ia64/xen-unstable

view linux-2.6-xen-sparse/drivers/xen/balloon/balloon.c @ 10601:8242c0c24db7

[LINUX] When ballooning out (returning memory to xen), don't try too hard
to allocate pages - this reduces the chance of the oom killer being invoked.

Signed-off-by: Steven Hand <steven@xensource.com>
author shand@kneesaa.uk.xensource.com
date Thu Jun 29 15:02:38 2006 +0100 (2006-06-29)
parents 9b35fada9e65
children c1850c659e40
line source
1 /******************************************************************************
2 * balloon.c
3 *
4 * Xen balloon driver - enables returning/claiming memory to/from Xen.
5 *
6 * Copyright (c) 2003, B Dragovic
7 * Copyright (c) 2003-2004, M Williamson, K Fraser
8 * Copyright (c) 2005 Dan M. Smith, IBM Corporation
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public License version 2
12 * as published by the Free Software Foundation; or, when distributed
13 * separately from the Linux kernel or incorporated into other
14 * software packages, subject to the following license:
15 *
16 * Permission is hereby granted, free of charge, to any person obtaining a copy
17 * of this source file (the "Software"), to deal in the Software without
18 * restriction, including without limitation the rights to use, copy, modify,
19 * merge, publish, distribute, sublicense, and/or sell copies of the Software,
20 * and to permit persons to whom the Software is furnished to do so, subject to
21 * the following conditions:
22 *
23 * The above copyright notice and this permission notice shall be included in
24 * all copies or substantial portions of the Software.
25 *
26 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
27 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
28 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
29 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
30 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
31 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
32 * IN THE SOFTWARE.
33 */
35 #include <linux/config.h>
36 #include <linux/kernel.h>
37 #include <linux/module.h>
38 #include <linux/sched.h>
39 #include <linux/errno.h>
40 #include <linux/mm.h>
41 #include <linux/mman.h>
42 #include <linux/smp_lock.h>
43 #include <linux/pagemap.h>
44 #include <linux/bootmem.h>
45 #include <linux/highmem.h>
46 #include <linux/vmalloc.h>
47 #include <xen/xen_proc.h>
48 #include <asm/hypervisor.h>
49 #include <xen/balloon.h>
50 #include <xen/interface/memory.h>
51 #include <asm/pgalloc.h>
52 #include <asm/pgtable.h>
53 #include <asm/uaccess.h>
54 #include <asm/tlb.h>
55 #include <linux/list.h>
57 #include <xen/xenbus.h>
59 #define PAGES2KB(_p) ((_p)<<(PAGE_SHIFT-10))
61 #ifdef CONFIG_PROC_FS
62 static struct proc_dir_entry *balloon_pde;
63 #endif
65 static DECLARE_MUTEX(balloon_mutex);
67 /*
68 * Protects atomic reservation decrease/increase against concurrent increases.
69 * Also protects non-atomic updates of current_pages and driver_pages, and
70 * balloon lists.
71 */
72 DEFINE_SPINLOCK(balloon_lock);
74 /* We aim for 'current allocation' == 'target allocation'. */
75 static unsigned long current_pages;
76 static unsigned long target_pages;
78 /* We increase/decrease in batches which fit in a page */
79 static unsigned long frame_list[PAGE_SIZE / sizeof(unsigned long)];
81 /* VM /proc information for memory */
82 extern unsigned long totalram_pages;
84 /* We may hit the hard limit in Xen. If we do then we remember it. */
85 static unsigned long hard_limit;
87 /*
88 * Drivers may alter the memory reservation independently, but they must
89 * inform the balloon driver so that we can avoid hitting the hard limit.
90 */
91 static unsigned long driver_pages;
93 /* List of ballooned pages, threaded through the mem_map array. */
94 static LIST_HEAD(ballooned_pages);
95 static unsigned long balloon_low, balloon_high;
97 /* Main work function, always executed in process context. */
98 static void balloon_process(void *unused);
99 static DECLARE_WORK(balloon_worker, balloon_process, NULL);
100 static struct timer_list balloon_timer;
102 /* When ballooning out (allocating memory to return to Xen) we don't really
103 want the kernel to try too hard since that can trigger the oom killer. */
104 #define GFP_BALLOON \
105 (GFP_HIGHUSER | __GFP_NOWARN | __GFP_NORETRY | __GFP_NOMEMALLOC)
107 #define PAGE_TO_LIST(p) (&(p)->lru)
108 #define LIST_TO_PAGE(l) list_entry((l), struct page, lru)
109 #define UNLIST_PAGE(p) \
110 do { \
111 list_del(PAGE_TO_LIST(p)); \
112 PAGE_TO_LIST(p)->next = NULL; \
113 PAGE_TO_LIST(p)->prev = NULL; \
114 } while(0)
116 #define IPRINTK(fmt, args...) \
117 printk(KERN_INFO "xen_mem: " fmt, ##args)
118 #define WPRINTK(fmt, args...) \
119 printk(KERN_WARNING "xen_mem: " fmt, ##args)
121 /* balloon_append: add the given page to the balloon. */
122 static void balloon_append(struct page *page)
123 {
124 /* Lowmem is re-populated first, so highmem pages go at list tail. */
125 if (PageHighMem(page)) {
126 list_add_tail(PAGE_TO_LIST(page), &ballooned_pages);
127 balloon_high++;
128 } else {
129 list_add(PAGE_TO_LIST(page), &ballooned_pages);
130 balloon_low++;
131 }
132 }
134 /* balloon_retrieve: rescue a page from the balloon, if it is not empty. */
135 static struct page *balloon_retrieve(void)
136 {
137 struct page *page;
139 if (list_empty(&ballooned_pages))
140 return NULL;
142 page = LIST_TO_PAGE(ballooned_pages.next);
143 UNLIST_PAGE(page);
145 if (PageHighMem(page))
146 balloon_high--;
147 else
148 balloon_low--;
150 return page;
151 }
153 static struct page *balloon_first_page(void)
154 {
155 if (list_empty(&ballooned_pages))
156 return NULL;
157 return LIST_TO_PAGE(ballooned_pages.next);
158 }
160 static struct page *balloon_next_page(struct page *page)
161 {
162 struct list_head *next = PAGE_TO_LIST(page)->next;
163 if (next == &ballooned_pages)
164 return NULL;
165 return LIST_TO_PAGE(next);
166 }
168 static void balloon_alarm(unsigned long unused)
169 {
170 schedule_work(&balloon_worker);
171 }
173 static unsigned long current_target(void)
174 {
175 unsigned long target = min(target_pages, hard_limit);
176 if (target > (current_pages + balloon_low + balloon_high))
177 target = current_pages + balloon_low + balloon_high;
178 return target;
179 }
181 static int increase_reservation(unsigned long nr_pages)
182 {
183 unsigned long pfn, i, flags;
184 struct page *page;
185 long rc;
186 struct xen_memory_reservation reservation = {
187 .address_bits = 0,
188 .extent_order = 0,
189 .domid = DOMID_SELF
190 };
192 if (nr_pages > ARRAY_SIZE(frame_list))
193 nr_pages = ARRAY_SIZE(frame_list);
195 balloon_lock(flags);
197 page = balloon_first_page();
198 for (i = 0; i < nr_pages; i++) {
199 BUG_ON(page == NULL);
200 frame_list[i] = page_to_pfn(page);;
201 page = balloon_next_page(page);
202 }
204 set_xen_guest_handle(reservation.extent_start, frame_list);
205 reservation.nr_extents = nr_pages;
206 rc = HYPERVISOR_memory_op(
207 XENMEM_populate_physmap, &reservation);
208 if (rc < nr_pages) {
209 if (rc > 0) {
210 int ret;
212 /* We hit the Xen hard limit: reprobe. */
213 reservation.nr_extents = rc;
214 ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation,
215 &reservation);
216 BUG_ON(ret != rc);
217 }
218 if (rc >= 0)
219 hard_limit = current_pages + rc - driver_pages;
220 goto out;
221 }
223 for (i = 0; i < nr_pages; i++) {
224 page = balloon_retrieve();
225 BUG_ON(page == NULL);
227 pfn = page_to_pfn(page);
228 BUG_ON(!xen_feature(XENFEAT_auto_translated_physmap) &&
229 phys_to_machine_mapping_valid(pfn));
231 /* Update P->M and M->P tables. */
232 set_phys_to_machine(pfn, frame_list[i]);
233 xen_machphys_update(frame_list[i], pfn);
235 /* Link back into the page tables if not highmem. */
236 if (pfn < max_low_pfn) {
237 int ret;
238 ret = HYPERVISOR_update_va_mapping(
239 (unsigned long)__va(pfn << PAGE_SHIFT),
240 pfn_pte_ma(frame_list[i], PAGE_KERNEL),
241 0);
242 BUG_ON(ret);
243 }
245 /* Relinquish the page back to the allocator. */
246 ClearPageReserved(page);
247 set_page_count(page, 1);
248 __free_page(page);
249 }
251 current_pages += nr_pages;
252 totalram_pages = current_pages;
254 out:
255 balloon_unlock(flags);
257 return 0;
258 }
260 static int decrease_reservation(unsigned long nr_pages)
261 {
262 unsigned long pfn, i, flags;
263 struct page *page;
264 void *v;
265 int need_sleep = 0;
266 int ret;
267 struct xen_memory_reservation reservation = {
268 .address_bits = 0,
269 .extent_order = 0,
270 .domid = DOMID_SELF
271 };
273 if (nr_pages > ARRAY_SIZE(frame_list))
274 nr_pages = ARRAY_SIZE(frame_list);
276 for (i = 0; i < nr_pages; i++) {
277 if ((page = alloc_page(GFP_BALLOON)) == NULL) {
278 nr_pages = i;
279 need_sleep = 1;
280 break;
281 }
283 pfn = page_to_pfn(page);
284 frame_list[i] = pfn_to_mfn(pfn);
286 if (!PageHighMem(page)) {
287 v = phys_to_virt(pfn << PAGE_SHIFT);
288 scrub_pages(v, 1);
289 ret = HYPERVISOR_update_va_mapping(
290 (unsigned long)v, __pte_ma(0), 0);
291 BUG_ON(ret);
292 }
293 #ifdef CONFIG_XEN_SCRUB_PAGES
294 else {
295 v = kmap(page);
296 scrub_pages(v, 1);
297 kunmap(page);
298 }
299 #endif
300 }
302 /* Ensure that ballooned highmem pages don't have kmaps. */
303 kmap_flush_unused();
304 flush_tlb_all();
306 balloon_lock(flags);
308 /* No more mappings: invalidate P2M and add to balloon. */
309 for (i = 0; i < nr_pages; i++) {
310 pfn = mfn_to_pfn(frame_list[i]);
311 set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
312 balloon_append(pfn_to_page(pfn));
313 }
315 set_xen_guest_handle(reservation.extent_start, frame_list);
316 reservation.nr_extents = nr_pages;
317 ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation);
318 BUG_ON(ret != nr_pages);
320 current_pages -= nr_pages;
321 totalram_pages = current_pages;
323 balloon_unlock(flags);
325 return need_sleep;
326 }
328 /*
329 * We avoid multiple worker processes conflicting via the balloon mutex.
330 * We may of course race updates of the target counts (which are protected
331 * by the balloon lock), or with changes to the Xen hard limit, but we will
332 * recover from these in time.
333 */
334 static void balloon_process(void *unused)
335 {
336 int need_sleep = 0;
337 long credit;
339 down(&balloon_mutex);
341 do {
342 credit = current_target() - current_pages;
343 if (credit > 0)
344 need_sleep = (increase_reservation(credit) != 0);
345 if (credit < 0)
346 need_sleep = (decrease_reservation(-credit) != 0);
348 #ifndef CONFIG_PREEMPT
349 if (need_resched())
350 schedule();
351 #endif
352 } while ((credit != 0) && !need_sleep);
354 /* Schedule more work if there is some still to be done. */
355 if (current_target() != current_pages)
356 mod_timer(&balloon_timer, jiffies + HZ);
358 up(&balloon_mutex);
359 }
361 /* Resets the Xen limit, sets new target, and kicks off processing. */
362 static void set_new_target(unsigned long target)
363 {
364 /* No need for lock. Not read-modify-write updates. */
365 hard_limit = ~0UL;
366 target_pages = target;
367 schedule_work(&balloon_worker);
368 }
370 static struct xenbus_watch target_watch =
371 {
372 .node = "memory/target"
373 };
375 /* React to a change in the target key */
376 static void watch_target(struct xenbus_watch *watch,
377 const char **vec, unsigned int len)
378 {
379 unsigned long long new_target;
380 int err;
382 err = xenbus_scanf(XBT_NIL, "memory", "target", "%llu", &new_target);
383 if (err != 1) {
384 /* This is ok (for domain0 at least) - so just return */
385 return;
386 }
388 /* The given memory/target value is in KiB, so it needs converting to
389 * pages. PAGE_SHIFT converts bytes to pages, hence PAGE_SHIFT - 10.
390 */
391 set_new_target(new_target >> (PAGE_SHIFT - 10));
392 }
394 static int balloon_init_watcher(struct notifier_block *notifier,
395 unsigned long event,
396 void *data)
397 {
398 int err;
400 err = register_xenbus_watch(&target_watch);
401 if (err)
402 printk(KERN_ERR "Failed to set balloon watcher\n");
404 return NOTIFY_DONE;
405 }
407 #ifdef CONFIG_PROC_FS
408 static int balloon_write(struct file *file, const char __user *buffer,
409 unsigned long count, void *data)
410 {
411 char memstring[64], *endchar;
412 unsigned long long target_bytes;
414 if (!capable(CAP_SYS_ADMIN))
415 return -EPERM;
417 if (count <= 1)
418 return -EBADMSG; /* runt */
419 if (count > sizeof(memstring))
420 return -EFBIG; /* too long */
422 if (copy_from_user(memstring, buffer, count))
423 return -EFAULT;
424 memstring[sizeof(memstring)-1] = '\0';
426 target_bytes = memparse(memstring, &endchar);
427 set_new_target(target_bytes >> PAGE_SHIFT);
429 return count;
430 }
432 static int balloon_read(char *page, char **start, off_t off,
433 int count, int *eof, void *data)
434 {
435 int len;
437 len = sprintf(
438 page,
439 "Current allocation: %8lu kB\n"
440 "Requested target: %8lu kB\n"
441 "Low-mem balloon: %8lu kB\n"
442 "High-mem balloon: %8lu kB\n"
443 "Xen hard limit: ",
444 PAGES2KB(current_pages), PAGES2KB(target_pages),
445 PAGES2KB(balloon_low), PAGES2KB(balloon_high));
447 if (hard_limit != ~0UL) {
448 len += sprintf(
449 page + len,
450 "%8lu kB (inc. %8lu kB driver headroom)\n",
451 PAGES2KB(hard_limit), PAGES2KB(driver_pages));
452 } else {
453 len += sprintf(
454 page + len,
455 " ??? kB\n");
456 }
458 *eof = 1;
459 return len;
460 }
461 #endif
463 static struct notifier_block xenstore_notifier;
465 static int __init balloon_init(void)
466 {
467 unsigned long pfn;
468 struct page *page;
470 if (!is_running_on_xen())
471 return -ENODEV;
473 IPRINTK("Initialising balloon driver.\n");
475 current_pages = min(xen_start_info->nr_pages, max_pfn);
476 totalram_pages = current_pages;
477 target_pages = current_pages;
478 balloon_low = 0;
479 balloon_high = 0;
480 driver_pages = 0UL;
481 hard_limit = ~0UL;
483 init_timer(&balloon_timer);
484 balloon_timer.data = 0;
485 balloon_timer.function = balloon_alarm;
487 #ifdef CONFIG_PROC_FS
488 if ((balloon_pde = create_xen_proc_entry("balloon", 0644)) == NULL) {
489 WPRINTK("Unable to create /proc/xen/balloon.\n");
490 return -1;
491 }
493 balloon_pde->read_proc = balloon_read;
494 balloon_pde->write_proc = balloon_write;
495 #endif
497 /* Initialise the balloon with excess memory space. */
498 for (pfn = xen_start_info->nr_pages; pfn < max_pfn; pfn++) {
499 page = pfn_to_page(pfn);
500 if (!PageReserved(page))
501 balloon_append(page);
502 }
504 target_watch.callback = watch_target;
505 xenstore_notifier.notifier_call = balloon_init_watcher;
507 register_xenstore_notifier(&xenstore_notifier);
509 return 0;
510 }
512 subsys_initcall(balloon_init);
514 void balloon_update_driver_allowance(long delta)
515 {
516 unsigned long flags;
518 balloon_lock(flags);
519 driver_pages += delta;
520 balloon_unlock(flags);
521 }
523 static int dealloc_pte_fn(
524 pte_t *pte, struct page *pmd_page, unsigned long addr, void *data)
525 {
526 unsigned long mfn = pte_mfn(*pte);
527 int ret;
528 struct xen_memory_reservation reservation = {
529 .nr_extents = 1,
530 .extent_order = 0,
531 .domid = DOMID_SELF
532 };
533 set_xen_guest_handle(reservation.extent_start, &mfn);
534 set_pte_at(&init_mm, addr, pte, __pte_ma(0));
535 set_phys_to_machine(__pa(addr) >> PAGE_SHIFT, INVALID_P2M_ENTRY);
536 ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation);
537 BUG_ON(ret != 1);
538 return 0;
539 }
541 struct page *balloon_alloc_empty_page_range(unsigned long nr_pages)
542 {
543 unsigned long vstart, flags;
544 unsigned int order = get_order(nr_pages * PAGE_SIZE);
545 int ret;
546 unsigned long i;
547 struct page *page;
549 vstart = __get_free_pages(GFP_KERNEL, order);
550 if (vstart == 0)
551 return NULL;
553 scrub_pages(vstart, 1 << order);
555 balloon_lock(flags);
556 if (xen_feature(XENFEAT_auto_translated_physmap)) {
557 unsigned long gmfn = __pa(vstart) >> PAGE_SHIFT;
558 struct xen_memory_reservation reservation = {
559 .nr_extents = 1,
560 .extent_order = order,
561 .domid = DOMID_SELF
562 };
563 set_xen_guest_handle(reservation.extent_start, &gmfn);
564 ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation,
565 &reservation);
566 BUG_ON(ret != 1);
567 } else {
568 ret = apply_to_page_range(&init_mm, vstart, PAGE_SIZE << order,
569 dealloc_pte_fn, NULL);
570 BUG_ON(ret);
571 }
572 current_pages -= 1UL << order;
573 totalram_pages = current_pages;
574 balloon_unlock(flags);
576 schedule_work(&balloon_worker);
578 flush_tlb_all();
580 page = virt_to_page(vstart);
582 for (i = 0; i < (1UL << order); i++)
583 set_page_count(page + i, 1);
585 return page;
586 }
588 void balloon_dealloc_empty_page_range(
589 struct page *page, unsigned long nr_pages)
590 {
591 unsigned long i, flags;
592 unsigned int order = get_order(nr_pages * PAGE_SIZE);
594 balloon_lock(flags);
595 for (i = 0; i < (1UL << order); i++) {
596 BUG_ON(page_count(page + i) != 1);
597 balloon_append(page + i);
598 }
599 balloon_unlock(flags);
601 schedule_work(&balloon_worker);
602 }
604 EXPORT_SYMBOL_GPL(balloon_update_driver_allowance);
605 EXPORT_SYMBOL_GPL(balloon_alloc_empty_page_range);
606 EXPORT_SYMBOL_GPL(balloon_dealloc_empty_page_range);
608 MODULE_LICENSE("Dual BSD/GPL");