ia64/linux-2.6.18-xen.hg

view arch/mips/sgi-ip27/ip27-memory.c @ 897:329ea0ccb344

balloon: try harder to balloon up under memory pressure.

Currently if the balloon driver is unable to increase the guest's
reservation it assumes the failure was due to reaching its full
allocation, gives up on the ballooning operation and records the limit
it reached as the "hard limit". The driver will not try again until
the target is set again (even to the same value).

However it is possible that ballooning has in fact failed due to
memory pressure in the host and therefore it is desirable to keep
attempting to reach the target in case memory becomes available. The
most likely scenario is that some guests are ballooning down while
others are ballooning up and therefore there is temporary memory
pressure while things stabilise. You would not expect a well behaved
toolstack to ask a domain to balloon to more than its allocation nor
would you expect it to deliberately over-commit memory by setting
balloon targets which exceed the total host memory.

This patch drops the concept of a hard limit and causes the balloon
driver to retry increasing the reservation on a timer in the same
manner as when decreasing the reservation.

Also if we partially succeed in increasing the reservation
(i.e. receive less pages than we asked for) then we may as well keep
those pages rather than returning them to Xen.

Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
author Keir Fraser <keir.fraser@citrix.com>
date Fri Jun 05 14:01:20 2009 +0100 (2009-06-05)
parents 831230e53067
children
line source
1 /*
2 * This file is subject to the terms and conditions of the GNU General Public
3 * License. See the file "COPYING" in the main directory of this archive
4 * for more details.
5 *
6 * Copyright (C) 2000, 05 by Ralf Baechle (ralf@linux-mips.org)
7 * Copyright (C) 2000 by Silicon Graphics, Inc.
8 * Copyright (C) 2004 by Christoph Hellwig
9 *
10 * On SGI IP27 the ARC memory configuration data is completly bogus but
11 * alternate easier to use mechanisms are available.
12 */
13 #include <linux/init.h>
14 #include <linux/kernel.h>
15 #include <linux/mm.h>
16 #include <linux/mmzone.h>
17 #include <linux/module.h>
18 #include <linux/nodemask.h>
19 #include <linux/swap.h>
20 #include <linux/bootmem.h>
21 #include <linux/pfn.h>
22 #include <asm/page.h>
23 #include <asm/sections.h>
25 #include <asm/sn/arch.h>
26 #include <asm/sn/hub.h>
27 #include <asm/sn/klconfig.h>
28 #include <asm/sn/sn_private.h>
31 #define SLOT_PFNSHIFT (SLOT_SHIFT - PAGE_SHIFT)
32 #define PFN_NASIDSHFT (NASID_SHFT - PAGE_SHIFT)
34 #define SLOT_IGNORED 0xffff
36 static short __initdata slot_lastfilled_cache[MAX_COMPACT_NODES];
37 static unsigned short __initdata slot_psize_cache[MAX_COMPACT_NODES][MAX_MEM_SLOTS];
38 static struct bootmem_data __initdata plat_node_bdata[MAX_COMPACT_NODES];
40 struct node_data *__node_data[MAX_COMPACT_NODES];
42 EXPORT_SYMBOL(__node_data);
44 static int fine_mode;
46 static int is_fine_dirmode(void)
47 {
48 return (((LOCAL_HUB_L(NI_STATUS_REV_ID) & NSRI_REGIONSIZE_MASK)
49 >> NSRI_REGIONSIZE_SHFT) & REGIONSIZE_FINE);
50 }
52 static hubreg_t get_region(cnodeid_t cnode)
53 {
54 if (fine_mode)
55 return COMPACT_TO_NASID_NODEID(cnode) >> NASID_TO_FINEREG_SHFT;
56 else
57 return COMPACT_TO_NASID_NODEID(cnode) >> NASID_TO_COARSEREG_SHFT;
58 }
60 static hubreg_t region_mask;
62 static void gen_region_mask(hubreg_t *region_mask)
63 {
64 cnodeid_t cnode;
66 (*region_mask) = 0;
67 for_each_online_node(cnode) {
68 (*region_mask) |= 1ULL << get_region(cnode);
69 }
70 }
72 #define rou_rflag rou_flags
74 static int router_distance;
76 static void router_recurse(klrou_t *router_a, klrou_t *router_b, int depth)
77 {
78 klrou_t *router;
79 lboard_t *brd;
80 int port;
82 if (router_a->rou_rflag == 1)
83 return;
85 if (depth >= router_distance)
86 return;
88 router_a->rou_rflag = 1;
90 for (port = 1; port <= MAX_ROUTER_PORTS; port++) {
91 if (router_a->rou_port[port].port_nasid == INVALID_NASID)
92 continue;
94 brd = (lboard_t *)NODE_OFFSET_TO_K0(
95 router_a->rou_port[port].port_nasid,
96 router_a->rou_port[port].port_offset);
98 if (brd->brd_type == KLTYPE_ROUTER) {
99 router = (klrou_t *)NODE_OFFSET_TO_K0(NASID_GET(brd), brd->brd_compts[0]);
100 if (router == router_b) {
101 if (depth < router_distance)
102 router_distance = depth;
103 }
104 else
105 router_recurse(router, router_b, depth + 1);
106 }
107 }
109 router_a->rou_rflag = 0;
110 }
112 unsigned char __node_distances[MAX_COMPACT_NODES][MAX_COMPACT_NODES];
114 static int __init compute_node_distance(nasid_t nasid_a, nasid_t nasid_b)
115 {
116 klrou_t *router, *router_a = NULL, *router_b = NULL;
117 lboard_t *brd, *dest_brd;
118 cnodeid_t cnode;
119 nasid_t nasid;
120 int port;
122 /* Figure out which routers nodes in question are connected to */
123 for_each_online_node(cnode) {
124 nasid = COMPACT_TO_NASID_NODEID(cnode);
126 if (nasid == -1) continue;
128 brd = find_lboard_class((lboard_t *)KL_CONFIG_INFO(nasid),
129 KLTYPE_ROUTER);
131 if (!brd)
132 continue;
134 do {
135 if (brd->brd_flags & DUPLICATE_BOARD)
136 continue;
138 router = (klrou_t *)NODE_OFFSET_TO_K0(NASID_GET(brd), brd->brd_compts[0]);
139 router->rou_rflag = 0;
141 for (port = 1; port <= MAX_ROUTER_PORTS; port++) {
142 if (router->rou_port[port].port_nasid == INVALID_NASID)
143 continue;
145 dest_brd = (lboard_t *)NODE_OFFSET_TO_K0(
146 router->rou_port[port].port_nasid,
147 router->rou_port[port].port_offset);
149 if (dest_brd->brd_type == KLTYPE_IP27) {
150 if (dest_brd->brd_nasid == nasid_a)
151 router_a = router;
152 if (dest_brd->brd_nasid == nasid_b)
153 router_b = router;
154 }
155 }
157 } while ((brd = find_lboard_class(KLCF_NEXT(brd), KLTYPE_ROUTER)));
158 }
160 if (router_a == NULL) {
161 printk("node_distance: router_a NULL\n");
162 return -1;
163 }
164 if (router_b == NULL) {
165 printk("node_distance: router_b NULL\n");
166 return -1;
167 }
169 if (nasid_a == nasid_b)
170 return 0;
172 if (router_a == router_b)
173 return 1;
175 router_distance = 100;
176 router_recurse(router_a, router_b, 2);
178 return router_distance;
179 }
181 static void __init init_topology_matrix(void)
182 {
183 nasid_t nasid, nasid2;
184 cnodeid_t row, col;
186 for (row = 0; row < MAX_COMPACT_NODES; row++)
187 for (col = 0; col < MAX_COMPACT_NODES; col++)
188 __node_distances[row][col] = -1;
190 for_each_online_node(row) {
191 nasid = COMPACT_TO_NASID_NODEID(row);
192 for_each_online_node(col) {
193 nasid2 = COMPACT_TO_NASID_NODEID(col);
194 __node_distances[row][col] =
195 compute_node_distance(nasid, nasid2);
196 }
197 }
198 }
200 static void __init dump_topology(void)
201 {
202 nasid_t nasid;
203 cnodeid_t cnode;
204 lboard_t *brd, *dest_brd;
205 int port;
206 int router_num = 0;
207 klrou_t *router;
208 cnodeid_t row, col;
210 printk("************** Topology ********************\n");
212 printk(" ");
213 for_each_online_node(col)
214 printk("%02d ", col);
215 printk("\n");
216 for_each_online_node(row) {
217 printk("%02d ", row);
218 for_each_online_node(col)
219 printk("%2d ", node_distance(row, col));
220 printk("\n");
221 }
223 for_each_online_node(cnode) {
224 nasid = COMPACT_TO_NASID_NODEID(cnode);
226 if (nasid == -1) continue;
228 brd = find_lboard_class((lboard_t *)KL_CONFIG_INFO(nasid),
229 KLTYPE_ROUTER);
231 if (!brd)
232 continue;
234 do {
235 if (brd->brd_flags & DUPLICATE_BOARD)
236 continue;
237 printk("Router %d:", router_num);
238 router_num++;
240 router = (klrou_t *)NODE_OFFSET_TO_K0(NASID_GET(brd), brd->brd_compts[0]);
242 for (port = 1; port <= MAX_ROUTER_PORTS; port++) {
243 if (router->rou_port[port].port_nasid == INVALID_NASID)
244 continue;
246 dest_brd = (lboard_t *)NODE_OFFSET_TO_K0(
247 router->rou_port[port].port_nasid,
248 router->rou_port[port].port_offset);
250 if (dest_brd->brd_type == KLTYPE_IP27)
251 printk(" %d", dest_brd->brd_nasid);
252 if (dest_brd->brd_type == KLTYPE_ROUTER)
253 printk(" r");
254 }
255 printk("\n");
257 } while ( (brd = find_lboard_class(KLCF_NEXT(brd), KLTYPE_ROUTER)) );
258 }
259 }
261 static pfn_t __init slot_getbasepfn(cnodeid_t cnode, int slot)
262 {
263 nasid_t nasid = COMPACT_TO_NASID_NODEID(cnode);
265 return ((pfn_t)nasid << PFN_NASIDSHFT) | (slot << SLOT_PFNSHIFT);
266 }
268 /*
269 * Return the number of pages of memory provided by the given slot
270 * on the specified node.
271 */
272 static pfn_t __init slot_getsize(cnodeid_t node, int slot)
273 {
274 return (pfn_t) slot_psize_cache[node][slot];
275 }
277 /*
278 * Return highest slot filled
279 */
280 static int __init node_getlastslot(cnodeid_t node)
281 {
282 return (int) slot_lastfilled_cache[node];
283 }
285 /*
286 * Return the pfn of the last free page of memory on a node.
287 */
288 static pfn_t __init node_getmaxclick(cnodeid_t node)
289 {
290 pfn_t slot_psize;
291 int slot;
293 /*
294 * Start at the top slot. When we find a slot with memory in it,
295 * that's the winner.
296 */
297 for (slot = (MAX_MEM_SLOTS - 1); slot >= 0; slot--) {
298 if ((slot_psize = slot_getsize(node, slot))) {
299 if (slot_psize == SLOT_IGNORED)
300 continue;
301 /* Return the basepfn + the slot size, minus 1. */
302 return slot_getbasepfn(node, slot) + slot_psize - 1;
303 }
304 }
306 /*
307 * If there's no memory on the node, return 0. This is likely
308 * to cause problems.
309 */
310 return 0;
311 }
313 static pfn_t __init slot_psize_compute(cnodeid_t node, int slot)
314 {
315 nasid_t nasid;
316 lboard_t *brd;
317 klmembnk_t *banks;
318 unsigned long size;
320 nasid = COMPACT_TO_NASID_NODEID(node);
321 /* Find the node board */
322 brd = find_lboard((lboard_t *)KL_CONFIG_INFO(nasid), KLTYPE_IP27);
323 if (!brd)
324 return 0;
326 /* Get the memory bank structure */
327 banks = (klmembnk_t *) find_first_component(brd, KLSTRUCT_MEMBNK);
328 if (!banks)
329 return 0;
331 /* Size in _Megabytes_ */
332 size = (unsigned long)banks->membnk_bnksz[slot/4];
334 /* hack for 128 dimm banks */
335 if (size <= 128) {
336 if (slot % 4 == 0) {
337 size <<= 20; /* size in bytes */
338 return(size >> PAGE_SHIFT);
339 } else
340 return 0;
341 } else {
342 size /= 4;
343 size <<= 20;
344 return size >> PAGE_SHIFT;
345 }
346 }
348 static void __init mlreset(void)
349 {
350 int i;
352 master_nasid = get_nasid();
353 fine_mode = is_fine_dirmode();
355 /*
356 * Probe for all CPUs - this creates the cpumask and sets up the
357 * mapping tables. We need to do this as early as possible.
358 */
359 #ifdef CONFIG_SMP
360 cpu_node_probe();
361 #endif
363 init_topology_matrix();
364 dump_topology();
366 gen_region_mask(&region_mask);
368 setup_replication_mask();
370 /*
371 * Set all nodes' calias sizes to 8k
372 */
373 for_each_online_node(i) {
374 nasid_t nasid;
376 nasid = COMPACT_TO_NASID_NODEID(i);
378 /*
379 * Always have node 0 in the region mask, otherwise
380 * CALIAS accesses get exceptions since the hub
381 * thinks it is a node 0 address.
382 */
383 REMOTE_HUB_S(nasid, PI_REGION_PRESENT, (region_mask | 1));
384 #ifdef CONFIG_REPLICATE_EXHANDLERS
385 REMOTE_HUB_S(nasid, PI_CALIAS_SIZE, PI_CALIAS_SIZE_8K);
386 #else
387 REMOTE_HUB_S(nasid, PI_CALIAS_SIZE, PI_CALIAS_SIZE_0);
388 #endif
390 #ifdef LATER
391 /*
392 * Set up all hubs to have a big window pointing at
393 * widget 0. Memory mode, widget 0, offset 0
394 */
395 REMOTE_HUB_S(nasid, IIO_ITTE(SWIN0_BIGWIN),
396 ((HUB_PIO_MAP_TO_MEM << IIO_ITTE_IOSP_SHIFT) |
397 (0 << IIO_ITTE_WIDGET_SHIFT)));
398 #endif
399 }
400 }
402 static void __init szmem(void)
403 {
404 pfn_t slot_psize, slot0sz = 0, nodebytes; /* Hack to detect problem configs */
405 int slot, ignore;
406 cnodeid_t node;
408 num_physpages = 0;
410 for_each_online_node(node) {
411 ignore = nodebytes = 0;
412 for (slot = 0; slot < MAX_MEM_SLOTS; slot++) {
413 slot_psize = slot_psize_compute(node, slot);
414 if (slot == 0)
415 slot0sz = slot_psize;
416 /*
417 * We need to refine the hack when we have replicated
418 * kernel text.
419 */
420 nodebytes += (1LL << SLOT_SHIFT);
421 if ((nodebytes >> PAGE_SHIFT) * (sizeof(struct page)) >
422 (slot0sz << PAGE_SHIFT))
423 ignore = 1;
424 if (ignore && slot_psize) {
425 printk("Ignoring slot %d onwards on node %d\n",
426 slot, node);
427 slot_psize_cache[node][slot] = SLOT_IGNORED;
428 slot = MAX_MEM_SLOTS;
429 continue;
430 }
431 num_physpages += slot_psize;
432 slot_psize_cache[node][slot] =
433 (unsigned short) slot_psize;
434 if (slot_psize)
435 slot_lastfilled_cache[node] = slot;
436 }
437 }
438 }
440 static void __init node_mem_init(cnodeid_t node)
441 {
442 pfn_t slot_firstpfn = slot_getbasepfn(node, 0);
443 pfn_t slot_lastpfn = slot_firstpfn + slot_getsize(node, 0);
444 pfn_t slot_freepfn = node_getfirstfree(node);
445 struct pglist_data *pd;
446 unsigned long bootmap_size;
448 /*
449 * Allocate the node data structures on the node first.
450 */
451 __node_data[node] = __va(slot_freepfn << PAGE_SHIFT);
453 pd = NODE_DATA(node);
454 pd->bdata = &plat_node_bdata[node];
456 cpus_clear(hub_data(node)->h_cpus);
458 slot_freepfn += PFN_UP(sizeof(struct pglist_data) +
459 sizeof(struct hub_data));
461 bootmap_size = init_bootmem_node(NODE_DATA(node), slot_freepfn,
462 slot_firstpfn, slot_lastpfn);
463 free_bootmem_node(NODE_DATA(node), slot_firstpfn << PAGE_SHIFT,
464 (slot_lastpfn - slot_firstpfn) << PAGE_SHIFT);
465 reserve_bootmem_node(NODE_DATA(node), slot_firstpfn << PAGE_SHIFT,
466 ((slot_freepfn - slot_firstpfn) << PAGE_SHIFT) + bootmap_size);
467 }
469 /*
470 * A node with nothing. We use it to avoid any special casing in
471 * node_to_cpumask
472 */
473 static struct node_data null_node = {
474 .hub = {
475 .h_cpus = CPU_MASK_NONE
476 }
477 };
479 /*
480 * Currently, the intranode memory hole support assumes that each slot
481 * contains at least 32 MBytes of memory. We assume all bootmem data
482 * fits on the first slot.
483 */
484 void __init prom_meminit(void)
485 {
486 cnodeid_t node;
488 mlreset();
489 szmem();
491 for (node = 0; node < MAX_COMPACT_NODES; node++) {
492 if (node_online(node)) {
493 node_mem_init(node);
494 continue;
495 }
496 __node_data[node] = &null_node;
497 }
498 }
500 unsigned long __init prom_free_prom_memory(void)
501 {
502 /* We got nothing to free here ... */
503 return 0;
504 }
506 extern void pagetable_init(void);
507 extern unsigned long setup_zero_pages(void);
509 void __init paging_init(void)
510 {
511 unsigned long zones_size[MAX_NR_ZONES] = {0, 0, 0};
512 unsigned node;
514 pagetable_init();
516 for_each_online_node(node) {
517 pfn_t start_pfn = slot_getbasepfn(node, 0);
518 pfn_t end_pfn = node_getmaxclick(node) + 1;
520 zones_size[ZONE_DMA] = end_pfn - start_pfn;
521 free_area_init_node(node, NODE_DATA(node),
522 zones_size, start_pfn, NULL);
524 if (end_pfn > max_low_pfn)
525 max_low_pfn = end_pfn;
526 }
527 }
529 void __init mem_init(void)
530 {
531 unsigned long codesize, datasize, initsize, tmp;
532 unsigned node;
534 high_memory = (void *) __va(num_physpages << PAGE_SHIFT);
536 for_each_online_node(node) {
537 unsigned slot, numslots;
538 struct page *end, *p;
540 /*
541 * This will free up the bootmem, ie, slot 0 memory.
542 */
543 totalram_pages += free_all_bootmem_node(NODE_DATA(node));
545 /*
546 * We need to manually do the other slots.
547 */
548 numslots = node_getlastslot(node);
549 for (slot = 1; slot <= numslots; slot++) {
550 p = nid_page_nr(node, slot_getbasepfn(node, slot) -
551 slot_getbasepfn(node, 0));
553 /*
554 * Free valid memory in current slot.
555 */
556 for (end = p + slot_getsize(node, slot); p < end; p++) {
557 /* if (!page_is_ram(pgnr)) continue; */
558 /* commented out until page_is_ram works */
559 ClearPageReserved(p);
560 init_page_count(p);
561 __free_page(p);
562 totalram_pages++;
563 }
564 }
565 }
567 totalram_pages -= setup_zero_pages(); /* This comes from node 0 */
569 codesize = (unsigned long) &_etext - (unsigned long) &_text;
570 datasize = (unsigned long) &_edata - (unsigned long) &_etext;
571 initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin;
573 tmp = nr_free_pages();
574 printk(KERN_INFO "Memory: %luk/%luk available (%ldk kernel code, "
575 "%ldk reserved, %ldk data, %ldk init, %ldk highmem)\n",
576 tmp << (PAGE_SHIFT-10),
577 num_physpages << (PAGE_SHIFT-10),
578 codesize >> 10,
579 (num_physpages - tmp) << (PAGE_SHIFT-10),
580 datasize >> 10,
581 initsize >> 10,
582 (unsigned long) (totalhigh_pages << (PAGE_SHIFT-10)));
583 }