ia64/linux-2.6.18-xen.hg

view init/main.c @ 897:329ea0ccb344

balloon: try harder to balloon up under memory pressure.

Currently if the balloon driver is unable to increase the guest's
reservation it assumes the failure was due to reaching its full
allocation, gives up on the ballooning operation and records the limit
it reached as the "hard limit". The driver will not try again until
the target is set again (even to the same value).

However it is possible that ballooning has in fact failed due to
memory pressure in the host and therefore it is desirable to keep
attempting to reach the target in case memory becomes available. The
most likely scenario is that some guests are ballooning down while
others are ballooning up and therefore there is temporary memory
pressure while things stabilise. You would not expect a well behaved
toolstack to ask a domain to balloon to more than its allocation nor
would you expect it to deliberately over-commit memory by setting
balloon targets which exceed the total host memory.

This patch drops the concept of a hard limit and causes the balloon
driver to retry increasing the reservation on a timer in the same
manner as when decreasing the reservation.

Also if we partially succeed in increasing the reservation
(i.e. receive less pages than we asked for) then we may as well keep
those pages rather than returning them to Xen.

Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
author Keir Fraser <keir.fraser@citrix.com>
date Fri Jun 05 14:01:20 2009 +0100 (2009-06-05)
parents 831230e53067
children
line source
1 /*
2 * linux/init/main.c
3 *
4 * Copyright (C) 1991, 1992 Linus Torvalds
5 *
6 * GK 2/5/95 - Changed to support mounting root fs via NFS
7 * Added initrd & change_root: Werner Almesberger & Hans Lermen, Feb '96
8 * Moan early if gcc is old, avoiding bogus kernels - Paul Gortmaker, May '96
9 * Simplified starting of init: Michael A. Griffith <grif@acm.org>
10 */
12 #define __KERNEL_SYSCALLS__
14 #include <linux/types.h>
15 #include <linux/module.h>
16 #include <linux/proc_fs.h>
17 #include <linux/kernel.h>
18 #include <linux/syscalls.h>
19 #include <linux/string.h>
20 #include <linux/ctype.h>
21 #include <linux/delay.h>
22 #include <linux/utsname.h>
23 #include <linux/ioport.h>
24 #include <linux/init.h>
25 #include <linux/smp_lock.h>
26 #include <linux/initrd.h>
27 #include <linux/hdreg.h>
28 #include <linux/bootmem.h>
29 #include <linux/tty.h>
30 #include <linux/gfp.h>
31 #include <linux/percpu.h>
32 #include <linux/kmod.h>
33 #include <linux/kernel_stat.h>
34 #include <linux/security.h>
35 #include <linux/workqueue.h>
36 #include <linux/profile.h>
37 #include <linux/rcupdate.h>
38 #include <linux/moduleparam.h>
39 #include <linux/kallsyms.h>
40 #include <linux/writeback.h>
41 #include <linux/cpu.h>
42 #include <linux/cpuset.h>
43 #include <linux/efi.h>
44 #include <linux/taskstats_kern.h>
45 #include <linux/delayacct.h>
46 #include <linux/unistd.h>
47 #include <linux/rmap.h>
48 #include <linux/mempolicy.h>
49 #include <linux/key.h>
50 #include <linux/unwind.h>
51 #include <linux/buffer_head.h>
52 #include <linux/debug_locks.h>
53 #include <linux/lockdep.h>
55 #include <asm/io.h>
56 #include <asm/bugs.h>
57 #include <asm/setup.h>
58 #include <asm/sections.h>
59 #include <asm/cacheflush.h>
61 #ifdef CONFIG_X86_LOCAL_APIC
62 #include <asm/smp.h>
63 #endif
65 /*
66 * This is one of the first .c files built. Error out early if we have compiler
67 * trouble.
68 *
69 * Versions of gcc older than that listed below may actually compile and link
70 * okay, but the end product can have subtle run time bugs. To avoid associated
71 * bogus bug reports, we flatly refuse to compile with a gcc that is known to be
72 * too old from the very beginning.
73 */
74 #if (__GNUC__ < 3) || (__GNUC__ == 3 && __GNUC_MINOR__ < 2)
75 #error Sorry, your GCC is too old. It builds incorrect kernels.
76 #endif
78 static int init(void *);
80 extern void init_IRQ(void);
81 extern void fork_init(unsigned long);
82 extern void mca_init(void);
83 extern void sbus_init(void);
84 extern void sysctl_init(void);
85 extern void signals_init(void);
86 extern void pidhash_init(void);
87 extern void pidmap_init(void);
88 extern void prio_tree_init(void);
89 extern void radix_tree_init(void);
90 extern void free_initmem(void);
91 extern void populate_rootfs(void);
92 extern void driver_init(void);
93 extern void prepare_namespace(void);
94 #ifdef CONFIG_ACPI
95 extern void acpi_early_init(void);
96 #else
97 static inline void acpi_early_init(void) { }
98 #endif
99 #ifndef CONFIG_DEBUG_RODATA
100 static inline void mark_rodata_ro(void) { }
101 #endif
103 #ifdef CONFIG_TC
104 extern void tc_init(void);
105 #endif
107 enum system_states system_state;
108 EXPORT_SYMBOL(system_state);
110 /*
111 * Boot command-line arguments
112 */
113 #define MAX_INIT_ARGS CONFIG_INIT_ENV_ARG_LIMIT
114 #define MAX_INIT_ENVS CONFIG_INIT_ENV_ARG_LIMIT
116 extern void time_init(void);
117 /* Default late time init is NULL. archs can override this later. */
118 void (*late_time_init)(void);
119 extern void softirq_init(void);
121 /* Untouched command line (eg. for /proc) saved by arch-specific code. */
122 char saved_command_line[COMMAND_LINE_SIZE];
124 static char *execute_command;
125 static char *ramdisk_execute_command;
127 /* Setup configured maximum number of CPUs to activate */
128 static unsigned int max_cpus = NR_CPUS;
130 /*
131 * Setup routine for controlling SMP activation
132 *
133 * Command-line option of "nosmp" or "maxcpus=0" will disable SMP
134 * activation entirely (the MPS table probe still happens, though).
135 *
136 * Command-line option of "maxcpus=<NUM>", where <NUM> is an integer
137 * greater than 0, limits the maximum number of CPUs activated in
138 * SMP mode to <NUM>.
139 */
140 static int __init nosmp(char *str)
141 {
142 max_cpus = 0;
143 return 1;
144 }
146 __setup("nosmp", nosmp);
148 static int __init maxcpus(char *str)
149 {
150 get_option(&str, &max_cpus);
151 return 1;
152 }
154 __setup("maxcpus=", maxcpus);
156 static char * argv_init[MAX_INIT_ARGS+2] = { "init", NULL, };
157 char * envp_init[MAX_INIT_ENVS+2] = { "HOME=/", "TERM=linux", NULL, };
158 static const char *panic_later, *panic_param;
160 extern struct obs_kernel_param __setup_start[], __setup_end[];
162 static int __init obsolete_checksetup(char *line)
163 {
164 struct obs_kernel_param *p;
166 p = __setup_start;
167 do {
168 int n = strlen(p->str);
169 if (!strncmp(line, p->str, n)) {
170 if (p->early) {
171 /* Already done in parse_early_param? (Needs
172 * exact match on param part) */
173 if (line[n] == '\0' || line[n] == '=')
174 return 1;
175 } else if (!p->setup_func) {
176 printk(KERN_WARNING "Parameter %s is obsolete,"
177 " ignored\n", p->str);
178 return 1;
179 } else if (p->setup_func(line + n))
180 return 1;
181 }
182 p++;
183 } while (p < __setup_end);
184 return 0;
185 }
187 /*
188 * This should be approx 2 Bo*oMips to start (note initial shift), and will
189 * still work even if initially too large, it will just take slightly longer
190 */
191 unsigned long loops_per_jiffy = (1<<12);
193 EXPORT_SYMBOL(loops_per_jiffy);
195 static int __init debug_kernel(char *str)
196 {
197 if (*str)
198 return 0;
199 console_loglevel = 10;
200 return 1;
201 }
203 static int __init quiet_kernel(char *str)
204 {
205 if (*str)
206 return 0;
207 console_loglevel = 4;
208 return 1;
209 }
211 __setup("debug", debug_kernel);
212 __setup("quiet", quiet_kernel);
214 static int __init loglevel(char *str)
215 {
216 get_option(&str, &console_loglevel);
217 return 1;
218 }
220 __setup("loglevel=", loglevel);
222 /*
223 * Unknown boot options get handed to init, unless they look like
224 * failed parameters
225 */
226 static int __init unknown_bootoption(char *param, char *val)
227 {
228 /* Change NUL term back to "=", to make "param" the whole string. */
229 if (val) {
230 /* param=val or param="val"? */
231 if (val == param+strlen(param)+1)
232 val[-1] = '=';
233 else if (val == param+strlen(param)+2) {
234 val[-2] = '=';
235 memmove(val-1, val, strlen(val)+1);
236 val--;
237 } else
238 BUG();
239 }
241 /* Handle obsolete-style parameters */
242 if (obsolete_checksetup(param))
243 return 0;
245 /*
246 * Preemptive maintenance for "why didn't my mispelled command
247 * line work?"
248 */
249 if (strchr(param, '.') && (!val || strchr(param, '.') < val)) {
250 printk(KERN_ERR "Unknown boot option `%s': ignoring\n", param);
251 return 0;
252 }
254 if (panic_later)
255 return 0;
257 if (val) {
258 /* Environment option */
259 unsigned int i;
260 for (i = 0; envp_init[i]; i++) {
261 if (i == MAX_INIT_ENVS) {
262 panic_later = "Too many boot env vars at `%s'";
263 panic_param = param;
264 }
265 if (!strncmp(param, envp_init[i], val - param))
266 break;
267 }
268 envp_init[i] = param;
269 } else {
270 /* Command line option */
271 unsigned int i;
272 for (i = 0; argv_init[i]; i++) {
273 if (i == MAX_INIT_ARGS) {
274 panic_later = "Too many boot init vars at `%s'";
275 panic_param = param;
276 }
277 }
278 argv_init[i] = param;
279 }
280 return 0;
281 }
283 static int __init init_setup(char *str)
284 {
285 unsigned int i;
287 execute_command = str;
288 /*
289 * In case LILO is going to boot us with default command line,
290 * it prepends "auto" before the whole cmdline which makes
291 * the shell think it should execute a script with such name.
292 * So we ignore all arguments entered _before_ init=... [MJ]
293 */
294 for (i = 1; i < MAX_INIT_ARGS; i++)
295 argv_init[i] = NULL;
296 return 1;
297 }
298 __setup("init=", init_setup);
300 static int __init rdinit_setup(char *str)
301 {
302 unsigned int i;
304 ramdisk_execute_command = str;
305 /* See "auto" comment in init_setup */
306 for (i = 1; i < MAX_INIT_ARGS; i++)
307 argv_init[i] = NULL;
308 return 1;
309 }
310 __setup("rdinit=", rdinit_setup);
312 #ifndef CONFIG_SMP
314 #ifdef CONFIG_X86_LOCAL_APIC
315 static void __init smp_init(void)
316 {
317 APIC_init_uniprocessor();
318 }
319 #else
320 #define smp_init() do { } while (0)
321 #endif
323 static inline void setup_per_cpu_areas(void) { }
324 static inline void smp_prepare_cpus(unsigned int maxcpus) { }
326 #else
328 #ifdef __GENERIC_PER_CPU
329 unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
331 EXPORT_SYMBOL(__per_cpu_offset);
333 static void __init setup_per_cpu_areas(void)
334 {
335 unsigned long size, i;
336 char *ptr;
337 unsigned long nr_possible_cpus = num_possible_cpus();
339 /* Copy section for each CPU (we discard the original) */
340 size = ALIGN(__per_cpu_end - __per_cpu_start, SMP_CACHE_BYTES);
341 #ifdef CONFIG_MODULES
342 if (size < PERCPU_ENOUGH_ROOM)
343 size = PERCPU_ENOUGH_ROOM;
344 #endif
345 ptr = alloc_bootmem(size * nr_possible_cpus);
347 for_each_possible_cpu(i) {
348 __per_cpu_offset[i] = ptr - __per_cpu_start;
349 memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
350 ptr += size;
351 }
352 }
353 #endif /* !__GENERIC_PER_CPU */
355 /* Called by boot processor to activate the rest. */
356 static void __init smp_init(void)
357 {
358 unsigned int i;
360 /* FIXME: This should be done in userspace --RR */
361 for_each_present_cpu(i) {
362 if (num_online_cpus() >= max_cpus)
363 break;
364 if (!cpu_online(i))
365 cpu_up(i);
366 }
368 /* Any cleanup work */
369 printk(KERN_INFO "Brought up %ld CPUs\n", (long)num_online_cpus());
370 smp_cpus_done(max_cpus);
371 #if 0
372 /* Get other processors into their bootup holding patterns. */
374 smp_commence();
375 #endif
376 }
378 #endif
380 /*
381 * We need to finalize in a non-__init function or else race conditions
382 * between the root thread and the init thread may cause start_kernel to
383 * be reaped by free_initmem before the root thread has proceeded to
384 * cpu_idle.
385 *
386 * gcc-3.4 accidentally inlines this function, so use noinline.
387 */
389 static void noinline rest_init(void)
390 __releases(kernel_lock)
391 {
392 kernel_thread(init, NULL, CLONE_FS | CLONE_SIGHAND);
393 numa_default_policy();
394 unlock_kernel();
396 /*
397 * The boot idle thread must execute schedule()
398 * at least one to get things moving:
399 */
400 preempt_enable_no_resched();
401 schedule();
402 preempt_disable();
404 /* Call into cpu_idle with preempt disabled */
405 cpu_idle();
406 }
408 /* Check for early params. */
409 static int __init do_early_param(char *param, char *val)
410 {
411 struct obs_kernel_param *p;
413 for (p = __setup_start; p < __setup_end; p++) {
414 if (p->early && strcmp(param, p->str) == 0) {
415 if (p->setup_func(val) != 0)
416 printk(KERN_WARNING
417 "Malformed early option '%s'\n", param);
418 }
419 }
420 /* We accept everything at this stage. */
421 return 0;
422 }
424 /* Arch code calls this early on, or if not, just before other parsing. */
425 void __init parse_early_param(void)
426 {
427 static __initdata int done = 0;
428 static __initdata char tmp_cmdline[COMMAND_LINE_SIZE];
430 if (done)
431 return;
433 /* All fall through to do_early_param. */
434 strlcpy(tmp_cmdline, saved_command_line, COMMAND_LINE_SIZE);
435 parse_args("early options", tmp_cmdline, NULL, 0, do_early_param);
436 done = 1;
437 }
439 /*
440 * Activate the first processor.
441 */
443 static void __init boot_cpu_init(void)
444 {
445 int cpu = smp_processor_id();
446 /* Mark the boot cpu "present", "online" etc for SMP and UP case */
447 cpu_set(cpu, cpu_online_map);
448 cpu_set(cpu, cpu_present_map);
449 cpu_set(cpu, cpu_possible_map);
450 }
452 void __init __attribute__((weak)) smp_setup_processor_id(void)
453 {
454 }
456 asmlinkage void __init start_kernel(void)
457 {
458 char * command_line;
459 extern struct kernel_param __start___param[], __stop___param[];
461 smp_setup_processor_id();
463 /*
464 * Need to run as early as possible, to initialize the
465 * lockdep hash:
466 */
467 lockdep_init();
469 local_irq_disable();
470 early_boot_irqs_off();
471 early_init_irq_lock_class();
473 /*
474 * Interrupts are still disabled. Do necessary setups, then
475 * enable them
476 */
477 lock_kernel();
478 boot_cpu_init();
479 page_address_init();
480 printk(KERN_NOTICE);
481 printk(linux_banner);
482 setup_arch(&command_line);
483 setup_per_cpu_areas();
484 smp_prepare_boot_cpu(); /* arch-specific boot-cpu hooks */
486 /*
487 * Set up the scheduler prior starting any interrupts (such as the
488 * timer interrupt). Full topology setup happens at smp_init()
489 * time - but meanwhile we still have a functioning scheduler.
490 */
491 sched_init();
492 /*
493 * Disable preemption - early bootup scheduling is extremely
494 * fragile until we cpu_idle() for the first time.
495 */
496 preempt_disable();
497 build_all_zonelists();
498 page_alloc_init();
499 printk(KERN_NOTICE "Kernel command line: %s\n", saved_command_line);
500 parse_early_param();
501 parse_args("Booting kernel", command_line, __start___param,
502 __stop___param - __start___param,
503 &unknown_bootoption);
504 sort_main_extable();
505 unwind_init();
506 trap_init();
507 rcu_init();
508 init_IRQ();
509 pidhash_init();
510 init_timers();
511 hrtimers_init();
512 softirq_init();
513 timekeeping_init();
514 time_init();
515 profile_init();
516 if (!irqs_disabled())
517 printk("start_kernel(): bug: interrupts were enabled early\n");
518 early_boot_irqs_on();
519 local_irq_enable();
521 /*
522 * HACK ALERT! This is early. We're enabling the console before
523 * we've done PCI setups etc, and console_init() must be aware of
524 * this. But we do want output early, in case something goes wrong.
525 */
526 console_init();
527 if (panic_later)
528 panic(panic_later, panic_param);
530 lockdep_info();
532 /*
533 * Need to run this when irqs are enabled, because it wants
534 * to self-test [hard/soft]-irqs on/off lock inversion bugs
535 * too:
536 */
537 locking_selftest();
539 #ifdef CONFIG_BLK_DEV_INITRD
540 if (initrd_start && !initrd_below_start_ok &&
541 initrd_start < min_low_pfn << PAGE_SHIFT) {
542 printk(KERN_CRIT "initrd overwritten (0x%08lx < 0x%08lx) - "
543 "disabling it.\n",initrd_start,min_low_pfn << PAGE_SHIFT);
544 initrd_start = 0;
545 }
546 #endif
547 vfs_caches_init_early();
548 cpuset_init_early();
549 mem_init();
550 kmem_cache_init();
551 setup_per_cpu_pageset();
552 numa_policy_init();
553 if (late_time_init)
554 late_time_init();
555 calibrate_delay();
556 pidmap_init();
557 pgtable_cache_init();
558 prio_tree_init();
559 anon_vma_init();
560 #ifdef CONFIG_X86
561 if (efi_enabled)
562 efi_enter_virtual_mode();
563 #endif
564 fork_init(num_physpages);
565 proc_caches_init();
566 buffer_init();
567 unnamed_dev_init();
568 key_init();
569 security_init();
570 vfs_caches_init(num_physpages);
571 radix_tree_init();
572 signals_init();
573 /* rootfs populating might need page-writeback */
574 page_writeback_init();
575 #ifdef CONFIG_PROC_FS
576 proc_root_init();
577 #endif
578 cpuset_init();
579 taskstats_init_early();
580 delayacct_init();
582 check_bugs();
584 acpi_early_init(); /* before LAPIC and SMP init */
586 /* Do the rest non-__init'ed, we're now alive */
587 rest_init();
588 }
590 static int __initdata initcall_debug;
592 static int __init initcall_debug_setup(char *str)
593 {
594 initcall_debug = 1;
595 return 1;
596 }
597 __setup("initcall_debug", initcall_debug_setup);
599 struct task_struct *child_reaper = &init_task;
601 extern initcall_t __initcall_start[], __initcall_end[];
603 static void __init do_initcalls(void)
604 {
605 initcall_t *call;
606 int count = preempt_count();
608 for (call = __initcall_start; call < __initcall_end; call++) {
609 char *msg = NULL;
610 char msgbuf[40];
611 int result;
613 if (initcall_debug) {
614 printk("Calling initcall 0x%p", *call);
615 print_fn_descriptor_symbol(": %s()",
616 (unsigned long) *call);
617 printk("\n");
618 }
620 result = (*call)();
622 if (result && result != -ENODEV && initcall_debug) {
623 sprintf(msgbuf, "error code %d", result);
624 msg = msgbuf;
625 }
626 if (preempt_count() != count) {
627 msg = "preemption imbalance";
628 preempt_count() = count;
629 }
630 if (irqs_disabled()) {
631 msg = "disabled interrupts";
632 local_irq_enable();
633 }
634 if (msg) {
635 printk(KERN_WARNING "initcall at 0x%p", *call);
636 print_fn_descriptor_symbol(": %s()",
637 (unsigned long) *call);
638 printk(": returned with %s\n", msg);
639 }
640 }
642 /* Make sure there is no pending stuff from the initcall sequence */
643 flush_scheduled_work();
644 }
646 /*
647 * Ok, the machine is now initialized. None of the devices
648 * have been touched yet, but the CPU subsystem is up and
649 * running, and memory and process management works.
650 *
651 * Now we can finally start doing some real work..
652 */
653 static void __init do_basic_setup(void)
654 {
655 /* drivers will send hotplug events */
656 init_workqueues();
657 usermodehelper_init();
658 driver_init();
660 #ifdef CONFIG_SYSCTL
661 sysctl_init();
662 #endif
664 do_initcalls();
665 }
667 static void do_pre_smp_initcalls(void)
668 {
669 extern int spawn_ksoftirqd(void);
670 #ifdef CONFIG_SMP
671 extern int migration_init(void);
673 migration_init();
674 #endif
675 spawn_ksoftirqd();
676 spawn_softlockup_task();
677 }
679 static void run_init_process(char *init_filename)
680 {
681 argv_init[0] = init_filename;
682 execve(init_filename, argv_init, envp_init);
683 }
685 static int init(void * unused)
686 {
687 lock_kernel();
688 /*
689 * init can run on any cpu.
690 */
691 set_cpus_allowed(current, CPU_MASK_ALL);
692 /*
693 * Tell the world that we're going to be the grim
694 * reaper of innocent orphaned children.
695 *
696 * We don't want people to have to make incorrect
697 * assumptions about where in the task array this
698 * can be found.
699 */
700 child_reaper = current;
702 smp_prepare_cpus(max_cpus);
704 do_pre_smp_initcalls();
706 smp_init();
707 sched_init_smp();
709 cpuset_init_smp();
711 /*
712 * Do this before initcalls, because some drivers want to access
713 * firmware files.
714 */
715 populate_rootfs();
717 do_basic_setup();
719 /*
720 * check if there is an early userspace init. If yes, let it do all
721 * the work
722 */
724 if (!ramdisk_execute_command)
725 ramdisk_execute_command = "/init";
727 if (sys_access((const char __user *) ramdisk_execute_command, 0) != 0) {
728 ramdisk_execute_command = NULL;
729 prepare_namespace();
730 }
732 /*
733 * Ok, we have completed the initial bootup, and
734 * we're essentially up and running. Get rid of the
735 * initmem segments and start the user-mode stuff..
736 */
737 free_initmem();
738 unlock_kernel();
739 mark_rodata_ro();
740 system_state = SYSTEM_RUNNING;
741 numa_default_policy();
743 if (sys_open((const char __user *) "/dev/console", O_RDWR, 0) < 0)
744 printk(KERN_WARNING "Warning: unable to open an initial console.\n");
746 (void) sys_dup(0);
747 (void) sys_dup(0);
749 if (ramdisk_execute_command) {
750 run_init_process(ramdisk_execute_command);
751 printk(KERN_WARNING "Failed to execute %s\n",
752 ramdisk_execute_command);
753 }
755 /*
756 * We try each of these until one succeeds.
757 *
758 * The Bourne shell can be used instead of init if we are
759 * trying to recover a really broken machine.
760 */
761 if (execute_command) {
762 run_init_process(execute_command);
763 printk(KERN_WARNING "Failed to execute %s. Attempting "
764 "defaults...\n", execute_command);
765 }
766 run_init_process("/sbin/init");
767 run_init_process("/etc/init");
768 run_init_process("/bin/init");
769 run_init_process("/bin/sh");
771 panic("No init found. Try passing init= option to kernel.");
772 }