ia64/linux-2.6.18-xen.hg

view arch/alpha/kernel/process.c @ 897:329ea0ccb344

balloon: try harder to balloon up under memory pressure.

Currently if the balloon driver is unable to increase the guest's
reservation it assumes the failure was due to reaching its full
allocation, gives up on the ballooning operation and records the limit
it reached as the "hard limit". The driver will not try again until
the target is set again (even to the same value).

However it is possible that ballooning has in fact failed due to
memory pressure in the host and therefore it is desirable to keep
attempting to reach the target in case memory becomes available. The
most likely scenario is that some guests are ballooning down while
others are ballooning up and therefore there is temporary memory
pressure while things stabilise. You would not expect a well behaved
toolstack to ask a domain to balloon to more than its allocation nor
would you expect it to deliberately over-commit memory by setting
balloon targets which exceed the total host memory.

This patch drops the concept of a hard limit and causes the balloon
driver to retry increasing the reservation on a timer in the same
manner as when decreasing the reservation.

Also if we partially succeed in increasing the reservation
(i.e. receive less pages than we asked for) then we may as well keep
those pages rather than returning them to Xen.

Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
author Keir Fraser <keir.fraser@citrix.com>
date Fri Jun 05 14:01:20 2009 +0100 (2009-06-05)
parents 831230e53067
children
line source
1 /*
2 * linux/arch/alpha/kernel/process.c
3 *
4 * Copyright (C) 1995 Linus Torvalds
5 */
7 /*
8 * This file handles the architecture-dependent parts of process handling.
9 */
11 #include <linux/errno.h>
12 #include <linux/module.h>
13 #include <linux/sched.h>
14 #include <linux/kernel.h>
15 #include <linux/mm.h>
16 #include <linux/smp.h>
17 #include <linux/smp_lock.h>
18 #include <linux/stddef.h>
19 #include <linux/unistd.h>
20 #include <linux/ptrace.h>
21 #include <linux/slab.h>
22 #include <linux/user.h>
23 #include <linux/a.out.h>
24 #include <linux/utsname.h>
25 #include <linux/time.h>
26 #include <linux/major.h>
27 #include <linux/stat.h>
28 #include <linux/vt.h>
29 #include <linux/mman.h>
30 #include <linux/elfcore.h>
31 #include <linux/reboot.h>
32 #include <linux/tty.h>
33 #include <linux/console.h>
35 #include <asm/reg.h>
36 #include <asm/uaccess.h>
37 #include <asm/system.h>
38 #include <asm/io.h>
39 #include <asm/pgtable.h>
40 #include <asm/hwrpb.h>
41 #include <asm/fpu.h>
43 #include "proto.h"
44 #include "pci_impl.h"
46 /*
47 * Power off function, if any
48 */
49 void (*pm_power_off)(void) = machine_power_off;
51 void
52 cpu_idle(void)
53 {
54 set_thread_flag(TIF_POLLING_NRFLAG);
56 while (1) {
57 /* FIXME -- EV6 and LCA45 know how to power down
58 the CPU. */
60 while (!need_resched())
61 cpu_relax();
62 schedule();
63 }
64 }
67 struct halt_info {
68 int mode;
69 char *restart_cmd;
70 };
72 static void
73 common_shutdown_1(void *generic_ptr)
74 {
75 struct halt_info *how = (struct halt_info *)generic_ptr;
76 struct percpu_struct *cpup;
77 unsigned long *pflags, flags;
78 int cpuid = smp_processor_id();
80 /* No point in taking interrupts anymore. */
81 local_irq_disable();
83 cpup = (struct percpu_struct *)
84 ((unsigned long)hwrpb + hwrpb->processor_offset
85 + hwrpb->processor_size * cpuid);
86 pflags = &cpup->flags;
87 flags = *pflags;
89 /* Clear reason to "default"; clear "bootstrap in progress". */
90 flags &= ~0x00ff0001UL;
92 #ifdef CONFIG_SMP
93 /* Secondaries halt here. */
94 if (cpuid != boot_cpuid) {
95 flags |= 0x00040000UL; /* "remain halted" */
96 *pflags = flags;
97 cpu_clear(cpuid, cpu_present_map);
98 halt();
99 }
100 #endif
102 if (how->mode == LINUX_REBOOT_CMD_RESTART) {
103 if (!how->restart_cmd) {
104 flags |= 0x00020000UL; /* "cold bootstrap" */
105 } else {
106 /* For SRM, we could probably set environment
107 variables to get this to work. We'd have to
108 delay this until after srm_paging_stop unless
109 we ever got srm_fixup working.
111 At the moment, SRM will use the last boot device,
112 but the file and flags will be the defaults, when
113 doing a "warm" bootstrap. */
114 flags |= 0x00030000UL; /* "warm bootstrap" */
115 }
116 } else {
117 flags |= 0x00040000UL; /* "remain halted" */
118 }
119 *pflags = flags;
121 #ifdef CONFIG_SMP
122 /* Wait for the secondaries to halt. */
123 cpu_clear(boot_cpuid, cpu_present_map);
124 while (cpus_weight(cpu_present_map))
125 barrier();
126 #endif
128 /* If booted from SRM, reset some of the original environment. */
129 if (alpha_using_srm) {
130 #ifdef CONFIG_DUMMY_CONSOLE
131 /* If we've gotten here after SysRq-b, leave interrupt
132 context before taking over the console. */
133 if (in_interrupt())
134 irq_exit();
135 /* This has the effect of resetting the VGA video origin. */
136 take_over_console(&dummy_con, 0, MAX_NR_CONSOLES-1, 1);
137 #endif
138 pci_restore_srm_config();
139 set_hae(srm_hae);
140 }
142 if (alpha_mv.kill_arch)
143 alpha_mv.kill_arch(how->mode);
145 if (! alpha_using_srm && how->mode != LINUX_REBOOT_CMD_RESTART) {
146 /* Unfortunately, since MILO doesn't currently understand
147 the hwrpb bits above, we can't reliably halt the
148 processor and keep it halted. So just loop. */
149 return;
150 }
152 if (alpha_using_srm)
153 srm_paging_stop();
155 halt();
156 }
158 static void
159 common_shutdown(int mode, char *restart_cmd)
160 {
161 struct halt_info args;
162 args.mode = mode;
163 args.restart_cmd = restart_cmd;
164 on_each_cpu(common_shutdown_1, &args, 1, 0);
165 }
167 void
168 machine_restart(char *restart_cmd)
169 {
170 common_shutdown(LINUX_REBOOT_CMD_RESTART, restart_cmd);
171 }
174 void
175 machine_halt(void)
176 {
177 common_shutdown(LINUX_REBOOT_CMD_HALT, NULL);
178 }
181 void
182 machine_power_off(void)
183 {
184 common_shutdown(LINUX_REBOOT_CMD_POWER_OFF, NULL);
185 }
188 /* Used by sysrq-p, among others. I don't believe r9-r15 are ever
189 saved in the context it's used. */
191 void
192 show_regs(struct pt_regs *regs)
193 {
194 dik_show_regs(regs, NULL);
195 }
197 /*
198 * Re-start a thread when doing execve()
199 */
200 void
201 start_thread(struct pt_regs * regs, unsigned long pc, unsigned long sp)
202 {
203 set_fs(USER_DS);
204 regs->pc = pc;
205 regs->ps = 8;
206 wrusp(sp);
207 }
209 /*
210 * Free current thread data structures etc..
211 */
212 void
213 exit_thread(void)
214 {
215 }
217 void
218 flush_thread(void)
219 {
220 /* Arrange for each exec'ed process to start off with a clean slate
221 with respect to the FPU. This is all exceptions disabled. */
222 current_thread_info()->ieee_state = 0;
223 wrfpcr(FPCR_DYN_NORMAL | ieee_swcr_to_fpcr(0));
225 /* Clean slate for TLS. */
226 current_thread_info()->pcb.unique = 0;
227 }
229 void
230 release_thread(struct task_struct *dead_task)
231 {
232 }
234 /*
235 * "alpha_clone()".. By the time we get here, the
236 * non-volatile registers have also been saved on the
237 * stack. We do some ugly pointer stuff here.. (see
238 * also copy_thread)
239 *
240 * Notice that "fork()" is implemented in terms of clone,
241 * with parameters (SIGCHLD, 0).
242 */
243 int
244 alpha_clone(unsigned long clone_flags, unsigned long usp,
245 int __user *parent_tid, int __user *child_tid,
246 unsigned long tls_value, struct pt_regs *regs)
247 {
248 if (!usp)
249 usp = rdusp();
251 return do_fork(clone_flags, usp, regs, 0, parent_tid, child_tid);
252 }
254 int
255 alpha_vfork(struct pt_regs *regs)
256 {
257 return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, rdusp(),
258 regs, 0, NULL, NULL);
259 }
261 /*
262 * Copy an alpha thread..
263 *
264 * Note the "stack_offset" stuff: when returning to kernel mode, we need
265 * to have some extra stack-space for the kernel stack that still exists
266 * after the "ret_from_fork". When returning to user mode, we only want
267 * the space needed by the syscall stack frame (ie "struct pt_regs").
268 * Use the passed "regs" pointer to determine how much space we need
269 * for a kernel fork().
270 */
272 int
273 copy_thread(int nr, unsigned long clone_flags, unsigned long usp,
274 unsigned long unused,
275 struct task_struct * p, struct pt_regs * regs)
276 {
277 extern void ret_from_fork(void);
279 struct thread_info *childti = task_thread_info(p);
280 struct pt_regs * childregs;
281 struct switch_stack * childstack, *stack;
282 unsigned long stack_offset, settls;
284 stack_offset = PAGE_SIZE - sizeof(struct pt_regs);
285 if (!(regs->ps & 8))
286 stack_offset = (PAGE_SIZE-1) & (unsigned long) regs;
287 childregs = (struct pt_regs *)
288 (stack_offset + PAGE_SIZE + task_stack_page(p));
290 *childregs = *regs;
291 settls = regs->r20;
292 childregs->r0 = 0;
293 childregs->r19 = 0;
294 childregs->r20 = 1; /* OSF/1 has some strange fork() semantics. */
295 regs->r20 = 0;
296 stack = ((struct switch_stack *) regs) - 1;
297 childstack = ((struct switch_stack *) childregs) - 1;
298 *childstack = *stack;
299 childstack->r26 = (unsigned long) ret_from_fork;
300 childti->pcb.usp = usp;
301 childti->pcb.ksp = (unsigned long) childstack;
302 childti->pcb.flags = 1; /* set FEN, clear everything else */
304 /* Set a new TLS for the child thread? Peek back into the
305 syscall arguments that we saved on syscall entry. Oops,
306 except we'd have clobbered it with the parent/child set
307 of r20. Read the saved copy. */
308 /* Note: if CLONE_SETTLS is not set, then we must inherit the
309 value from the parent, which will have been set by the block
310 copy in dup_task_struct. This is non-intuitive, but is
311 required for proper operation in the case of a threaded
312 application calling fork. */
313 if (clone_flags & CLONE_SETTLS)
314 childti->pcb.unique = settls;
316 return 0;
317 }
319 /*
320 * Fill in the user structure for an ECOFF core dump.
321 */
322 void
323 dump_thread(struct pt_regs * pt, struct user * dump)
324 {
325 /* switch stack follows right below pt_regs: */
326 struct switch_stack * sw = ((struct switch_stack *) pt) - 1;
328 dump->magic = CMAGIC;
329 dump->start_code = current->mm->start_code;
330 dump->start_data = current->mm->start_data;
331 dump->start_stack = rdusp() & ~(PAGE_SIZE - 1);
332 dump->u_tsize = ((current->mm->end_code - dump->start_code)
333 >> PAGE_SHIFT);
334 dump->u_dsize = ((current->mm->brk + PAGE_SIZE-1 - dump->start_data)
335 >> PAGE_SHIFT);
336 dump->u_ssize = (current->mm->start_stack - dump->start_stack
337 + PAGE_SIZE-1) >> PAGE_SHIFT;
339 /*
340 * We store the registers in an order/format that is
341 * compatible with DEC Unix/OSF/1 as this makes life easier
342 * for gdb.
343 */
344 dump->regs[EF_V0] = pt->r0;
345 dump->regs[EF_T0] = pt->r1;
346 dump->regs[EF_T1] = pt->r2;
347 dump->regs[EF_T2] = pt->r3;
348 dump->regs[EF_T3] = pt->r4;
349 dump->regs[EF_T4] = pt->r5;
350 dump->regs[EF_T5] = pt->r6;
351 dump->regs[EF_T6] = pt->r7;
352 dump->regs[EF_T7] = pt->r8;
353 dump->regs[EF_S0] = sw->r9;
354 dump->regs[EF_S1] = sw->r10;
355 dump->regs[EF_S2] = sw->r11;
356 dump->regs[EF_S3] = sw->r12;
357 dump->regs[EF_S4] = sw->r13;
358 dump->regs[EF_S5] = sw->r14;
359 dump->regs[EF_S6] = sw->r15;
360 dump->regs[EF_A3] = pt->r19;
361 dump->regs[EF_A4] = pt->r20;
362 dump->regs[EF_A5] = pt->r21;
363 dump->regs[EF_T8] = pt->r22;
364 dump->regs[EF_T9] = pt->r23;
365 dump->regs[EF_T10] = pt->r24;
366 dump->regs[EF_T11] = pt->r25;
367 dump->regs[EF_RA] = pt->r26;
368 dump->regs[EF_T12] = pt->r27;
369 dump->regs[EF_AT] = pt->r28;
370 dump->regs[EF_SP] = rdusp();
371 dump->regs[EF_PS] = pt->ps;
372 dump->regs[EF_PC] = pt->pc;
373 dump->regs[EF_GP] = pt->gp;
374 dump->regs[EF_A0] = pt->r16;
375 dump->regs[EF_A1] = pt->r17;
376 dump->regs[EF_A2] = pt->r18;
377 memcpy((char *)dump->regs + EF_SIZE, sw->fp, 32 * 8);
378 }
380 /*
381 * Fill in the user structure for a ELF core dump.
382 */
383 void
384 dump_elf_thread(elf_greg_t *dest, struct pt_regs *pt, struct thread_info *ti)
385 {
386 /* switch stack follows right below pt_regs: */
387 struct switch_stack * sw = ((struct switch_stack *) pt) - 1;
389 dest[ 0] = pt->r0;
390 dest[ 1] = pt->r1;
391 dest[ 2] = pt->r2;
392 dest[ 3] = pt->r3;
393 dest[ 4] = pt->r4;
394 dest[ 5] = pt->r5;
395 dest[ 6] = pt->r6;
396 dest[ 7] = pt->r7;
397 dest[ 8] = pt->r8;
398 dest[ 9] = sw->r9;
399 dest[10] = sw->r10;
400 dest[11] = sw->r11;
401 dest[12] = sw->r12;
402 dest[13] = sw->r13;
403 dest[14] = sw->r14;
404 dest[15] = sw->r15;
405 dest[16] = pt->r16;
406 dest[17] = pt->r17;
407 dest[18] = pt->r18;
408 dest[19] = pt->r19;
409 dest[20] = pt->r20;
410 dest[21] = pt->r21;
411 dest[22] = pt->r22;
412 dest[23] = pt->r23;
413 dest[24] = pt->r24;
414 dest[25] = pt->r25;
415 dest[26] = pt->r26;
416 dest[27] = pt->r27;
417 dest[28] = pt->r28;
418 dest[29] = pt->gp;
419 dest[30] = rdusp();
420 dest[31] = pt->pc;
422 /* Once upon a time this was the PS value. Which is stupid
423 since that is always 8 for usermode. Usurped for the more
424 useful value of the thread's UNIQUE field. */
425 dest[32] = ti->pcb.unique;
426 }
428 int
429 dump_elf_task(elf_greg_t *dest, struct task_struct *task)
430 {
431 dump_elf_thread(dest, task_pt_regs(task), task_thread_info(task));
432 return 1;
433 }
435 int
436 dump_elf_task_fp(elf_fpreg_t *dest, struct task_struct *task)
437 {
438 struct switch_stack *sw = (struct switch_stack *)task_pt_regs(task) - 1;
439 memcpy(dest, sw->fp, 32 * 8);
440 return 1;
441 }
443 /*
444 * sys_execve() executes a new program.
445 */
446 asmlinkage int
447 do_sys_execve(char __user *ufilename, char __user * __user *argv,
448 char __user * __user *envp, struct pt_regs *regs)
449 {
450 int error;
451 char *filename;
453 filename = getname(ufilename);
454 error = PTR_ERR(filename);
455 if (IS_ERR(filename))
456 goto out;
457 error = do_execve(filename, argv, envp, regs);
458 putname(filename);
459 out:
460 return error;
461 }
463 /*
464 * Return saved PC of a blocked thread. This assumes the frame
465 * pointer is the 6th saved long on the kernel stack and that the
466 * saved return address is the first long in the frame. This all
467 * holds provided the thread blocked through a call to schedule() ($15
468 * is the frame pointer in schedule() and $15 is saved at offset 48 by
469 * entry.S:do_switch_stack).
470 *
471 * Under heavy swap load I've seen this lose in an ugly way. So do
472 * some extra sanity checking on the ranges we expect these pointers
473 * to be in so that we can fail gracefully. This is just for ps after
474 * all. -- r~
475 */
477 unsigned long
478 thread_saved_pc(struct task_struct *t)
479 {
480 unsigned long base = (unsigned long)task_stack_page(t);
481 unsigned long fp, sp = task_thread_info(t)->pcb.ksp;
483 if (sp > base && sp+6*8 < base + 16*1024) {
484 fp = ((unsigned long*)sp)[6];
485 if (fp > sp && fp < base + 16*1024)
486 return *(unsigned long *)fp;
487 }
489 return 0;
490 }
492 unsigned long
493 get_wchan(struct task_struct *p)
494 {
495 unsigned long schedule_frame;
496 unsigned long pc;
497 if (!p || p == current || p->state == TASK_RUNNING)
498 return 0;
499 /*
500 * This one depends on the frame size of schedule(). Do a
501 * "disass schedule" in gdb to find the frame size. Also, the
502 * code assumes that sleep_on() follows immediately after
503 * interruptible_sleep_on() and that add_timer() follows
504 * immediately after interruptible_sleep(). Ugly, isn't it?
505 * Maybe adding a wchan field to task_struct would be better,
506 * after all...
507 */
509 pc = thread_saved_pc(p);
510 if (in_sched_functions(pc)) {
511 schedule_frame = ((unsigned long *)task_thread_info(p)->pcb.ksp)[6];
512 return ((unsigned long *)schedule_frame)[12];
513 }
514 return pc;
515 }