ia64/linux-2.6.18-xen.hg

view arch/sparc64/kernel/binfmt_aout32.c @ 897:329ea0ccb344

balloon: try harder to balloon up under memory pressure.

Currently if the balloon driver is unable to increase the guest's
reservation it assumes the failure was due to reaching its full
allocation, gives up on the ballooning operation and records the limit
it reached as the "hard limit". The driver will not try again until
the target is set again (even to the same value).

However it is possible that ballooning has in fact failed due to
memory pressure in the host and therefore it is desirable to keep
attempting to reach the target in case memory becomes available. The
most likely scenario is that some guests are ballooning down while
others are ballooning up and therefore there is temporary memory
pressure while things stabilise. You would not expect a well behaved
toolstack to ask a domain to balloon to more than its allocation nor
would you expect it to deliberately over-commit memory by setting
balloon targets which exceed the total host memory.

This patch drops the concept of a hard limit and causes the balloon
driver to retry increasing the reservation on a timer in the same
manner as when decreasing the reservation.

Also if we partially succeed in increasing the reservation
(i.e. receive less pages than we asked for) then we may as well keep
those pages rather than returning them to Xen.

Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
author Keir Fraser <keir.fraser@citrix.com>
date Fri Jun 05 14:01:20 2009 +0100 (2009-06-05)
parents 831230e53067
children
line source
1 /*
2 * linux/fs/binfmt_aout.c
3 *
4 * Copyright (C) 1991, 1992, 1996 Linus Torvalds
5 *
6 * Hacked a bit by DaveM to make it work with 32-bit SunOS
7 * binaries on the sparc64 port.
8 */
10 #include <linux/module.h>
12 #include <linux/sched.h>
13 #include <linux/kernel.h>
14 #include <linux/mm.h>
15 #include <linux/mman.h>
16 #include <linux/a.out.h>
17 #include <linux/errno.h>
18 #include <linux/signal.h>
19 #include <linux/string.h>
20 #include <linux/fs.h>
21 #include <linux/file.h>
22 #include <linux/stat.h>
23 #include <linux/fcntl.h>
24 #include <linux/ptrace.h>
25 #include <linux/user.h>
26 #include <linux/slab.h>
27 #include <linux/binfmts.h>
28 #include <linux/personality.h>
29 #include <linux/init.h>
31 #include <asm/system.h>
32 #include <asm/uaccess.h>
33 #include <asm/pgalloc.h>
34 #include <asm/mmu_context.h>
36 static int load_aout32_binary(struct linux_binprm *, struct pt_regs * regs);
37 static int load_aout32_library(struct file*);
38 static int aout32_core_dump(long signr, struct pt_regs * regs, struct file *file);
40 static struct linux_binfmt aout32_format = {
41 NULL, THIS_MODULE, load_aout32_binary, load_aout32_library, aout32_core_dump,
42 PAGE_SIZE
43 };
45 static void set_brk(unsigned long start, unsigned long end)
46 {
47 start = PAGE_ALIGN(start);
48 end = PAGE_ALIGN(end);
49 if (end <= start)
50 return;
51 down_write(&current->mm->mmap_sem);
52 do_brk(start, end - start);
53 up_write(&current->mm->mmap_sem);
54 }
56 /*
57 * These are the only things you should do on a core-file: use only these
58 * macros to write out all the necessary info.
59 */
61 static int dump_write(struct file *file, const void *addr, int nr)
62 {
63 return file->f_op->write(file, addr, nr, &file->f_pos) == nr;
64 }
66 #define DUMP_WRITE(addr, nr) \
67 if (!dump_write(file, (void *)(addr), (nr))) \
68 goto end_coredump;
70 #define DUMP_SEEK(offset) \
71 if (file->f_op->llseek) { \
72 if (file->f_op->llseek(file,(offset),0) != (offset)) \
73 goto end_coredump; \
74 } else file->f_pos = (offset)
76 /*
77 * Routine writes a core dump image in the current directory.
78 * Currently only a stub-function.
79 *
80 * Note that setuid/setgid files won't make a core-dump if the uid/gid
81 * changed due to the set[u|g]id. It's enforced by the "current->mm->dumpable"
82 * field, which also makes sure the core-dumps won't be recursive if the
83 * dumping of the process results in another error..
84 */
86 static int aout32_core_dump(long signr, struct pt_regs *regs, struct file *file)
87 {
88 mm_segment_t fs;
89 int has_dumped = 0;
90 unsigned long dump_start, dump_size;
91 struct user dump;
92 # define START_DATA(u) (u.u_tsize)
93 # define START_STACK(u) ((regs->u_regs[UREG_FP]) & ~(PAGE_SIZE - 1))
95 fs = get_fs();
96 set_fs(KERNEL_DS);
97 has_dumped = 1;
98 current->flags |= PF_DUMPCORE;
99 strncpy(dump.u_comm, current->comm, sizeof(dump.u_comm));
100 dump.signal = signr;
101 dump_thread(regs, &dump);
103 /* If the size of the dump file exceeds the rlimit, then see what would happen
104 if we wrote the stack, but not the data area. */
105 if ((dump.u_dsize+dump.u_ssize) >
106 current->signal->rlim[RLIMIT_CORE].rlim_cur)
107 dump.u_dsize = 0;
109 /* Make sure we have enough room to write the stack and data areas. */
110 if ((dump.u_ssize) >
111 current->signal->rlim[RLIMIT_CORE].rlim_cur)
112 dump.u_ssize = 0;
114 /* make sure we actually have a data and stack area to dump */
115 set_fs(USER_DS);
116 if (!access_ok(VERIFY_READ, (void __user *) START_DATA(dump), dump.u_dsize))
117 dump.u_dsize = 0;
118 if (!access_ok(VERIFY_READ, (void __user *) START_STACK(dump), dump.u_ssize))
119 dump.u_ssize = 0;
121 set_fs(KERNEL_DS);
122 /* struct user */
123 DUMP_WRITE(&dump,sizeof(dump));
124 /* now we start writing out the user space info */
125 set_fs(USER_DS);
126 /* Dump the data area */
127 if (dump.u_dsize != 0) {
128 dump_start = START_DATA(dump);
129 dump_size = dump.u_dsize;
130 DUMP_WRITE(dump_start,dump_size);
131 }
132 /* Now prepare to dump the stack area */
133 if (dump.u_ssize != 0) {
134 dump_start = START_STACK(dump);
135 dump_size = dump.u_ssize;
136 DUMP_WRITE(dump_start,dump_size);
137 }
138 /* Finally dump the task struct. Not be used by gdb, but could be useful */
139 set_fs(KERNEL_DS);
140 DUMP_WRITE(current,sizeof(*current));
141 end_coredump:
142 set_fs(fs);
143 return has_dumped;
144 }
146 /*
147 * create_aout32_tables() parses the env- and arg-strings in new user
148 * memory and creates the pointer tables from them, and puts their
149 * addresses on the "stack", returning the new stack pointer value.
150 */
152 static u32 __user *create_aout32_tables(char __user *p, struct linux_binprm *bprm)
153 {
154 u32 __user *argv;
155 u32 __user *envp;
156 u32 __user *sp;
157 int argc = bprm->argc;
158 int envc = bprm->envc;
160 sp = (u32 __user *)((-(unsigned long)sizeof(char *))&(unsigned long)p);
162 /* This imposes the proper stack alignment for a new process. */
163 sp = (u32 __user *) (((unsigned long) sp) & ~7);
164 if ((envc+argc+3)&1)
165 --sp;
167 sp -= envc+1;
168 envp = sp;
169 sp -= argc+1;
170 argv = sp;
171 put_user(argc,--sp);
172 current->mm->arg_start = (unsigned long) p;
173 while (argc-->0) {
174 char c;
175 put_user(((u32)(unsigned long)(p)),argv++);
176 do {
177 get_user(c,p++);
178 } while (c);
179 }
180 put_user(NULL,argv);
181 current->mm->arg_end = current->mm->env_start = (unsigned long) p;
182 while (envc-->0) {
183 char c;
184 put_user(((u32)(unsigned long)(p)),envp++);
185 do {
186 get_user(c,p++);
187 } while (c);
188 }
189 put_user(NULL,envp);
190 current->mm->env_end = (unsigned long) p;
191 return sp;
192 }
194 /*
195 * These are the functions used to load a.out style executables and shared
196 * libraries. There is no binary dependent code anywhere else.
197 */
199 static int load_aout32_binary(struct linux_binprm * bprm, struct pt_regs * regs)
200 {
201 struct exec ex;
202 unsigned long error;
203 unsigned long fd_offset;
204 unsigned long rlim;
205 unsigned long orig_thr_flags;
206 int retval;
208 ex = *((struct exec *) bprm->buf); /* exec-header */
209 if ((N_MAGIC(ex) != ZMAGIC && N_MAGIC(ex) != OMAGIC &&
210 N_MAGIC(ex) != QMAGIC && N_MAGIC(ex) != NMAGIC) ||
211 N_TRSIZE(ex) || N_DRSIZE(ex) ||
212 bprm->file->f_dentry->d_inode->i_size < ex.a_text+ex.a_data+N_SYMSIZE(ex)+N_TXTOFF(ex)) {
213 return -ENOEXEC;
214 }
216 fd_offset = N_TXTOFF(ex);
218 /* Check initial limits. This avoids letting people circumvent
219 * size limits imposed on them by creating programs with large
220 * arrays in the data or bss.
221 */
222 rlim = current->signal->rlim[RLIMIT_DATA].rlim_cur;
223 if (rlim >= RLIM_INFINITY)
224 rlim = ~0;
225 if (ex.a_data + ex.a_bss > rlim)
226 return -ENOMEM;
228 /* Flush all traces of the currently running executable */
229 retval = flush_old_exec(bprm);
230 if (retval)
231 return retval;
233 /* OK, This is the point of no return */
234 set_personality(PER_SUNOS);
236 current->mm->end_code = ex.a_text +
237 (current->mm->start_code = N_TXTADDR(ex));
238 current->mm->end_data = ex.a_data +
239 (current->mm->start_data = N_DATADDR(ex));
240 current->mm->brk = ex.a_bss +
241 (current->mm->start_brk = N_BSSADDR(ex));
242 current->mm->free_area_cache = current->mm->mmap_base;
243 current->mm->cached_hole_size = 0;
245 current->mm->mmap = NULL;
246 compute_creds(bprm);
247 current->flags &= ~PF_FORKNOEXEC;
248 if (N_MAGIC(ex) == NMAGIC) {
249 loff_t pos = fd_offset;
250 /* Fuck me plenty... */
251 down_write(&current->mm->mmap_sem);
252 error = do_brk(N_TXTADDR(ex), ex.a_text);
253 up_write(&current->mm->mmap_sem);
254 bprm->file->f_op->read(bprm->file, (char __user *)N_TXTADDR(ex),
255 ex.a_text, &pos);
256 down_write(&current->mm->mmap_sem);
257 error = do_brk(N_DATADDR(ex), ex.a_data);
258 up_write(&current->mm->mmap_sem);
259 bprm->file->f_op->read(bprm->file, (char __user *)N_DATADDR(ex),
260 ex.a_data, &pos);
261 goto beyond_if;
262 }
264 if (N_MAGIC(ex) == OMAGIC) {
265 loff_t pos = fd_offset;
266 down_write(&current->mm->mmap_sem);
267 do_brk(N_TXTADDR(ex) & PAGE_MASK,
268 ex.a_text+ex.a_data + PAGE_SIZE - 1);
269 up_write(&current->mm->mmap_sem);
270 bprm->file->f_op->read(bprm->file, (char __user *)N_TXTADDR(ex),
271 ex.a_text+ex.a_data, &pos);
272 } else {
273 static unsigned long error_time;
274 if ((ex.a_text & 0xfff || ex.a_data & 0xfff) &&
275 (N_MAGIC(ex) != NMAGIC) && (jiffies-error_time) > 5*HZ)
276 {
277 printk(KERN_NOTICE "executable not page aligned\n");
278 error_time = jiffies;
279 }
281 if (!bprm->file->f_op->mmap) {
282 loff_t pos = fd_offset;
283 down_write(&current->mm->mmap_sem);
284 do_brk(0, ex.a_text+ex.a_data);
285 up_write(&current->mm->mmap_sem);
286 bprm->file->f_op->read(bprm->file,
287 (char __user *)N_TXTADDR(ex),
288 ex.a_text+ex.a_data, &pos);
289 goto beyond_if;
290 }
292 down_write(&current->mm->mmap_sem);
293 error = do_mmap(bprm->file, N_TXTADDR(ex), ex.a_text,
294 PROT_READ | PROT_EXEC,
295 MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE | MAP_EXECUTABLE,
296 fd_offset);
297 up_write(&current->mm->mmap_sem);
299 if (error != N_TXTADDR(ex)) {
300 send_sig(SIGKILL, current, 0);
301 return error;
302 }
304 down_write(&current->mm->mmap_sem);
305 error = do_mmap(bprm->file, N_DATADDR(ex), ex.a_data,
306 PROT_READ | PROT_WRITE | PROT_EXEC,
307 MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE | MAP_EXECUTABLE,
308 fd_offset + ex.a_text);
309 up_write(&current->mm->mmap_sem);
310 if (error != N_DATADDR(ex)) {
311 send_sig(SIGKILL, current, 0);
312 return error;
313 }
314 }
315 beyond_if:
316 set_binfmt(&aout32_format);
318 set_brk(current->mm->start_brk, current->mm->brk);
320 /* Make sure STACK_TOP returns the right thing. */
321 orig_thr_flags = current_thread_info()->flags;
322 current_thread_info()->flags |= _TIF_32BIT;
324 retval = setup_arg_pages(bprm, STACK_TOP, EXSTACK_DEFAULT);
325 if (retval < 0) {
326 current_thread_info()->flags = orig_thr_flags;
328 /* Someone check-me: is this error path enough? */
329 send_sig(SIGKILL, current, 0);
330 return retval;
331 }
333 current->mm->start_stack =
334 (unsigned long) create_aout32_tables((char __user *)bprm->p, bprm);
335 tsb_context_switch(current->mm);
337 start_thread32(regs, ex.a_entry, current->mm->start_stack);
338 if (current->ptrace & PT_PTRACED)
339 send_sig(SIGTRAP, current, 0);
340 return 0;
341 }
343 /* N.B. Move to .h file and use code in fs/binfmt_aout.c? */
344 static int load_aout32_library(struct file *file)
345 {
346 struct inode * inode;
347 unsigned long bss, start_addr, len;
348 unsigned long error;
349 int retval;
350 struct exec ex;
352 inode = file->f_dentry->d_inode;
354 retval = -ENOEXEC;
355 error = kernel_read(file, 0, (char *) &ex, sizeof(ex));
356 if (error != sizeof(ex))
357 goto out;
359 /* We come in here for the regular a.out style of shared libraries */
360 if ((N_MAGIC(ex) != ZMAGIC && N_MAGIC(ex) != QMAGIC) || N_TRSIZE(ex) ||
361 N_DRSIZE(ex) || ((ex.a_entry & 0xfff) && N_MAGIC(ex) == ZMAGIC) ||
362 inode->i_size < ex.a_text+ex.a_data+N_SYMSIZE(ex)+N_TXTOFF(ex)) {
363 goto out;
364 }
366 if (N_MAGIC(ex) == ZMAGIC && N_TXTOFF(ex) &&
367 (N_TXTOFF(ex) < inode->i_sb->s_blocksize)) {
368 printk("N_TXTOFF < BLOCK_SIZE. Please convert library\n");
369 goto out;
370 }
372 if (N_FLAGS(ex))
373 goto out;
375 /* For QMAGIC, the starting address is 0x20 into the page. We mask
376 this off to get the starting address for the page */
378 start_addr = ex.a_entry & 0xfffff000;
380 /* Now use mmap to map the library into memory. */
381 down_write(&current->mm->mmap_sem);
382 error = do_mmap(file, start_addr, ex.a_text + ex.a_data,
383 PROT_READ | PROT_WRITE | PROT_EXEC,
384 MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE,
385 N_TXTOFF(ex));
386 up_write(&current->mm->mmap_sem);
387 retval = error;
388 if (error != start_addr)
389 goto out;
391 len = PAGE_ALIGN(ex.a_text + ex.a_data);
392 bss = ex.a_text + ex.a_data + ex.a_bss;
393 if (bss > len) {
394 down_write(&current->mm->mmap_sem);
395 error = do_brk(start_addr + len, bss - len);
396 up_write(&current->mm->mmap_sem);
397 retval = error;
398 if (error != start_addr + len)
399 goto out;
400 }
401 retval = 0;
402 out:
403 return retval;
404 }
406 static int __init init_aout32_binfmt(void)
407 {
408 return register_binfmt(&aout32_format);
409 }
411 static void __exit exit_aout32_binfmt(void)
412 {
413 unregister_binfmt(&aout32_format);
414 }
416 module_init(init_aout32_binfmt);
417 module_exit(exit_aout32_binfmt);