ia64/xen-unstable

view linux-2.6-xen-sparse/kernel/fork.c @ 12945:79bb96e0ba73

[XEN][POWERPC] Create a Domain Foreign Map space
The following patch creates a Domain Foreign Map space that is uses to
map granted memory into the Linear Map of the domain. The Linear Map
of Linux is the is the Kernel Virtual address space where VA = PA +
PAGE_OFFSET.
Also:
- lots of grant_* interfaces work now
- mm.[ch] cleanups
- first pass at extracting Page Table operations from PAPR interfaces
- get_page_type() fix logic bug
- recognize a grant table mapping by placing its gmfn at the end of
real memory.
- grant table usually mapped like an IO page, so force WIMG bits I=0
- mfn_to_gmfn and pfn2mfn get WAY to complex, need get a simpler model in.
- communicate the Domain Foreign Map to domains using /xen/foreign-map
- make sure all bit definitions are UL where possible
- now that we actually assign Xen heap pages to domains they must be
relinquished
Signed-off-by: Jimi Xenidis <jimix@watson.ibm.com>
Signed-off-by: Hollis Blanchard <hollisb@us.ibm.com>
author Jimi Xenidis <jimix@watson.ibm.com>
date Sun Oct 08 11:34:24 2006 -0400 (2006-10-08)
parents 2fea03842f40
children 4fad820a2233
line source
1 /*
2 * linux/kernel/fork.c
3 *
4 * Copyright (C) 1991, 1992 Linus Torvalds
5 */
7 /*
8 * 'fork.c' contains the help-routines for the 'fork' system call
9 * (see also entry.S and others).
10 * Fork is rather simple, once you get the hang of it, but the memory
11 * management can be a bitch. See 'mm/memory.c': 'copy_page_range()'
12 */
14 #include <linux/config.h>
15 #include <linux/slab.h>
16 #include <linux/init.h>
17 #include <linux/unistd.h>
18 #include <linux/smp_lock.h>
19 #include <linux/module.h>
20 #include <linux/vmalloc.h>
21 #include <linux/completion.h>
22 #include <linux/namespace.h>
23 #include <linux/personality.h>
24 #include <linux/mempolicy.h>
25 #include <linux/sem.h>
26 #include <linux/file.h>
27 #include <linux/key.h>
28 #include <linux/binfmts.h>
29 #include <linux/mman.h>
30 #include <linux/fs.h>
31 #include <linux/capability.h>
32 #include <linux/cpu.h>
33 #include <linux/cpuset.h>
34 #include <linux/security.h>
35 #include <linux/swap.h>
36 #include <linux/syscalls.h>
37 #include <linux/jiffies.h>
38 #include <linux/futex.h>
39 #include <linux/rcupdate.h>
40 #include <linux/ptrace.h>
41 #include <linux/mount.h>
42 #include <linux/audit.h>
43 #include <linux/profile.h>
44 #include <linux/rmap.h>
45 #include <linux/acct.h>
46 #include <linux/cn_proc.h>
48 #include <asm/pgtable.h>
49 #include <asm/pgalloc.h>
50 #include <asm/uaccess.h>
51 #include <asm/mmu_context.h>
52 #include <asm/cacheflush.h>
53 #include <asm/tlbflush.h>
55 /*
56 * Protected counters by write_lock_irq(&tasklist_lock)
57 */
58 unsigned long total_forks; /* Handle normal Linux uptimes. */
59 int nr_threads; /* The idle threads do not count.. */
61 int max_threads; /* tunable limit on nr_threads */
63 DEFINE_PER_CPU(unsigned long, process_counts) = 0;
65 __cacheline_aligned DEFINE_RWLOCK(tasklist_lock); /* outer */
67 EXPORT_SYMBOL(tasklist_lock);
69 int nr_processes(void)
70 {
71 int cpu;
72 int total = 0;
74 for_each_online_cpu(cpu)
75 total += per_cpu(process_counts, cpu);
77 return total;
78 }
80 #ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR
81 # define alloc_task_struct() kmem_cache_alloc(task_struct_cachep, GFP_KERNEL)
82 # define free_task_struct(tsk) kmem_cache_free(task_struct_cachep, (tsk))
83 static kmem_cache_t *task_struct_cachep;
84 #endif
86 /* SLAB cache for signal_struct structures (tsk->signal) */
87 kmem_cache_t *signal_cachep;
89 /* SLAB cache for sighand_struct structures (tsk->sighand) */
90 kmem_cache_t *sighand_cachep;
92 /* SLAB cache for files_struct structures (tsk->files) */
93 kmem_cache_t *files_cachep;
95 /* SLAB cache for fs_struct structures (tsk->fs) */
96 kmem_cache_t *fs_cachep;
98 /* SLAB cache for vm_area_struct structures */
99 kmem_cache_t *vm_area_cachep;
101 /* SLAB cache for mm_struct structures (tsk->mm) */
102 static kmem_cache_t *mm_cachep;
104 void free_task(struct task_struct *tsk)
105 {
106 free_thread_info(tsk->thread_info);
107 free_task_struct(tsk);
108 }
109 EXPORT_SYMBOL(free_task);
111 void __put_task_struct_cb(struct rcu_head *rhp)
112 {
113 struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
115 WARN_ON(!(tsk->exit_state & (EXIT_DEAD | EXIT_ZOMBIE)));
116 WARN_ON(atomic_read(&tsk->usage));
117 WARN_ON(tsk == current);
119 if (unlikely(tsk->audit_context))
120 audit_free(tsk);
121 security_task_free(tsk);
122 free_uid(tsk->user);
123 put_group_info(tsk->group_info);
125 if (!profile_handoff_task(tsk))
126 free_task(tsk);
127 }
129 void __init fork_init(unsigned long mempages)
130 {
131 #ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR
132 #ifndef ARCH_MIN_TASKALIGN
133 #define ARCH_MIN_TASKALIGN L1_CACHE_BYTES
134 #endif
135 /* create a slab on which task_structs can be allocated */
136 task_struct_cachep =
137 kmem_cache_create("task_struct", sizeof(struct task_struct),
138 ARCH_MIN_TASKALIGN, SLAB_PANIC, NULL, NULL);
139 #endif
141 /*
142 * The default maximum number of threads is set to a safe
143 * value: the thread structures can take up at most half
144 * of memory.
145 */
146 max_threads = mempages / (8 * THREAD_SIZE / PAGE_SIZE);
148 /*
149 * we need to allow at least 20 threads to boot a system
150 */
151 if(max_threads < 20)
152 max_threads = 20;
154 init_task.signal->rlim[RLIMIT_NPROC].rlim_cur = max_threads/2;
155 init_task.signal->rlim[RLIMIT_NPROC].rlim_max = max_threads/2;
156 init_task.signal->rlim[RLIMIT_SIGPENDING] =
157 init_task.signal->rlim[RLIMIT_NPROC];
158 }
160 static struct task_struct *dup_task_struct(struct task_struct *orig)
161 {
162 struct task_struct *tsk;
163 struct thread_info *ti;
165 prepare_to_copy(orig);
167 tsk = alloc_task_struct();
168 if (!tsk)
169 return NULL;
171 ti = alloc_thread_info(tsk);
172 if (!ti) {
173 free_task_struct(tsk);
174 return NULL;
175 }
177 *tsk = *orig;
178 tsk->thread_info = ti;
179 setup_thread_stack(tsk, orig);
181 /* One for us, one for whoever does the "release_task()" (usually parent) */
182 atomic_set(&tsk->usage,2);
183 atomic_set(&tsk->fs_excl, 0);
184 return tsk;
185 }
187 #ifdef CONFIG_MMU
188 static inline int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
189 {
190 struct vm_area_struct *mpnt, *tmp, **pprev;
191 struct rb_node **rb_link, *rb_parent;
192 int retval;
193 unsigned long charge;
194 struct mempolicy *pol;
196 down_write(&oldmm->mmap_sem);
197 flush_cache_mm(oldmm);
198 down_write(&mm->mmap_sem);
200 mm->locked_vm = 0;
201 mm->mmap = NULL;
202 mm->mmap_cache = NULL;
203 mm->free_area_cache = oldmm->mmap_base;
204 mm->cached_hole_size = ~0UL;
205 mm->map_count = 0;
206 cpus_clear(mm->cpu_vm_mask);
207 mm->mm_rb = RB_ROOT;
208 rb_link = &mm->mm_rb.rb_node;
209 rb_parent = NULL;
210 pprev = &mm->mmap;
212 for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) {
213 struct file *file;
215 if (mpnt->vm_flags & VM_DONTCOPY) {
216 long pages = vma_pages(mpnt);
217 mm->total_vm -= pages;
218 vm_stat_account(mm, mpnt->vm_flags, mpnt->vm_file,
219 -pages);
220 continue;
221 }
222 charge = 0;
223 if (mpnt->vm_flags & VM_ACCOUNT) {
224 unsigned int len = (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT;
225 if (security_vm_enough_memory(len))
226 goto fail_nomem;
227 charge = len;
228 }
229 tmp = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
230 if (!tmp)
231 goto fail_nomem;
232 *tmp = *mpnt;
233 pol = mpol_copy(vma_policy(mpnt));
234 retval = PTR_ERR(pol);
235 if (IS_ERR(pol))
236 goto fail_nomem_policy;
237 vma_set_policy(tmp, pol);
238 tmp->vm_flags &= ~VM_LOCKED;
239 tmp->vm_mm = mm;
240 tmp->vm_next = NULL;
241 anon_vma_link(tmp);
242 file = tmp->vm_file;
243 if (file) {
244 struct inode *inode = file->f_dentry->d_inode;
245 get_file(file);
246 if (tmp->vm_flags & VM_DENYWRITE)
247 atomic_dec(&inode->i_writecount);
249 /* insert tmp into the share list, just after mpnt */
250 spin_lock(&file->f_mapping->i_mmap_lock);
251 tmp->vm_truncate_count = mpnt->vm_truncate_count;
252 flush_dcache_mmap_lock(file->f_mapping);
253 vma_prio_tree_add(tmp, mpnt);
254 flush_dcache_mmap_unlock(file->f_mapping);
255 spin_unlock(&file->f_mapping->i_mmap_lock);
256 }
258 /*
259 * Link in the new vma and copy the page table entries.
260 */
261 *pprev = tmp;
262 pprev = &tmp->vm_next;
264 __vma_link_rb(mm, tmp, rb_link, rb_parent);
265 rb_link = &tmp->vm_rb.rb_right;
266 rb_parent = &tmp->vm_rb;
268 mm->map_count++;
269 retval = copy_page_range(mm, oldmm, mpnt);
271 if (tmp->vm_ops && tmp->vm_ops->open)
272 tmp->vm_ops->open(tmp);
274 if (retval)
275 goto out;
276 }
277 #ifdef arch_dup_mmap
278 arch_dup_mmap(mm, oldmm);
279 #endif
280 retval = 0;
281 out:
282 up_write(&mm->mmap_sem);
283 flush_tlb_mm(oldmm);
284 up_write(&oldmm->mmap_sem);
285 return retval;
286 fail_nomem_policy:
287 kmem_cache_free(vm_area_cachep, tmp);
288 fail_nomem:
289 retval = -ENOMEM;
290 vm_unacct_memory(charge);
291 goto out;
292 }
294 static inline int mm_alloc_pgd(struct mm_struct * mm)
295 {
296 mm->pgd = pgd_alloc(mm);
297 if (unlikely(!mm->pgd))
298 return -ENOMEM;
299 return 0;
300 }
302 static inline void mm_free_pgd(struct mm_struct * mm)
303 {
304 pgd_free(mm->pgd);
305 }
306 #else
307 #define dup_mmap(mm, oldmm) (0)
308 #define mm_alloc_pgd(mm) (0)
309 #define mm_free_pgd(mm)
310 #endif /* CONFIG_MMU */
312 __cacheline_aligned_in_smp DEFINE_SPINLOCK(mmlist_lock);
314 #define allocate_mm() (kmem_cache_alloc(mm_cachep, SLAB_KERNEL))
315 #define free_mm(mm) (kmem_cache_free(mm_cachep, (mm)))
317 #include <linux/init_task.h>
319 static struct mm_struct * mm_init(struct mm_struct * mm)
320 {
321 atomic_set(&mm->mm_users, 1);
322 atomic_set(&mm->mm_count, 1);
323 init_rwsem(&mm->mmap_sem);
324 INIT_LIST_HEAD(&mm->mmlist);
325 mm->core_waiters = 0;
326 mm->nr_ptes = 0;
327 set_mm_counter(mm, file_rss, 0);
328 set_mm_counter(mm, anon_rss, 0);
329 spin_lock_init(&mm->page_table_lock);
330 rwlock_init(&mm->ioctx_list_lock);
331 mm->ioctx_list = NULL;
332 mm->free_area_cache = TASK_UNMAPPED_BASE;
333 mm->cached_hole_size = ~0UL;
335 if (likely(!mm_alloc_pgd(mm))) {
336 mm->def_flags = 0;
337 return mm;
338 }
339 free_mm(mm);
340 return NULL;
341 }
343 /*
344 * Allocate and initialize an mm_struct.
345 */
346 struct mm_struct * mm_alloc(void)
347 {
348 struct mm_struct * mm;
350 mm = allocate_mm();
351 if (mm) {
352 memset(mm, 0, sizeof(*mm));
353 mm = mm_init(mm);
354 }
355 return mm;
356 }
358 /*
359 * Called when the last reference to the mm
360 * is dropped: either by a lazy thread or by
361 * mmput. Free the page directory and the mm.
362 */
363 void fastcall __mmdrop(struct mm_struct *mm)
364 {
365 BUG_ON(mm == &init_mm);
366 mm_free_pgd(mm);
367 destroy_context(mm);
368 free_mm(mm);
369 }
371 /*
372 * Decrement the use count and release all resources for an mm.
373 */
374 void mmput(struct mm_struct *mm)
375 {
376 if (atomic_dec_and_test(&mm->mm_users)) {
377 exit_aio(mm);
378 exit_mmap(mm);
379 if (!list_empty(&mm->mmlist)) {
380 spin_lock(&mmlist_lock);
381 list_del(&mm->mmlist);
382 spin_unlock(&mmlist_lock);
383 }
384 put_swap_token(mm);
385 mmdrop(mm);
386 }
387 }
388 EXPORT_SYMBOL_GPL(mmput);
390 /**
391 * get_task_mm - acquire a reference to the task's mm
392 *
393 * Returns %NULL if the task has no mm. Checks PF_BORROWED_MM (meaning
394 * this kernel workthread has transiently adopted a user mm with use_mm,
395 * to do its AIO) is not set and if so returns a reference to it, after
396 * bumping up the use count. User must release the mm via mmput()
397 * after use. Typically used by /proc and ptrace.
398 */
399 struct mm_struct *get_task_mm(struct task_struct *task)
400 {
401 struct mm_struct *mm;
403 task_lock(task);
404 mm = task->mm;
405 if (mm) {
406 if (task->flags & PF_BORROWED_MM)
407 mm = NULL;
408 else
409 atomic_inc(&mm->mm_users);
410 }
411 task_unlock(task);
412 return mm;
413 }
414 EXPORT_SYMBOL_GPL(get_task_mm);
416 /* Please note the differences between mmput and mm_release.
417 * mmput is called whenever we stop holding onto a mm_struct,
418 * error success whatever.
419 *
420 * mm_release is called after a mm_struct has been removed
421 * from the current process.
422 *
423 * This difference is important for error handling, when we
424 * only half set up a mm_struct for a new process and need to restore
425 * the old one. Because we mmput the new mm_struct before
426 * restoring the old one. . .
427 * Eric Biederman 10 January 1998
428 */
429 void mm_release(struct task_struct *tsk, struct mm_struct *mm)
430 {
431 struct completion *vfork_done = tsk->vfork_done;
433 /* Get rid of any cached register state */
434 deactivate_mm(tsk, mm);
436 /* notify parent sleeping on vfork() */
437 if (vfork_done) {
438 tsk->vfork_done = NULL;
439 complete(vfork_done);
440 }
441 if (tsk->clear_child_tid && atomic_read(&mm->mm_users) > 1) {
442 u32 __user * tidptr = tsk->clear_child_tid;
443 tsk->clear_child_tid = NULL;
445 /*
446 * We don't check the error code - if userspace has
447 * not set up a proper pointer then tough luck.
448 */
449 put_user(0, tidptr);
450 sys_futex(tidptr, FUTEX_WAKE, 1, NULL, NULL, 0);
451 }
452 }
454 /*
455 * Allocate a new mm structure and copy contents from the
456 * mm structure of the passed in task structure.
457 */
458 static struct mm_struct *dup_mm(struct task_struct *tsk)
459 {
460 struct mm_struct *mm, *oldmm = current->mm;
461 int err;
463 if (!oldmm)
464 return NULL;
466 mm = allocate_mm();
467 if (!mm)
468 goto fail_nomem;
470 memcpy(mm, oldmm, sizeof(*mm));
472 if (!mm_init(mm))
473 goto fail_nomem;
475 if (init_new_context(tsk, mm))
476 goto fail_nocontext;
478 err = dup_mmap(mm, oldmm);
479 if (err)
480 goto free_pt;
482 mm->hiwater_rss = get_mm_rss(mm);
483 mm->hiwater_vm = mm->total_vm;
485 return mm;
487 free_pt:
488 mmput(mm);
490 fail_nomem:
491 return NULL;
493 fail_nocontext:
494 /*
495 * If init_new_context() failed, we cannot use mmput() to free the mm
496 * because it calls destroy_context()
497 */
498 mm_free_pgd(mm);
499 free_mm(mm);
500 return NULL;
501 }
503 static int copy_mm(unsigned long clone_flags, struct task_struct * tsk)
504 {
505 struct mm_struct * mm, *oldmm;
506 int retval;
508 tsk->min_flt = tsk->maj_flt = 0;
509 tsk->nvcsw = tsk->nivcsw = 0;
511 tsk->mm = NULL;
512 tsk->active_mm = NULL;
514 /*
515 * Are we cloning a kernel thread?
516 *
517 * We need to steal a active VM for that..
518 */
519 oldmm = current->mm;
520 if (!oldmm)
521 return 0;
523 if (clone_flags & CLONE_VM) {
524 atomic_inc(&oldmm->mm_users);
525 mm = oldmm;
526 goto good_mm;
527 }
529 retval = -ENOMEM;
530 mm = dup_mm(tsk);
531 if (!mm)
532 goto fail_nomem;
534 good_mm:
535 tsk->mm = mm;
536 tsk->active_mm = mm;
537 return 0;
539 fail_nomem:
540 return retval;
541 }
543 static inline struct fs_struct *__copy_fs_struct(struct fs_struct *old)
544 {
545 struct fs_struct *fs = kmem_cache_alloc(fs_cachep, GFP_KERNEL);
546 /* We don't need to lock fs - think why ;-) */
547 if (fs) {
548 atomic_set(&fs->count, 1);
549 rwlock_init(&fs->lock);
550 fs->umask = old->umask;
551 read_lock(&old->lock);
552 fs->rootmnt = mntget(old->rootmnt);
553 fs->root = dget(old->root);
554 fs->pwdmnt = mntget(old->pwdmnt);
555 fs->pwd = dget(old->pwd);
556 if (old->altroot) {
557 fs->altrootmnt = mntget(old->altrootmnt);
558 fs->altroot = dget(old->altroot);
559 } else {
560 fs->altrootmnt = NULL;
561 fs->altroot = NULL;
562 }
563 read_unlock(&old->lock);
564 }
565 return fs;
566 }
568 struct fs_struct *copy_fs_struct(struct fs_struct *old)
569 {
570 return __copy_fs_struct(old);
571 }
573 EXPORT_SYMBOL_GPL(copy_fs_struct);
575 static inline int copy_fs(unsigned long clone_flags, struct task_struct * tsk)
576 {
577 if (clone_flags & CLONE_FS) {
578 atomic_inc(&current->fs->count);
579 return 0;
580 }
581 tsk->fs = __copy_fs_struct(current->fs);
582 if (!tsk->fs)
583 return -ENOMEM;
584 return 0;
585 }
587 static int count_open_files(struct fdtable *fdt)
588 {
589 int size = fdt->max_fdset;
590 int i;
592 /* Find the last open fd */
593 for (i = size/(8*sizeof(long)); i > 0; ) {
594 if (fdt->open_fds->fds_bits[--i])
595 break;
596 }
597 i = (i+1) * 8 * sizeof(long);
598 return i;
599 }
601 static struct files_struct *alloc_files(void)
602 {
603 struct files_struct *newf;
604 struct fdtable *fdt;
606 newf = kmem_cache_alloc(files_cachep, SLAB_KERNEL);
607 if (!newf)
608 goto out;
610 atomic_set(&newf->count, 1);
612 spin_lock_init(&newf->file_lock);
613 fdt = &newf->fdtab;
614 fdt->next_fd = 0;
615 fdt->max_fds = NR_OPEN_DEFAULT;
616 fdt->max_fdset = __FD_SETSIZE;
617 fdt->close_on_exec = &newf->close_on_exec_init;
618 fdt->open_fds = &newf->open_fds_init;
619 fdt->fd = &newf->fd_array[0];
620 INIT_RCU_HEAD(&fdt->rcu);
621 fdt->free_files = NULL;
622 fdt->next = NULL;
623 rcu_assign_pointer(newf->fdt, fdt);
624 out:
625 return newf;
626 }
628 /*
629 * Allocate a new files structure and copy contents from the
630 * passed in files structure.
631 */
632 static struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
633 {
634 struct files_struct *newf;
635 struct file **old_fds, **new_fds;
636 int open_files, size, i, expand;
637 struct fdtable *old_fdt, *new_fdt;
639 newf = alloc_files();
640 if (!newf)
641 goto out;
643 spin_lock(&oldf->file_lock);
644 old_fdt = files_fdtable(oldf);
645 new_fdt = files_fdtable(newf);
646 size = old_fdt->max_fdset;
647 open_files = count_open_files(old_fdt);
648 expand = 0;
650 /*
651 * Check whether we need to allocate a larger fd array or fd set.
652 * Note: we're not a clone task, so the open count won't change.
653 */
654 if (open_files > new_fdt->max_fdset) {
655 new_fdt->max_fdset = 0;
656 expand = 1;
657 }
658 if (open_files > new_fdt->max_fds) {
659 new_fdt->max_fds = 0;
660 expand = 1;
661 }
663 /* if the old fdset gets grown now, we'll only copy up to "size" fds */
664 if (expand) {
665 spin_unlock(&oldf->file_lock);
666 spin_lock(&newf->file_lock);
667 *errorp = expand_files(newf, open_files-1);
668 spin_unlock(&newf->file_lock);
669 if (*errorp < 0)
670 goto out_release;
671 new_fdt = files_fdtable(newf);
672 /*
673 * Reacquire the oldf lock and a pointer to its fd table
674 * who knows it may have a new bigger fd table. We need
675 * the latest pointer.
676 */
677 spin_lock(&oldf->file_lock);
678 old_fdt = files_fdtable(oldf);
679 }
681 old_fds = old_fdt->fd;
682 new_fds = new_fdt->fd;
684 memcpy(new_fdt->open_fds->fds_bits, old_fdt->open_fds->fds_bits, open_files/8);
685 memcpy(new_fdt->close_on_exec->fds_bits, old_fdt->close_on_exec->fds_bits, open_files/8);
687 for (i = open_files; i != 0; i--) {
688 struct file *f = *old_fds++;
689 if (f) {
690 get_file(f);
691 } else {
692 /*
693 * The fd may be claimed in the fd bitmap but not yet
694 * instantiated in the files array if a sibling thread
695 * is partway through open(). So make sure that this
696 * fd is available to the new process.
697 */
698 FD_CLR(open_files - i, new_fdt->open_fds);
699 }
700 rcu_assign_pointer(*new_fds++, f);
701 }
702 spin_unlock(&oldf->file_lock);
704 /* compute the remainder to be cleared */
705 size = (new_fdt->max_fds - open_files) * sizeof(struct file *);
707 /* This is long word aligned thus could use a optimized version */
708 memset(new_fds, 0, size);
710 if (new_fdt->max_fdset > open_files) {
711 int left = (new_fdt->max_fdset-open_files)/8;
712 int start = open_files / (8 * sizeof(unsigned long));
714 memset(&new_fdt->open_fds->fds_bits[start], 0, left);
715 memset(&new_fdt->close_on_exec->fds_bits[start], 0, left);
716 }
718 out:
719 return newf;
721 out_release:
722 free_fdset (new_fdt->close_on_exec, new_fdt->max_fdset);
723 free_fdset (new_fdt->open_fds, new_fdt->max_fdset);
724 free_fd_array(new_fdt->fd, new_fdt->max_fds);
725 kmem_cache_free(files_cachep, newf);
726 return NULL;
727 }
729 static int copy_files(unsigned long clone_flags, struct task_struct * tsk)
730 {
731 struct files_struct *oldf, *newf;
732 int error = 0;
734 /*
735 * A background process may not have any files ...
736 */
737 oldf = current->files;
738 if (!oldf)
739 goto out;
741 if (clone_flags & CLONE_FILES) {
742 atomic_inc(&oldf->count);
743 goto out;
744 }
746 /*
747 * Note: we may be using current for both targets (See exec.c)
748 * This works because we cache current->files (old) as oldf. Don't
749 * break this.
750 */
751 tsk->files = NULL;
752 error = -ENOMEM;
753 newf = dup_fd(oldf, &error);
754 if (!newf)
755 goto out;
757 tsk->files = newf;
758 error = 0;
759 out:
760 return error;
761 }
763 /*
764 * Helper to unshare the files of the current task.
765 * We don't want to expose copy_files internals to
766 * the exec layer of the kernel.
767 */
769 int unshare_files(void)
770 {
771 struct files_struct *files = current->files;
772 int rc;
774 if(!files)
775 BUG();
777 /* This can race but the race causes us to copy when we don't
778 need to and drop the copy */
779 if(atomic_read(&files->count) == 1)
780 {
781 atomic_inc(&files->count);
782 return 0;
783 }
784 rc = copy_files(0, current);
785 if(rc)
786 current->files = files;
787 return rc;
788 }
790 EXPORT_SYMBOL(unshare_files);
792 void sighand_free_cb(struct rcu_head *rhp)
793 {
794 struct sighand_struct *sp;
796 sp = container_of(rhp, struct sighand_struct, rcu);
797 kmem_cache_free(sighand_cachep, sp);
798 }
800 static inline int copy_sighand(unsigned long clone_flags, struct task_struct * tsk)
801 {
802 struct sighand_struct *sig;
804 if (clone_flags & (CLONE_SIGHAND | CLONE_THREAD)) {
805 atomic_inc(&current->sighand->count);
806 return 0;
807 }
808 sig = kmem_cache_alloc(sighand_cachep, GFP_KERNEL);
809 rcu_assign_pointer(tsk->sighand, sig);
810 if (!sig)
811 return -ENOMEM;
812 spin_lock_init(&sig->siglock);
813 atomic_set(&sig->count, 1);
814 memcpy(sig->action, current->sighand->action, sizeof(sig->action));
815 return 0;
816 }
818 static inline int copy_signal(unsigned long clone_flags, struct task_struct * tsk)
819 {
820 struct signal_struct *sig;
821 int ret;
823 if (clone_flags & CLONE_THREAD) {
824 atomic_inc(&current->signal->count);
825 atomic_inc(&current->signal->live);
826 return 0;
827 }
828 sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL);
829 tsk->signal = sig;
830 if (!sig)
831 return -ENOMEM;
833 ret = copy_thread_group_keys(tsk);
834 if (ret < 0) {
835 kmem_cache_free(signal_cachep, sig);
836 return ret;
837 }
839 atomic_set(&sig->count, 1);
840 atomic_set(&sig->live, 1);
841 init_waitqueue_head(&sig->wait_chldexit);
842 sig->flags = 0;
843 sig->group_exit_code = 0;
844 sig->group_exit_task = NULL;
845 sig->group_stop_count = 0;
846 sig->curr_target = NULL;
847 init_sigpending(&sig->shared_pending);
848 INIT_LIST_HEAD(&sig->posix_timers);
850 hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_REL);
851 sig->it_real_incr.tv64 = 0;
852 sig->real_timer.function = it_real_fn;
853 sig->real_timer.data = tsk;
855 sig->it_virt_expires = cputime_zero;
856 sig->it_virt_incr = cputime_zero;
857 sig->it_prof_expires = cputime_zero;
858 sig->it_prof_incr = cputime_zero;
860 sig->leader = 0; /* session leadership doesn't inherit */
861 sig->tty_old_pgrp = 0;
863 sig->utime = sig->stime = sig->cutime = sig->cstime = cputime_zero;
864 sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0;
865 sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0;
866 sig->sched_time = 0;
867 INIT_LIST_HEAD(&sig->cpu_timers[0]);
868 INIT_LIST_HEAD(&sig->cpu_timers[1]);
869 INIT_LIST_HEAD(&sig->cpu_timers[2]);
871 task_lock(current->group_leader);
872 memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim);
873 task_unlock(current->group_leader);
875 if (sig->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY) {
876 /*
877 * New sole thread in the process gets an expiry time
878 * of the whole CPU time limit.
879 */
880 tsk->it_prof_expires =
881 secs_to_cputime(sig->rlim[RLIMIT_CPU].rlim_cur);
882 }
884 return 0;
885 }
887 static inline void copy_flags(unsigned long clone_flags, struct task_struct *p)
888 {
889 unsigned long new_flags = p->flags;
891 new_flags &= ~(PF_SUPERPRIV | PF_NOFREEZE);
892 new_flags |= PF_FORKNOEXEC;
893 if (!(clone_flags & CLONE_PTRACE))
894 p->ptrace = 0;
895 p->flags = new_flags;
896 }
898 asmlinkage long sys_set_tid_address(int __user *tidptr)
899 {
900 current->clear_child_tid = tidptr;
902 return current->pid;
903 }
905 /*
906 * This creates a new process as a copy of the old one,
907 * but does not actually start it yet.
908 *
909 * It copies the registers, and all the appropriate
910 * parts of the process environment (as per the clone
911 * flags). The actual kick-off is left to the caller.
912 */
913 static task_t *copy_process(unsigned long clone_flags,
914 unsigned long stack_start,
915 struct pt_regs *regs,
916 unsigned long stack_size,
917 int __user *parent_tidptr,
918 int __user *child_tidptr,
919 int pid)
920 {
921 int retval;
922 struct task_struct *p = NULL;
924 if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
925 return ERR_PTR(-EINVAL);
927 /*
928 * Thread groups must share signals as well, and detached threads
929 * can only be started up within the thread group.
930 */
931 if ((clone_flags & CLONE_THREAD) && !(clone_flags & CLONE_SIGHAND))
932 return ERR_PTR(-EINVAL);
934 /*
935 * Shared signal handlers imply shared VM. By way of the above,
936 * thread groups also imply shared VM. Blocking this case allows
937 * for various simplifications in other code.
938 */
939 if ((clone_flags & CLONE_SIGHAND) && !(clone_flags & CLONE_VM))
940 return ERR_PTR(-EINVAL);
942 retval = security_task_create(clone_flags);
943 if (retval)
944 goto fork_out;
946 retval = -ENOMEM;
947 p = dup_task_struct(current);
948 if (!p)
949 goto fork_out;
951 retval = -EAGAIN;
952 if (atomic_read(&p->user->processes) >=
953 p->signal->rlim[RLIMIT_NPROC].rlim_cur) {
954 if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE) &&
955 p->user != &root_user)
956 goto bad_fork_free;
957 }
959 atomic_inc(&p->user->__count);
960 atomic_inc(&p->user->processes);
961 get_group_info(p->group_info);
963 /*
964 * If multiple threads are within copy_process(), then this check
965 * triggers too late. This doesn't hurt, the check is only there
966 * to stop root fork bombs.
967 */
968 if (nr_threads >= max_threads)
969 goto bad_fork_cleanup_count;
971 if (!try_module_get(task_thread_info(p)->exec_domain->module))
972 goto bad_fork_cleanup_count;
974 if (p->binfmt && !try_module_get(p->binfmt->module))
975 goto bad_fork_cleanup_put_domain;
977 p->did_exec = 0;
978 copy_flags(clone_flags, p);
979 p->pid = pid;
980 retval = -EFAULT;
981 if (clone_flags & CLONE_PARENT_SETTID)
982 if (put_user(p->pid, parent_tidptr))
983 goto bad_fork_cleanup;
985 p->proc_dentry = NULL;
987 INIT_LIST_HEAD(&p->children);
988 INIT_LIST_HEAD(&p->sibling);
989 p->vfork_done = NULL;
990 spin_lock_init(&p->alloc_lock);
991 spin_lock_init(&p->proc_lock);
993 clear_tsk_thread_flag(p, TIF_SIGPENDING);
994 init_sigpending(&p->pending);
996 p->utime = cputime_zero;
997 p->stime = cputime_zero;
998 p->sched_time = 0;
999 p->rchar = 0; /* I/O counter: bytes read */
1000 p->wchar = 0; /* I/O counter: bytes written */
1001 p->syscr = 0; /* I/O counter: read syscalls */
1002 p->syscw = 0; /* I/O counter: write syscalls */
1003 acct_clear_integrals(p);
1005 p->it_virt_expires = cputime_zero;
1006 p->it_prof_expires = cputime_zero;
1007 p->it_sched_expires = 0;
1008 INIT_LIST_HEAD(&p->cpu_timers[0]);
1009 INIT_LIST_HEAD(&p->cpu_timers[1]);
1010 INIT_LIST_HEAD(&p->cpu_timers[2]);
1012 p->lock_depth = -1; /* -1 = no lock */
1013 do_posix_clock_monotonic_gettime(&p->start_time);
1014 p->security = NULL;
1015 p->io_context = NULL;
1016 p->io_wait = NULL;
1017 p->audit_context = NULL;
1018 cpuset_fork(p);
1019 #ifdef CONFIG_NUMA
1020 p->mempolicy = mpol_copy(p->mempolicy);
1021 if (IS_ERR(p->mempolicy)) {
1022 retval = PTR_ERR(p->mempolicy);
1023 p->mempolicy = NULL;
1024 goto bad_fork_cleanup_cpuset;
1026 #endif
1028 #ifdef CONFIG_DEBUG_MUTEXES
1029 p->blocked_on = NULL; /* not blocked yet */
1030 #endif
1032 p->tgid = p->pid;
1033 if (clone_flags & CLONE_THREAD)
1034 p->tgid = current->tgid;
1036 if ((retval = security_task_alloc(p)))
1037 goto bad_fork_cleanup_policy;
1038 if ((retval = audit_alloc(p)))
1039 goto bad_fork_cleanup_security;
1040 /* copy all the process information */
1041 if ((retval = copy_semundo(clone_flags, p)))
1042 goto bad_fork_cleanup_audit;
1043 if ((retval = copy_files(clone_flags, p)))
1044 goto bad_fork_cleanup_semundo;
1045 if ((retval = copy_fs(clone_flags, p)))
1046 goto bad_fork_cleanup_files;
1047 if ((retval = copy_sighand(clone_flags, p)))
1048 goto bad_fork_cleanup_fs;
1049 if ((retval = copy_signal(clone_flags, p)))
1050 goto bad_fork_cleanup_sighand;
1051 if ((retval = copy_mm(clone_flags, p)))
1052 goto bad_fork_cleanup_signal;
1053 if ((retval = copy_keys(clone_flags, p)))
1054 goto bad_fork_cleanup_mm;
1055 if ((retval = copy_namespace(clone_flags, p)))
1056 goto bad_fork_cleanup_keys;
1057 retval = copy_thread(0, clone_flags, stack_start, stack_size, p, regs);
1058 if (retval)
1059 goto bad_fork_cleanup_namespace;
1061 p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL;
1062 /*
1063 * Clear TID on mm_release()?
1064 */
1065 p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr: NULL;
1067 /*
1068 * sigaltstack should be cleared when sharing the same VM
1069 */
1070 if ((clone_flags & (CLONE_VM|CLONE_VFORK)) == CLONE_VM)
1071 p->sas_ss_sp = p->sas_ss_size = 0;
1073 /*
1074 * Syscall tracing should be turned off in the child regardless
1075 * of CLONE_PTRACE.
1076 */
1077 clear_tsk_thread_flag(p, TIF_SYSCALL_TRACE);
1078 #ifdef TIF_SYSCALL_EMU
1079 clear_tsk_thread_flag(p, TIF_SYSCALL_EMU);
1080 #endif
1082 /* Our parent execution domain becomes current domain
1083 These must match for thread signalling to apply */
1085 p->parent_exec_id = p->self_exec_id;
1087 /* ok, now we should be set up.. */
1088 p->exit_signal = (clone_flags & CLONE_THREAD) ? -1 : (clone_flags & CSIGNAL);
1089 p->pdeath_signal = 0;
1090 p->exit_state = 0;
1092 /*
1093 * Ok, make it visible to the rest of the system.
1094 * We dont wake it up yet.
1095 */
1096 p->group_leader = p;
1097 INIT_LIST_HEAD(&p->ptrace_children);
1098 INIT_LIST_HEAD(&p->ptrace_list);
1100 /* Perform scheduler related setup. Assign this task to a CPU. */
1101 sched_fork(p, clone_flags);
1103 /* Need tasklist lock for parent etc handling! */
1104 write_lock_irq(&tasklist_lock);
1106 /*
1107 * The task hasn't been attached yet, so its cpus_allowed mask will
1108 * not be changed, nor will its assigned CPU.
1110 * The cpus_allowed mask of the parent may have changed after it was
1111 * copied first time - so re-copy it here, then check the child's CPU
1112 * to ensure it is on a valid CPU (and if not, just force it back to
1113 * parent's CPU). This avoids alot of nasty races.
1114 */
1115 p->cpus_allowed = current->cpus_allowed;
1116 if (unlikely(!cpu_isset(task_cpu(p), p->cpus_allowed) ||
1117 !cpu_online(task_cpu(p))))
1118 set_task_cpu(p, smp_processor_id());
1120 /*
1121 * Check for pending SIGKILL! The new thread should not be allowed
1122 * to slip out of an OOM kill. (or normal SIGKILL.)
1123 */
1124 if (sigismember(&current->pending.signal, SIGKILL)) {
1125 write_unlock_irq(&tasklist_lock);
1126 retval = -EINTR;
1127 goto bad_fork_cleanup_namespace;
1130 /* CLONE_PARENT re-uses the old parent */
1131 if (clone_flags & (CLONE_PARENT|CLONE_THREAD))
1132 p->real_parent = current->real_parent;
1133 else
1134 p->real_parent = current;
1135 p->parent = p->real_parent;
1137 spin_lock(&current->sighand->siglock);
1138 if (clone_flags & CLONE_THREAD) {
1139 /*
1140 * Important: if an exit-all has been started then
1141 * do not create this new thread - the whole thread
1142 * group is supposed to exit anyway.
1143 */
1144 if (current->signal->flags & SIGNAL_GROUP_EXIT) {
1145 spin_unlock(&current->sighand->siglock);
1146 write_unlock_irq(&tasklist_lock);
1147 retval = -EAGAIN;
1148 goto bad_fork_cleanup_namespace;
1150 p->group_leader = current->group_leader;
1152 if (current->signal->group_stop_count > 0) {
1153 /*
1154 * There is an all-stop in progress for the group.
1155 * We ourselves will stop as soon as we check signals.
1156 * Make the new thread part of that group stop too.
1157 */
1158 current->signal->group_stop_count++;
1159 set_tsk_thread_flag(p, TIF_SIGPENDING);
1162 if (!cputime_eq(current->signal->it_virt_expires,
1163 cputime_zero) ||
1164 !cputime_eq(current->signal->it_prof_expires,
1165 cputime_zero) ||
1166 current->signal->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY ||
1167 !list_empty(&current->signal->cpu_timers[0]) ||
1168 !list_empty(&current->signal->cpu_timers[1]) ||
1169 !list_empty(&current->signal->cpu_timers[2])) {
1170 /*
1171 * Have child wake up on its first tick to check
1172 * for process CPU timers.
1173 */
1174 p->it_prof_expires = jiffies_to_cputime(1);
1178 /*
1179 * inherit ioprio
1180 */
1181 p->ioprio = current->ioprio;
1183 SET_LINKS(p);
1184 if (unlikely(p->ptrace & PT_PTRACED))
1185 __ptrace_link(p, current->parent);
1187 if (thread_group_leader(p)) {
1188 p->signal->tty = current->signal->tty;
1189 p->signal->pgrp = process_group(current);
1190 p->signal->session = current->signal->session;
1191 attach_pid(p, PIDTYPE_PGID, process_group(p));
1192 attach_pid(p, PIDTYPE_SID, p->signal->session);
1193 if (p->pid)
1194 __get_cpu_var(process_counts)++;
1196 attach_pid(p, PIDTYPE_TGID, p->tgid);
1197 attach_pid(p, PIDTYPE_PID, p->pid);
1199 nr_threads++;
1200 total_forks++;
1201 spin_unlock(&current->sighand->siglock);
1202 write_unlock_irq(&tasklist_lock);
1203 proc_fork_connector(p);
1204 return p;
1206 bad_fork_cleanup_namespace:
1207 exit_namespace(p);
1208 bad_fork_cleanup_keys:
1209 exit_keys(p);
1210 bad_fork_cleanup_mm:
1211 if (p->mm)
1212 mmput(p->mm);
1213 bad_fork_cleanup_signal:
1214 exit_signal(p);
1215 bad_fork_cleanup_sighand:
1216 exit_sighand(p);
1217 bad_fork_cleanup_fs:
1218 exit_fs(p); /* blocking */
1219 bad_fork_cleanup_files:
1220 exit_files(p); /* blocking */
1221 bad_fork_cleanup_semundo:
1222 exit_sem(p);
1223 bad_fork_cleanup_audit:
1224 audit_free(p);
1225 bad_fork_cleanup_security:
1226 security_task_free(p);
1227 bad_fork_cleanup_policy:
1228 #ifdef CONFIG_NUMA
1229 mpol_free(p->mempolicy);
1230 bad_fork_cleanup_cpuset:
1231 #endif
1232 cpuset_exit(p);
1233 bad_fork_cleanup:
1234 if (p->binfmt)
1235 module_put(p->binfmt->module);
1236 bad_fork_cleanup_put_domain:
1237 module_put(task_thread_info(p)->exec_domain->module);
1238 bad_fork_cleanup_count:
1239 put_group_info(p->group_info);
1240 atomic_dec(&p->user->processes);
1241 free_uid(p->user);
1242 bad_fork_free:
1243 free_task(p);
1244 fork_out:
1245 return ERR_PTR(retval);
1248 struct pt_regs * __devinit __attribute__((weak)) idle_regs(struct pt_regs *regs)
1250 memset(regs, 0, sizeof(struct pt_regs));
1251 return regs;
1254 task_t * __devinit fork_idle(int cpu)
1256 task_t *task;
1257 struct pt_regs regs;
1259 task = copy_process(CLONE_VM, 0, idle_regs(&regs), 0, NULL, NULL, 0);
1260 if (!task)
1261 return ERR_PTR(-ENOMEM);
1262 init_idle(task, cpu);
1263 unhash_process(task);
1264 return task;
1267 static inline int fork_traceflag (unsigned clone_flags)
1269 if (clone_flags & CLONE_UNTRACED)
1270 return 0;
1271 else if (clone_flags & CLONE_VFORK) {
1272 if (current->ptrace & PT_TRACE_VFORK)
1273 return PTRACE_EVENT_VFORK;
1274 } else if ((clone_flags & CSIGNAL) != SIGCHLD) {
1275 if (current->ptrace & PT_TRACE_CLONE)
1276 return PTRACE_EVENT_CLONE;
1277 } else if (current->ptrace & PT_TRACE_FORK)
1278 return PTRACE_EVENT_FORK;
1280 return 0;
1283 /*
1284 * Ok, this is the main fork-routine.
1286 * It copies the process, and if successful kick-starts
1287 * it and waits for it to finish using the VM if required.
1288 */
1289 long do_fork(unsigned long clone_flags,
1290 unsigned long stack_start,
1291 struct pt_regs *regs,
1292 unsigned long stack_size,
1293 int __user *parent_tidptr,
1294 int __user *child_tidptr)
1296 struct task_struct *p;
1297 int trace = 0;
1298 long pid = alloc_pidmap();
1300 if (pid < 0)
1301 return -EAGAIN;
1302 if (unlikely(current->ptrace)) {
1303 trace = fork_traceflag (clone_flags);
1304 if (trace)
1305 clone_flags |= CLONE_PTRACE;
1308 p = copy_process(clone_flags, stack_start, regs, stack_size, parent_tidptr, child_tidptr, pid);
1309 /*
1310 * Do this prior waking up the new thread - the thread pointer
1311 * might get invalid after that point, if the thread exits quickly.
1312 */
1313 if (!IS_ERR(p)) {
1314 struct completion vfork;
1316 if (clone_flags & CLONE_VFORK) {
1317 p->vfork_done = &vfork;
1318 init_completion(&vfork);
1321 if ((p->ptrace & PT_PTRACED) || (clone_flags & CLONE_STOPPED)) {
1322 /*
1323 * We'll start up with an immediate SIGSTOP.
1324 */
1325 sigaddset(&p->pending.signal, SIGSTOP);
1326 set_tsk_thread_flag(p, TIF_SIGPENDING);
1329 if (!(clone_flags & CLONE_STOPPED))
1330 wake_up_new_task(p, clone_flags);
1331 else
1332 p->state = TASK_STOPPED;
1334 if (unlikely (trace)) {
1335 current->ptrace_message = pid;
1336 ptrace_notify ((trace << 8) | SIGTRAP);
1339 if (clone_flags & CLONE_VFORK) {
1340 wait_for_completion(&vfork);
1341 if (unlikely (current->ptrace & PT_TRACE_VFORK_DONE))
1342 ptrace_notify ((PTRACE_EVENT_VFORK_DONE << 8) | SIGTRAP);
1344 } else {
1345 free_pidmap(pid);
1346 pid = PTR_ERR(p);
1348 return pid;
1351 #ifndef ARCH_MIN_MMSTRUCT_ALIGN
1352 #define ARCH_MIN_MMSTRUCT_ALIGN 0
1353 #endif
1355 void __init proc_caches_init(void)
1357 sighand_cachep = kmem_cache_create("sighand_cache",
1358 sizeof(struct sighand_struct), 0,
1359 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
1360 signal_cachep = kmem_cache_create("signal_cache",
1361 sizeof(struct signal_struct), 0,
1362 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
1363 files_cachep = kmem_cache_create("files_cache",
1364 sizeof(struct files_struct), 0,
1365 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
1366 fs_cachep = kmem_cache_create("fs_cache",
1367 sizeof(struct fs_struct), 0,
1368 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
1369 vm_area_cachep = kmem_cache_create("vm_area_struct",
1370 sizeof(struct vm_area_struct), 0,
1371 SLAB_PANIC, NULL, NULL);
1372 mm_cachep = kmem_cache_create("mm_struct",
1373 sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN,
1374 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
1378 /*
1379 * Check constraints on flags passed to the unshare system call and
1380 * force unsharing of additional process context as appropriate.
1381 */
1382 static inline void check_unshare_flags(unsigned long *flags_ptr)
1384 /*
1385 * If unsharing a thread from a thread group, must also
1386 * unshare vm.
1387 */
1388 if (*flags_ptr & CLONE_THREAD)
1389 *flags_ptr |= CLONE_VM;
1391 /*
1392 * If unsharing vm, must also unshare signal handlers.
1393 */
1394 if (*flags_ptr & CLONE_VM)
1395 *flags_ptr |= CLONE_SIGHAND;
1397 /*
1398 * If unsharing signal handlers and the task was created
1399 * using CLONE_THREAD, then must unshare the thread
1400 */
1401 if ((*flags_ptr & CLONE_SIGHAND) &&
1402 (atomic_read(&current->signal->count) > 1))
1403 *flags_ptr |= CLONE_THREAD;
1405 /*
1406 * If unsharing namespace, must also unshare filesystem information.
1407 */
1408 if (*flags_ptr & CLONE_NEWNS)
1409 *flags_ptr |= CLONE_FS;
1412 /*
1413 * Unsharing of tasks created with CLONE_THREAD is not supported yet
1414 */
1415 static int unshare_thread(unsigned long unshare_flags)
1417 if (unshare_flags & CLONE_THREAD)
1418 return -EINVAL;
1420 return 0;
1423 /*
1424 * Unshare the filesystem structure if it is being shared
1425 */
1426 static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp)
1428 struct fs_struct *fs = current->fs;
1430 if ((unshare_flags & CLONE_FS) &&
1431 (fs && atomic_read(&fs->count) > 1)) {
1432 *new_fsp = __copy_fs_struct(current->fs);
1433 if (!*new_fsp)
1434 return -ENOMEM;
1437 return 0;
1440 /*
1441 * Unshare the namespace structure if it is being shared
1442 */
1443 static int unshare_namespace(unsigned long unshare_flags, struct namespace **new_nsp, struct fs_struct *new_fs)
1445 struct namespace *ns = current->namespace;
1447 if ((unshare_flags & CLONE_NEWNS) &&
1448 (ns && atomic_read(&ns->count) > 1)) {
1449 if (!capable(CAP_SYS_ADMIN))
1450 return -EPERM;
1452 *new_nsp = dup_namespace(current, new_fs ? new_fs : current->fs);
1453 if (!*new_nsp)
1454 return -ENOMEM;
1457 return 0;
1460 /*
1461 * Unsharing of sighand for tasks created with CLONE_SIGHAND is not
1462 * supported yet
1463 */
1464 static int unshare_sighand(unsigned long unshare_flags, struct sighand_struct **new_sighp)
1466 struct sighand_struct *sigh = current->sighand;
1468 if ((unshare_flags & CLONE_SIGHAND) &&
1469 (sigh && atomic_read(&sigh->count) > 1))
1470 return -EINVAL;
1471 else
1472 return 0;
1475 /*
1476 * Unshare vm if it is being shared
1477 */
1478 static int unshare_vm(unsigned long unshare_flags, struct mm_struct **new_mmp)
1480 struct mm_struct *mm = current->mm;
1482 if ((unshare_flags & CLONE_VM) &&
1483 (mm && atomic_read(&mm->mm_users) > 1)) {
1484 return -EINVAL;
1487 return 0;
1490 /*
1491 * Unshare file descriptor table if it is being shared
1492 */
1493 static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp)
1495 struct files_struct *fd = current->files;
1496 int error = 0;
1498 if ((unshare_flags & CLONE_FILES) &&
1499 (fd && atomic_read(&fd->count) > 1)) {
1500 *new_fdp = dup_fd(fd, &error);
1501 if (!*new_fdp)
1502 return error;
1505 return 0;
1508 /*
1509 * Unsharing of semundo for tasks created with CLONE_SYSVSEM is not
1510 * supported yet
1511 */
1512 static int unshare_semundo(unsigned long unshare_flags, struct sem_undo_list **new_ulistp)
1514 if (unshare_flags & CLONE_SYSVSEM)
1515 return -EINVAL;
1517 return 0;
1520 /*
1521 * unshare allows a process to 'unshare' part of the process
1522 * context which was originally shared using clone. copy_*
1523 * functions used by do_fork() cannot be used here directly
1524 * because they modify an inactive task_struct that is being
1525 * constructed. Here we are modifying the current, active,
1526 * task_struct.
1527 */
1528 asmlinkage long sys_unshare(unsigned long unshare_flags)
1530 int err = 0;
1531 struct fs_struct *fs, *new_fs = NULL;
1532 struct namespace *ns, *new_ns = NULL;
1533 struct sighand_struct *sigh, *new_sigh = NULL;
1534 struct mm_struct *mm, *new_mm = NULL, *active_mm = NULL;
1535 struct files_struct *fd, *new_fd = NULL;
1536 struct sem_undo_list *new_ulist = NULL;
1538 check_unshare_flags(&unshare_flags);
1540 if ((err = unshare_thread(unshare_flags)))
1541 goto bad_unshare_out;
1542 if ((err = unshare_fs(unshare_flags, &new_fs)))
1543 goto bad_unshare_cleanup_thread;
1544 if ((err = unshare_namespace(unshare_flags, &new_ns, new_fs)))
1545 goto bad_unshare_cleanup_fs;
1546 if ((err = unshare_sighand(unshare_flags, &new_sigh)))
1547 goto bad_unshare_cleanup_ns;
1548 if ((err = unshare_vm(unshare_flags, &new_mm)))
1549 goto bad_unshare_cleanup_sigh;
1550 if ((err = unshare_fd(unshare_flags, &new_fd)))
1551 goto bad_unshare_cleanup_vm;
1552 if ((err = unshare_semundo(unshare_flags, &new_ulist)))
1553 goto bad_unshare_cleanup_fd;
1555 if (new_fs || new_ns || new_sigh || new_mm || new_fd || new_ulist) {
1557 task_lock(current);
1559 if (new_fs) {
1560 fs = current->fs;
1561 current->fs = new_fs;
1562 new_fs = fs;
1565 if (new_ns) {
1566 ns = current->namespace;
1567 current->namespace = new_ns;
1568 new_ns = ns;
1571 if (new_sigh) {
1572 sigh = current->sighand;
1573 rcu_assign_pointer(current->sighand, new_sigh);
1574 new_sigh = sigh;
1577 if (new_mm) {
1578 mm = current->mm;
1579 active_mm = current->active_mm;
1580 current->mm = new_mm;
1581 current->active_mm = new_mm;
1582 activate_mm(active_mm, new_mm);
1583 new_mm = mm;
1586 if (new_fd) {
1587 fd = current->files;
1588 current->files = new_fd;
1589 new_fd = fd;
1592 task_unlock(current);
1595 bad_unshare_cleanup_fd:
1596 if (new_fd)
1597 put_files_struct(new_fd);
1599 bad_unshare_cleanup_vm:
1600 if (new_mm)
1601 mmput(new_mm);
1603 bad_unshare_cleanup_sigh:
1604 if (new_sigh)
1605 if (atomic_dec_and_test(&new_sigh->count))
1606 kmem_cache_free(sighand_cachep, new_sigh);
1608 bad_unshare_cleanup_ns:
1609 if (new_ns)
1610 put_namespace(new_ns);
1612 bad_unshare_cleanup_fs:
1613 if (new_fs)
1614 put_fs_struct(new_fs);
1616 bad_unshare_cleanup_thread:
1617 bad_unshare_out:
1618 return err;