ia64/xen-unstable

changeset 10398:2fea03842f40

[LINUX] Eliminates a deadlock and reduce (sometimes significantly) the time
interrupts are off during context switch.

Signed-off-by: Jan Beulich <jbeulich@novell.com>
author kaf24@firebug.cl.cam.ac.uk
date Tue Jun 13 15:56:28 2006 +0100 (2006-06-13)
parents 5d4b9dc88218
children a734745bf058
files linux-2.6-xen-sparse/arch/i386/mm/pgtable-xen.c linux-2.6-xen-sparse/arch/x86_64/mm/pageattr-xen.c linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/mmu.h linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/mmu_context.h linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/mmu.h linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/mmu_context.h linux-2.6-xen-sparse/kernel/fork.c
line diff
     1.1 --- a/linux-2.6-xen-sparse/arch/i386/mm/pgtable-xen.c	Tue Jun 13 15:41:27 2006 +0100
     1.2 +++ b/linux-2.6-xen-sparse/arch/i386/mm/pgtable-xen.c	Tue Jun 13 15:56:28 2006 +0100
     1.3 @@ -614,6 +614,12 @@ void mm_pin_all(void)
     1.4  	}
     1.5  }
     1.6  
     1.7 +void _arch_dup_mmap(struct mm_struct *mm)
     1.8 +{
     1.9 +	if (!test_bit(PG_pinned, &virt_to_page(mm->pgd)->flags))
    1.10 +		mm_pin(mm);
    1.11 +}
    1.12 +
    1.13  void _arch_exit_mmap(struct mm_struct *mm)
    1.14  {
    1.15  	struct task_struct *tsk = current;
     2.1 --- a/linux-2.6-xen-sparse/arch/x86_64/mm/pageattr-xen.c	Tue Jun 13 15:41:27 2006 +0100
     2.2 +++ b/linux-2.6-xen-sparse/arch/x86_64/mm/pageattr-xen.c	Tue Jun 13 15:56:28 2006 +0100
     2.3 @@ -130,6 +130,12 @@ void mm_pin_all(void)
     2.4  				  context.unpinned));
     2.5  }
     2.6  
     2.7 +void _arch_dup_mmap(struct mm_struct *mm)
     2.8 +{
     2.9 +    if (!mm->context.pinned)
    2.10 +        mm_pin(mm);
    2.11 +}
    2.12 +
    2.13  void _arch_exit_mmap(struct mm_struct *mm)
    2.14  {
    2.15      struct task_struct *tsk = current;
     3.1 --- a/linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/mmu.h	Tue Jun 13 15:41:27 2006 +0100
     3.2 +++ b/linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/mmu.h	Tue Jun 13 15:56:28 2006 +0100
     3.3 @@ -18,4 +18,8 @@ typedef struct {
     3.4  extern void _arch_exit_mmap(struct mm_struct *mm);
     3.5  #define arch_exit_mmap(_mm) _arch_exit_mmap(_mm)
     3.6  
     3.7 +/* kernel/fork.c:dup_mmap hook */
     3.8 +extern void _arch_dup_mmap(struct mm_struct *mm);
     3.9 +#define arch_dup_mmap(mm, oldmm) ((void)(oldmm), _arch_dup_mmap(mm))
    3.10 +
    3.11  #endif
     4.1 --- a/linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/mmu_context.h	Tue Jun 13 15:41:27 2006 +0100
     4.2 +++ b/linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/mmu_context.h	Tue Jun 13 15:56:28 2006 +0100
     4.3 @@ -51,8 +51,7 @@ static inline void switch_mm(struct mm_s
     4.4  	struct mmuext_op _op[2], *op = _op;
     4.5  
     4.6  	if (likely(prev != next)) {
     4.7 -		if (!test_bit(PG_pinned, &virt_to_page(next->pgd)->flags))
     4.8 -			mm_pin(next);
     4.9 +		BUG_ON(!test_bit(PG_pinned, &virt_to_page(next->pgd)->flags));
    4.10  
    4.11  		/* stop flush ipis for the previous mm */
    4.12  		cpu_clear(cpu, prev->cpu_vm_mask);
    4.13 @@ -99,7 +98,11 @@ static inline void switch_mm(struct mm_s
    4.14  #define deactivate_mm(tsk, mm) \
    4.15  	asm("movl %0,%%fs ; movl %0,%%gs": :"r" (0))
    4.16  
    4.17 -#define activate_mm(prev, next) \
    4.18 -	switch_mm((prev),(next),NULL)
    4.19 +static inline void activate_mm(struct mm_struct *prev, struct mm_struct *next)
    4.20 +{
    4.21 +	if (!test_bit(PG_pinned, &virt_to_page(next->pgd)->flags))
    4.22 +		mm_pin(next);
    4.23 +	switch_mm(prev, next, NULL);
    4.24 +}
    4.25  
    4.26  #endif
     5.1 --- a/linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/mmu.h	Tue Jun 13 15:41:27 2006 +0100
     5.2 +++ b/linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/mmu.h	Tue Jun 13 15:56:28 2006 +0100
     5.3 @@ -28,6 +28,10 @@ extern spinlock_t mm_unpinned_lock;
     5.4  /* mm/memory.c:exit_mmap hook */
     5.5  extern void _arch_exit_mmap(struct mm_struct *mm);
     5.6  #define arch_exit_mmap(_mm) _arch_exit_mmap(_mm)
     5.7 +
     5.8 +/* kernel/fork.c:dup_mmap hook */
     5.9 +extern void _arch_dup_mmap(struct mm_struct *mm);
    5.10 +#define arch_dup_mmap(mm, oldmm) ((void)(oldmm), _arch_dup_mmap(mm))
    5.11  #endif
    5.12  
    5.13  #endif
     6.1 --- a/linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/mmu_context.h	Tue Jun 13 15:41:27 2006 +0100
     6.2 +++ b/linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/mmu_context.h	Tue Jun 13 15:56:28 2006 +0100
     6.3 @@ -73,8 +73,7 @@ static inline void switch_mm(struct mm_s
     6.4  	struct mmuext_op _op[3], *op = _op;
     6.5  
     6.6  	if (likely(prev != next)) {
     6.7 -		if (!next->context.pinned)
     6.8 -			mm_pin(next);
     6.9 +		BUG_ON(!next->context.pinned);
    6.10  
    6.11  		/* stop flush ipis for the previous mm */
    6.12  		clear_bit(cpu, &prev->cpu_vm_mask);
    6.13 @@ -127,8 +126,11 @@ static inline void switch_mm(struct mm_s
    6.14  	asm volatile("movl %0,%%fs"::"r"(0));  \
    6.15  } while(0)
    6.16  
    6.17 -#define activate_mm(prev, next) do {		\
    6.18 -	switch_mm((prev),(next),NULL);		\
    6.19 -} while (0)
    6.20 +static inline void activate_mm(struct mm_struct *prev, struct mm_struct *next)
    6.21 +{
    6.22 +	if (!next->context.pinned)
    6.23 +		mm_pin(next);
    6.24 +	switch_mm(prev, next, NULL);
    6.25 +}
    6.26  
    6.27  #endif
     7.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     7.2 +++ b/linux-2.6-xen-sparse/kernel/fork.c	Tue Jun 13 15:56:28 2006 +0100
     7.3 @@ -0,0 +1,1619 @@
     7.4 +/*
     7.5 + *  linux/kernel/fork.c
     7.6 + *
     7.7 + *  Copyright (C) 1991, 1992  Linus Torvalds
     7.8 + */
     7.9 +
    7.10 +/*
    7.11 + *  'fork.c' contains the help-routines for the 'fork' system call
    7.12 + * (see also entry.S and others).
    7.13 + * Fork is rather simple, once you get the hang of it, but the memory
    7.14 + * management can be a bitch. See 'mm/memory.c': 'copy_page_range()'
    7.15 + */
    7.16 +
    7.17 +#include <linux/config.h>
    7.18 +#include <linux/slab.h>
    7.19 +#include <linux/init.h>
    7.20 +#include <linux/unistd.h>
    7.21 +#include <linux/smp_lock.h>
    7.22 +#include <linux/module.h>
    7.23 +#include <linux/vmalloc.h>
    7.24 +#include <linux/completion.h>
    7.25 +#include <linux/namespace.h>
    7.26 +#include <linux/personality.h>
    7.27 +#include <linux/mempolicy.h>
    7.28 +#include <linux/sem.h>
    7.29 +#include <linux/file.h>
    7.30 +#include <linux/key.h>
    7.31 +#include <linux/binfmts.h>
    7.32 +#include <linux/mman.h>
    7.33 +#include <linux/fs.h>
    7.34 +#include <linux/capability.h>
    7.35 +#include <linux/cpu.h>
    7.36 +#include <linux/cpuset.h>
    7.37 +#include <linux/security.h>
    7.38 +#include <linux/swap.h>
    7.39 +#include <linux/syscalls.h>
    7.40 +#include <linux/jiffies.h>
    7.41 +#include <linux/futex.h>
    7.42 +#include <linux/rcupdate.h>
    7.43 +#include <linux/ptrace.h>
    7.44 +#include <linux/mount.h>
    7.45 +#include <linux/audit.h>
    7.46 +#include <linux/profile.h>
    7.47 +#include <linux/rmap.h>
    7.48 +#include <linux/acct.h>
    7.49 +#include <linux/cn_proc.h>
    7.50 +
    7.51 +#include <asm/pgtable.h>
    7.52 +#include <asm/pgalloc.h>
    7.53 +#include <asm/uaccess.h>
    7.54 +#include <asm/mmu_context.h>
    7.55 +#include <asm/cacheflush.h>
    7.56 +#include <asm/tlbflush.h>
    7.57 +
    7.58 +/*
    7.59 + * Protected counters by write_lock_irq(&tasklist_lock)
    7.60 + */
    7.61 +unsigned long total_forks;	/* Handle normal Linux uptimes. */
    7.62 +int nr_threads; 		/* The idle threads do not count.. */
    7.63 +
    7.64 +int max_threads;		/* tunable limit on nr_threads */
    7.65 +
    7.66 +DEFINE_PER_CPU(unsigned long, process_counts) = 0;
    7.67 +
    7.68 + __cacheline_aligned DEFINE_RWLOCK(tasklist_lock);  /* outer */
    7.69 +
    7.70 +EXPORT_SYMBOL(tasklist_lock);
    7.71 +
    7.72 +int nr_processes(void)
    7.73 +{
    7.74 +	int cpu;
    7.75 +	int total = 0;
    7.76 +
    7.77 +	for_each_online_cpu(cpu)
    7.78 +		total += per_cpu(process_counts, cpu);
    7.79 +
    7.80 +	return total;
    7.81 +}
    7.82 +
    7.83 +#ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR
    7.84 +# define alloc_task_struct()	kmem_cache_alloc(task_struct_cachep, GFP_KERNEL)
    7.85 +# define free_task_struct(tsk)	kmem_cache_free(task_struct_cachep, (tsk))
    7.86 +static kmem_cache_t *task_struct_cachep;
    7.87 +#endif
    7.88 +
    7.89 +/* SLAB cache for signal_struct structures (tsk->signal) */
    7.90 +kmem_cache_t *signal_cachep;
    7.91 +
    7.92 +/* SLAB cache for sighand_struct structures (tsk->sighand) */
    7.93 +kmem_cache_t *sighand_cachep;
    7.94 +
    7.95 +/* SLAB cache for files_struct structures (tsk->files) */
    7.96 +kmem_cache_t *files_cachep;
    7.97 +
    7.98 +/* SLAB cache for fs_struct structures (tsk->fs) */
    7.99 +kmem_cache_t *fs_cachep;
   7.100 +
   7.101 +/* SLAB cache for vm_area_struct structures */
   7.102 +kmem_cache_t *vm_area_cachep;
   7.103 +
   7.104 +/* SLAB cache for mm_struct structures (tsk->mm) */
   7.105 +static kmem_cache_t *mm_cachep;
   7.106 +
   7.107 +void free_task(struct task_struct *tsk)
   7.108 +{
   7.109 +	free_thread_info(tsk->thread_info);
   7.110 +	free_task_struct(tsk);
   7.111 +}
   7.112 +EXPORT_SYMBOL(free_task);
   7.113 +
   7.114 +void __put_task_struct_cb(struct rcu_head *rhp)
   7.115 +{
   7.116 +	struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
   7.117 +
   7.118 +	WARN_ON(!(tsk->exit_state & (EXIT_DEAD | EXIT_ZOMBIE)));
   7.119 +	WARN_ON(atomic_read(&tsk->usage));
   7.120 +	WARN_ON(tsk == current);
   7.121 +
   7.122 +	if (unlikely(tsk->audit_context))
   7.123 +		audit_free(tsk);
   7.124 +	security_task_free(tsk);
   7.125 +	free_uid(tsk->user);
   7.126 +	put_group_info(tsk->group_info);
   7.127 +
   7.128 +	if (!profile_handoff_task(tsk))
   7.129 +		free_task(tsk);
   7.130 +}
   7.131 +
   7.132 +void __init fork_init(unsigned long mempages)
   7.133 +{
   7.134 +#ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR
   7.135 +#ifndef ARCH_MIN_TASKALIGN
   7.136 +#define ARCH_MIN_TASKALIGN	L1_CACHE_BYTES
   7.137 +#endif
   7.138 +	/* create a slab on which task_structs can be allocated */
   7.139 +	task_struct_cachep =
   7.140 +		kmem_cache_create("task_struct", sizeof(struct task_struct),
   7.141 +			ARCH_MIN_TASKALIGN, SLAB_PANIC, NULL, NULL);
   7.142 +#endif
   7.143 +
   7.144 +	/*
   7.145 +	 * The default maximum number of threads is set to a safe
   7.146 +	 * value: the thread structures can take up at most half
   7.147 +	 * of memory.
   7.148 +	 */
   7.149 +	max_threads = mempages / (8 * THREAD_SIZE / PAGE_SIZE);
   7.150 +
   7.151 +	/*
   7.152 +	 * we need to allow at least 20 threads to boot a system
   7.153 +	 */
   7.154 +	if(max_threads < 20)
   7.155 +		max_threads = 20;
   7.156 +
   7.157 +	init_task.signal->rlim[RLIMIT_NPROC].rlim_cur = max_threads/2;
   7.158 +	init_task.signal->rlim[RLIMIT_NPROC].rlim_max = max_threads/2;
   7.159 +	init_task.signal->rlim[RLIMIT_SIGPENDING] =
   7.160 +		init_task.signal->rlim[RLIMIT_NPROC];
   7.161 +}
   7.162 +
   7.163 +static struct task_struct *dup_task_struct(struct task_struct *orig)
   7.164 +{
   7.165 +	struct task_struct *tsk;
   7.166 +	struct thread_info *ti;
   7.167 +
   7.168 +	prepare_to_copy(orig);
   7.169 +
   7.170 +	tsk = alloc_task_struct();
   7.171 +	if (!tsk)
   7.172 +		return NULL;
   7.173 +
   7.174 +	ti = alloc_thread_info(tsk);
   7.175 +	if (!ti) {
   7.176 +		free_task_struct(tsk);
   7.177 +		return NULL;
   7.178 +	}
   7.179 +
   7.180 +	*tsk = *orig;
   7.181 +	tsk->thread_info = ti;
   7.182 +	setup_thread_stack(tsk, orig);
   7.183 +
   7.184 +	/* One for us, one for whoever does the "release_task()" (usually parent) */
   7.185 +	atomic_set(&tsk->usage,2);
   7.186 +	atomic_set(&tsk->fs_excl, 0);
   7.187 +	return tsk;
   7.188 +}
   7.189 +
   7.190 +#ifdef CONFIG_MMU
   7.191 +static inline int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
   7.192 +{
   7.193 +	struct vm_area_struct *mpnt, *tmp, **pprev;
   7.194 +	struct rb_node **rb_link, *rb_parent;
   7.195 +	int retval;
   7.196 +	unsigned long charge;
   7.197 +	struct mempolicy *pol;
   7.198 +
   7.199 +	down_write(&oldmm->mmap_sem);
   7.200 +	flush_cache_mm(oldmm);
   7.201 +	down_write(&mm->mmap_sem);
   7.202 +
   7.203 +	mm->locked_vm = 0;
   7.204 +	mm->mmap = NULL;
   7.205 +	mm->mmap_cache = NULL;
   7.206 +	mm->free_area_cache = oldmm->mmap_base;
   7.207 +	mm->cached_hole_size = ~0UL;
   7.208 +	mm->map_count = 0;
   7.209 +	cpus_clear(mm->cpu_vm_mask);
   7.210 +	mm->mm_rb = RB_ROOT;
   7.211 +	rb_link = &mm->mm_rb.rb_node;
   7.212 +	rb_parent = NULL;
   7.213 +	pprev = &mm->mmap;
   7.214 +
   7.215 +	for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) {
   7.216 +		struct file *file;
   7.217 +
   7.218 +		if (mpnt->vm_flags & VM_DONTCOPY) {
   7.219 +			long pages = vma_pages(mpnt);
   7.220 +			mm->total_vm -= pages;
   7.221 +			vm_stat_account(mm, mpnt->vm_flags, mpnt->vm_file,
   7.222 +								-pages);
   7.223 +			continue;
   7.224 +		}
   7.225 +		charge = 0;
   7.226 +		if (mpnt->vm_flags & VM_ACCOUNT) {
   7.227 +			unsigned int len = (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT;
   7.228 +			if (security_vm_enough_memory(len))
   7.229 +				goto fail_nomem;
   7.230 +			charge = len;
   7.231 +		}
   7.232 +		tmp = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
   7.233 +		if (!tmp)
   7.234 +			goto fail_nomem;
   7.235 +		*tmp = *mpnt;
   7.236 +		pol = mpol_copy(vma_policy(mpnt));
   7.237 +		retval = PTR_ERR(pol);
   7.238 +		if (IS_ERR(pol))
   7.239 +			goto fail_nomem_policy;
   7.240 +		vma_set_policy(tmp, pol);
   7.241 +		tmp->vm_flags &= ~VM_LOCKED;
   7.242 +		tmp->vm_mm = mm;
   7.243 +		tmp->vm_next = NULL;
   7.244 +		anon_vma_link(tmp);
   7.245 +		file = tmp->vm_file;
   7.246 +		if (file) {
   7.247 +			struct inode *inode = file->f_dentry->d_inode;
   7.248 +			get_file(file);
   7.249 +			if (tmp->vm_flags & VM_DENYWRITE)
   7.250 +				atomic_dec(&inode->i_writecount);
   7.251 +      
   7.252 +			/* insert tmp into the share list, just after mpnt */
   7.253 +			spin_lock(&file->f_mapping->i_mmap_lock);
   7.254 +			tmp->vm_truncate_count = mpnt->vm_truncate_count;
   7.255 +			flush_dcache_mmap_lock(file->f_mapping);
   7.256 +			vma_prio_tree_add(tmp, mpnt);
   7.257 +			flush_dcache_mmap_unlock(file->f_mapping);
   7.258 +			spin_unlock(&file->f_mapping->i_mmap_lock);
   7.259 +		}
   7.260 +
   7.261 +		/*
   7.262 +		 * Link in the new vma and copy the page table entries.
   7.263 +		 */
   7.264 +		*pprev = tmp;
   7.265 +		pprev = &tmp->vm_next;
   7.266 +
   7.267 +		__vma_link_rb(mm, tmp, rb_link, rb_parent);
   7.268 +		rb_link = &tmp->vm_rb.rb_right;
   7.269 +		rb_parent = &tmp->vm_rb;
   7.270 +
   7.271 +		mm->map_count++;
   7.272 +		retval = copy_page_range(mm, oldmm, mpnt);
   7.273 +
   7.274 +		if (tmp->vm_ops && tmp->vm_ops->open)
   7.275 +			tmp->vm_ops->open(tmp);
   7.276 +
   7.277 +		if (retval)
   7.278 +			goto out;
   7.279 +	}
   7.280 +#ifdef arch_dup_mmap
   7.281 +	arch_dup_mmap(mm, oldmm);
   7.282 +#endif
   7.283 +	retval = 0;
   7.284 +out:
   7.285 +	up_write(&mm->mmap_sem);
   7.286 +	flush_tlb_mm(oldmm);
   7.287 +	up_write(&oldmm->mmap_sem);
   7.288 +	return retval;
   7.289 +fail_nomem_policy:
   7.290 +	kmem_cache_free(vm_area_cachep, tmp);
   7.291 +fail_nomem:
   7.292 +	retval = -ENOMEM;
   7.293 +	vm_unacct_memory(charge);
   7.294 +	goto out;
   7.295 +}
   7.296 +
   7.297 +static inline int mm_alloc_pgd(struct mm_struct * mm)
   7.298 +{
   7.299 +	mm->pgd = pgd_alloc(mm);
   7.300 +	if (unlikely(!mm->pgd))
   7.301 +		return -ENOMEM;
   7.302 +	return 0;
   7.303 +}
   7.304 +
   7.305 +static inline void mm_free_pgd(struct mm_struct * mm)
   7.306 +{
   7.307 +	pgd_free(mm->pgd);
   7.308 +}
   7.309 +#else
   7.310 +#define dup_mmap(mm, oldmm)	(0)
   7.311 +#define mm_alloc_pgd(mm)	(0)
   7.312 +#define mm_free_pgd(mm)
   7.313 +#endif /* CONFIG_MMU */
   7.314 +
   7.315 + __cacheline_aligned_in_smp DEFINE_SPINLOCK(mmlist_lock);
   7.316 +
   7.317 +#define allocate_mm()	(kmem_cache_alloc(mm_cachep, SLAB_KERNEL))
   7.318 +#define free_mm(mm)	(kmem_cache_free(mm_cachep, (mm)))
   7.319 +
   7.320 +#include <linux/init_task.h>
   7.321 +
   7.322 +static struct mm_struct * mm_init(struct mm_struct * mm)
   7.323 +{
   7.324 +	atomic_set(&mm->mm_users, 1);
   7.325 +	atomic_set(&mm->mm_count, 1);
   7.326 +	init_rwsem(&mm->mmap_sem);
   7.327 +	INIT_LIST_HEAD(&mm->mmlist);
   7.328 +	mm->core_waiters = 0;
   7.329 +	mm->nr_ptes = 0;
   7.330 +	set_mm_counter(mm, file_rss, 0);
   7.331 +	set_mm_counter(mm, anon_rss, 0);
   7.332 +	spin_lock_init(&mm->page_table_lock);
   7.333 +	rwlock_init(&mm->ioctx_list_lock);
   7.334 +	mm->ioctx_list = NULL;
   7.335 +	mm->free_area_cache = TASK_UNMAPPED_BASE;
   7.336 +	mm->cached_hole_size = ~0UL;
   7.337 +
   7.338 +	if (likely(!mm_alloc_pgd(mm))) {
   7.339 +		mm->def_flags = 0;
   7.340 +		return mm;
   7.341 +	}
   7.342 +	free_mm(mm);
   7.343 +	return NULL;
   7.344 +}
   7.345 +
   7.346 +/*
   7.347 + * Allocate and initialize an mm_struct.
   7.348 + */
   7.349 +struct mm_struct * mm_alloc(void)
   7.350 +{
   7.351 +	struct mm_struct * mm;
   7.352 +
   7.353 +	mm = allocate_mm();
   7.354 +	if (mm) {
   7.355 +		memset(mm, 0, sizeof(*mm));
   7.356 +		mm = mm_init(mm);
   7.357 +	}
   7.358 +	return mm;
   7.359 +}
   7.360 +
   7.361 +/*
   7.362 + * Called when the last reference to the mm
   7.363 + * is dropped: either by a lazy thread or by
   7.364 + * mmput. Free the page directory and the mm.
   7.365 + */
   7.366 +void fastcall __mmdrop(struct mm_struct *mm)
   7.367 +{
   7.368 +	BUG_ON(mm == &init_mm);
   7.369 +	mm_free_pgd(mm);
   7.370 +	destroy_context(mm);
   7.371 +	free_mm(mm);
   7.372 +}
   7.373 +
   7.374 +/*
   7.375 + * Decrement the use count and release all resources for an mm.
   7.376 + */
   7.377 +void mmput(struct mm_struct *mm)
   7.378 +{
   7.379 +	if (atomic_dec_and_test(&mm->mm_users)) {
   7.380 +		exit_aio(mm);
   7.381 +		exit_mmap(mm);
   7.382 +		if (!list_empty(&mm->mmlist)) {
   7.383 +			spin_lock(&mmlist_lock);
   7.384 +			list_del(&mm->mmlist);
   7.385 +			spin_unlock(&mmlist_lock);
   7.386 +		}
   7.387 +		put_swap_token(mm);
   7.388 +		mmdrop(mm);
   7.389 +	}
   7.390 +}
   7.391 +EXPORT_SYMBOL_GPL(mmput);
   7.392 +
   7.393 +/**
   7.394 + * get_task_mm - acquire a reference to the task's mm
   7.395 + *
   7.396 + * Returns %NULL if the task has no mm.  Checks PF_BORROWED_MM (meaning
   7.397 + * this kernel workthread has transiently adopted a user mm with use_mm,
   7.398 + * to do its AIO) is not set and if so returns a reference to it, after
   7.399 + * bumping up the use count.  User must release the mm via mmput()
   7.400 + * after use.  Typically used by /proc and ptrace.
   7.401 + */
   7.402 +struct mm_struct *get_task_mm(struct task_struct *task)
   7.403 +{
   7.404 +	struct mm_struct *mm;
   7.405 +
   7.406 +	task_lock(task);
   7.407 +	mm = task->mm;
   7.408 +	if (mm) {
   7.409 +		if (task->flags & PF_BORROWED_MM)
   7.410 +			mm = NULL;
   7.411 +		else
   7.412 +			atomic_inc(&mm->mm_users);
   7.413 +	}
   7.414 +	task_unlock(task);
   7.415 +	return mm;
   7.416 +}
   7.417 +EXPORT_SYMBOL_GPL(get_task_mm);
   7.418 +
   7.419 +/* Please note the differences between mmput and mm_release.
   7.420 + * mmput is called whenever we stop holding onto a mm_struct,
   7.421 + * error success whatever.
   7.422 + *
   7.423 + * mm_release is called after a mm_struct has been removed
   7.424 + * from the current process.
   7.425 + *
   7.426 + * This difference is important for error handling, when we
   7.427 + * only half set up a mm_struct for a new process and need to restore
   7.428 + * the old one.  Because we mmput the new mm_struct before
   7.429 + * restoring the old one. . .
   7.430 + * Eric Biederman 10 January 1998
   7.431 + */
   7.432 +void mm_release(struct task_struct *tsk, struct mm_struct *mm)
   7.433 +{
   7.434 +	struct completion *vfork_done = tsk->vfork_done;
   7.435 +
   7.436 +	/* Get rid of any cached register state */
   7.437 +	deactivate_mm(tsk, mm);
   7.438 +
   7.439 +	/* notify parent sleeping on vfork() */
   7.440 +	if (vfork_done) {
   7.441 +		tsk->vfork_done = NULL;
   7.442 +		complete(vfork_done);
   7.443 +	}
   7.444 +	if (tsk->clear_child_tid && atomic_read(&mm->mm_users) > 1) {
   7.445 +		u32 __user * tidptr = tsk->clear_child_tid;
   7.446 +		tsk->clear_child_tid = NULL;
   7.447 +
   7.448 +		/*
   7.449 +		 * We don't check the error code - if userspace has
   7.450 +		 * not set up a proper pointer then tough luck.
   7.451 +		 */
   7.452 +		put_user(0, tidptr);
   7.453 +		sys_futex(tidptr, FUTEX_WAKE, 1, NULL, NULL, 0);
   7.454 +	}
   7.455 +}
   7.456 +
   7.457 +/*
   7.458 + * Allocate a new mm structure and copy contents from the
   7.459 + * mm structure of the passed in task structure.
   7.460 + */
   7.461 +static struct mm_struct *dup_mm(struct task_struct *tsk)
   7.462 +{
   7.463 +	struct mm_struct *mm, *oldmm = current->mm;
   7.464 +	int err;
   7.465 +
   7.466 +	if (!oldmm)
   7.467 +		return NULL;
   7.468 +
   7.469 +	mm = allocate_mm();
   7.470 +	if (!mm)
   7.471 +		goto fail_nomem;
   7.472 +
   7.473 +	memcpy(mm, oldmm, sizeof(*mm));
   7.474 +
   7.475 +	if (!mm_init(mm))
   7.476 +		goto fail_nomem;
   7.477 +
   7.478 +	if (init_new_context(tsk, mm))
   7.479 +		goto fail_nocontext;
   7.480 +
   7.481 +	err = dup_mmap(mm, oldmm);
   7.482 +	if (err)
   7.483 +		goto free_pt;
   7.484 +
   7.485 +	mm->hiwater_rss = get_mm_rss(mm);
   7.486 +	mm->hiwater_vm = mm->total_vm;
   7.487 +
   7.488 +	return mm;
   7.489 +
   7.490 +free_pt:
   7.491 +	mmput(mm);
   7.492 +
   7.493 +fail_nomem:
   7.494 +	return NULL;
   7.495 +
   7.496 +fail_nocontext:
   7.497 +	/*
   7.498 +	 * If init_new_context() failed, we cannot use mmput() to free the mm
   7.499 +	 * because it calls destroy_context()
   7.500 +	 */
   7.501 +	mm_free_pgd(mm);
   7.502 +	free_mm(mm);
   7.503 +	return NULL;
   7.504 +}
   7.505 +
   7.506 +static int copy_mm(unsigned long clone_flags, struct task_struct * tsk)
   7.507 +{
   7.508 +	struct mm_struct * mm, *oldmm;
   7.509 +	int retval;
   7.510 +
   7.511 +	tsk->min_flt = tsk->maj_flt = 0;
   7.512 +	tsk->nvcsw = tsk->nivcsw = 0;
   7.513 +
   7.514 +	tsk->mm = NULL;
   7.515 +	tsk->active_mm = NULL;
   7.516 +
   7.517 +	/*
   7.518 +	 * Are we cloning a kernel thread?
   7.519 +	 *
   7.520 +	 * We need to steal a active VM for that..
   7.521 +	 */
   7.522 +	oldmm = current->mm;
   7.523 +	if (!oldmm)
   7.524 +		return 0;
   7.525 +
   7.526 +	if (clone_flags & CLONE_VM) {
   7.527 +		atomic_inc(&oldmm->mm_users);
   7.528 +		mm = oldmm;
   7.529 +		goto good_mm;
   7.530 +	}
   7.531 +
   7.532 +	retval = -ENOMEM;
   7.533 +	mm = dup_mm(tsk);
   7.534 +	if (!mm)
   7.535 +		goto fail_nomem;
   7.536 +
   7.537 +good_mm:
   7.538 +	tsk->mm = mm;
   7.539 +	tsk->active_mm = mm;
   7.540 +	return 0;
   7.541 +
   7.542 +fail_nomem:
   7.543 +	return retval;
   7.544 +}
   7.545 +
   7.546 +static inline struct fs_struct *__copy_fs_struct(struct fs_struct *old)
   7.547 +{
   7.548 +	struct fs_struct *fs = kmem_cache_alloc(fs_cachep, GFP_KERNEL);
   7.549 +	/* We don't need to lock fs - think why ;-) */
   7.550 +	if (fs) {
   7.551 +		atomic_set(&fs->count, 1);
   7.552 +		rwlock_init(&fs->lock);
   7.553 +		fs->umask = old->umask;
   7.554 +		read_lock(&old->lock);
   7.555 +		fs->rootmnt = mntget(old->rootmnt);
   7.556 +		fs->root = dget(old->root);
   7.557 +		fs->pwdmnt = mntget(old->pwdmnt);
   7.558 +		fs->pwd = dget(old->pwd);
   7.559 +		if (old->altroot) {
   7.560 +			fs->altrootmnt = mntget(old->altrootmnt);
   7.561 +			fs->altroot = dget(old->altroot);
   7.562 +		} else {
   7.563 +			fs->altrootmnt = NULL;
   7.564 +			fs->altroot = NULL;
   7.565 +		}
   7.566 +		read_unlock(&old->lock);
   7.567 +	}
   7.568 +	return fs;
   7.569 +}
   7.570 +
   7.571 +struct fs_struct *copy_fs_struct(struct fs_struct *old)
   7.572 +{
   7.573 +	return __copy_fs_struct(old);
   7.574 +}
   7.575 +
   7.576 +EXPORT_SYMBOL_GPL(copy_fs_struct);
   7.577 +
   7.578 +static inline int copy_fs(unsigned long clone_flags, struct task_struct * tsk)
   7.579 +{
   7.580 +	if (clone_flags & CLONE_FS) {
   7.581 +		atomic_inc(&current->fs->count);
   7.582 +		return 0;
   7.583 +	}
   7.584 +	tsk->fs = __copy_fs_struct(current->fs);
   7.585 +	if (!tsk->fs)
   7.586 +		return -ENOMEM;
   7.587 +	return 0;
   7.588 +}
   7.589 +
   7.590 +static int count_open_files(struct fdtable *fdt)
   7.591 +{
   7.592 +	int size = fdt->max_fdset;
   7.593 +	int i;
   7.594 +
   7.595 +	/* Find the last open fd */
   7.596 +	for (i = size/(8*sizeof(long)); i > 0; ) {
   7.597 +		if (fdt->open_fds->fds_bits[--i])
   7.598 +			break;
   7.599 +	}
   7.600 +	i = (i+1) * 8 * sizeof(long);
   7.601 +	return i;
   7.602 +}
   7.603 +
   7.604 +static struct files_struct *alloc_files(void)
   7.605 +{
   7.606 +	struct files_struct *newf;
   7.607 +	struct fdtable *fdt;
   7.608 +
   7.609 +	newf = kmem_cache_alloc(files_cachep, SLAB_KERNEL);
   7.610 +	if (!newf)
   7.611 +		goto out;
   7.612 +
   7.613 +	atomic_set(&newf->count, 1);
   7.614 +
   7.615 +	spin_lock_init(&newf->file_lock);
   7.616 +	fdt = &newf->fdtab;
   7.617 +	fdt->next_fd = 0;
   7.618 +	fdt->max_fds = NR_OPEN_DEFAULT;
   7.619 +	fdt->max_fdset = __FD_SETSIZE;
   7.620 +	fdt->close_on_exec = &newf->close_on_exec_init;
   7.621 +	fdt->open_fds = &newf->open_fds_init;
   7.622 +	fdt->fd = &newf->fd_array[0];
   7.623 +	INIT_RCU_HEAD(&fdt->rcu);
   7.624 +	fdt->free_files = NULL;
   7.625 +	fdt->next = NULL;
   7.626 +	rcu_assign_pointer(newf->fdt, fdt);
   7.627 +out:
   7.628 +	return newf;
   7.629 +}
   7.630 +
   7.631 +/*
   7.632 + * Allocate a new files structure and copy contents from the
   7.633 + * passed in files structure.
   7.634 + */
   7.635 +static struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
   7.636 +{
   7.637 +	struct files_struct *newf;
   7.638 +	struct file **old_fds, **new_fds;
   7.639 +	int open_files, size, i, expand;
   7.640 +	struct fdtable *old_fdt, *new_fdt;
   7.641 +
   7.642 +	newf = alloc_files();
   7.643 +	if (!newf)
   7.644 +		goto out;
   7.645 +
   7.646 +	spin_lock(&oldf->file_lock);
   7.647 +	old_fdt = files_fdtable(oldf);
   7.648 +	new_fdt = files_fdtable(newf);
   7.649 +	size = old_fdt->max_fdset;
   7.650 +	open_files = count_open_files(old_fdt);
   7.651 +	expand = 0;
   7.652 +
   7.653 +	/*
   7.654 +	 * Check whether we need to allocate a larger fd array or fd set.
   7.655 +	 * Note: we're not a clone task, so the open count won't  change.
   7.656 +	 */
   7.657 +	if (open_files > new_fdt->max_fdset) {
   7.658 +		new_fdt->max_fdset = 0;
   7.659 +		expand = 1;
   7.660 +	}
   7.661 +	if (open_files > new_fdt->max_fds) {
   7.662 +		new_fdt->max_fds = 0;
   7.663 +		expand = 1;
   7.664 +	}
   7.665 +
   7.666 +	/* if the old fdset gets grown now, we'll only copy up to "size" fds */
   7.667 +	if (expand) {
   7.668 +		spin_unlock(&oldf->file_lock);
   7.669 +		spin_lock(&newf->file_lock);
   7.670 +		*errorp = expand_files(newf, open_files-1);
   7.671 +		spin_unlock(&newf->file_lock);
   7.672 +		if (*errorp < 0)
   7.673 +			goto out_release;
   7.674 +		new_fdt = files_fdtable(newf);
   7.675 +		/*
   7.676 +		 * Reacquire the oldf lock and a pointer to its fd table
   7.677 +		 * who knows it may have a new bigger fd table. We need
   7.678 +		 * the latest pointer.
   7.679 +		 */
   7.680 +		spin_lock(&oldf->file_lock);
   7.681 +		old_fdt = files_fdtable(oldf);
   7.682 +	}
   7.683 +
   7.684 +	old_fds = old_fdt->fd;
   7.685 +	new_fds = new_fdt->fd;
   7.686 +
   7.687 +	memcpy(new_fdt->open_fds->fds_bits, old_fdt->open_fds->fds_bits, open_files/8);
   7.688 +	memcpy(new_fdt->close_on_exec->fds_bits, old_fdt->close_on_exec->fds_bits, open_files/8);
   7.689 +
   7.690 +	for (i = open_files; i != 0; i--) {
   7.691 +		struct file *f = *old_fds++;
   7.692 +		if (f) {
   7.693 +			get_file(f);
   7.694 +		} else {
   7.695 +			/*
   7.696 +			 * The fd may be claimed in the fd bitmap but not yet
   7.697 +			 * instantiated in the files array if a sibling thread
   7.698 +			 * is partway through open().  So make sure that this
   7.699 +			 * fd is available to the new process.
   7.700 +			 */
   7.701 +			FD_CLR(open_files - i, new_fdt->open_fds);
   7.702 +		}
   7.703 +		rcu_assign_pointer(*new_fds++, f);
   7.704 +	}
   7.705 +	spin_unlock(&oldf->file_lock);
   7.706 +
   7.707 +	/* compute the remainder to be cleared */
   7.708 +	size = (new_fdt->max_fds - open_files) * sizeof(struct file *);
   7.709 +
   7.710 +	/* This is long word aligned thus could use a optimized version */ 
   7.711 +	memset(new_fds, 0, size); 
   7.712 +
   7.713 +	if (new_fdt->max_fdset > open_files) {
   7.714 +		int left = (new_fdt->max_fdset-open_files)/8;
   7.715 +		int start = open_files / (8 * sizeof(unsigned long));
   7.716 +
   7.717 +		memset(&new_fdt->open_fds->fds_bits[start], 0, left);
   7.718 +		memset(&new_fdt->close_on_exec->fds_bits[start], 0, left);
   7.719 +	}
   7.720 +
   7.721 +out:
   7.722 +	return newf;
   7.723 +
   7.724 +out_release:
   7.725 +	free_fdset (new_fdt->close_on_exec, new_fdt->max_fdset);
   7.726 +	free_fdset (new_fdt->open_fds, new_fdt->max_fdset);
   7.727 +	free_fd_array(new_fdt->fd, new_fdt->max_fds);
   7.728 +	kmem_cache_free(files_cachep, newf);
   7.729 +	return NULL;
   7.730 +}
   7.731 +
   7.732 +static int copy_files(unsigned long clone_flags, struct task_struct * tsk)
   7.733 +{
   7.734 +	struct files_struct *oldf, *newf;
   7.735 +	int error = 0;
   7.736 +
   7.737 +	/*
   7.738 +	 * A background process may not have any files ...
   7.739 +	 */
   7.740 +	oldf = current->files;
   7.741 +	if (!oldf)
   7.742 +		goto out;
   7.743 +
   7.744 +	if (clone_flags & CLONE_FILES) {
   7.745 +		atomic_inc(&oldf->count);
   7.746 +		goto out;
   7.747 +	}
   7.748 +
   7.749 +	/*
   7.750 +	 * Note: we may be using current for both targets (See exec.c)
   7.751 +	 * This works because we cache current->files (old) as oldf. Don't
   7.752 +	 * break this.
   7.753 +	 */
   7.754 +	tsk->files = NULL;
   7.755 +	error = -ENOMEM;
   7.756 +	newf = dup_fd(oldf, &error);
   7.757 +	if (!newf)
   7.758 +		goto out;
   7.759 +
   7.760 +	tsk->files = newf;
   7.761 +	error = 0;
   7.762 +out:
   7.763 +	return error;
   7.764 +}
   7.765 +
   7.766 +/*
   7.767 + *	Helper to unshare the files of the current task.
   7.768 + *	We don't want to expose copy_files internals to
   7.769 + *	the exec layer of the kernel.
   7.770 + */
   7.771 +
   7.772 +int unshare_files(void)
   7.773 +{
   7.774 +	struct files_struct *files  = current->files;
   7.775 +	int rc;
   7.776 +
   7.777 +	if(!files)
   7.778 +		BUG();
   7.779 +
   7.780 +	/* This can race but the race causes us to copy when we don't
   7.781 +	   need to and drop the copy */
   7.782 +	if(atomic_read(&files->count) == 1)
   7.783 +	{
   7.784 +		atomic_inc(&files->count);
   7.785 +		return 0;
   7.786 +	}
   7.787 +	rc = copy_files(0, current);
   7.788 +	if(rc)
   7.789 +		current->files = files;
   7.790 +	return rc;
   7.791 +}
   7.792 +
   7.793 +EXPORT_SYMBOL(unshare_files);
   7.794 +
   7.795 +void sighand_free_cb(struct rcu_head *rhp)
   7.796 +{
   7.797 +	struct sighand_struct *sp;
   7.798 +
   7.799 +	sp = container_of(rhp, struct sighand_struct, rcu);
   7.800 +	kmem_cache_free(sighand_cachep, sp);
   7.801 +}
   7.802 +
   7.803 +static inline int copy_sighand(unsigned long clone_flags, struct task_struct * tsk)
   7.804 +{
   7.805 +	struct sighand_struct *sig;
   7.806 +
   7.807 +	if (clone_flags & (CLONE_SIGHAND | CLONE_THREAD)) {
   7.808 +		atomic_inc(&current->sighand->count);
   7.809 +		return 0;
   7.810 +	}
   7.811 +	sig = kmem_cache_alloc(sighand_cachep, GFP_KERNEL);
   7.812 +	rcu_assign_pointer(tsk->sighand, sig);
   7.813 +	if (!sig)
   7.814 +		return -ENOMEM;
   7.815 +	spin_lock_init(&sig->siglock);
   7.816 +	atomic_set(&sig->count, 1);
   7.817 +	memcpy(sig->action, current->sighand->action, sizeof(sig->action));
   7.818 +	return 0;
   7.819 +}
   7.820 +
   7.821 +static inline int copy_signal(unsigned long clone_flags, struct task_struct * tsk)
   7.822 +{
   7.823 +	struct signal_struct *sig;
   7.824 +	int ret;
   7.825 +
   7.826 +	if (clone_flags & CLONE_THREAD) {
   7.827 +		atomic_inc(&current->signal->count);
   7.828 +		atomic_inc(&current->signal->live);
   7.829 +		return 0;
   7.830 +	}
   7.831 +	sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL);
   7.832 +	tsk->signal = sig;
   7.833 +	if (!sig)
   7.834 +		return -ENOMEM;
   7.835 +
   7.836 +	ret = copy_thread_group_keys(tsk);
   7.837 +	if (ret < 0) {
   7.838 +		kmem_cache_free(signal_cachep, sig);
   7.839 +		return ret;
   7.840 +	}
   7.841 +
   7.842 +	atomic_set(&sig->count, 1);
   7.843 +	atomic_set(&sig->live, 1);
   7.844 +	init_waitqueue_head(&sig->wait_chldexit);
   7.845 +	sig->flags = 0;
   7.846 +	sig->group_exit_code = 0;
   7.847 +	sig->group_exit_task = NULL;
   7.848 +	sig->group_stop_count = 0;
   7.849 +	sig->curr_target = NULL;
   7.850 +	init_sigpending(&sig->shared_pending);
   7.851 +	INIT_LIST_HEAD(&sig->posix_timers);
   7.852 +
   7.853 +	hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_REL);
   7.854 +	sig->it_real_incr.tv64 = 0;
   7.855 +	sig->real_timer.function = it_real_fn;
   7.856 +	sig->real_timer.data = tsk;
   7.857 +
   7.858 +	sig->it_virt_expires = cputime_zero;
   7.859 +	sig->it_virt_incr = cputime_zero;
   7.860 +	sig->it_prof_expires = cputime_zero;
   7.861 +	sig->it_prof_incr = cputime_zero;
   7.862 +
   7.863 +	sig->leader = 0;	/* session leadership doesn't inherit */
   7.864 +	sig->tty_old_pgrp = 0;
   7.865 +
   7.866 +	sig->utime = sig->stime = sig->cutime = sig->cstime = cputime_zero;
   7.867 +	sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0;
   7.868 +	sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0;
   7.869 +	sig->sched_time = 0;
   7.870 +	INIT_LIST_HEAD(&sig->cpu_timers[0]);
   7.871 +	INIT_LIST_HEAD(&sig->cpu_timers[1]);
   7.872 +	INIT_LIST_HEAD(&sig->cpu_timers[2]);
   7.873 +
   7.874 +	task_lock(current->group_leader);
   7.875 +	memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim);
   7.876 +	task_unlock(current->group_leader);
   7.877 +
   7.878 +	if (sig->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY) {
   7.879 +		/*
   7.880 +		 * New sole thread in the process gets an expiry time
   7.881 +		 * of the whole CPU time limit.
   7.882 +		 */
   7.883 +		tsk->it_prof_expires =
   7.884 +			secs_to_cputime(sig->rlim[RLIMIT_CPU].rlim_cur);
   7.885 +	}
   7.886 +
   7.887 +	return 0;
   7.888 +}
   7.889 +
   7.890 +static inline void copy_flags(unsigned long clone_flags, struct task_struct *p)
   7.891 +{
   7.892 +	unsigned long new_flags = p->flags;
   7.893 +
   7.894 +	new_flags &= ~(PF_SUPERPRIV | PF_NOFREEZE);
   7.895 +	new_flags |= PF_FORKNOEXEC;
   7.896 +	if (!(clone_flags & CLONE_PTRACE))
   7.897 +		p->ptrace = 0;
   7.898 +	p->flags = new_flags;
   7.899 +}
   7.900 +
   7.901 +asmlinkage long sys_set_tid_address(int __user *tidptr)
   7.902 +{
   7.903 +	current->clear_child_tid = tidptr;
   7.904 +
   7.905 +	return current->pid;
   7.906 +}
   7.907 +
   7.908 +/*
   7.909 + * This creates a new process as a copy of the old one,
   7.910 + * but does not actually start it yet.
   7.911 + *
   7.912 + * It copies the registers, and all the appropriate
   7.913 + * parts of the process environment (as per the clone
   7.914 + * flags). The actual kick-off is left to the caller.
   7.915 + */
   7.916 +static task_t *copy_process(unsigned long clone_flags,
   7.917 +				 unsigned long stack_start,
   7.918 +				 struct pt_regs *regs,
   7.919 +				 unsigned long stack_size,
   7.920 +				 int __user *parent_tidptr,
   7.921 +				 int __user *child_tidptr,
   7.922 +				 int pid)
   7.923 +{
   7.924 +	int retval;
   7.925 +	struct task_struct *p = NULL;
   7.926 +
   7.927 +	if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
   7.928 +		return ERR_PTR(-EINVAL);
   7.929 +
   7.930 +	/*
   7.931 +	 * Thread groups must share signals as well, and detached threads
   7.932 +	 * can only be started up within the thread group.
   7.933 +	 */
   7.934 +	if ((clone_flags & CLONE_THREAD) && !(clone_flags & CLONE_SIGHAND))
   7.935 +		return ERR_PTR(-EINVAL);
   7.936 +
   7.937 +	/*
   7.938 +	 * Shared signal handlers imply shared VM. By way of the above,
   7.939 +	 * thread groups also imply shared VM. Blocking this case allows
   7.940 +	 * for various simplifications in other code.
   7.941 +	 */
   7.942 +	if ((clone_flags & CLONE_SIGHAND) && !(clone_flags & CLONE_VM))
   7.943 +		return ERR_PTR(-EINVAL);
   7.944 +
   7.945 +	retval = security_task_create(clone_flags);
   7.946 +	if (retval)
   7.947 +		goto fork_out;
   7.948 +
   7.949 +	retval = -ENOMEM;
   7.950 +	p = dup_task_struct(current);
   7.951 +	if (!p)
   7.952 +		goto fork_out;
   7.953 +
   7.954 +	retval = -EAGAIN;
   7.955 +	if (atomic_read(&p->user->processes) >=
   7.956 +			p->signal->rlim[RLIMIT_NPROC].rlim_cur) {
   7.957 +		if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE) &&
   7.958 +				p->user != &root_user)
   7.959 +			goto bad_fork_free;
   7.960 +	}
   7.961 +
   7.962 +	atomic_inc(&p->user->__count);
   7.963 +	atomic_inc(&p->user->processes);
   7.964 +	get_group_info(p->group_info);
   7.965 +
   7.966 +	/*
   7.967 +	 * If multiple threads are within copy_process(), then this check
   7.968 +	 * triggers too late. This doesn't hurt, the check is only there
   7.969 +	 * to stop root fork bombs.
   7.970 +	 */
   7.971 +	if (nr_threads >= max_threads)
   7.972 +		goto bad_fork_cleanup_count;
   7.973 +
   7.974 +	if (!try_module_get(task_thread_info(p)->exec_domain->module))
   7.975 +		goto bad_fork_cleanup_count;
   7.976 +
   7.977 +	if (p->binfmt && !try_module_get(p->binfmt->module))
   7.978 +		goto bad_fork_cleanup_put_domain;
   7.979 +
   7.980 +	p->did_exec = 0;
   7.981 +	copy_flags(clone_flags, p);
   7.982 +	p->pid = pid;
   7.983 +	retval = -EFAULT;
   7.984 +	if (clone_flags & CLONE_PARENT_SETTID)
   7.985 +		if (put_user(p->pid, parent_tidptr))
   7.986 +			goto bad_fork_cleanup;
   7.987 +
   7.988 +	p->proc_dentry = NULL;
   7.989 +
   7.990 +	INIT_LIST_HEAD(&p->children);
   7.991 +	INIT_LIST_HEAD(&p->sibling);
   7.992 +	p->vfork_done = NULL;
   7.993 +	spin_lock_init(&p->alloc_lock);
   7.994 +	spin_lock_init(&p->proc_lock);
   7.995 +
   7.996 +	clear_tsk_thread_flag(p, TIF_SIGPENDING);
   7.997 +	init_sigpending(&p->pending);
   7.998 +
   7.999 +	p->utime = cputime_zero;
  7.1000 +	p->stime = cputime_zero;
  7.1001 + 	p->sched_time = 0;
  7.1002 +	p->rchar = 0;		/* I/O counter: bytes read */
  7.1003 +	p->wchar = 0;		/* I/O counter: bytes written */
  7.1004 +	p->syscr = 0;		/* I/O counter: read syscalls */
  7.1005 +	p->syscw = 0;		/* I/O counter: write syscalls */
  7.1006 +	acct_clear_integrals(p);
  7.1007 +
  7.1008 + 	p->it_virt_expires = cputime_zero;
  7.1009 +	p->it_prof_expires = cputime_zero;
  7.1010 + 	p->it_sched_expires = 0;
  7.1011 + 	INIT_LIST_HEAD(&p->cpu_timers[0]);
  7.1012 + 	INIT_LIST_HEAD(&p->cpu_timers[1]);
  7.1013 + 	INIT_LIST_HEAD(&p->cpu_timers[2]);
  7.1014 +
  7.1015 +	p->lock_depth = -1;		/* -1 = no lock */
  7.1016 +	do_posix_clock_monotonic_gettime(&p->start_time);
  7.1017 +	p->security = NULL;
  7.1018 +	p->io_context = NULL;
  7.1019 +	p->io_wait = NULL;
  7.1020 +	p->audit_context = NULL;
  7.1021 +	cpuset_fork(p);
  7.1022 +#ifdef CONFIG_NUMA
  7.1023 + 	p->mempolicy = mpol_copy(p->mempolicy);
  7.1024 + 	if (IS_ERR(p->mempolicy)) {
  7.1025 + 		retval = PTR_ERR(p->mempolicy);
  7.1026 + 		p->mempolicy = NULL;
  7.1027 + 		goto bad_fork_cleanup_cpuset;
  7.1028 + 	}
  7.1029 +#endif
  7.1030 +
  7.1031 +#ifdef CONFIG_DEBUG_MUTEXES
  7.1032 +	p->blocked_on = NULL; /* not blocked yet */
  7.1033 +#endif
  7.1034 +
  7.1035 +	p->tgid = p->pid;
  7.1036 +	if (clone_flags & CLONE_THREAD)
  7.1037 +		p->tgid = current->tgid;
  7.1038 +
  7.1039 +	if ((retval = security_task_alloc(p)))
  7.1040 +		goto bad_fork_cleanup_policy;
  7.1041 +	if ((retval = audit_alloc(p)))
  7.1042 +		goto bad_fork_cleanup_security;
  7.1043 +	/* copy all the process information */
  7.1044 +	if ((retval = copy_semundo(clone_flags, p)))
  7.1045 +		goto bad_fork_cleanup_audit;
  7.1046 +	if ((retval = copy_files(clone_flags, p)))
  7.1047 +		goto bad_fork_cleanup_semundo;
  7.1048 +	if ((retval = copy_fs(clone_flags, p)))
  7.1049 +		goto bad_fork_cleanup_files;
  7.1050 +	if ((retval = copy_sighand(clone_flags, p)))
  7.1051 +		goto bad_fork_cleanup_fs;
  7.1052 +	if ((retval = copy_signal(clone_flags, p)))
  7.1053 +		goto bad_fork_cleanup_sighand;
  7.1054 +	if ((retval = copy_mm(clone_flags, p)))
  7.1055 +		goto bad_fork_cleanup_signal;
  7.1056 +	if ((retval = copy_keys(clone_flags, p)))
  7.1057 +		goto bad_fork_cleanup_mm;
  7.1058 +	if ((retval = copy_namespace(clone_flags, p)))
  7.1059 +		goto bad_fork_cleanup_keys;
  7.1060 +	retval = copy_thread(0, clone_flags, stack_start, stack_size, p, regs);
  7.1061 +	if (retval)
  7.1062 +		goto bad_fork_cleanup_namespace;
  7.1063 +
  7.1064 +	p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL;
  7.1065 +	/*
  7.1066 +	 * Clear TID on mm_release()?
  7.1067 +	 */
  7.1068 +	p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr: NULL;
  7.1069 +
  7.1070 +	/*
  7.1071 +	 * sigaltstack should be cleared when sharing the same VM
  7.1072 +	 */
  7.1073 +	if ((clone_flags & (CLONE_VM|CLONE_VFORK)) == CLONE_VM)
  7.1074 +		p->sas_ss_sp = p->sas_ss_size = 0;
  7.1075 +
  7.1076 +	/*
  7.1077 +	 * Syscall tracing should be turned off in the child regardless
  7.1078 +	 * of CLONE_PTRACE.
  7.1079 +	 */
  7.1080 +	clear_tsk_thread_flag(p, TIF_SYSCALL_TRACE);
  7.1081 +#ifdef TIF_SYSCALL_EMU
  7.1082 +	clear_tsk_thread_flag(p, TIF_SYSCALL_EMU);
  7.1083 +#endif
  7.1084 +
  7.1085 +	/* Our parent execution domain becomes current domain
  7.1086 +	   These must match for thread signalling to apply */
  7.1087 +	   
  7.1088 +	p->parent_exec_id = p->self_exec_id;
  7.1089 +
  7.1090 +	/* ok, now we should be set up.. */
  7.1091 +	p->exit_signal = (clone_flags & CLONE_THREAD) ? -1 : (clone_flags & CSIGNAL);
  7.1092 +	p->pdeath_signal = 0;
  7.1093 +	p->exit_state = 0;
  7.1094 +
  7.1095 +	/*
  7.1096 +	 * Ok, make it visible to the rest of the system.
  7.1097 +	 * We dont wake it up yet.
  7.1098 +	 */
  7.1099 +	p->group_leader = p;
  7.1100 +	INIT_LIST_HEAD(&p->ptrace_children);
  7.1101 +	INIT_LIST_HEAD(&p->ptrace_list);
  7.1102 +
  7.1103 +	/* Perform scheduler related setup. Assign this task to a CPU. */
  7.1104 +	sched_fork(p, clone_flags);
  7.1105 +
  7.1106 +	/* Need tasklist lock for parent etc handling! */
  7.1107 +	write_lock_irq(&tasklist_lock);
  7.1108 +
  7.1109 +	/*
  7.1110 +	 * The task hasn't been attached yet, so its cpus_allowed mask will
  7.1111 +	 * not be changed, nor will its assigned CPU.
  7.1112 +	 *
  7.1113 +	 * The cpus_allowed mask of the parent may have changed after it was
  7.1114 +	 * copied first time - so re-copy it here, then check the child's CPU
  7.1115 +	 * to ensure it is on a valid CPU (and if not, just force it back to
  7.1116 +	 * parent's CPU). This avoids alot of nasty races.
  7.1117 +	 */
  7.1118 +	p->cpus_allowed = current->cpus_allowed;
  7.1119 +	if (unlikely(!cpu_isset(task_cpu(p), p->cpus_allowed) ||
  7.1120 +			!cpu_online(task_cpu(p))))
  7.1121 +		set_task_cpu(p, smp_processor_id());
  7.1122 +
  7.1123 +	/*
  7.1124 +	 * Check for pending SIGKILL! The new thread should not be allowed
  7.1125 +	 * to slip out of an OOM kill. (or normal SIGKILL.)
  7.1126 +	 */
  7.1127 +	if (sigismember(&current->pending.signal, SIGKILL)) {
  7.1128 +		write_unlock_irq(&tasklist_lock);
  7.1129 +		retval = -EINTR;
  7.1130 +		goto bad_fork_cleanup_namespace;
  7.1131 +	}
  7.1132 +
  7.1133 +	/* CLONE_PARENT re-uses the old parent */
  7.1134 +	if (clone_flags & (CLONE_PARENT|CLONE_THREAD))
  7.1135 +		p->real_parent = current->real_parent;
  7.1136 +	else
  7.1137 +		p->real_parent = current;
  7.1138 +	p->parent = p->real_parent;
  7.1139 +
  7.1140 +	spin_lock(&current->sighand->siglock);
  7.1141 +	if (clone_flags & CLONE_THREAD) {
  7.1142 +		/*
  7.1143 +		 * Important: if an exit-all has been started then
  7.1144 +		 * do not create this new thread - the whole thread
  7.1145 +		 * group is supposed to exit anyway.
  7.1146 +		 */
  7.1147 +		if (current->signal->flags & SIGNAL_GROUP_EXIT) {
  7.1148 +			spin_unlock(&current->sighand->siglock);
  7.1149 +			write_unlock_irq(&tasklist_lock);
  7.1150 +			retval = -EAGAIN;
  7.1151 +			goto bad_fork_cleanup_namespace;
  7.1152 +		}
  7.1153 +		p->group_leader = current->group_leader;
  7.1154 +
  7.1155 +		if (current->signal->group_stop_count > 0) {
  7.1156 +			/*
  7.1157 +			 * There is an all-stop in progress for the group.
  7.1158 +			 * We ourselves will stop as soon as we check signals.
  7.1159 +			 * Make the new thread part of that group stop too.
  7.1160 +			 */
  7.1161 +			current->signal->group_stop_count++;
  7.1162 +			set_tsk_thread_flag(p, TIF_SIGPENDING);
  7.1163 +		}
  7.1164 +
  7.1165 +		if (!cputime_eq(current->signal->it_virt_expires,
  7.1166 +				cputime_zero) ||
  7.1167 +		    !cputime_eq(current->signal->it_prof_expires,
  7.1168 +				cputime_zero) ||
  7.1169 +		    current->signal->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY ||
  7.1170 +		    !list_empty(&current->signal->cpu_timers[0]) ||
  7.1171 +		    !list_empty(&current->signal->cpu_timers[1]) ||
  7.1172 +		    !list_empty(&current->signal->cpu_timers[2])) {
  7.1173 +			/*
  7.1174 +			 * Have child wake up on its first tick to check
  7.1175 +			 * for process CPU timers.
  7.1176 +			 */
  7.1177 +			p->it_prof_expires = jiffies_to_cputime(1);
  7.1178 +		}
  7.1179 +	}
  7.1180 +
  7.1181 +	/*
  7.1182 +	 * inherit ioprio
  7.1183 +	 */
  7.1184 +	p->ioprio = current->ioprio;
  7.1185 +
  7.1186 +	SET_LINKS(p);
  7.1187 +	if (unlikely(p->ptrace & PT_PTRACED))
  7.1188 +		__ptrace_link(p, current->parent);
  7.1189 +
  7.1190 +	if (thread_group_leader(p)) {
  7.1191 +		p->signal->tty = current->signal->tty;
  7.1192 +		p->signal->pgrp = process_group(current);
  7.1193 +		p->signal->session = current->signal->session;
  7.1194 +		attach_pid(p, PIDTYPE_PGID, process_group(p));
  7.1195 +		attach_pid(p, PIDTYPE_SID, p->signal->session);
  7.1196 +		if (p->pid)
  7.1197 +			__get_cpu_var(process_counts)++;
  7.1198 +	}
  7.1199 +	attach_pid(p, PIDTYPE_TGID, p->tgid);
  7.1200 +	attach_pid(p, PIDTYPE_PID, p->pid);
  7.1201 +
  7.1202 +	nr_threads++;
  7.1203 +	total_forks++;
  7.1204 +	spin_unlock(&current->sighand->siglock);
  7.1205 +	write_unlock_irq(&tasklist_lock);
  7.1206 +	proc_fork_connector(p);
  7.1207 +	return p;
  7.1208 +
  7.1209 +bad_fork_cleanup_namespace:
  7.1210 +	exit_namespace(p);
  7.1211 +bad_fork_cleanup_keys:
  7.1212 +	exit_keys(p);
  7.1213 +bad_fork_cleanup_mm:
  7.1214 +	if (p->mm)
  7.1215 +		mmput(p->mm);
  7.1216 +bad_fork_cleanup_signal:
  7.1217 +	exit_signal(p);
  7.1218 +bad_fork_cleanup_sighand:
  7.1219 +	exit_sighand(p);
  7.1220 +bad_fork_cleanup_fs:
  7.1221 +	exit_fs(p); /* blocking */
  7.1222 +bad_fork_cleanup_files:
  7.1223 +	exit_files(p); /* blocking */
  7.1224 +bad_fork_cleanup_semundo:
  7.1225 +	exit_sem(p);
  7.1226 +bad_fork_cleanup_audit:
  7.1227 +	audit_free(p);
  7.1228 +bad_fork_cleanup_security:
  7.1229 +	security_task_free(p);
  7.1230 +bad_fork_cleanup_policy:
  7.1231 +#ifdef CONFIG_NUMA
  7.1232 +	mpol_free(p->mempolicy);
  7.1233 +bad_fork_cleanup_cpuset:
  7.1234 +#endif
  7.1235 +	cpuset_exit(p);
  7.1236 +bad_fork_cleanup:
  7.1237 +	if (p->binfmt)
  7.1238 +		module_put(p->binfmt->module);
  7.1239 +bad_fork_cleanup_put_domain:
  7.1240 +	module_put(task_thread_info(p)->exec_domain->module);
  7.1241 +bad_fork_cleanup_count:
  7.1242 +	put_group_info(p->group_info);
  7.1243 +	atomic_dec(&p->user->processes);
  7.1244 +	free_uid(p->user);
  7.1245 +bad_fork_free:
  7.1246 +	free_task(p);
  7.1247 +fork_out:
  7.1248 +	return ERR_PTR(retval);
  7.1249 +}
  7.1250 +
  7.1251 +struct pt_regs * __devinit __attribute__((weak)) idle_regs(struct pt_regs *regs)
  7.1252 +{
  7.1253 +	memset(regs, 0, sizeof(struct pt_regs));
  7.1254 +	return regs;
  7.1255 +}
  7.1256 +
  7.1257 +task_t * __devinit fork_idle(int cpu)
  7.1258 +{
  7.1259 +	task_t *task;
  7.1260 +	struct pt_regs regs;
  7.1261 +
  7.1262 +	task = copy_process(CLONE_VM, 0, idle_regs(&regs), 0, NULL, NULL, 0);
  7.1263 +	if (!task)
  7.1264 +		return ERR_PTR(-ENOMEM);
  7.1265 +	init_idle(task, cpu);
  7.1266 +	unhash_process(task);
  7.1267 +	return task;
  7.1268 +}
  7.1269 +
  7.1270 +static inline int fork_traceflag (unsigned clone_flags)
  7.1271 +{
  7.1272 +	if (clone_flags & CLONE_UNTRACED)
  7.1273 +		return 0;
  7.1274 +	else if (clone_flags & CLONE_VFORK) {
  7.1275 +		if (current->ptrace & PT_TRACE_VFORK)
  7.1276 +			return PTRACE_EVENT_VFORK;
  7.1277 +	} else if ((clone_flags & CSIGNAL) != SIGCHLD) {
  7.1278 +		if (current->ptrace & PT_TRACE_CLONE)
  7.1279 +			return PTRACE_EVENT_CLONE;
  7.1280 +	} else if (current->ptrace & PT_TRACE_FORK)
  7.1281 +		return PTRACE_EVENT_FORK;
  7.1282 +
  7.1283 +	return 0;
  7.1284 +}
  7.1285 +
  7.1286 +/*
  7.1287 + *  Ok, this is the main fork-routine.
  7.1288 + *
  7.1289 + * It copies the process, and if successful kick-starts
  7.1290 + * it and waits for it to finish using the VM if required.
  7.1291 + */
  7.1292 +long do_fork(unsigned long clone_flags,
  7.1293 +	      unsigned long stack_start,
  7.1294 +	      struct pt_regs *regs,
  7.1295 +	      unsigned long stack_size,
  7.1296 +	      int __user *parent_tidptr,
  7.1297 +	      int __user *child_tidptr)
  7.1298 +{
  7.1299 +	struct task_struct *p;
  7.1300 +	int trace = 0;
  7.1301 +	long pid = alloc_pidmap();
  7.1302 +
  7.1303 +	if (pid < 0)
  7.1304 +		return -EAGAIN;
  7.1305 +	if (unlikely(current->ptrace)) {
  7.1306 +		trace = fork_traceflag (clone_flags);
  7.1307 +		if (trace)
  7.1308 +			clone_flags |= CLONE_PTRACE;
  7.1309 +	}
  7.1310 +
  7.1311 +	p = copy_process(clone_flags, stack_start, regs, stack_size, parent_tidptr, child_tidptr, pid);
  7.1312 +	/*
  7.1313 +	 * Do this prior waking up the new thread - the thread pointer
  7.1314 +	 * might get invalid after that point, if the thread exits quickly.
  7.1315 +	 */
  7.1316 +	if (!IS_ERR(p)) {
  7.1317 +		struct completion vfork;
  7.1318 +
  7.1319 +		if (clone_flags & CLONE_VFORK) {
  7.1320 +			p->vfork_done = &vfork;
  7.1321 +			init_completion(&vfork);
  7.1322 +		}
  7.1323 +
  7.1324 +		if ((p->ptrace & PT_PTRACED) || (clone_flags & CLONE_STOPPED)) {
  7.1325 +			/*
  7.1326 +			 * We'll start up with an immediate SIGSTOP.
  7.1327 +			 */
  7.1328 +			sigaddset(&p->pending.signal, SIGSTOP);
  7.1329 +			set_tsk_thread_flag(p, TIF_SIGPENDING);
  7.1330 +		}
  7.1331 +
  7.1332 +		if (!(clone_flags & CLONE_STOPPED))
  7.1333 +			wake_up_new_task(p, clone_flags);
  7.1334 +		else
  7.1335 +			p->state = TASK_STOPPED;
  7.1336 +
  7.1337 +		if (unlikely (trace)) {
  7.1338 +			current->ptrace_message = pid;
  7.1339 +			ptrace_notify ((trace << 8) | SIGTRAP);
  7.1340 +		}
  7.1341 +
  7.1342 +		if (clone_flags & CLONE_VFORK) {
  7.1343 +			wait_for_completion(&vfork);
  7.1344 +			if (unlikely (current->ptrace & PT_TRACE_VFORK_DONE))
  7.1345 +				ptrace_notify ((PTRACE_EVENT_VFORK_DONE << 8) | SIGTRAP);
  7.1346 +		}
  7.1347 +	} else {
  7.1348 +		free_pidmap(pid);
  7.1349 +		pid = PTR_ERR(p);
  7.1350 +	}
  7.1351 +	return pid;
  7.1352 +}
  7.1353 +
  7.1354 +#ifndef ARCH_MIN_MMSTRUCT_ALIGN
  7.1355 +#define ARCH_MIN_MMSTRUCT_ALIGN 0
  7.1356 +#endif
  7.1357 +
  7.1358 +void __init proc_caches_init(void)
  7.1359 +{
  7.1360 +	sighand_cachep = kmem_cache_create("sighand_cache",
  7.1361 +			sizeof(struct sighand_struct), 0,
  7.1362 +			SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
  7.1363 +	signal_cachep = kmem_cache_create("signal_cache",
  7.1364 +			sizeof(struct signal_struct), 0,
  7.1365 +			SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
  7.1366 +	files_cachep = kmem_cache_create("files_cache", 
  7.1367 +			sizeof(struct files_struct), 0,
  7.1368 +			SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
  7.1369 +	fs_cachep = kmem_cache_create("fs_cache", 
  7.1370 +			sizeof(struct fs_struct), 0,
  7.1371 +			SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
  7.1372 +	vm_area_cachep = kmem_cache_create("vm_area_struct",
  7.1373 +			sizeof(struct vm_area_struct), 0,
  7.1374 +			SLAB_PANIC, NULL, NULL);
  7.1375 +	mm_cachep = kmem_cache_create("mm_struct",
  7.1376 +			sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN,
  7.1377 +			SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
  7.1378 +}
  7.1379 +
  7.1380 +
  7.1381 +/*
  7.1382 + * Check constraints on flags passed to the unshare system call and
  7.1383 + * force unsharing of additional process context as appropriate.
  7.1384 + */
  7.1385 +static inline void check_unshare_flags(unsigned long *flags_ptr)
  7.1386 +{
  7.1387 +	/*
  7.1388 +	 * If unsharing a thread from a thread group, must also
  7.1389 +	 * unshare vm.
  7.1390 +	 */
  7.1391 +	if (*flags_ptr & CLONE_THREAD)
  7.1392 +		*flags_ptr |= CLONE_VM;
  7.1393 +
  7.1394 +	/*
  7.1395 +	 * If unsharing vm, must also unshare signal handlers.
  7.1396 +	 */
  7.1397 +	if (*flags_ptr & CLONE_VM)
  7.1398 +		*flags_ptr |= CLONE_SIGHAND;
  7.1399 +
  7.1400 +	/*
  7.1401 +	 * If unsharing signal handlers and the task was created
  7.1402 +	 * using CLONE_THREAD, then must unshare the thread
  7.1403 +	 */
  7.1404 +	if ((*flags_ptr & CLONE_SIGHAND) &&
  7.1405 +	    (atomic_read(&current->signal->count) > 1))
  7.1406 +		*flags_ptr |= CLONE_THREAD;
  7.1407 +
  7.1408 +	/*
  7.1409 +	 * If unsharing namespace, must also unshare filesystem information.
  7.1410 +	 */
  7.1411 +	if (*flags_ptr & CLONE_NEWNS)
  7.1412 +		*flags_ptr |= CLONE_FS;
  7.1413 +}
  7.1414 +
  7.1415 +/*
  7.1416 + * Unsharing of tasks created with CLONE_THREAD is not supported yet
  7.1417 + */
  7.1418 +static int unshare_thread(unsigned long unshare_flags)
  7.1419 +{
  7.1420 +	if (unshare_flags & CLONE_THREAD)
  7.1421 +		return -EINVAL;
  7.1422 +
  7.1423 +	return 0;
  7.1424 +}
  7.1425 +
  7.1426 +/*
  7.1427 + * Unshare the filesystem structure if it is being shared
  7.1428 + */
  7.1429 +static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp)
  7.1430 +{
  7.1431 +	struct fs_struct *fs = current->fs;
  7.1432 +
  7.1433 +	if ((unshare_flags & CLONE_FS) &&
  7.1434 +	    (fs && atomic_read(&fs->count) > 1)) {
  7.1435 +		*new_fsp = __copy_fs_struct(current->fs);
  7.1436 +		if (!*new_fsp)
  7.1437 +			return -ENOMEM;
  7.1438 +	}
  7.1439 +
  7.1440 +	return 0;
  7.1441 +}
  7.1442 +
  7.1443 +/*
  7.1444 + * Unshare the namespace structure if it is being shared
  7.1445 + */
  7.1446 +static int unshare_namespace(unsigned long unshare_flags, struct namespace **new_nsp, struct fs_struct *new_fs)
  7.1447 +{
  7.1448 +	struct namespace *ns = current->namespace;
  7.1449 +
  7.1450 +	if ((unshare_flags & CLONE_NEWNS) &&
  7.1451 +	    (ns && atomic_read(&ns->count) > 1)) {
  7.1452 +		if (!capable(CAP_SYS_ADMIN))
  7.1453 +			return -EPERM;
  7.1454 +
  7.1455 +		*new_nsp = dup_namespace(current, new_fs ? new_fs : current->fs);
  7.1456 +		if (!*new_nsp)
  7.1457 +			return -ENOMEM;
  7.1458 +	}
  7.1459 +
  7.1460 +	return 0;
  7.1461 +}
  7.1462 +
  7.1463 +/*
  7.1464 + * Unsharing of sighand for tasks created with CLONE_SIGHAND is not
  7.1465 + * supported yet
  7.1466 + */
  7.1467 +static int unshare_sighand(unsigned long unshare_flags, struct sighand_struct **new_sighp)
  7.1468 +{
  7.1469 +	struct sighand_struct *sigh = current->sighand;
  7.1470 +
  7.1471 +	if ((unshare_flags & CLONE_SIGHAND) &&
  7.1472 +	    (sigh && atomic_read(&sigh->count) > 1))
  7.1473 +		return -EINVAL;
  7.1474 +	else
  7.1475 +		return 0;
  7.1476 +}
  7.1477 +
  7.1478 +/*
  7.1479 + * Unshare vm if it is being shared
  7.1480 + */
  7.1481 +static int unshare_vm(unsigned long unshare_flags, struct mm_struct **new_mmp)
  7.1482 +{
  7.1483 +	struct mm_struct *mm = current->mm;
  7.1484 +
  7.1485 +	if ((unshare_flags & CLONE_VM) &&
  7.1486 +	    (mm && atomic_read(&mm->mm_users) > 1)) {
  7.1487 +		return -EINVAL;
  7.1488 +	}
  7.1489 +
  7.1490 +	return 0;
  7.1491 +}
  7.1492 +
  7.1493 +/*
  7.1494 + * Unshare file descriptor table if it is being shared
  7.1495 + */
  7.1496 +static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp)
  7.1497 +{
  7.1498 +	struct files_struct *fd = current->files;
  7.1499 +	int error = 0;
  7.1500 +
  7.1501 +	if ((unshare_flags & CLONE_FILES) &&
  7.1502 +	    (fd && atomic_read(&fd->count) > 1)) {
  7.1503 +		*new_fdp = dup_fd(fd, &error);
  7.1504 +		if (!*new_fdp)
  7.1505 +			return error;
  7.1506 +	}
  7.1507 +
  7.1508 +	return 0;
  7.1509 +}
  7.1510 +
  7.1511 +/*
  7.1512 + * Unsharing of semundo for tasks created with CLONE_SYSVSEM is not
  7.1513 + * supported yet
  7.1514 + */
  7.1515 +static int unshare_semundo(unsigned long unshare_flags, struct sem_undo_list **new_ulistp)
  7.1516 +{
  7.1517 +	if (unshare_flags & CLONE_SYSVSEM)
  7.1518 +		return -EINVAL;
  7.1519 +
  7.1520 +	return 0;
  7.1521 +}
  7.1522 +
  7.1523 +/*
  7.1524 + * unshare allows a process to 'unshare' part of the process
  7.1525 + * context which was originally shared using clone.  copy_*
  7.1526 + * functions used by do_fork() cannot be used here directly
  7.1527 + * because they modify an inactive task_struct that is being
  7.1528 + * constructed. Here we are modifying the current, active,
  7.1529 + * task_struct.
  7.1530 + */
  7.1531 +asmlinkage long sys_unshare(unsigned long unshare_flags)
  7.1532 +{
  7.1533 +	int err = 0;
  7.1534 +	struct fs_struct *fs, *new_fs = NULL;
  7.1535 +	struct namespace *ns, *new_ns = NULL;
  7.1536 +	struct sighand_struct *sigh, *new_sigh = NULL;
  7.1537 +	struct mm_struct *mm, *new_mm = NULL, *active_mm = NULL;
  7.1538 +	struct files_struct *fd, *new_fd = NULL;
  7.1539 +	struct sem_undo_list *new_ulist = NULL;
  7.1540 +
  7.1541 +	check_unshare_flags(&unshare_flags);
  7.1542 +
  7.1543 +	if ((err = unshare_thread(unshare_flags)))
  7.1544 +		goto bad_unshare_out;
  7.1545 +	if ((err = unshare_fs(unshare_flags, &new_fs)))
  7.1546 +		goto bad_unshare_cleanup_thread;
  7.1547 +	if ((err = unshare_namespace(unshare_flags, &new_ns, new_fs)))
  7.1548 +		goto bad_unshare_cleanup_fs;
  7.1549 +	if ((err = unshare_sighand(unshare_flags, &new_sigh)))
  7.1550 +		goto bad_unshare_cleanup_ns;
  7.1551 +	if ((err = unshare_vm(unshare_flags, &new_mm)))
  7.1552 +		goto bad_unshare_cleanup_sigh;
  7.1553 +	if ((err = unshare_fd(unshare_flags, &new_fd)))
  7.1554 +		goto bad_unshare_cleanup_vm;
  7.1555 +	if ((err = unshare_semundo(unshare_flags, &new_ulist)))
  7.1556 +		goto bad_unshare_cleanup_fd;
  7.1557 +
  7.1558 +	if (new_fs || new_ns || new_sigh || new_mm || new_fd || new_ulist) {
  7.1559 +
  7.1560 +		task_lock(current);
  7.1561 +
  7.1562 +		if (new_fs) {
  7.1563 +			fs = current->fs;
  7.1564 +			current->fs = new_fs;
  7.1565 +			new_fs = fs;
  7.1566 +		}
  7.1567 +
  7.1568 +		if (new_ns) {
  7.1569 +			ns = current->namespace;
  7.1570 +			current->namespace = new_ns;
  7.1571 +			new_ns = ns;
  7.1572 +		}
  7.1573 +
  7.1574 +		if (new_sigh) {
  7.1575 +			sigh = current->sighand;
  7.1576 +			rcu_assign_pointer(current->sighand, new_sigh);
  7.1577 +			new_sigh = sigh;
  7.1578 +		}
  7.1579 +
  7.1580 +		if (new_mm) {
  7.1581 +			mm = current->mm;
  7.1582 +			active_mm = current->active_mm;
  7.1583 +			current->mm = new_mm;
  7.1584 +			current->active_mm = new_mm;
  7.1585 +			activate_mm(active_mm, new_mm);
  7.1586 +			new_mm = mm;
  7.1587 +		}
  7.1588 +
  7.1589 +		if (new_fd) {
  7.1590 +			fd = current->files;
  7.1591 +			current->files = new_fd;
  7.1592 +			new_fd = fd;
  7.1593 +		}
  7.1594 +
  7.1595 +		task_unlock(current);
  7.1596 +	}
  7.1597 +
  7.1598 +bad_unshare_cleanup_fd:
  7.1599 +	if (new_fd)
  7.1600 +		put_files_struct(new_fd);
  7.1601 +
  7.1602 +bad_unshare_cleanup_vm:
  7.1603 +	if (new_mm)
  7.1604 +		mmput(new_mm);
  7.1605 +
  7.1606 +bad_unshare_cleanup_sigh:
  7.1607 +	if (new_sigh)
  7.1608 +		if (atomic_dec_and_test(&new_sigh->count))
  7.1609 +			kmem_cache_free(sighand_cachep, new_sigh);
  7.1610 +
  7.1611 +bad_unshare_cleanup_ns:
  7.1612 +	if (new_ns)
  7.1613 +		put_namespace(new_ns);
  7.1614 +
  7.1615 +bad_unshare_cleanup_fs:
  7.1616 +	if (new_fs)
  7.1617 +		put_fs_struct(new_fs);
  7.1618 +
  7.1619 +bad_unshare_cleanup_thread:
  7.1620 +bad_unshare_out:
  7.1621 +	return err;
  7.1622 +}