ia64/linux-2.6.18-xen.hg

annotate kernel/fork.c @ 562:66faefe721eb

pvSCSI backend driver

Signed-off-by: Tomonari Horikoshi <t.horikoshi@jp.fujitsu.com>
Signed-off-by: Jun Kamada <kama@jp.fujitsu.com>
author Keir Fraser <keir.fraser@citrix.com>
date Mon Jun 02 09:58:27 2008 +0100 (2008-06-02)
parents a533be77c572
children
rev   line source
ian@0 1 /*
ian@0 2 * linux/kernel/fork.c
ian@0 3 *
ian@0 4 * Copyright (C) 1991, 1992 Linus Torvalds
ian@0 5 */
ian@0 6
ian@0 7 /*
ian@0 8 * 'fork.c' contains the help-routines for the 'fork' system call
ian@0 9 * (see also entry.S and others).
ian@0 10 * Fork is rather simple, once you get the hang of it, but the memory
ian@0 11 * management can be a bitch. See 'mm/memory.c': 'copy_page_range()'
ian@0 12 */
ian@0 13
ian@0 14 #include <linux/slab.h>
ian@0 15 #include <linux/init.h>
ian@0 16 #include <linux/unistd.h>
ian@0 17 #include <linux/smp_lock.h>
ian@0 18 #include <linux/module.h>
ian@0 19 #include <linux/vmalloc.h>
ian@0 20 #include <linux/completion.h>
ian@0 21 #include <linux/namespace.h>
ian@0 22 #include <linux/personality.h>
ian@0 23 #include <linux/mempolicy.h>
ian@0 24 #include <linux/sem.h>
ian@0 25 #include <linux/file.h>
ian@0 26 #include <linux/key.h>
ian@0 27 #include <linux/binfmts.h>
ian@0 28 #include <linux/mman.h>
ian@0 29 #include <linux/fs.h>
ian@0 30 #include <linux/capability.h>
ian@0 31 #include <linux/cpu.h>
ian@0 32 #include <linux/cpuset.h>
ian@0 33 #include <linux/security.h>
ian@0 34 #include <linux/swap.h>
ian@0 35 #include <linux/syscalls.h>
ian@0 36 #include <linux/jiffies.h>
ian@0 37 #include <linux/futex.h>
ian@0 38 #include <linux/rcupdate.h>
ian@0 39 #include <linux/ptrace.h>
ian@0 40 #include <linux/mount.h>
ian@0 41 #include <linux/audit.h>
ian@0 42 #include <linux/profile.h>
ian@0 43 #include <linux/rmap.h>
ian@0 44 #include <linux/acct.h>
ian@0 45 #include <linux/cn_proc.h>
ian@0 46 #include <linux/delayacct.h>
ian@0 47 #include <linux/taskstats_kern.h>
ian@0 48
ian@0 49 #include <asm/pgtable.h>
ian@0 50 #include <asm/pgalloc.h>
ian@0 51 #include <asm/uaccess.h>
ian@0 52 #include <asm/mmu_context.h>
ian@0 53 #include <asm/cacheflush.h>
ian@0 54 #include <asm/tlbflush.h>
ian@0 55
ian@0 56 /*
ian@0 57 * Protected counters by write_lock_irq(&tasklist_lock)
ian@0 58 */
ian@0 59 unsigned long total_forks; /* Handle normal Linux uptimes. */
ian@0 60 int nr_threads; /* The idle threads do not count.. */
ian@0 61
ian@0 62 int max_threads; /* tunable limit on nr_threads */
ian@0 63
ian@0 64 DEFINE_PER_CPU(unsigned long, process_counts) = 0;
ian@0 65
ian@0 66 __cacheline_aligned DEFINE_RWLOCK(tasklist_lock); /* outer */
ian@0 67
ian@0 68 int nr_processes(void)
ian@0 69 {
ian@0 70 int cpu;
ian@0 71 int total = 0;
ian@0 72
ian@0 73 for_each_online_cpu(cpu)
ian@0 74 total += per_cpu(process_counts, cpu);
ian@0 75
ian@0 76 return total;
ian@0 77 }
ian@0 78
ian@0 79 #ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR
ian@0 80 # define alloc_task_struct() kmem_cache_alloc(task_struct_cachep, GFP_KERNEL)
ian@0 81 # define free_task_struct(tsk) kmem_cache_free(task_struct_cachep, (tsk))
ian@0 82 static kmem_cache_t *task_struct_cachep;
ian@0 83 #endif
ian@0 84
ian@0 85 /* SLAB cache for signal_struct structures (tsk->signal) */
ian@0 86 static kmem_cache_t *signal_cachep;
ian@0 87
ian@0 88 /* SLAB cache for sighand_struct structures (tsk->sighand) */
ian@0 89 kmem_cache_t *sighand_cachep;
ian@0 90
ian@0 91 /* SLAB cache for files_struct structures (tsk->files) */
ian@0 92 kmem_cache_t *files_cachep;
ian@0 93
ian@0 94 /* SLAB cache for fs_struct structures (tsk->fs) */
ian@0 95 kmem_cache_t *fs_cachep;
ian@0 96
ian@0 97 /* SLAB cache for vm_area_struct structures */
ian@0 98 kmem_cache_t *vm_area_cachep;
ian@0 99
ian@0 100 /* SLAB cache for mm_struct structures (tsk->mm) */
ian@0 101 static kmem_cache_t *mm_cachep;
ian@0 102
ian@0 103 void free_task(struct task_struct *tsk)
ian@0 104 {
ian@0 105 free_thread_info(tsk->thread_info);
ian@0 106 rt_mutex_debug_task_free(tsk);
ian@0 107 free_task_struct(tsk);
ian@0 108 }
ian@0 109 EXPORT_SYMBOL(free_task);
ian@0 110
ian@0 111 void __put_task_struct(struct task_struct *tsk)
ian@0 112 {
ian@0 113 WARN_ON(!(tsk->exit_state & (EXIT_DEAD | EXIT_ZOMBIE)));
ian@0 114 WARN_ON(atomic_read(&tsk->usage));
ian@0 115 WARN_ON(tsk == current);
ian@0 116
ian@0 117 security_task_free(tsk);
ian@0 118 free_uid(tsk->user);
ian@0 119 put_group_info(tsk->group_info);
ian@0 120 delayacct_tsk_free(tsk);
ian@0 121
ian@0 122 if (!profile_handoff_task(tsk))
ian@0 123 free_task(tsk);
ian@0 124 }
ian@0 125
ian@0 126 void __init fork_init(unsigned long mempages)
ian@0 127 {
ian@0 128 #ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR
ian@0 129 #ifndef ARCH_MIN_TASKALIGN
ian@0 130 #define ARCH_MIN_TASKALIGN L1_CACHE_BYTES
ian@0 131 #endif
ian@0 132 /* create a slab on which task_structs can be allocated */
ian@0 133 task_struct_cachep =
ian@0 134 kmem_cache_create("task_struct", sizeof(struct task_struct),
ian@0 135 ARCH_MIN_TASKALIGN, SLAB_PANIC, NULL, NULL);
ian@0 136 #endif
ian@0 137
ian@0 138 /*
ian@0 139 * The default maximum number of threads is set to a safe
ian@0 140 * value: the thread structures can take up at most half
ian@0 141 * of memory.
ian@0 142 */
ian@0 143 max_threads = mempages / (8 * THREAD_SIZE / PAGE_SIZE);
ian@0 144
ian@0 145 /*
ian@0 146 * we need to allow at least 20 threads to boot a system
ian@0 147 */
ian@0 148 if(max_threads < 20)
ian@0 149 max_threads = 20;
ian@0 150
ian@0 151 init_task.signal->rlim[RLIMIT_NPROC].rlim_cur = max_threads/2;
ian@0 152 init_task.signal->rlim[RLIMIT_NPROC].rlim_max = max_threads/2;
ian@0 153 init_task.signal->rlim[RLIMIT_SIGPENDING] =
ian@0 154 init_task.signal->rlim[RLIMIT_NPROC];
ian@0 155 }
ian@0 156
ian@0 157 static struct task_struct *dup_task_struct(struct task_struct *orig)
ian@0 158 {
ian@0 159 struct task_struct *tsk;
ian@0 160 struct thread_info *ti;
ian@0 161
ian@0 162 prepare_to_copy(orig);
ian@0 163
ian@0 164 tsk = alloc_task_struct();
ian@0 165 if (!tsk)
ian@0 166 return NULL;
ian@0 167
ian@0 168 ti = alloc_thread_info(tsk);
ian@0 169 if (!ti) {
ian@0 170 free_task_struct(tsk);
ian@0 171 return NULL;
ian@0 172 }
ian@0 173
ian@0 174 *tsk = *orig;
ian@0 175 tsk->thread_info = ti;
ian@0 176 setup_thread_stack(tsk, orig);
ian@0 177
ian@0 178 /* One for us, one for whoever does the "release_task()" (usually parent) */
ian@0 179 atomic_set(&tsk->usage,2);
ian@0 180 atomic_set(&tsk->fs_excl, 0);
ian@0 181 tsk->btrace_seq = 0;
ian@0 182 tsk->splice_pipe = NULL;
ian@0 183 return tsk;
ian@0 184 }
ian@0 185
ian@0 186 #ifdef CONFIG_MMU
ian@0 187 static inline int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
ian@0 188 {
ian@0 189 struct vm_area_struct *mpnt, *tmp, **pprev;
ian@0 190 struct rb_node **rb_link, *rb_parent;
ian@0 191 int retval;
ian@0 192 unsigned long charge;
ian@0 193 struct mempolicy *pol;
ian@0 194
ian@0 195 down_write(&oldmm->mmap_sem);
ian@0 196 flush_cache_mm(oldmm);
ian@0 197 /*
ian@0 198 * Not linked in yet - no deadlock potential:
ian@0 199 */
ian@0 200 down_write_nested(&mm->mmap_sem, SINGLE_DEPTH_NESTING);
ian@0 201
ian@0 202 mm->locked_vm = 0;
ian@0 203 mm->mmap = NULL;
ian@0 204 mm->mmap_cache = NULL;
ian@0 205 mm->free_area_cache = oldmm->mmap_base;
ian@0 206 mm->cached_hole_size = ~0UL;
ian@0 207 mm->map_count = 0;
ian@0 208 cpus_clear(mm->cpu_vm_mask);
ian@0 209 mm->mm_rb = RB_ROOT;
ian@0 210 rb_link = &mm->mm_rb.rb_node;
ian@0 211 rb_parent = NULL;
ian@0 212 pprev = &mm->mmap;
ian@0 213
ian@0 214 for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) {
ian@0 215 struct file *file;
ian@0 216
ian@0 217 if (mpnt->vm_flags & VM_DONTCOPY) {
ian@0 218 long pages = vma_pages(mpnt);
ian@0 219 mm->total_vm -= pages;
ian@0 220 vm_stat_account(mm, mpnt->vm_flags, mpnt->vm_file,
ian@0 221 -pages);
ian@0 222 continue;
ian@0 223 }
ian@0 224 charge = 0;
ian@0 225 if (mpnt->vm_flags & VM_ACCOUNT) {
ian@0 226 unsigned int len = (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT;
ian@0 227 if (security_vm_enough_memory(len))
ian@0 228 goto fail_nomem;
ian@0 229 charge = len;
ian@0 230 }
ian@0 231 tmp = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
ian@0 232 if (!tmp)
ian@0 233 goto fail_nomem;
ian@0 234 *tmp = *mpnt;
ian@0 235 pol = mpol_copy(vma_policy(mpnt));
ian@0 236 retval = PTR_ERR(pol);
ian@0 237 if (IS_ERR(pol))
ian@0 238 goto fail_nomem_policy;
ian@0 239 vma_set_policy(tmp, pol);
ian@0 240 tmp->vm_flags &= ~VM_LOCKED;
ian@0 241 tmp->vm_mm = mm;
ian@0 242 tmp->vm_next = NULL;
ian@0 243 anon_vma_link(tmp);
ian@0 244 file = tmp->vm_file;
ian@0 245 if (file) {
ian@0 246 struct inode *inode = file->f_dentry->d_inode;
ian@0 247 get_file(file);
ian@0 248 if (tmp->vm_flags & VM_DENYWRITE)
ian@0 249 atomic_dec(&inode->i_writecount);
ian@0 250
ian@0 251 /* insert tmp into the share list, just after mpnt */
ian@0 252 spin_lock(&file->f_mapping->i_mmap_lock);
ian@0 253 tmp->vm_truncate_count = mpnt->vm_truncate_count;
ian@0 254 flush_dcache_mmap_lock(file->f_mapping);
ian@0 255 vma_prio_tree_add(tmp, mpnt);
ian@0 256 flush_dcache_mmap_unlock(file->f_mapping);
ian@0 257 spin_unlock(&file->f_mapping->i_mmap_lock);
ian@0 258 }
ian@0 259
ian@0 260 /*
ian@0 261 * Link in the new vma and copy the page table entries.
ian@0 262 */
ian@0 263 *pprev = tmp;
ian@0 264 pprev = &tmp->vm_next;
ian@0 265
ian@0 266 __vma_link_rb(mm, tmp, rb_link, rb_parent);
ian@0 267 rb_link = &tmp->vm_rb.rb_right;
ian@0 268 rb_parent = &tmp->vm_rb;
ian@0 269
ian@0 270 mm->map_count++;
ian@0 271 retval = copy_page_range(mm, oldmm, mpnt);
ian@0 272
ian@0 273 if (tmp->vm_ops && tmp->vm_ops->open)
ian@0 274 tmp->vm_ops->open(tmp);
ian@0 275
ian@0 276 if (retval)
ian@0 277 goto out;
ian@0 278 }
ian@26 279 #ifdef arch_dup_mmap
ian@26 280 arch_dup_mmap(mm, oldmm);
ian@26 281 #endif
ian@0 282 retval = 0;
ian@0 283 out:
ian@0 284 up_write(&mm->mmap_sem);
ian@0 285 flush_tlb_mm(oldmm);
ian@0 286 up_write(&oldmm->mmap_sem);
ian@0 287 return retval;
ian@0 288 fail_nomem_policy:
ian@0 289 kmem_cache_free(vm_area_cachep, tmp);
ian@0 290 fail_nomem:
ian@0 291 retval = -ENOMEM;
ian@0 292 vm_unacct_memory(charge);
ian@0 293 goto out;
ian@0 294 }
ian@0 295
ian@0 296 static inline int mm_alloc_pgd(struct mm_struct * mm)
ian@0 297 {
ian@0 298 mm->pgd = pgd_alloc(mm);
ian@0 299 if (unlikely(!mm->pgd))
ian@0 300 return -ENOMEM;
ian@0 301 return 0;
ian@0 302 }
ian@0 303
ian@0 304 static inline void mm_free_pgd(struct mm_struct * mm)
ian@0 305 {
ian@0 306 pgd_free(mm->pgd);
ian@0 307 }
ian@0 308 #else
ian@0 309 #define dup_mmap(mm, oldmm) (0)
ian@0 310 #define mm_alloc_pgd(mm) (0)
ian@0 311 #define mm_free_pgd(mm)
ian@0 312 #endif /* CONFIG_MMU */
ian@0 313
ian@0 314 __cacheline_aligned_in_smp DEFINE_SPINLOCK(mmlist_lock);
ian@0 315
ian@0 316 #define allocate_mm() (kmem_cache_alloc(mm_cachep, SLAB_KERNEL))
ian@0 317 #define free_mm(mm) (kmem_cache_free(mm_cachep, (mm)))
ian@0 318
ian@0 319 #include <linux/init_task.h>
ian@0 320
ian@0 321 static struct mm_struct * mm_init(struct mm_struct * mm)
ian@0 322 {
ian@0 323 atomic_set(&mm->mm_users, 1);
ian@0 324 atomic_set(&mm->mm_count, 1);
ian@0 325 init_rwsem(&mm->mmap_sem);
ian@0 326 INIT_LIST_HEAD(&mm->mmlist);
ian@0 327 mm->core_waiters = 0;
ian@0 328 mm->nr_ptes = 0;
ian@0 329 set_mm_counter(mm, file_rss, 0);
ian@0 330 set_mm_counter(mm, anon_rss, 0);
ian@0 331 spin_lock_init(&mm->page_table_lock);
ian@0 332 rwlock_init(&mm->ioctx_list_lock);
ian@0 333 mm->ioctx_list = NULL;
ian@0 334 mm->free_area_cache = TASK_UNMAPPED_BASE;
ian@0 335 mm->cached_hole_size = ~0UL;
ian@0 336
ian@0 337 if (likely(!mm_alloc_pgd(mm))) {
ian@0 338 mm->def_flags = 0;
ian@0 339 return mm;
ian@0 340 }
ian@0 341 free_mm(mm);
ian@0 342 return NULL;
ian@0 343 }
ian@0 344
ian@0 345 /*
ian@0 346 * Allocate and initialize an mm_struct.
ian@0 347 */
ian@0 348 struct mm_struct * mm_alloc(void)
ian@0 349 {
ian@0 350 struct mm_struct * mm;
ian@0 351
ian@0 352 mm = allocate_mm();
ian@0 353 if (mm) {
ian@0 354 memset(mm, 0, sizeof(*mm));
ian@0 355 mm = mm_init(mm);
ian@0 356 }
ian@0 357 return mm;
ian@0 358 }
ian@0 359
ian@0 360 /*
ian@0 361 * Called when the last reference to the mm
ian@0 362 * is dropped: either by a lazy thread or by
ian@0 363 * mmput. Free the page directory and the mm.
ian@0 364 */
ian@0 365 void fastcall __mmdrop(struct mm_struct *mm)
ian@0 366 {
ian@0 367 BUG_ON(mm == &init_mm);
ian@0 368 mm_free_pgd(mm);
ian@0 369 destroy_context(mm);
ian@0 370 free_mm(mm);
ian@0 371 }
ian@0 372
ian@0 373 /*
ian@0 374 * Decrement the use count and release all resources for an mm.
ian@0 375 */
ian@0 376 void mmput(struct mm_struct *mm)
ian@0 377 {
ian@0 378 might_sleep();
ian@0 379
ian@0 380 if (atomic_dec_and_test(&mm->mm_users)) {
ian@0 381 exit_aio(mm);
ian@0 382 exit_mmap(mm);
ian@0 383 if (!list_empty(&mm->mmlist)) {
ian@0 384 spin_lock(&mmlist_lock);
ian@0 385 list_del(&mm->mmlist);
ian@0 386 spin_unlock(&mmlist_lock);
ian@0 387 }
ian@0 388 put_swap_token(mm);
ian@0 389 mmdrop(mm);
ian@0 390 }
ian@0 391 }
ian@0 392 EXPORT_SYMBOL_GPL(mmput);
ian@0 393
ian@0 394 /**
ian@0 395 * get_task_mm - acquire a reference to the task's mm
ian@0 396 *
ian@0 397 * Returns %NULL if the task has no mm. Checks PF_BORROWED_MM (meaning
ian@0 398 * this kernel workthread has transiently adopted a user mm with use_mm,
ian@0 399 * to do its AIO) is not set and if so returns a reference to it, after
ian@0 400 * bumping up the use count. User must release the mm via mmput()
ian@0 401 * after use. Typically used by /proc and ptrace.
ian@0 402 */
ian@0 403 struct mm_struct *get_task_mm(struct task_struct *task)
ian@0 404 {
ian@0 405 struct mm_struct *mm;
ian@0 406
ian@0 407 task_lock(task);
ian@0 408 mm = task->mm;
ian@0 409 if (mm) {
ian@0 410 if (task->flags & PF_BORROWED_MM)
ian@0 411 mm = NULL;
ian@0 412 else
ian@0 413 atomic_inc(&mm->mm_users);
ian@0 414 }
ian@0 415 task_unlock(task);
ian@0 416 return mm;
ian@0 417 }
ian@0 418 EXPORT_SYMBOL_GPL(get_task_mm);
ian@0 419
ian@0 420 /* Please note the differences between mmput and mm_release.
ian@0 421 * mmput is called whenever we stop holding onto a mm_struct,
ian@0 422 * error success whatever.
ian@0 423 *
ian@0 424 * mm_release is called after a mm_struct has been removed
ian@0 425 * from the current process.
ian@0 426 *
ian@0 427 * This difference is important for error handling, when we
ian@0 428 * only half set up a mm_struct for a new process and need to restore
ian@0 429 * the old one. Because we mmput the new mm_struct before
ian@0 430 * restoring the old one. . .
ian@0 431 * Eric Biederman 10 January 1998
ian@0 432 */
ian@0 433 void mm_release(struct task_struct *tsk, struct mm_struct *mm)
ian@0 434 {
ian@0 435 struct completion *vfork_done = tsk->vfork_done;
ian@0 436
ian@0 437 /* Get rid of any cached register state */
ian@0 438 deactivate_mm(tsk, mm);
ian@0 439
ian@0 440 /* notify parent sleeping on vfork() */
ian@0 441 if (vfork_done) {
ian@0 442 tsk->vfork_done = NULL;
ian@0 443 complete(vfork_done);
ian@0 444 }
ian@0 445 if (tsk->clear_child_tid && atomic_read(&mm->mm_users) > 1) {
ian@0 446 u32 __user * tidptr = tsk->clear_child_tid;
ian@0 447 tsk->clear_child_tid = NULL;
ian@0 448
ian@0 449 /*
ian@0 450 * We don't check the error code - if userspace has
ian@0 451 * not set up a proper pointer then tough luck.
ian@0 452 */
ian@0 453 put_user(0, tidptr);
ian@0 454 sys_futex(tidptr, FUTEX_WAKE, 1, NULL, NULL, 0);
ian@0 455 }
ian@0 456 }
ian@0 457
ian@0 458 /*
ian@0 459 * Allocate a new mm structure and copy contents from the
ian@0 460 * mm structure of the passed in task structure.
ian@0 461 */
ian@0 462 static struct mm_struct *dup_mm(struct task_struct *tsk)
ian@0 463 {
ian@0 464 struct mm_struct *mm, *oldmm = current->mm;
ian@0 465 int err;
ian@0 466
ian@0 467 if (!oldmm)
ian@0 468 return NULL;
ian@0 469
ian@0 470 mm = allocate_mm();
ian@0 471 if (!mm)
ian@0 472 goto fail_nomem;
ian@0 473
ian@0 474 memcpy(mm, oldmm, sizeof(*mm));
ian@0 475
ian@0 476 if (!mm_init(mm))
ian@0 477 goto fail_nomem;
ian@0 478
ian@0 479 if (init_new_context(tsk, mm))
ian@0 480 goto fail_nocontext;
ian@0 481
ian@0 482 err = dup_mmap(mm, oldmm);
ian@0 483 if (err)
ian@0 484 goto free_pt;
ian@0 485
ian@0 486 mm->hiwater_rss = get_mm_rss(mm);
ian@0 487 mm->hiwater_vm = mm->total_vm;
ian@0 488
ian@0 489 return mm;
ian@0 490
ian@0 491 free_pt:
ian@0 492 mmput(mm);
ian@0 493
ian@0 494 fail_nomem:
ian@0 495 return NULL;
ian@0 496
ian@0 497 fail_nocontext:
ian@0 498 /*
ian@0 499 * If init_new_context() failed, we cannot use mmput() to free the mm
ian@0 500 * because it calls destroy_context()
ian@0 501 */
ian@0 502 mm_free_pgd(mm);
ian@0 503 free_mm(mm);
ian@0 504 return NULL;
ian@0 505 }
ian@0 506
ian@0 507 static int copy_mm(unsigned long clone_flags, struct task_struct * tsk)
ian@0 508 {
ian@0 509 struct mm_struct * mm, *oldmm;
ian@0 510 int retval;
ian@0 511
ian@0 512 tsk->min_flt = tsk->maj_flt = 0;
ian@0 513 tsk->nvcsw = tsk->nivcsw = 0;
ian@0 514
ian@0 515 tsk->mm = NULL;
ian@0 516 tsk->active_mm = NULL;
ian@0 517
ian@0 518 /*
ian@0 519 * Are we cloning a kernel thread?
ian@0 520 *
ian@0 521 * We need to steal a active VM for that..
ian@0 522 */
ian@0 523 oldmm = current->mm;
ian@0 524 if (!oldmm)
ian@0 525 return 0;
ian@0 526
ian@0 527 if (clone_flags & CLONE_VM) {
ian@0 528 atomic_inc(&oldmm->mm_users);
ian@0 529 mm = oldmm;
ian@0 530 goto good_mm;
ian@0 531 }
ian@0 532
ian@0 533 retval = -ENOMEM;
ian@0 534 mm = dup_mm(tsk);
ian@0 535 if (!mm)
ian@0 536 goto fail_nomem;
ian@0 537
ian@0 538 good_mm:
ian@0 539 tsk->mm = mm;
ian@0 540 tsk->active_mm = mm;
ian@0 541 return 0;
ian@0 542
ian@0 543 fail_nomem:
ian@0 544 return retval;
ian@0 545 }
ian@0 546
ian@0 547 static inline struct fs_struct *__copy_fs_struct(struct fs_struct *old)
ian@0 548 {
ian@0 549 struct fs_struct *fs = kmem_cache_alloc(fs_cachep, GFP_KERNEL);
ian@0 550 /* We don't need to lock fs - think why ;-) */
ian@0 551 if (fs) {
ian@0 552 atomic_set(&fs->count, 1);
ian@0 553 rwlock_init(&fs->lock);
ian@0 554 fs->umask = old->umask;
ian@0 555 read_lock(&old->lock);
ian@0 556 fs->rootmnt = mntget(old->rootmnt);
ian@0 557 fs->root = dget(old->root);
ian@0 558 fs->pwdmnt = mntget(old->pwdmnt);
ian@0 559 fs->pwd = dget(old->pwd);
ian@0 560 if (old->altroot) {
ian@0 561 fs->altrootmnt = mntget(old->altrootmnt);
ian@0 562 fs->altroot = dget(old->altroot);
ian@0 563 } else {
ian@0 564 fs->altrootmnt = NULL;
ian@0 565 fs->altroot = NULL;
ian@0 566 }
ian@0 567 read_unlock(&old->lock);
ian@0 568 }
ian@0 569 return fs;
ian@0 570 }
ian@0 571
ian@0 572 struct fs_struct *copy_fs_struct(struct fs_struct *old)
ian@0 573 {
ian@0 574 return __copy_fs_struct(old);
ian@0 575 }
ian@0 576
ian@0 577 EXPORT_SYMBOL_GPL(copy_fs_struct);
ian@0 578
ian@0 579 static inline int copy_fs(unsigned long clone_flags, struct task_struct * tsk)
ian@0 580 {
ian@0 581 if (clone_flags & CLONE_FS) {
ian@0 582 atomic_inc(&current->fs->count);
ian@0 583 return 0;
ian@0 584 }
ian@0 585 tsk->fs = __copy_fs_struct(current->fs);
ian@0 586 if (!tsk->fs)
ian@0 587 return -ENOMEM;
ian@0 588 return 0;
ian@0 589 }
ian@0 590
ian@0 591 static int count_open_files(struct fdtable *fdt)
ian@0 592 {
ian@0 593 int size = fdt->max_fdset;
ian@0 594 int i;
ian@0 595
ian@0 596 /* Find the last open fd */
ian@0 597 for (i = size/(8*sizeof(long)); i > 0; ) {
ian@0 598 if (fdt->open_fds->fds_bits[--i])
ian@0 599 break;
ian@0 600 }
ian@0 601 i = (i+1) * 8 * sizeof(long);
ian@0 602 return i;
ian@0 603 }
ian@0 604
ian@0 605 static struct files_struct *alloc_files(void)
ian@0 606 {
ian@0 607 struct files_struct *newf;
ian@0 608 struct fdtable *fdt;
ian@0 609
ian@0 610 newf = kmem_cache_alloc(files_cachep, SLAB_KERNEL);
ian@0 611 if (!newf)
ian@0 612 goto out;
ian@0 613
ian@0 614 atomic_set(&newf->count, 1);
ian@0 615
ian@0 616 spin_lock_init(&newf->file_lock);
ian@0 617 newf->next_fd = 0;
ian@0 618 fdt = &newf->fdtab;
ian@0 619 fdt->max_fds = NR_OPEN_DEFAULT;
ian@0 620 fdt->max_fdset = EMBEDDED_FD_SET_SIZE;
ian@0 621 fdt->close_on_exec = (fd_set *)&newf->close_on_exec_init;
ian@0 622 fdt->open_fds = (fd_set *)&newf->open_fds_init;
ian@0 623 fdt->fd = &newf->fd_array[0];
ian@0 624 INIT_RCU_HEAD(&fdt->rcu);
ian@0 625 fdt->free_files = NULL;
ian@0 626 fdt->next = NULL;
ian@0 627 rcu_assign_pointer(newf->fdt, fdt);
ian@0 628 out:
ian@0 629 return newf;
ian@0 630 }
ian@0 631
ian@0 632 /*
ian@0 633 * Allocate a new files structure and copy contents from the
ian@0 634 * passed in files structure.
ian@0 635 * errorp will be valid only when the returned files_struct is NULL.
ian@0 636 */
ian@0 637 static struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
ian@0 638 {
ian@0 639 struct files_struct *newf;
ian@0 640 struct file **old_fds, **new_fds;
ian@0 641 int open_files, size, i, expand;
ian@0 642 struct fdtable *old_fdt, *new_fdt;
ian@0 643
ian@0 644 *errorp = -ENOMEM;
ian@0 645 newf = alloc_files();
ian@0 646 if (!newf)
ian@0 647 goto out;
ian@0 648
ian@0 649 spin_lock(&oldf->file_lock);
ian@0 650 old_fdt = files_fdtable(oldf);
ian@0 651 new_fdt = files_fdtable(newf);
ian@0 652 size = old_fdt->max_fdset;
ian@0 653 open_files = count_open_files(old_fdt);
ian@0 654 expand = 0;
ian@0 655
ian@0 656 /*
ian@0 657 * Check whether we need to allocate a larger fd array or fd set.
ian@0 658 * Note: we're not a clone task, so the open count won't change.
ian@0 659 */
ian@0 660 if (open_files > new_fdt->max_fdset) {
ian@0 661 new_fdt->max_fdset = 0;
ian@0 662 expand = 1;
ian@0 663 }
ian@0 664 if (open_files > new_fdt->max_fds) {
ian@0 665 new_fdt->max_fds = 0;
ian@0 666 expand = 1;
ian@0 667 }
ian@0 668
ian@0 669 /* if the old fdset gets grown now, we'll only copy up to "size" fds */
ian@0 670 if (expand) {
ian@0 671 spin_unlock(&oldf->file_lock);
ian@0 672 spin_lock(&newf->file_lock);
ian@0 673 *errorp = expand_files(newf, open_files-1);
ian@0 674 spin_unlock(&newf->file_lock);
ian@0 675 if (*errorp < 0)
ian@0 676 goto out_release;
ian@0 677 new_fdt = files_fdtable(newf);
ian@0 678 /*
ian@0 679 * Reacquire the oldf lock and a pointer to its fd table
ian@0 680 * who knows it may have a new bigger fd table. We need
ian@0 681 * the latest pointer.
ian@0 682 */
ian@0 683 spin_lock(&oldf->file_lock);
ian@0 684 old_fdt = files_fdtable(oldf);
ian@0 685 }
ian@0 686
ian@0 687 old_fds = old_fdt->fd;
ian@0 688 new_fds = new_fdt->fd;
ian@0 689
ian@0 690 memcpy(new_fdt->open_fds->fds_bits, old_fdt->open_fds->fds_bits, open_files/8);
ian@0 691 memcpy(new_fdt->close_on_exec->fds_bits, old_fdt->close_on_exec->fds_bits, open_files/8);
ian@0 692
ian@0 693 for (i = open_files; i != 0; i--) {
ian@0 694 struct file *f = *old_fds++;
ian@0 695 if (f) {
ian@0 696 get_file(f);
ian@0 697 } else {
ian@0 698 /*
ian@0 699 * The fd may be claimed in the fd bitmap but not yet
ian@0 700 * instantiated in the files array if a sibling thread
ian@0 701 * is partway through open(). So make sure that this
ian@0 702 * fd is available to the new process.
ian@0 703 */
ian@0 704 FD_CLR(open_files - i, new_fdt->open_fds);
ian@0 705 }
ian@0 706 rcu_assign_pointer(*new_fds++, f);
ian@0 707 }
ian@0 708 spin_unlock(&oldf->file_lock);
ian@0 709
ian@0 710 /* compute the remainder to be cleared */
ian@0 711 size = (new_fdt->max_fds - open_files) * sizeof(struct file *);
ian@0 712
ian@0 713 /* This is long word aligned thus could use a optimized version */
ian@0 714 memset(new_fds, 0, size);
ian@0 715
ian@0 716 if (new_fdt->max_fdset > open_files) {
ian@0 717 int left = (new_fdt->max_fdset-open_files)/8;
ian@0 718 int start = open_files / (8 * sizeof(unsigned long));
ian@0 719
ian@0 720 memset(&new_fdt->open_fds->fds_bits[start], 0, left);
ian@0 721 memset(&new_fdt->close_on_exec->fds_bits[start], 0, left);
ian@0 722 }
ian@0 723
ian@0 724 out:
ian@0 725 return newf;
ian@0 726
ian@0 727 out_release:
ian@0 728 free_fdset (new_fdt->close_on_exec, new_fdt->max_fdset);
ian@0 729 free_fdset (new_fdt->open_fds, new_fdt->max_fdset);
ian@0 730 free_fd_array(new_fdt->fd, new_fdt->max_fds);
ian@0 731 kmem_cache_free(files_cachep, newf);
ian@0 732 return NULL;
ian@0 733 }
ian@0 734
ian@0 735 static int copy_files(unsigned long clone_flags, struct task_struct * tsk)
ian@0 736 {
ian@0 737 struct files_struct *oldf, *newf;
ian@0 738 int error = 0;
ian@0 739
ian@0 740 /*
ian@0 741 * A background process may not have any files ...
ian@0 742 */
ian@0 743 oldf = current->files;
ian@0 744 if (!oldf)
ian@0 745 goto out;
ian@0 746
ian@0 747 if (clone_flags & CLONE_FILES) {
ian@0 748 atomic_inc(&oldf->count);
ian@0 749 goto out;
ian@0 750 }
ian@0 751
ian@0 752 /*
ian@0 753 * Note: we may be using current for both targets (See exec.c)
ian@0 754 * This works because we cache current->files (old) as oldf. Don't
ian@0 755 * break this.
ian@0 756 */
ian@0 757 tsk->files = NULL;
ian@0 758 newf = dup_fd(oldf, &error);
ian@0 759 if (!newf)
ian@0 760 goto out;
ian@0 761
ian@0 762 tsk->files = newf;
ian@0 763 error = 0;
ian@0 764 out:
ian@0 765 return error;
ian@0 766 }
ian@0 767
ian@0 768 /*
ian@0 769 * Helper to unshare the files of the current task.
ian@0 770 * We don't want to expose copy_files internals to
ian@0 771 * the exec layer of the kernel.
ian@0 772 */
ian@0 773
ian@0 774 int unshare_files(void)
ian@0 775 {
ian@0 776 struct files_struct *files = current->files;
ian@0 777 int rc;
ian@0 778
ian@0 779 BUG_ON(!files);
ian@0 780
ian@0 781 /* This can race but the race causes us to copy when we don't
ian@0 782 need to and drop the copy */
ian@0 783 if(atomic_read(&files->count) == 1)
ian@0 784 {
ian@0 785 atomic_inc(&files->count);
ian@0 786 return 0;
ian@0 787 }
ian@0 788 rc = copy_files(0, current);
ian@0 789 if(rc)
ian@0 790 current->files = files;
ian@0 791 return rc;
ian@0 792 }
ian@0 793
ian@0 794 EXPORT_SYMBOL(unshare_files);
ian@0 795
ian@0 796 static inline int copy_sighand(unsigned long clone_flags, struct task_struct * tsk)
ian@0 797 {
ian@0 798 struct sighand_struct *sig;
ian@0 799
ian@0 800 if (clone_flags & (CLONE_SIGHAND | CLONE_THREAD)) {
ian@0 801 atomic_inc(&current->sighand->count);
ian@0 802 return 0;
ian@0 803 }
ian@0 804 sig = kmem_cache_alloc(sighand_cachep, GFP_KERNEL);
ian@0 805 rcu_assign_pointer(tsk->sighand, sig);
ian@0 806 if (!sig)
ian@0 807 return -ENOMEM;
ian@0 808 atomic_set(&sig->count, 1);
ian@0 809 memcpy(sig->action, current->sighand->action, sizeof(sig->action));
ian@0 810 return 0;
ian@0 811 }
ian@0 812
ian@0 813 void __cleanup_sighand(struct sighand_struct *sighand)
ian@0 814 {
ian@0 815 if (atomic_dec_and_test(&sighand->count))
ian@0 816 kmem_cache_free(sighand_cachep, sighand);
ian@0 817 }
ian@0 818
ian@0 819 static inline int copy_signal(unsigned long clone_flags, struct task_struct * tsk)
ian@0 820 {
ian@0 821 struct signal_struct *sig;
ian@0 822 int ret;
ian@0 823
ian@0 824 if (clone_flags & CLONE_THREAD) {
ian@0 825 atomic_inc(&current->signal->count);
ian@0 826 atomic_inc(&current->signal->live);
ian@0 827 taskstats_tgid_alloc(current->signal);
ian@0 828 return 0;
ian@0 829 }
ian@0 830 sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL);
ian@0 831 tsk->signal = sig;
ian@0 832 if (!sig)
ian@0 833 return -ENOMEM;
ian@0 834
ian@0 835 ret = copy_thread_group_keys(tsk);
ian@0 836 if (ret < 0) {
ian@0 837 kmem_cache_free(signal_cachep, sig);
ian@0 838 return ret;
ian@0 839 }
ian@0 840
ian@0 841 atomic_set(&sig->count, 1);
ian@0 842 atomic_set(&sig->live, 1);
ian@0 843 init_waitqueue_head(&sig->wait_chldexit);
ian@0 844 sig->flags = 0;
ian@0 845 sig->group_exit_code = 0;
ian@0 846 sig->group_exit_task = NULL;
ian@0 847 sig->group_stop_count = 0;
ian@0 848 sig->curr_target = NULL;
ian@0 849 init_sigpending(&sig->shared_pending);
ian@0 850 INIT_LIST_HEAD(&sig->posix_timers);
ian@0 851
ian@0 852 hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_REL);
ian@0 853 sig->it_real_incr.tv64 = 0;
ian@0 854 sig->real_timer.function = it_real_fn;
ian@0 855 sig->tsk = tsk;
ian@0 856
ian@0 857 sig->it_virt_expires = cputime_zero;
ian@0 858 sig->it_virt_incr = cputime_zero;
ian@0 859 sig->it_prof_expires = cputime_zero;
ian@0 860 sig->it_prof_incr = cputime_zero;
ian@0 861
ian@0 862 sig->leader = 0; /* session leadership doesn't inherit */
ian@0 863 sig->tty_old_pgrp = 0;
ian@0 864
ian@0 865 sig->utime = sig->stime = sig->cutime = sig->cstime = cputime_zero;
ian@0 866 sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0;
ian@0 867 sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0;
ian@0 868 sig->sched_time = 0;
ian@0 869 INIT_LIST_HEAD(&sig->cpu_timers[0]);
ian@0 870 INIT_LIST_HEAD(&sig->cpu_timers[1]);
ian@0 871 INIT_LIST_HEAD(&sig->cpu_timers[2]);
ian@0 872 taskstats_tgid_init(sig);
ian@0 873
ian@0 874 task_lock(current->group_leader);
ian@0 875 memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim);
ian@0 876 task_unlock(current->group_leader);
ian@0 877
ian@0 878 if (sig->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY) {
ian@0 879 /*
ian@0 880 * New sole thread in the process gets an expiry time
ian@0 881 * of the whole CPU time limit.
ian@0 882 */
ian@0 883 tsk->it_prof_expires =
ian@0 884 secs_to_cputime(sig->rlim[RLIMIT_CPU].rlim_cur);
ian@0 885 }
ian@0 886 acct_init_pacct(&sig->pacct);
ian@0 887
ian@0 888 return 0;
ian@0 889 }
ian@0 890
ian@0 891 void __cleanup_signal(struct signal_struct *sig)
ian@0 892 {
ian@0 893 exit_thread_group_keys(sig);
ian@0 894 taskstats_tgid_free(sig);
ian@0 895 kmem_cache_free(signal_cachep, sig);
ian@0 896 }
ian@0 897
ian@0 898 static inline void cleanup_signal(struct task_struct *tsk)
ian@0 899 {
ian@0 900 struct signal_struct *sig = tsk->signal;
ian@0 901
ian@0 902 atomic_dec(&sig->live);
ian@0 903
ian@0 904 if (atomic_dec_and_test(&sig->count))
ian@0 905 __cleanup_signal(sig);
ian@0 906 }
ian@0 907
ian@0 908 static inline void copy_flags(unsigned long clone_flags, struct task_struct *p)
ian@0 909 {
ian@0 910 unsigned long new_flags = p->flags;
ian@0 911
ian@0 912 new_flags &= ~(PF_SUPERPRIV | PF_NOFREEZE);
ian@0 913 new_flags |= PF_FORKNOEXEC;
ian@0 914 if (!(clone_flags & CLONE_PTRACE))
ian@0 915 p->ptrace = 0;
ian@0 916 p->flags = new_flags;
ian@0 917 }
ian@0 918
ian@0 919 asmlinkage long sys_set_tid_address(int __user *tidptr)
ian@0 920 {
ian@0 921 current->clear_child_tid = tidptr;
ian@0 922
ian@0 923 return current->pid;
ian@0 924 }
ian@0 925
ian@0 926 static inline void rt_mutex_init_task(struct task_struct *p)
ian@0 927 {
ian@0 928 #ifdef CONFIG_RT_MUTEXES
ian@0 929 spin_lock_init(&p->pi_lock);
ian@0 930 plist_head_init(&p->pi_waiters, &p->pi_lock);
ian@0 931 p->pi_blocked_on = NULL;
ian@0 932 #endif
ian@0 933 }
ian@0 934
ian@0 935 /*
ian@0 936 * This creates a new process as a copy of the old one,
ian@0 937 * but does not actually start it yet.
ian@0 938 *
ian@0 939 * It copies the registers, and all the appropriate
ian@0 940 * parts of the process environment (as per the clone
ian@0 941 * flags). The actual kick-off is left to the caller.
ian@0 942 */
ian@0 943 static struct task_struct *copy_process(unsigned long clone_flags,
ian@0 944 unsigned long stack_start,
ian@0 945 struct pt_regs *regs,
ian@0 946 unsigned long stack_size,
ian@0 947 int __user *parent_tidptr,
ian@0 948 int __user *child_tidptr,
ian@0 949 int pid)
ian@0 950 {
ian@0 951 int retval;
ian@0 952 struct task_struct *p = NULL;
ian@0 953
ian@0 954 if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
ian@0 955 return ERR_PTR(-EINVAL);
ian@0 956
ian@0 957 /*
ian@0 958 * Thread groups must share signals as well, and detached threads
ian@0 959 * can only be started up within the thread group.
ian@0 960 */
ian@0 961 if ((clone_flags & CLONE_THREAD) && !(clone_flags & CLONE_SIGHAND))
ian@0 962 return ERR_PTR(-EINVAL);
ian@0 963
ian@0 964 /*
ian@0 965 * Shared signal handlers imply shared VM. By way of the above,
ian@0 966 * thread groups also imply shared VM. Blocking this case allows
ian@0 967 * for various simplifications in other code.
ian@0 968 */
ian@0 969 if ((clone_flags & CLONE_SIGHAND) && !(clone_flags & CLONE_VM))
ian@0 970 return ERR_PTR(-EINVAL);
ian@0 971
ian@0 972 retval = security_task_create(clone_flags);
ian@0 973 if (retval)
ian@0 974 goto fork_out;
ian@0 975
ian@0 976 retval = -ENOMEM;
ian@0 977 p = dup_task_struct(current);
ian@0 978 if (!p)
ian@0 979 goto fork_out;
ian@0 980
ian@0 981 #ifdef CONFIG_TRACE_IRQFLAGS
ian@0 982 DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled);
ian@0 983 DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled);
ian@0 984 #endif
ian@0 985 retval = -EAGAIN;
ian@0 986 if (atomic_read(&p->user->processes) >=
ian@0 987 p->signal->rlim[RLIMIT_NPROC].rlim_cur) {
ian@0 988 if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE) &&
ian@0 989 p->user != &root_user)
ian@0 990 goto bad_fork_free;
ian@0 991 }
ian@0 992
ian@0 993 atomic_inc(&p->user->__count);
ian@0 994 atomic_inc(&p->user->processes);
ian@0 995 get_group_info(p->group_info);
ian@0 996
ian@0 997 /*
ian@0 998 * If multiple threads are within copy_process(), then this check
ian@0 999 * triggers too late. This doesn't hurt, the check is only there
ian@0 1000 * to stop root fork bombs.
ian@0 1001 */
ian@0 1002 if (nr_threads >= max_threads)
ian@0 1003 goto bad_fork_cleanup_count;
ian@0 1004
ian@0 1005 if (!try_module_get(task_thread_info(p)->exec_domain->module))
ian@0 1006 goto bad_fork_cleanup_count;
ian@0 1007
ian@0 1008 if (p->binfmt && !try_module_get(p->binfmt->module))
ian@0 1009 goto bad_fork_cleanup_put_domain;
ian@0 1010
ian@0 1011 p->did_exec = 0;
ian@0 1012 delayacct_tsk_init(p); /* Must remain after dup_task_struct() */
ian@0 1013 copy_flags(clone_flags, p);
ian@0 1014 p->pid = pid;
ian@0 1015 retval = -EFAULT;
ian@0 1016 if (clone_flags & CLONE_PARENT_SETTID)
ian@0 1017 if (put_user(p->pid, parent_tidptr))
ian@0 1018 goto bad_fork_cleanup_delays_binfmt;
ian@0 1019
ian@0 1020 INIT_LIST_HEAD(&p->children);
ian@0 1021 INIT_LIST_HEAD(&p->sibling);
ian@0 1022 p->vfork_done = NULL;
ian@0 1023 spin_lock_init(&p->alloc_lock);
ian@0 1024
ian@0 1025 clear_tsk_thread_flag(p, TIF_SIGPENDING);
ian@0 1026 init_sigpending(&p->pending);
ian@0 1027
ian@0 1028 p->utime = cputime_zero;
ian@0 1029 p->stime = cputime_zero;
ian@0 1030 p->sched_time = 0;
ian@0 1031 p->rchar = 0; /* I/O counter: bytes read */
ian@0 1032 p->wchar = 0; /* I/O counter: bytes written */
ian@0 1033 p->syscr = 0; /* I/O counter: read syscalls */
ian@0 1034 p->syscw = 0; /* I/O counter: write syscalls */
ian@0 1035 acct_clear_integrals(p);
ian@0 1036
ian@0 1037 p->it_virt_expires = cputime_zero;
ian@0 1038 p->it_prof_expires = cputime_zero;
ian@0 1039 p->it_sched_expires = 0;
ian@0 1040 INIT_LIST_HEAD(&p->cpu_timers[0]);
ian@0 1041 INIT_LIST_HEAD(&p->cpu_timers[1]);
ian@0 1042 INIT_LIST_HEAD(&p->cpu_timers[2]);
ian@0 1043
ian@0 1044 p->lock_depth = -1; /* -1 = no lock */
ian@0 1045 do_posix_clock_monotonic_gettime(&p->start_time);
ian@0 1046 p->security = NULL;
ian@0 1047 p->io_context = NULL;
ian@0 1048 p->io_wait = NULL;
ian@0 1049 p->audit_context = NULL;
ian@0 1050 cpuset_fork(p);
ian@0 1051 #ifdef CONFIG_NUMA
ian@0 1052 p->mempolicy = mpol_copy(p->mempolicy);
ian@0 1053 if (IS_ERR(p->mempolicy)) {
ian@0 1054 retval = PTR_ERR(p->mempolicy);
ian@0 1055 p->mempolicy = NULL;
ian@0 1056 goto bad_fork_cleanup_cpuset;
ian@0 1057 }
ian@0 1058 mpol_fix_fork_child_flag(p);
ian@0 1059 #endif
ian@0 1060 #ifdef CONFIG_TRACE_IRQFLAGS
ian@0 1061 p->irq_events = 0;
ian@0 1062 p->hardirqs_enabled = 0;
ian@0 1063 p->hardirq_enable_ip = 0;
ian@0 1064 p->hardirq_enable_event = 0;
ian@0 1065 p->hardirq_disable_ip = _THIS_IP_;
ian@0 1066 p->hardirq_disable_event = 0;
ian@0 1067 p->softirqs_enabled = 1;
ian@0 1068 p->softirq_enable_ip = _THIS_IP_;
ian@0 1069 p->softirq_enable_event = 0;
ian@0 1070 p->softirq_disable_ip = 0;
ian@0 1071 p->softirq_disable_event = 0;
ian@0 1072 p->hardirq_context = 0;
ian@0 1073 p->softirq_context = 0;
ian@0 1074 #endif
ian@0 1075 #ifdef CONFIG_LOCKDEP
ian@0 1076 p->lockdep_depth = 0; /* no locks held yet */
ian@0 1077 p->curr_chain_key = 0;
ian@0 1078 p->lockdep_recursion = 0;
ian@0 1079 #endif
ian@0 1080
ian@0 1081 rt_mutex_init_task(p);
ian@0 1082
ian@0 1083 #ifdef CONFIG_DEBUG_MUTEXES
ian@0 1084 p->blocked_on = NULL; /* not blocked yet */
ian@0 1085 #endif
ian@0 1086
ian@0 1087 p->tgid = p->pid;
ian@0 1088 if (clone_flags & CLONE_THREAD)
ian@0 1089 p->tgid = current->tgid;
ian@0 1090
ian@0 1091 if ((retval = security_task_alloc(p)))
ian@0 1092 goto bad_fork_cleanup_policy;
ian@0 1093 if ((retval = audit_alloc(p)))
ian@0 1094 goto bad_fork_cleanup_security;
ian@0 1095 /* copy all the process information */
ian@0 1096 if ((retval = copy_semundo(clone_flags, p)))
ian@0 1097 goto bad_fork_cleanup_audit;
ian@0 1098 if ((retval = copy_files(clone_flags, p)))
ian@0 1099 goto bad_fork_cleanup_semundo;
ian@0 1100 if ((retval = copy_fs(clone_flags, p)))
ian@0 1101 goto bad_fork_cleanup_files;
ian@0 1102 if ((retval = copy_sighand(clone_flags, p)))
ian@0 1103 goto bad_fork_cleanup_fs;
ian@0 1104 if ((retval = copy_signal(clone_flags, p)))
ian@0 1105 goto bad_fork_cleanup_sighand;
ian@0 1106 if ((retval = copy_mm(clone_flags, p)))
ian@0 1107 goto bad_fork_cleanup_signal;
ian@0 1108 if ((retval = copy_keys(clone_flags, p)))
ian@0 1109 goto bad_fork_cleanup_mm;
ian@0 1110 if ((retval = copy_namespace(clone_flags, p)))
ian@0 1111 goto bad_fork_cleanup_keys;
ian@0 1112 retval = copy_thread(0, clone_flags, stack_start, stack_size, p, regs);
ian@0 1113 if (retval)
ian@0 1114 goto bad_fork_cleanup_namespace;
ian@0 1115
ian@0 1116 p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL;
ian@0 1117 /*
ian@0 1118 * Clear TID on mm_release()?
ian@0 1119 */
ian@0 1120 p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr: NULL;
ian@0 1121 p->robust_list = NULL;
ian@0 1122 #ifdef CONFIG_COMPAT
ian@0 1123 p->compat_robust_list = NULL;
ian@0 1124 #endif
ian@0 1125 INIT_LIST_HEAD(&p->pi_state_list);
ian@0 1126 p->pi_state_cache = NULL;
ian@0 1127
ian@0 1128 /*
ian@0 1129 * sigaltstack should be cleared when sharing the same VM
ian@0 1130 */
ian@0 1131 if ((clone_flags & (CLONE_VM|CLONE_VFORK)) == CLONE_VM)
ian@0 1132 p->sas_ss_sp = p->sas_ss_size = 0;
ian@0 1133
ian@0 1134 /*
ian@0 1135 * Syscall tracing should be turned off in the child regardless
ian@0 1136 * of CLONE_PTRACE.
ian@0 1137 */
ian@0 1138 clear_tsk_thread_flag(p, TIF_SYSCALL_TRACE);
ian@0 1139 #ifdef TIF_SYSCALL_EMU
ian@0 1140 clear_tsk_thread_flag(p, TIF_SYSCALL_EMU);
ian@0 1141 #endif
ian@0 1142
ian@0 1143 /* Our parent execution domain becomes current domain
ian@0 1144 These must match for thread signalling to apply */
ian@0 1145
ian@0 1146 p->parent_exec_id = p->self_exec_id;
ian@0 1147
ian@0 1148 /* ok, now we should be set up.. */
ian@0 1149 p->exit_signal = (clone_flags & CLONE_THREAD) ? -1 : (clone_flags & CSIGNAL);
ian@0 1150 p->pdeath_signal = 0;
ian@0 1151 p->exit_state = 0;
ian@0 1152
ian@0 1153 /*
ian@0 1154 * Ok, make it visible to the rest of the system.
ian@0 1155 * We dont wake it up yet.
ian@0 1156 */
ian@0 1157 p->group_leader = p;
ian@0 1158 INIT_LIST_HEAD(&p->thread_group);
ian@0 1159 INIT_LIST_HEAD(&p->ptrace_children);
ian@0 1160 INIT_LIST_HEAD(&p->ptrace_list);
ian@0 1161
ian@0 1162 /* Perform scheduler related setup. Assign this task to a CPU. */
ian@0 1163 sched_fork(p, clone_flags);
ian@0 1164
ian@0 1165 /* Need tasklist lock for parent etc handling! */
ian@0 1166 write_lock_irq(&tasklist_lock);
ian@0 1167
ian@0 1168 /*
ian@0 1169 * The task hasn't been attached yet, so its cpus_allowed mask will
ian@0 1170 * not be changed, nor will its assigned CPU.
ian@0 1171 *
ian@0 1172 * The cpus_allowed mask of the parent may have changed after it was
ian@0 1173 * copied first time - so re-copy it here, then check the child's CPU
ian@0 1174 * to ensure it is on a valid CPU (and if not, just force it back to
ian@0 1175 * parent's CPU). This avoids alot of nasty races.
ian@0 1176 */
ian@0 1177 p->cpus_allowed = current->cpus_allowed;
ian@0 1178 if (unlikely(!cpu_isset(task_cpu(p), p->cpus_allowed) ||
ian@0 1179 !cpu_online(task_cpu(p))))
ian@0 1180 set_task_cpu(p, smp_processor_id());
ian@0 1181
ian@0 1182 /* CLONE_PARENT re-uses the old parent */
ian@0 1183 if (clone_flags & (CLONE_PARENT|CLONE_THREAD))
ian@0 1184 p->real_parent = current->real_parent;
ian@0 1185 else
ian@0 1186 p->real_parent = current;
ian@0 1187 p->parent = p->real_parent;
ian@0 1188
ian@0 1189 spin_lock(&current->sighand->siglock);
ian@0 1190
ian@0 1191 /*
ian@0 1192 * Process group and session signals need to be delivered to just the
ian@0 1193 * parent before the fork or both the parent and the child after the
ian@0 1194 * fork. Restart if a signal comes in before we add the new process to
ian@0 1195 * it's process group.
ian@0 1196 * A fatal signal pending means that current will exit, so the new
ian@0 1197 * thread can't slip out of an OOM kill (or normal SIGKILL).
ian@0 1198 */
ian@0 1199 recalc_sigpending();
ian@0 1200 if (signal_pending(current)) {
ian@0 1201 spin_unlock(&current->sighand->siglock);
ian@0 1202 write_unlock_irq(&tasklist_lock);
ian@0 1203 retval = -ERESTARTNOINTR;
ian@0 1204 goto bad_fork_cleanup_namespace;
ian@0 1205 }
ian@0 1206
ian@0 1207 if (clone_flags & CLONE_THREAD) {
ian@0 1208 p->group_leader = current->group_leader;
ian@0 1209 list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group);
ian@0 1210
ian@0 1211 if (!cputime_eq(current->signal->it_virt_expires,
ian@0 1212 cputime_zero) ||
ian@0 1213 !cputime_eq(current->signal->it_prof_expires,
ian@0 1214 cputime_zero) ||
ian@0 1215 current->signal->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY ||
ian@0 1216 !list_empty(&current->signal->cpu_timers[0]) ||
ian@0 1217 !list_empty(&current->signal->cpu_timers[1]) ||
ian@0 1218 !list_empty(&current->signal->cpu_timers[2])) {
ian@0 1219 /*
ian@0 1220 * Have child wake up on its first tick to check
ian@0 1221 * for process CPU timers.
ian@0 1222 */
ian@0 1223 p->it_prof_expires = jiffies_to_cputime(1);
ian@0 1224 }
ian@0 1225 }
ian@0 1226
ian@0 1227 /*
ian@0 1228 * inherit ioprio
ian@0 1229 */
ian@0 1230 p->ioprio = current->ioprio;
ian@0 1231
ian@0 1232 if (likely(p->pid)) {
ian@0 1233 add_parent(p);
ian@0 1234 if (unlikely(p->ptrace & PT_PTRACED))
ian@0 1235 __ptrace_link(p, current->parent);
ian@0 1236
ian@0 1237 if (thread_group_leader(p)) {
ian@0 1238 p->signal->tty = current->signal->tty;
ian@0 1239 p->signal->pgrp = process_group(current);
ian@0 1240 p->signal->session = current->signal->session;
ian@0 1241 attach_pid(p, PIDTYPE_PGID, process_group(p));
ian@0 1242 attach_pid(p, PIDTYPE_SID, p->signal->session);
ian@0 1243
ian@0 1244 list_add_tail_rcu(&p->tasks, &init_task.tasks);
ian@0 1245 __get_cpu_var(process_counts)++;
ian@0 1246 }
ian@0 1247 attach_pid(p, PIDTYPE_PID, p->pid);
ian@0 1248 nr_threads++;
ian@0 1249 }
ian@0 1250
ian@0 1251 total_forks++;
ian@0 1252 spin_unlock(&current->sighand->siglock);
ian@0 1253 write_unlock_irq(&tasklist_lock);
ian@0 1254 proc_fork_connector(p);
ian@0 1255 return p;
ian@0 1256
ian@0 1257 bad_fork_cleanup_namespace:
ian@0 1258 exit_namespace(p);
ian@0 1259 bad_fork_cleanup_keys:
ian@0 1260 exit_keys(p);
ian@0 1261 bad_fork_cleanup_mm:
ian@0 1262 if (p->mm)
ian@0 1263 mmput(p->mm);
ian@0 1264 bad_fork_cleanup_signal:
ian@0 1265 cleanup_signal(p);
ian@0 1266 bad_fork_cleanup_sighand:
ian@0 1267 __cleanup_sighand(p->sighand);
ian@0 1268 bad_fork_cleanup_fs:
ian@0 1269 exit_fs(p); /* blocking */
ian@0 1270 bad_fork_cleanup_files:
ian@0 1271 exit_files(p); /* blocking */
ian@0 1272 bad_fork_cleanup_semundo:
ian@0 1273 exit_sem(p);
ian@0 1274 bad_fork_cleanup_audit:
ian@0 1275 audit_free(p);
ian@0 1276 bad_fork_cleanup_security:
ian@0 1277 security_task_free(p);
ian@0 1278 bad_fork_cleanup_policy:
ian@0 1279 #ifdef CONFIG_NUMA
ian@0 1280 mpol_free(p->mempolicy);
ian@0 1281 bad_fork_cleanup_cpuset:
ian@0 1282 #endif
ian@0 1283 cpuset_exit(p);
ian@0 1284 bad_fork_cleanup_delays_binfmt:
ian@0 1285 delayacct_tsk_free(p);
ian@0 1286 if (p->binfmt)
ian@0 1287 module_put(p->binfmt->module);
ian@0 1288 bad_fork_cleanup_put_domain:
ian@0 1289 module_put(task_thread_info(p)->exec_domain->module);
ian@0 1290 bad_fork_cleanup_count:
ian@0 1291 put_group_info(p->group_info);
ian@0 1292 atomic_dec(&p->user->processes);
ian@0 1293 free_uid(p->user);
ian@0 1294 bad_fork_free:
ian@0 1295 free_task(p);
ian@0 1296 fork_out:
ian@0 1297 return ERR_PTR(retval);
ian@0 1298 }
ian@0 1299
ian@0 1300 struct pt_regs * __devinit __attribute__((weak)) idle_regs(struct pt_regs *regs)
ian@0 1301 {
ian@0 1302 memset(regs, 0, sizeof(struct pt_regs));
ian@0 1303 return regs;
ian@0 1304 }
ian@0 1305
ian@0 1306 struct task_struct * __devinit fork_idle(int cpu)
ian@0 1307 {
ian@0 1308 struct task_struct *task;
ian@0 1309 struct pt_regs regs;
ian@0 1310
ian@0 1311 task = copy_process(CLONE_VM, 0, idle_regs(&regs), 0, NULL, NULL, 0);
ian@0 1312 if (!task)
ian@0 1313 return ERR_PTR(-ENOMEM);
ian@0 1314 init_idle(task, cpu);
ian@0 1315
ian@0 1316 return task;
ian@0 1317 }
ian@0 1318
ian@0 1319 static inline int fork_traceflag (unsigned clone_flags)
ian@0 1320 {
ian@0 1321 if (clone_flags & CLONE_UNTRACED)
ian@0 1322 return 0;
ian@0 1323 else if (clone_flags & CLONE_VFORK) {
ian@0 1324 if (current->ptrace & PT_TRACE_VFORK)
ian@0 1325 return PTRACE_EVENT_VFORK;
ian@0 1326 } else if ((clone_flags & CSIGNAL) != SIGCHLD) {
ian@0 1327 if (current->ptrace & PT_TRACE_CLONE)
ian@0 1328 return PTRACE_EVENT_CLONE;
ian@0 1329 } else if (current->ptrace & PT_TRACE_FORK)
ian@0 1330 return PTRACE_EVENT_FORK;
ian@0 1331
ian@0 1332 return 0;
ian@0 1333 }
ian@0 1334
ian@0 1335 /*
ian@0 1336 * Ok, this is the main fork-routine.
ian@0 1337 *
ian@0 1338 * It copies the process, and if successful kick-starts
ian@0 1339 * it and waits for it to finish using the VM if required.
ian@0 1340 */
ian@0 1341 long do_fork(unsigned long clone_flags,
ian@0 1342 unsigned long stack_start,
ian@0 1343 struct pt_regs *regs,
ian@0 1344 unsigned long stack_size,
ian@0 1345 int __user *parent_tidptr,
ian@0 1346 int __user *child_tidptr)
ian@0 1347 {
ian@0 1348 struct task_struct *p;
ian@0 1349 int trace = 0;
ian@0 1350 struct pid *pid = alloc_pid();
ian@0 1351 long nr;
ian@0 1352
ian@0 1353 if (!pid)
ian@0 1354 return -EAGAIN;
ian@0 1355 nr = pid->nr;
ian@0 1356 if (unlikely(current->ptrace)) {
ian@0 1357 trace = fork_traceflag (clone_flags);
ian@0 1358 if (trace)
ian@0 1359 clone_flags |= CLONE_PTRACE;
ian@0 1360 }
ian@0 1361
ian@0 1362 p = copy_process(clone_flags, stack_start, regs, stack_size, parent_tidptr, child_tidptr, nr);
ian@0 1363 /*
ian@0 1364 * Do this prior waking up the new thread - the thread pointer
ian@0 1365 * might get invalid after that point, if the thread exits quickly.
ian@0 1366 */
ian@0 1367 if (!IS_ERR(p)) {
ian@0 1368 struct completion vfork;
ian@0 1369
ian@0 1370 if (clone_flags & CLONE_VFORK) {
ian@0 1371 p->vfork_done = &vfork;
ian@0 1372 init_completion(&vfork);
ian@0 1373 }
ian@0 1374
ian@0 1375 if ((p->ptrace & PT_PTRACED) || (clone_flags & CLONE_STOPPED)) {
ian@0 1376 /*
ian@0 1377 * We'll start up with an immediate SIGSTOP.
ian@0 1378 */
ian@0 1379 sigaddset(&p->pending.signal, SIGSTOP);
ian@0 1380 set_tsk_thread_flag(p, TIF_SIGPENDING);
ian@0 1381 }
ian@0 1382
ian@0 1383 if (!(clone_flags & CLONE_STOPPED))
ian@0 1384 wake_up_new_task(p, clone_flags);
ian@0 1385 else
ian@0 1386 p->state = TASK_STOPPED;
ian@0 1387
ian@0 1388 if (unlikely (trace)) {
ian@0 1389 current->ptrace_message = nr;
ian@0 1390 ptrace_notify ((trace << 8) | SIGTRAP);
ian@0 1391 }
ian@0 1392
ian@0 1393 if (clone_flags & CLONE_VFORK) {
ian@0 1394 wait_for_completion(&vfork);
ian@0 1395 if (unlikely (current->ptrace & PT_TRACE_VFORK_DONE)) {
ian@0 1396 current->ptrace_message = nr;
ian@0 1397 ptrace_notify ((PTRACE_EVENT_VFORK_DONE << 8) | SIGTRAP);
ian@0 1398 }
ian@0 1399 }
ian@0 1400 } else {
ian@0 1401 free_pid(pid);
ian@0 1402 nr = PTR_ERR(p);
ian@0 1403 }
ian@0 1404 return nr;
ian@0 1405 }
ian@0 1406
ian@0 1407 #ifndef ARCH_MIN_MMSTRUCT_ALIGN
ian@0 1408 #define ARCH_MIN_MMSTRUCT_ALIGN 0
ian@0 1409 #endif
ian@0 1410
ian@0 1411 static void sighand_ctor(void *data, kmem_cache_t *cachep, unsigned long flags)
ian@0 1412 {
ian@0 1413 struct sighand_struct *sighand = data;
ian@0 1414
ian@0 1415 if ((flags & (SLAB_CTOR_VERIFY | SLAB_CTOR_CONSTRUCTOR)) ==
ian@0 1416 SLAB_CTOR_CONSTRUCTOR)
ian@0 1417 spin_lock_init(&sighand->siglock);
ian@0 1418 }
ian@0 1419
ian@0 1420 void __init proc_caches_init(void)
ian@0 1421 {
ian@0 1422 sighand_cachep = kmem_cache_create("sighand_cache",
ian@0 1423 sizeof(struct sighand_struct), 0,
ian@0 1424 SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_DESTROY_BY_RCU,
ian@0 1425 sighand_ctor, NULL);
ian@0 1426 signal_cachep = kmem_cache_create("signal_cache",
ian@0 1427 sizeof(struct signal_struct), 0,
ian@0 1428 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
ian@0 1429 files_cachep = kmem_cache_create("files_cache",
ian@0 1430 sizeof(struct files_struct), 0,
ian@0 1431 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
ian@0 1432 fs_cachep = kmem_cache_create("fs_cache",
ian@0 1433 sizeof(struct fs_struct), 0,
ian@0 1434 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
ian@0 1435 vm_area_cachep = kmem_cache_create("vm_area_struct",
ian@0 1436 sizeof(struct vm_area_struct), 0,
ian@0 1437 SLAB_PANIC, NULL, NULL);
ian@0 1438 mm_cachep = kmem_cache_create("mm_struct",
ian@0 1439 sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN,
ian@0 1440 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
ian@0 1441 }
ian@0 1442
ian@0 1443
ian@0 1444 /*
ian@0 1445 * Check constraints on flags passed to the unshare system call and
ian@0 1446 * force unsharing of additional process context as appropriate.
ian@0 1447 */
ian@0 1448 static inline void check_unshare_flags(unsigned long *flags_ptr)
ian@0 1449 {
ian@0 1450 /*
ian@0 1451 * If unsharing a thread from a thread group, must also
ian@0 1452 * unshare vm.
ian@0 1453 */
ian@0 1454 if (*flags_ptr & CLONE_THREAD)
ian@0 1455 *flags_ptr |= CLONE_VM;
ian@0 1456
ian@0 1457 /*
ian@0 1458 * If unsharing vm, must also unshare signal handlers.
ian@0 1459 */
ian@0 1460 if (*flags_ptr & CLONE_VM)
ian@0 1461 *flags_ptr |= CLONE_SIGHAND;
ian@0 1462
ian@0 1463 /*
ian@0 1464 * If unsharing signal handlers and the task was created
ian@0 1465 * using CLONE_THREAD, then must unshare the thread
ian@0 1466 */
ian@0 1467 if ((*flags_ptr & CLONE_SIGHAND) &&
ian@0 1468 (atomic_read(&current->signal->count) > 1))
ian@0 1469 *flags_ptr |= CLONE_THREAD;
ian@0 1470
ian@0 1471 /*
ian@0 1472 * If unsharing namespace, must also unshare filesystem information.
ian@0 1473 */
ian@0 1474 if (*flags_ptr & CLONE_NEWNS)
ian@0 1475 *flags_ptr |= CLONE_FS;
ian@0 1476 }
ian@0 1477
ian@0 1478 /*
ian@0 1479 * Unsharing of tasks created with CLONE_THREAD is not supported yet
ian@0 1480 */
ian@0 1481 static int unshare_thread(unsigned long unshare_flags)
ian@0 1482 {
ian@0 1483 if (unshare_flags & CLONE_THREAD)
ian@0 1484 return -EINVAL;
ian@0 1485
ian@0 1486 return 0;
ian@0 1487 }
ian@0 1488
ian@0 1489 /*
ian@0 1490 * Unshare the filesystem structure if it is being shared
ian@0 1491 */
ian@0 1492 static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp)
ian@0 1493 {
ian@0 1494 struct fs_struct *fs = current->fs;
ian@0 1495
ian@0 1496 if ((unshare_flags & CLONE_FS) &&
ian@0 1497 (fs && atomic_read(&fs->count) > 1)) {
ian@0 1498 *new_fsp = __copy_fs_struct(current->fs);
ian@0 1499 if (!*new_fsp)
ian@0 1500 return -ENOMEM;
ian@0 1501 }
ian@0 1502
ian@0 1503 return 0;
ian@0 1504 }
ian@0 1505
ian@0 1506 /*
ian@0 1507 * Unshare the namespace structure if it is being shared
ian@0 1508 */
ian@0 1509 static int unshare_namespace(unsigned long unshare_flags, struct namespace **new_nsp, struct fs_struct *new_fs)
ian@0 1510 {
ian@0 1511 struct namespace *ns = current->namespace;
ian@0 1512
ian@0 1513 if ((unshare_flags & CLONE_NEWNS) &&
ian@0 1514 (ns && atomic_read(&ns->count) > 1)) {
ian@0 1515 if (!capable(CAP_SYS_ADMIN))
ian@0 1516 return -EPERM;
ian@0 1517
ian@0 1518 *new_nsp = dup_namespace(current, new_fs ? new_fs : current->fs);
ian@0 1519 if (!*new_nsp)
ian@0 1520 return -ENOMEM;
ian@0 1521 }
ian@0 1522
ian@0 1523 return 0;
ian@0 1524 }
ian@0 1525
ian@0 1526 /*
ian@0 1527 * Unsharing of sighand for tasks created with CLONE_SIGHAND is not
ian@0 1528 * supported yet
ian@0 1529 */
ian@0 1530 static int unshare_sighand(unsigned long unshare_flags, struct sighand_struct **new_sighp)
ian@0 1531 {
ian@0 1532 struct sighand_struct *sigh = current->sighand;
ian@0 1533
ian@0 1534 if ((unshare_flags & CLONE_SIGHAND) &&
ian@0 1535 (sigh && atomic_read(&sigh->count) > 1))
ian@0 1536 return -EINVAL;
ian@0 1537 else
ian@0 1538 return 0;
ian@0 1539 }
ian@0 1540
ian@0 1541 /*
ian@0 1542 * Unshare vm if it is being shared
ian@0 1543 */
ian@0 1544 static int unshare_vm(unsigned long unshare_flags, struct mm_struct **new_mmp)
ian@0 1545 {
ian@0 1546 struct mm_struct *mm = current->mm;
ian@0 1547
ian@0 1548 if ((unshare_flags & CLONE_VM) &&
ian@0 1549 (mm && atomic_read(&mm->mm_users) > 1)) {
ian@0 1550 return -EINVAL;
ian@0 1551 }
ian@0 1552
ian@0 1553 return 0;
ian@0 1554 }
ian@0 1555
ian@0 1556 /*
ian@0 1557 * Unshare file descriptor table if it is being shared
ian@0 1558 */
ian@0 1559 static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp)
ian@0 1560 {
ian@0 1561 struct files_struct *fd = current->files;
ian@0 1562 int error = 0;
ian@0 1563
ian@0 1564 if ((unshare_flags & CLONE_FILES) &&
ian@0 1565 (fd && atomic_read(&fd->count) > 1)) {
ian@0 1566 *new_fdp = dup_fd(fd, &error);
ian@0 1567 if (!*new_fdp)
ian@0 1568 return error;
ian@0 1569 }
ian@0 1570
ian@0 1571 return 0;
ian@0 1572 }
ian@0 1573
ian@0 1574 /*
ian@0 1575 * Unsharing of semundo for tasks created with CLONE_SYSVSEM is not
ian@0 1576 * supported yet
ian@0 1577 */
ian@0 1578 static int unshare_semundo(unsigned long unshare_flags, struct sem_undo_list **new_ulistp)
ian@0 1579 {
ian@0 1580 if (unshare_flags & CLONE_SYSVSEM)
ian@0 1581 return -EINVAL;
ian@0 1582
ian@0 1583 return 0;
ian@0 1584 }
ian@0 1585
ian@0 1586 /*
ian@0 1587 * unshare allows a process to 'unshare' part of the process
ian@0 1588 * context which was originally shared using clone. copy_*
ian@0 1589 * functions used by do_fork() cannot be used here directly
ian@0 1590 * because they modify an inactive task_struct that is being
ian@0 1591 * constructed. Here we are modifying the current, active,
ian@0 1592 * task_struct.
ian@0 1593 */
ian@0 1594 asmlinkage long sys_unshare(unsigned long unshare_flags)
ian@0 1595 {
ian@0 1596 int err = 0;
ian@0 1597 struct fs_struct *fs, *new_fs = NULL;
ian@0 1598 struct namespace *ns, *new_ns = NULL;
ian@0 1599 struct sighand_struct *sigh, *new_sigh = NULL;
ian@0 1600 struct mm_struct *mm, *new_mm = NULL, *active_mm = NULL;
ian@0 1601 struct files_struct *fd, *new_fd = NULL;
ian@0 1602 struct sem_undo_list *new_ulist = NULL;
ian@0 1603
ian@0 1604 check_unshare_flags(&unshare_flags);
ian@0 1605
ian@0 1606 /* Return -EINVAL for all unsupported flags */
ian@0 1607 err = -EINVAL;
ian@0 1608 if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND|
ian@0 1609 CLONE_VM|CLONE_FILES|CLONE_SYSVSEM))
ian@0 1610 goto bad_unshare_out;
ian@0 1611
ian@0 1612 if ((err = unshare_thread(unshare_flags)))
ian@0 1613 goto bad_unshare_out;
ian@0 1614 if ((err = unshare_fs(unshare_flags, &new_fs)))
ian@0 1615 goto bad_unshare_cleanup_thread;
ian@0 1616 if ((err = unshare_namespace(unshare_flags, &new_ns, new_fs)))
ian@0 1617 goto bad_unshare_cleanup_fs;
ian@0 1618 if ((err = unshare_sighand(unshare_flags, &new_sigh)))
ian@0 1619 goto bad_unshare_cleanup_ns;
ian@0 1620 if ((err = unshare_vm(unshare_flags, &new_mm)))
ian@0 1621 goto bad_unshare_cleanup_sigh;
ian@0 1622 if ((err = unshare_fd(unshare_flags, &new_fd)))
ian@0 1623 goto bad_unshare_cleanup_vm;
ian@0 1624 if ((err = unshare_semundo(unshare_flags, &new_ulist)))
ian@0 1625 goto bad_unshare_cleanup_fd;
ian@0 1626
ian@0 1627 if (new_fs || new_ns || new_sigh || new_mm || new_fd || new_ulist) {
ian@0 1628
ian@0 1629 task_lock(current);
ian@0 1630
ian@0 1631 if (new_fs) {
ian@0 1632 fs = current->fs;
ian@0 1633 current->fs = new_fs;
ian@0 1634 new_fs = fs;
ian@0 1635 }
ian@0 1636
ian@0 1637 if (new_ns) {
ian@0 1638 ns = current->namespace;
ian@0 1639 current->namespace = new_ns;
ian@0 1640 new_ns = ns;
ian@0 1641 }
ian@0 1642
ian@0 1643 if (new_sigh) {
ian@0 1644 sigh = current->sighand;
ian@0 1645 rcu_assign_pointer(current->sighand, new_sigh);
ian@0 1646 new_sigh = sigh;
ian@0 1647 }
ian@0 1648
ian@0 1649 if (new_mm) {
ian@0 1650 mm = current->mm;
ian@0 1651 active_mm = current->active_mm;
ian@0 1652 current->mm = new_mm;
ian@0 1653 current->active_mm = new_mm;
ian@0 1654 activate_mm(active_mm, new_mm);
ian@0 1655 new_mm = mm;
ian@0 1656 }
ian@0 1657
ian@0 1658 if (new_fd) {
ian@0 1659 fd = current->files;
ian@0 1660 current->files = new_fd;
ian@0 1661 new_fd = fd;
ian@0 1662 }
ian@0 1663
ian@0 1664 task_unlock(current);
ian@0 1665 }
ian@0 1666
ian@0 1667 bad_unshare_cleanup_fd:
ian@0 1668 if (new_fd)
ian@0 1669 put_files_struct(new_fd);
ian@0 1670
ian@0 1671 bad_unshare_cleanup_vm:
ian@0 1672 if (new_mm)
ian@0 1673 mmput(new_mm);
ian@0 1674
ian@0 1675 bad_unshare_cleanup_sigh:
ian@0 1676 if (new_sigh)
ian@0 1677 if (atomic_dec_and_test(&new_sigh->count))
ian@0 1678 kmem_cache_free(sighand_cachep, new_sigh);
ian@0 1679
ian@0 1680 bad_unshare_cleanup_ns:
ian@0 1681 if (new_ns)
ian@0 1682 put_namespace(new_ns);
ian@0 1683
ian@0 1684 bad_unshare_cleanup_fs:
ian@0 1685 if (new_fs)
ian@0 1686 put_fs_struct(new_fs);
ian@0 1687
ian@0 1688 bad_unshare_cleanup_thread:
ian@0 1689 bad_unshare_out:
ian@0 1690 return err;
ian@0 1691 }