direct-io.hg
changeset 4157:2c4ca5aad6c4
bitkeeper revision 1.1245 (4236f622mMlu4s1f6bmCbV2qW4kvjw)
added 2.4 batch mode
Signed-off-by: michael.fetterman@cl.cam.ac.uk
added 2.4 batch mode
Signed-off-by: michael.fetterman@cl.cam.ac.uk
author | rneugeba@wyvis.research.intel-research.net |
---|---|
date | Tue Mar 15 14:50:10 2005 +0000 (2005-03-15) |
parents | 47e1cb8a3d38 |
children | e379e05dfb91 0cf318b324fb |
files | .rootkeys linux-2.6.10-xen-sparse/arch/xen/i386/mm/fault.c linux-2.6.10-xen-sparse/fs/exec.c linux-2.6.10-xen-sparse/include/asm-xen/asm-i386/pgtable-2level.h linux-2.6.10-xen-sparse/mm/highmem.c linux-2.6.10-xen-sparse/mm/memory.c linux-2.6.10-xen-sparse/mm/swapfile.c |
line diff
1.1 --- a/.rootkeys Thu Mar 10 18:12:10 2005 +0000 1.2 +++ b/.rootkeys Tue Mar 15 14:50:10 2005 +0000 1.3 @@ -230,6 +230,7 @@ 41ee5e8bSs3BGC7yegM_ek2Tn0Ahvw linux-2.6 1.4 41ee5e8bglvqKvZSY5uJ5JGQejEwyQ linux-2.6.10-xen-sparse/drivers/xen/usbback/usbback.c 1.5 41ee5e8ckZ9xVNvu9NHIZDK7JqApmQ linux-2.6.10-xen-sparse/drivers/xen/usbfront/usbfront.c 1.6 41ee5e8ck9scpGirfqEZRARbGDyTXA linux-2.6.10-xen-sparse/drivers/xen/usbfront/xhci.h 1.7 +4236f620IqJ4VZVDPfMJzrpFrio8Sw linux-2.6.10-xen-sparse/fs/exec.c 1.8 412f47e4RKD-R5IS5gEXvcT8L4v8gA linux-2.6.10-xen-sparse/include/asm-generic/pgtable.h 1.9 40f56239YAjS52QG2FIAQpHDZAdGHg linux-2.6.10-xen-sparse/include/asm-xen/asm-i386/desc.h 1.10 4107adf1E5O4ztGHNGMzCCNhcvqNow linux-2.6.10-xen-sparse/include/asm-xen/asm-i386/dma-mapping.h 1.11 @@ -274,8 +275,10 @@ 419dfc609zbti8rqL60tL2dHXQ_rvQ linux-2.6 1.12 4124f66f4NaKNa0xPiGGykn9QaZk3w linux-2.6.10-xen-sparse/include/linux/skbuff.h 1.13 419dfc6awx7w88wk6cG9P3mPidX6LQ linux-2.6.10-xen-sparse/kernel/irq/manage.c 1.14 40f56a0ddHCSs3501MY4hRf22tctOw linux-2.6.10-xen-sparse/mkbuildtree 1.15 +4236f620IaM-42pgVYuNGF4cFrttbw linux-2.6.10-xen-sparse/mm/highmem.c 1.16 412f46c0LJuKAgSPGoC0Z1DEkLfuLA linux-2.6.10-xen-sparse/mm/memory.c 1.17 410a94a4KT6I6X0LVc7djB39tRDp4g linux-2.6.10-xen-sparse/mm/page_alloc.c 1.18 +4236f620F2ZXlYSPUkwtN85tZMqDFQ linux-2.6.10-xen-sparse/mm/swapfile.c 1.19 41505c572m-s9ATiO1LiD1GPznTTIg linux-2.6.10-xen-sparse/net/core/skbuff.c 1.20 413cb1e4zst25MDYjg63Y-NGC5_pLg netbsd-2.0-xen-sparse/Makefile 1.21 413cb1e5c_Mkxf_X0zimEhTKI_l4DA netbsd-2.0-xen-sparse/mkbuildtree
2.1 --- a/linux-2.6.10-xen-sparse/arch/xen/i386/mm/fault.c Thu Mar 10 18:12:10 2005 +0000 2.2 +++ b/linux-2.6.10-xen-sparse/arch/xen/i386/mm/fault.c Tue Mar 15 14:50:10 2005 +0000 2.3 @@ -231,6 +231,12 @@ fastcall void do_page_fault(struct pt_re 2.4 error_code |= (regs->xcs & 2) << 1; 2.5 if (regs->eflags & X86_EFLAGS_VM) 2.6 error_code |= 4; 2.7 + 2.8 +#ifdef CONFIG_XEN_BATCH_MODE2 2.9 + /* ensure all updates have completed */ 2.10 + flush_page_update_queue(); 2.11 +#endif 2.12 + 2.13 2.14 if (notify_die(DIE_PAGE_FAULT, "page fault", regs, error_code, 14, 2.15 SIGSEGV) == NOTIFY_STOP)
3.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 3.2 +++ b/linux-2.6.10-xen-sparse/fs/exec.c Tue Mar 15 14:50:10 2005 +0000 3.3 @@ -0,0 +1,1432 @@ 3.4 +/* 3.5 + * linux/fs/exec.c 3.6 + * 3.7 + * Copyright (C) 1991, 1992 Linus Torvalds 3.8 + */ 3.9 + 3.10 +/* 3.11 + * #!-checking implemented by tytso. 3.12 + */ 3.13 +/* 3.14 + * Demand-loading implemented 01.12.91 - no need to read anything but 3.15 + * the header into memory. The inode of the executable is put into 3.16 + * "current->executable", and page faults do the actual loading. Clean. 3.17 + * 3.18 + * Once more I can proudly say that linux stood up to being changed: it 3.19 + * was less than 2 hours work to get demand-loading completely implemented. 3.20 + * 3.21 + * Demand loading changed July 1993 by Eric Youngdale. Use mmap instead, 3.22 + * current->executable is only used by the procfs. This allows a dispatch 3.23 + * table to check for several different types of binary formats. We keep 3.24 + * trying until we recognize the file or we run out of supported binary 3.25 + * formats. 3.26 + */ 3.27 + 3.28 +#include <linux/config.h> 3.29 +#include <linux/slab.h> 3.30 +#include <linux/file.h> 3.31 +#include <linux/mman.h> 3.32 +#include <linux/a.out.h> 3.33 +#include <linux/stat.h> 3.34 +#include <linux/fcntl.h> 3.35 +#include <linux/smp_lock.h> 3.36 +#include <linux/init.h> 3.37 +#include <linux/pagemap.h> 3.38 +#include <linux/highmem.h> 3.39 +#include <linux/spinlock.h> 3.40 +#include <linux/key.h> 3.41 +#include <linux/personality.h> 3.42 +#include <linux/binfmts.h> 3.43 +#include <linux/swap.h> 3.44 +#include <linux/utsname.h> 3.45 +#include <linux/module.h> 3.46 +#include <linux/namei.h> 3.47 +#include <linux/proc_fs.h> 3.48 +#include <linux/ptrace.h> 3.49 +#include <linux/mount.h> 3.50 +#include <linux/security.h> 3.51 +#include <linux/syscalls.h> 3.52 +#include <linux/rmap.h> 3.53 + 3.54 +#include <asm/uaccess.h> 3.55 +#include <asm/mmu_context.h> 3.56 + 3.57 +#ifdef CONFIG_KMOD 3.58 +#include <linux/kmod.h> 3.59 +#endif 3.60 + 3.61 +int core_uses_pid; 3.62 +char core_pattern[65] = "core"; 3.63 +/* The maximal length of core_pattern is also specified in sysctl.c */ 3.64 + 3.65 +static struct linux_binfmt *formats; 3.66 +static rwlock_t binfmt_lock = RW_LOCK_UNLOCKED; 3.67 + 3.68 +int register_binfmt(struct linux_binfmt * fmt) 3.69 +{ 3.70 + struct linux_binfmt ** tmp = &formats; 3.71 + 3.72 + if (!fmt) 3.73 + return -EINVAL; 3.74 + if (fmt->next) 3.75 + return -EBUSY; 3.76 + write_lock(&binfmt_lock); 3.77 + while (*tmp) { 3.78 + if (fmt == *tmp) { 3.79 + write_unlock(&binfmt_lock); 3.80 + return -EBUSY; 3.81 + } 3.82 + tmp = &(*tmp)->next; 3.83 + } 3.84 + fmt->next = formats; 3.85 + formats = fmt; 3.86 + write_unlock(&binfmt_lock); 3.87 + return 0; 3.88 +} 3.89 + 3.90 +EXPORT_SYMBOL(register_binfmt); 3.91 + 3.92 +int unregister_binfmt(struct linux_binfmt * fmt) 3.93 +{ 3.94 + struct linux_binfmt ** tmp = &formats; 3.95 + 3.96 + write_lock(&binfmt_lock); 3.97 + while (*tmp) { 3.98 + if (fmt == *tmp) { 3.99 + *tmp = fmt->next; 3.100 + write_unlock(&binfmt_lock); 3.101 + return 0; 3.102 + } 3.103 + tmp = &(*tmp)->next; 3.104 + } 3.105 + write_unlock(&binfmt_lock); 3.106 + return -EINVAL; 3.107 +} 3.108 + 3.109 +EXPORT_SYMBOL(unregister_binfmt); 3.110 + 3.111 +static inline void put_binfmt(struct linux_binfmt * fmt) 3.112 +{ 3.113 + module_put(fmt->module); 3.114 +} 3.115 + 3.116 +/* 3.117 + * Note that a shared library must be both readable and executable due to 3.118 + * security reasons. 3.119 + * 3.120 + * Also note that we take the address to load from from the file itself. 3.121 + */ 3.122 +asmlinkage long sys_uselib(const char __user * library) 3.123 +{ 3.124 + struct file * file; 3.125 + struct nameidata nd; 3.126 + int error; 3.127 + 3.128 + nd.intent.open.flags = FMODE_READ; 3.129 + error = __user_walk(library, LOOKUP_FOLLOW|LOOKUP_OPEN, &nd); 3.130 + if (error) 3.131 + goto out; 3.132 + 3.133 + error = -EINVAL; 3.134 + if (!S_ISREG(nd.dentry->d_inode->i_mode)) 3.135 + goto exit; 3.136 + 3.137 + error = permission(nd.dentry->d_inode, MAY_READ | MAY_EXEC, &nd); 3.138 + if (error) 3.139 + goto exit; 3.140 + 3.141 + file = dentry_open(nd.dentry, nd.mnt, O_RDONLY); 3.142 + error = PTR_ERR(file); 3.143 + if (IS_ERR(file)) 3.144 + goto out; 3.145 + 3.146 + error = -ENOEXEC; 3.147 + if(file->f_op) { 3.148 + struct linux_binfmt * fmt; 3.149 + 3.150 + read_lock(&binfmt_lock); 3.151 + for (fmt = formats ; fmt ; fmt = fmt->next) { 3.152 + if (!fmt->load_shlib) 3.153 + continue; 3.154 + if (!try_module_get(fmt->module)) 3.155 + continue; 3.156 + read_unlock(&binfmt_lock); 3.157 + error = fmt->load_shlib(file); 3.158 + read_lock(&binfmt_lock); 3.159 + put_binfmt(fmt); 3.160 + if (error != -ENOEXEC) 3.161 + break; 3.162 + } 3.163 + read_unlock(&binfmt_lock); 3.164 + } 3.165 + fput(file); 3.166 +out: 3.167 + return error; 3.168 +exit: 3.169 + path_release(&nd); 3.170 + goto out; 3.171 +} 3.172 + 3.173 +/* 3.174 + * count() counts the number of strings in array ARGV. 3.175 + */ 3.176 +static int count(char __user * __user * argv, int max) 3.177 +{ 3.178 + int i = 0; 3.179 + 3.180 + if (argv != NULL) { 3.181 + for (;;) { 3.182 + char __user * p; 3.183 + 3.184 + if (get_user(p, argv)) 3.185 + return -EFAULT; 3.186 + if (!p) 3.187 + break; 3.188 + argv++; 3.189 + if(++i > max) 3.190 + return -E2BIG; 3.191 + } 3.192 + } 3.193 + return i; 3.194 +} 3.195 + 3.196 +/* 3.197 + * 'copy_strings()' copies argument/environment strings from user 3.198 + * memory to free pages in kernel mem. These are in a format ready 3.199 + * to be put directly into the top of new user memory. 3.200 + */ 3.201 +int copy_strings(int argc,char __user * __user * argv, struct linux_binprm *bprm) 3.202 +{ 3.203 + struct page *kmapped_page = NULL; 3.204 + char *kaddr = NULL; 3.205 + int ret; 3.206 + 3.207 + while (argc-- > 0) { 3.208 + char __user *str; 3.209 + int len; 3.210 + unsigned long pos; 3.211 + 3.212 + if (get_user(str, argv+argc) || 3.213 + !(len = strnlen_user(str, bprm->p))) { 3.214 + ret = -EFAULT; 3.215 + goto out; 3.216 + } 3.217 + 3.218 + if (bprm->p < len) { 3.219 + ret = -E2BIG; 3.220 + goto out; 3.221 + } 3.222 + 3.223 + bprm->p -= len; 3.224 + /* XXX: add architecture specific overflow check here. */ 3.225 + pos = bprm->p; 3.226 + 3.227 + while (len > 0) { 3.228 + int i, new, err; 3.229 + int offset, bytes_to_copy; 3.230 + struct page *page; 3.231 + 3.232 + offset = pos % PAGE_SIZE; 3.233 + i = pos/PAGE_SIZE; 3.234 + page = bprm->page[i]; 3.235 + new = 0; 3.236 + if (!page) { 3.237 + page = alloc_page(GFP_HIGHUSER); 3.238 + bprm->page[i] = page; 3.239 + if (!page) { 3.240 + ret = -ENOMEM; 3.241 + goto out; 3.242 + } 3.243 + new = 1; 3.244 + } 3.245 + 3.246 + if (page != kmapped_page) { 3.247 + if (kmapped_page) 3.248 + kunmap(kmapped_page); 3.249 + kmapped_page = page; 3.250 + kaddr = kmap(kmapped_page); 3.251 + } 3.252 + if (new && offset) 3.253 + memset(kaddr, 0, offset); 3.254 + bytes_to_copy = PAGE_SIZE - offset; 3.255 + if (bytes_to_copy > len) { 3.256 + bytes_to_copy = len; 3.257 + if (new) 3.258 + memset(kaddr+offset+len, 0, 3.259 + PAGE_SIZE-offset-len); 3.260 + } 3.261 + err = copy_from_user(kaddr+offset, str, bytes_to_copy); 3.262 + if (err) { 3.263 + ret = -EFAULT; 3.264 + goto out; 3.265 + } 3.266 + 3.267 + pos += bytes_to_copy; 3.268 + str += bytes_to_copy; 3.269 + len -= bytes_to_copy; 3.270 + } 3.271 + } 3.272 + ret = 0; 3.273 +out: 3.274 + if (kmapped_page) 3.275 + kunmap(kmapped_page); 3.276 + return ret; 3.277 +} 3.278 + 3.279 +/* 3.280 + * Like copy_strings, but get argv and its values from kernel memory. 3.281 + */ 3.282 +int copy_strings_kernel(int argc,char ** argv, struct linux_binprm *bprm) 3.283 +{ 3.284 + int r; 3.285 + mm_segment_t oldfs = get_fs(); 3.286 + set_fs(KERNEL_DS); 3.287 + r = copy_strings(argc, (char __user * __user *)argv, bprm); 3.288 + set_fs(oldfs); 3.289 + return r; 3.290 +} 3.291 + 3.292 +EXPORT_SYMBOL(copy_strings_kernel); 3.293 + 3.294 +#ifdef CONFIG_MMU 3.295 +/* 3.296 + * This routine is used to map in a page into an address space: needed by 3.297 + * execve() for the initial stack and environment pages. 3.298 + * 3.299 + * vma->vm_mm->mmap_sem is held for writing. 3.300 + */ 3.301 +void install_arg_page(struct vm_area_struct *vma, 3.302 + struct page *page, unsigned long address) 3.303 +{ 3.304 + struct mm_struct *mm = vma->vm_mm; 3.305 + pgd_t * pgd; 3.306 + pmd_t * pmd; 3.307 + pte_t * pte; 3.308 + 3.309 + if (unlikely(anon_vma_prepare(vma))) 3.310 + goto out_sig; 3.311 + 3.312 + flush_dcache_page(page); 3.313 + pgd = pgd_offset(mm, address); 3.314 + 3.315 + spin_lock(&mm->page_table_lock); 3.316 + pmd = pmd_alloc(mm, pgd, address); 3.317 + if (!pmd) 3.318 + goto out; 3.319 + pte = pte_alloc_map(mm, pmd, address); 3.320 + if (!pte) 3.321 + goto out; 3.322 + if (!pte_none(*pte)) { 3.323 + pte_unmap(pte); 3.324 + goto out; 3.325 + } 3.326 + mm->rss++; 3.327 + lru_cache_add_active(page); 3.328 + set_pte(pte, pte_mkdirty(pte_mkwrite(mk_pte( 3.329 + page, vma->vm_page_prot)))); 3.330 +#ifdef CONFIG_XEN_BATCH_MODE2 3.331 + XEN_flush_page_update_queue(); 3.332 +#endif 3.333 + page_add_anon_rmap(page, vma, address); 3.334 + pte_unmap(pte); 3.335 + spin_unlock(&mm->page_table_lock); 3.336 + 3.337 + /* no need for flush_tlb */ 3.338 + return; 3.339 +out: 3.340 + spin_unlock(&mm->page_table_lock); 3.341 +out_sig: 3.342 + __free_page(page); 3.343 + force_sig(SIGKILL, current); 3.344 +} 3.345 + 3.346 +int setup_arg_pages(struct linux_binprm *bprm, int executable_stack) 3.347 +{ 3.348 + unsigned long stack_base; 3.349 + struct vm_area_struct *mpnt; 3.350 + struct mm_struct *mm = current->mm; 3.351 + int i, ret; 3.352 + long arg_size; 3.353 + 3.354 +#ifdef CONFIG_STACK_GROWSUP 3.355 + /* Move the argument and environment strings to the bottom of the 3.356 + * stack space. 3.357 + */ 3.358 + int offset, j; 3.359 + char *to, *from; 3.360 + 3.361 + /* Start by shifting all the pages down */ 3.362 + i = 0; 3.363 + for (j = 0; j < MAX_ARG_PAGES; j++) { 3.364 + struct page *page = bprm->page[j]; 3.365 + if (!page) 3.366 + continue; 3.367 + bprm->page[i++] = page; 3.368 + } 3.369 + 3.370 + /* Now move them within their pages */ 3.371 + offset = bprm->p % PAGE_SIZE; 3.372 + to = kmap(bprm->page[0]); 3.373 + for (j = 1; j < i; j++) { 3.374 + memmove(to, to + offset, PAGE_SIZE - offset); 3.375 + from = kmap(bprm->page[j]); 3.376 + memcpy(to + PAGE_SIZE - offset, from, offset); 3.377 + kunmap(bprm->page[j - 1]); 3.378 + to = from; 3.379 + } 3.380 + memmove(to, to + offset, PAGE_SIZE - offset); 3.381 + kunmap(bprm->page[j - 1]); 3.382 + 3.383 + /* Adjust bprm->p to point to the end of the strings. */ 3.384 + bprm->p = PAGE_SIZE * i - offset; 3.385 + 3.386 + /* Limit stack size to 1GB */ 3.387 + stack_base = current->signal->rlim[RLIMIT_STACK].rlim_max; 3.388 + if (stack_base > (1 << 30)) 3.389 + stack_base = 1 << 30; 3.390 + stack_base = PAGE_ALIGN(STACK_TOP - stack_base); 3.391 + 3.392 + mm->arg_start = stack_base; 3.393 + arg_size = i << PAGE_SHIFT; 3.394 + 3.395 + /* zero pages that were copied above */ 3.396 + while (i < MAX_ARG_PAGES) 3.397 + bprm->page[i++] = NULL; 3.398 +#else 3.399 + stack_base = STACK_TOP - MAX_ARG_PAGES * PAGE_SIZE; 3.400 + mm->arg_start = bprm->p + stack_base; 3.401 + arg_size = STACK_TOP - (PAGE_MASK & (unsigned long) mm->arg_start); 3.402 +#endif 3.403 + 3.404 + bprm->p += stack_base; 3.405 + if (bprm->loader) 3.406 + bprm->loader += stack_base; 3.407 + bprm->exec += stack_base; 3.408 + 3.409 + mpnt = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); 3.410 + if (!mpnt) 3.411 + return -ENOMEM; 3.412 + 3.413 + if (security_vm_enough_memory(arg_size >> PAGE_SHIFT)) { 3.414 + kmem_cache_free(vm_area_cachep, mpnt); 3.415 + return -ENOMEM; 3.416 + } 3.417 + 3.418 + memset(mpnt, 0, sizeof(*mpnt)); 3.419 + 3.420 + down_write(&mm->mmap_sem); 3.421 + { 3.422 + mpnt->vm_mm = mm; 3.423 +#ifdef CONFIG_STACK_GROWSUP 3.424 + mpnt->vm_start = stack_base; 3.425 + mpnt->vm_end = PAGE_MASK & 3.426 + (PAGE_SIZE - 1 + (unsigned long) bprm->p); 3.427 +#else 3.428 + mpnt->vm_start = PAGE_MASK & (unsigned long) bprm->p; 3.429 + mpnt->vm_end = STACK_TOP; 3.430 +#endif 3.431 + /* Adjust stack execute permissions; explicitly enable 3.432 + * for EXSTACK_ENABLE_X, disable for EXSTACK_DISABLE_X 3.433 + * and leave alone (arch default) otherwise. */ 3.434 + if (unlikely(executable_stack == EXSTACK_ENABLE_X)) 3.435 + mpnt->vm_flags = VM_STACK_FLAGS | VM_EXEC; 3.436 + else if (executable_stack == EXSTACK_DISABLE_X) 3.437 + mpnt->vm_flags = VM_STACK_FLAGS & ~VM_EXEC; 3.438 + else 3.439 + mpnt->vm_flags = VM_STACK_FLAGS; 3.440 + mpnt->vm_flags |= mm->def_flags; 3.441 + mpnt->vm_page_prot = protection_map[mpnt->vm_flags & 0x7]; 3.442 + if ((ret = insert_vm_struct(mm, mpnt))) { 3.443 + up_write(&mm->mmap_sem); 3.444 + kmem_cache_free(vm_area_cachep, mpnt); 3.445 + return ret; 3.446 + } 3.447 + mm->stack_vm = mm->total_vm = vma_pages(mpnt); 3.448 + } 3.449 + 3.450 + for (i = 0 ; i < MAX_ARG_PAGES ; i++) { 3.451 + struct page *page = bprm->page[i]; 3.452 + if (page) { 3.453 + bprm->page[i] = NULL; 3.454 + install_arg_page(mpnt, page, stack_base); 3.455 + } 3.456 + stack_base += PAGE_SIZE; 3.457 + } 3.458 + up_write(&mm->mmap_sem); 3.459 + 3.460 + return 0; 3.461 +} 3.462 + 3.463 +EXPORT_SYMBOL(setup_arg_pages); 3.464 + 3.465 +#define free_arg_pages(bprm) do { } while (0) 3.466 + 3.467 +#else 3.468 + 3.469 +static inline void free_arg_pages(struct linux_binprm *bprm) 3.470 +{ 3.471 + int i; 3.472 + 3.473 + for (i = 0; i < MAX_ARG_PAGES; i++) { 3.474 + if (bprm->page[i]) 3.475 + __free_page(bprm->page[i]); 3.476 + bprm->page[i] = NULL; 3.477 + } 3.478 +} 3.479 + 3.480 +#endif /* CONFIG_MMU */ 3.481 + 3.482 +struct file *open_exec(const char *name) 3.483 +{ 3.484 + struct nameidata nd; 3.485 + int err; 3.486 + struct file *file; 3.487 + 3.488 + nd.intent.open.flags = FMODE_READ; 3.489 + err = path_lookup(name, LOOKUP_FOLLOW|LOOKUP_OPEN, &nd); 3.490 + file = ERR_PTR(err); 3.491 + 3.492 + if (!err) { 3.493 + struct inode *inode = nd.dentry->d_inode; 3.494 + file = ERR_PTR(-EACCES); 3.495 + if (!(nd.mnt->mnt_flags & MNT_NOEXEC) && 3.496 + S_ISREG(inode->i_mode)) { 3.497 + int err = permission(inode, MAY_EXEC, &nd); 3.498 + if (!err && !(inode->i_mode & 0111)) 3.499 + err = -EACCES; 3.500 + file = ERR_PTR(err); 3.501 + if (!err) { 3.502 + file = dentry_open(nd.dentry, nd.mnt, O_RDONLY); 3.503 + if (!IS_ERR(file)) { 3.504 + err = deny_write_access(file); 3.505 + if (err) { 3.506 + fput(file); 3.507 + file = ERR_PTR(err); 3.508 + } 3.509 + } 3.510 +out: 3.511 + return file; 3.512 + } 3.513 + } 3.514 + path_release(&nd); 3.515 + } 3.516 + goto out; 3.517 +} 3.518 + 3.519 +EXPORT_SYMBOL(open_exec); 3.520 + 3.521 +int kernel_read(struct file *file, unsigned long offset, 3.522 + char *addr, unsigned long count) 3.523 +{ 3.524 + mm_segment_t old_fs; 3.525 + loff_t pos = offset; 3.526 + int result; 3.527 + 3.528 + old_fs = get_fs(); 3.529 + set_fs(get_ds()); 3.530 + /* The cast to a user pointer is valid due to the set_fs() */ 3.531 + result = vfs_read(file, (void __user *)addr, count, &pos); 3.532 + set_fs(old_fs); 3.533 + return result; 3.534 +} 3.535 + 3.536 +EXPORT_SYMBOL(kernel_read); 3.537 + 3.538 +static int exec_mmap(struct mm_struct *mm) 3.539 +{ 3.540 + struct task_struct *tsk; 3.541 + struct mm_struct * old_mm, *active_mm; 3.542 + 3.543 + /* Notify parent that we're no longer interested in the old VM */ 3.544 + tsk = current; 3.545 + old_mm = current->mm; 3.546 + mm_release(tsk, old_mm); 3.547 + 3.548 + task_lock(tsk); 3.549 + active_mm = tsk->active_mm; 3.550 + tsk->mm = mm; 3.551 + tsk->active_mm = mm; 3.552 + activate_mm(active_mm, mm); 3.553 + task_unlock(tsk); 3.554 + arch_pick_mmap_layout(mm); 3.555 + if (old_mm) { 3.556 + if (active_mm != old_mm) BUG(); 3.557 + mmput(old_mm); 3.558 + return 0; 3.559 + } 3.560 + mmdrop(active_mm); 3.561 + return 0; 3.562 +} 3.563 + 3.564 +/* 3.565 + * This function makes sure the current process has its own signal table, 3.566 + * so that flush_signal_handlers can later reset the handlers without 3.567 + * disturbing other processes. (Other processes might share the signal 3.568 + * table via the CLONE_SIGHAND option to clone().) 3.569 + */ 3.570 +static inline int de_thread(struct task_struct *tsk) 3.571 +{ 3.572 + struct signal_struct *sig = tsk->signal; 3.573 + struct sighand_struct *newsighand, *oldsighand = tsk->sighand; 3.574 + spinlock_t *lock = &oldsighand->siglock; 3.575 + int count; 3.576 + 3.577 + /* 3.578 + * If we don't share sighandlers, then we aren't sharing anything 3.579 + * and we can just re-use it all. 3.580 + */ 3.581 + if (atomic_read(&oldsighand->count) <= 1) { 3.582 + BUG_ON(atomic_read(&sig->count) != 1); 3.583 + exit_itimers(sig); 3.584 + return 0; 3.585 + } 3.586 + 3.587 + newsighand = kmem_cache_alloc(sighand_cachep, GFP_KERNEL); 3.588 + if (!newsighand) 3.589 + return -ENOMEM; 3.590 + 3.591 + if (thread_group_empty(current)) 3.592 + goto no_thread_group; 3.593 + 3.594 + /* 3.595 + * Kill all other threads in the thread group. 3.596 + * We must hold tasklist_lock to call zap_other_threads. 3.597 + */ 3.598 + read_lock(&tasklist_lock); 3.599 + spin_lock_irq(lock); 3.600 + if (sig->group_exit) { 3.601 + /* 3.602 + * Another group action in progress, just 3.603 + * return so that the signal is processed. 3.604 + */ 3.605 + spin_unlock_irq(lock); 3.606 + read_unlock(&tasklist_lock); 3.607 + kmem_cache_free(sighand_cachep, newsighand); 3.608 + return -EAGAIN; 3.609 + } 3.610 + sig->group_exit = 1; 3.611 + zap_other_threads(current); 3.612 + read_unlock(&tasklist_lock); 3.613 + 3.614 + /* 3.615 + * Account for the thread group leader hanging around: 3.616 + */ 3.617 + count = 2; 3.618 + if (current->pid == current->tgid) 3.619 + count = 1; 3.620 + while (atomic_read(&sig->count) > count) { 3.621 + sig->group_exit_task = current; 3.622 + sig->notify_count = count; 3.623 + __set_current_state(TASK_UNINTERRUPTIBLE); 3.624 + spin_unlock_irq(lock); 3.625 + schedule(); 3.626 + spin_lock_irq(lock); 3.627 + } 3.628 + sig->group_exit_task = NULL; 3.629 + sig->notify_count = 0; 3.630 + spin_unlock_irq(lock); 3.631 + 3.632 + /* 3.633 + * At this point all other threads have exited, all we have to 3.634 + * do is to wait for the thread group leader to become inactive, 3.635 + * and to assume its PID: 3.636 + */ 3.637 + if (current->pid != current->tgid) { 3.638 + struct task_struct *leader = current->group_leader, *parent; 3.639 + struct dentry *proc_dentry1, *proc_dentry2; 3.640 + unsigned long exit_state, ptrace; 3.641 + 3.642 + /* 3.643 + * Wait for the thread group leader to be a zombie. 3.644 + * It should already be zombie at this point, most 3.645 + * of the time. 3.646 + */ 3.647 + while (leader->exit_state != EXIT_ZOMBIE) 3.648 + yield(); 3.649 + 3.650 + spin_lock(&leader->proc_lock); 3.651 + spin_lock(¤t->proc_lock); 3.652 + proc_dentry1 = proc_pid_unhash(current); 3.653 + proc_dentry2 = proc_pid_unhash(leader); 3.654 + write_lock_irq(&tasklist_lock); 3.655 + 3.656 + if (leader->tgid != current->tgid) 3.657 + BUG(); 3.658 + if (current->pid == current->tgid) 3.659 + BUG(); 3.660 + /* 3.661 + * An exec() starts a new thread group with the 3.662 + * TGID of the previous thread group. Rehash the 3.663 + * two threads with a switched PID, and release 3.664 + * the former thread group leader: 3.665 + */ 3.666 + ptrace = leader->ptrace; 3.667 + parent = leader->parent; 3.668 + 3.669 + ptrace_unlink(current); 3.670 + ptrace_unlink(leader); 3.671 + remove_parent(current); 3.672 + remove_parent(leader); 3.673 + 3.674 + switch_exec_pids(leader, current); 3.675 + 3.676 + current->parent = current->real_parent = leader->real_parent; 3.677 + leader->parent = leader->real_parent = child_reaper; 3.678 + current->group_leader = current; 3.679 + leader->group_leader = leader; 3.680 + 3.681 + add_parent(current, current->parent); 3.682 + add_parent(leader, leader->parent); 3.683 + if (ptrace) { 3.684 + current->ptrace = ptrace; 3.685 + __ptrace_link(current, parent); 3.686 + } 3.687 + 3.688 + list_del(¤t->tasks); 3.689 + list_add_tail(¤t->tasks, &init_task.tasks); 3.690 + current->exit_signal = SIGCHLD; 3.691 + exit_state = leader->exit_state; 3.692 + 3.693 + write_unlock_irq(&tasklist_lock); 3.694 + spin_unlock(&leader->proc_lock); 3.695 + spin_unlock(¤t->proc_lock); 3.696 + proc_pid_flush(proc_dentry1); 3.697 + proc_pid_flush(proc_dentry2); 3.698 + 3.699 + if (exit_state != EXIT_ZOMBIE) 3.700 + BUG(); 3.701 + release_task(leader); 3.702 + } 3.703 + 3.704 + /* 3.705 + * Now there are really no other threads at all, 3.706 + * so it's safe to stop telling them to kill themselves. 3.707 + */ 3.708 + sig->group_exit = 0; 3.709 + 3.710 +no_thread_group: 3.711 + BUG_ON(atomic_read(&sig->count) != 1); 3.712 + exit_itimers(sig); 3.713 + 3.714 + if (atomic_read(&oldsighand->count) == 1) { 3.715 + /* 3.716 + * Now that we nuked the rest of the thread group, 3.717 + * it turns out we are not sharing sighand any more either. 3.718 + * So we can just keep it. 3.719 + */ 3.720 + kmem_cache_free(sighand_cachep, newsighand); 3.721 + } else { 3.722 + /* 3.723 + * Move our state over to newsighand and switch it in. 3.724 + */ 3.725 + spin_lock_init(&newsighand->siglock); 3.726 + atomic_set(&newsighand->count, 1); 3.727 + memcpy(newsighand->action, oldsighand->action, 3.728 + sizeof(newsighand->action)); 3.729 + 3.730 + write_lock_irq(&tasklist_lock); 3.731 + spin_lock(&oldsighand->siglock); 3.732 + spin_lock(&newsighand->siglock); 3.733 + 3.734 + current->sighand = newsighand; 3.735 + recalc_sigpending(); 3.736 + 3.737 + spin_unlock(&newsighand->siglock); 3.738 + spin_unlock(&oldsighand->siglock); 3.739 + write_unlock_irq(&tasklist_lock); 3.740 + 3.741 + if (atomic_dec_and_test(&oldsighand->count)) 3.742 + kmem_cache_free(sighand_cachep, oldsighand); 3.743 + } 3.744 + 3.745 + if (!thread_group_empty(current)) 3.746 + BUG(); 3.747 + if (current->tgid != current->pid) 3.748 + BUG(); 3.749 + return 0; 3.750 +} 3.751 + 3.752 +/* 3.753 + * These functions flushes out all traces of the currently running executable 3.754 + * so that a new one can be started 3.755 + */ 3.756 + 3.757 +static inline void flush_old_files(struct files_struct * files) 3.758 +{ 3.759 + long j = -1; 3.760 + 3.761 + spin_lock(&files->file_lock); 3.762 + for (;;) { 3.763 + unsigned long set, i; 3.764 + 3.765 + j++; 3.766 + i = j * __NFDBITS; 3.767 + if (i >= files->max_fds || i >= files->max_fdset) 3.768 + break; 3.769 + set = files->close_on_exec->fds_bits[j]; 3.770 + if (!set) 3.771 + continue; 3.772 + files->close_on_exec->fds_bits[j] = 0; 3.773 + spin_unlock(&files->file_lock); 3.774 + for ( ; set ; i++,set >>= 1) { 3.775 + if (set & 1) { 3.776 + sys_close(i); 3.777 + } 3.778 + } 3.779 + spin_lock(&files->file_lock); 3.780 + 3.781 + } 3.782 + spin_unlock(&files->file_lock); 3.783 +} 3.784 + 3.785 +void get_task_comm(char *buf, struct task_struct *tsk) 3.786 +{ 3.787 + /* buf must be at least sizeof(tsk->comm) in size */ 3.788 + task_lock(tsk); 3.789 + memcpy(buf, tsk->comm, sizeof(tsk->comm)); 3.790 + task_unlock(tsk); 3.791 +} 3.792 + 3.793 +void set_task_comm(struct task_struct *tsk, char *buf) 3.794 +{ 3.795 + task_lock(tsk); 3.796 + strlcpy(tsk->comm, buf, sizeof(tsk->comm)); 3.797 + task_unlock(tsk); 3.798 +} 3.799 + 3.800 +int flush_old_exec(struct linux_binprm * bprm) 3.801 +{ 3.802 + char * name; 3.803 + int i, ch, retval; 3.804 + struct files_struct *files; 3.805 + char tcomm[sizeof(current->comm)]; 3.806 + 3.807 + /* 3.808 + * Make sure we have a private signal table and that 3.809 + * we are unassociated from the previous thread group. 3.810 + */ 3.811 + retval = de_thread(current); 3.812 + if (retval) 3.813 + goto out; 3.814 + 3.815 + /* 3.816 + * Make sure we have private file handles. Ask the 3.817 + * fork helper to do the work for us and the exit 3.818 + * helper to do the cleanup of the old one. 3.819 + */ 3.820 + files = current->files; /* refcounted so safe to hold */ 3.821 + retval = unshare_files(); 3.822 + if (retval) 3.823 + goto out; 3.824 + /* 3.825 + * Release all of the old mmap stuff 3.826 + */ 3.827 + retval = exec_mmap(bprm->mm); 3.828 + if (retval) 3.829 + goto mmap_failed; 3.830 + 3.831 + bprm->mm = NULL; /* We're using it now */ 3.832 + 3.833 + /* This is the point of no return */ 3.834 + steal_locks(files); 3.835 + put_files_struct(files); 3.836 + 3.837 + current->sas_ss_sp = current->sas_ss_size = 0; 3.838 + 3.839 + if (current->euid == current->uid && current->egid == current->gid) 3.840 + current->mm->dumpable = 1; 3.841 + name = bprm->filename; 3.842 + for (i=0; (ch = *(name++)) != '\0';) { 3.843 + if (ch == '/') 3.844 + i = 0; 3.845 + else 3.846 + if (i < (sizeof(tcomm) - 1)) 3.847 + tcomm[i++] = ch; 3.848 + } 3.849 + tcomm[i] = '\0'; 3.850 + set_task_comm(current, tcomm); 3.851 + 3.852 + flush_thread(); 3.853 + 3.854 + if (bprm->e_uid != current->euid || bprm->e_gid != current->egid || 3.855 + permission(bprm->file->f_dentry->d_inode,MAY_READ, NULL) || 3.856 + (bprm->interp_flags & BINPRM_FLAGS_ENFORCE_NONDUMP)) { 3.857 + suid_keys(current); 3.858 + current->mm->dumpable = 0; 3.859 + } 3.860 + 3.861 + /* An exec changes our domain. We are no longer part of the thread 3.862 + group */ 3.863 + 3.864 + current->self_exec_id++; 3.865 + 3.866 + flush_signal_handlers(current, 0); 3.867 + flush_old_files(current->files); 3.868 + 3.869 + return 0; 3.870 + 3.871 +mmap_failed: 3.872 + put_files_struct(current->files); 3.873 + current->files = files; 3.874 +out: 3.875 + return retval; 3.876 +} 3.877 + 3.878 +EXPORT_SYMBOL(flush_old_exec); 3.879 + 3.880 +/* 3.881 + * Fill the binprm structure from the inode. 3.882 + * Check permissions, then read the first 128 (BINPRM_BUF_SIZE) bytes 3.883 + */ 3.884 +int prepare_binprm(struct linux_binprm *bprm) 3.885 +{ 3.886 + int mode; 3.887 + struct inode * inode = bprm->file->f_dentry->d_inode; 3.888 + int retval; 3.889 + 3.890 + mode = inode->i_mode; 3.891 + /* 3.892 + * Check execute perms again - if the caller has CAP_DAC_OVERRIDE, 3.893 + * generic_permission lets a non-executable through 3.894 + */ 3.895 + if (!(mode & 0111)) /* with at least _one_ execute bit set */ 3.896 + return -EACCES; 3.897 + if (bprm->file->f_op == NULL) 3.898 + return -EACCES; 3.899 + 3.900 + bprm->e_uid = current->euid; 3.901 + bprm->e_gid = current->egid; 3.902 + 3.903 + if(!(bprm->file->f_vfsmnt->mnt_flags & MNT_NOSUID)) { 3.904 + /* Set-uid? */ 3.905 + if (mode & S_ISUID) { 3.906 + current->personality &= ~PER_CLEAR_ON_SETID; 3.907 + bprm->e_uid = inode->i_uid; 3.908 + } 3.909 + 3.910 + /* Set-gid? */ 3.911 + /* 3.912 + * If setgid is set but no group execute bit then this 3.913 + * is a candidate for mandatory locking, not a setgid 3.914 + * executable. 3.915 + */ 3.916 + if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) { 3.917 + current->personality &= ~PER_CLEAR_ON_SETID; 3.918 + bprm->e_gid = inode->i_gid; 3.919 + } 3.920 + } 3.921 + 3.922 + /* fill in binprm security blob */ 3.923 + retval = security_bprm_set(bprm); 3.924 + if (retval) 3.925 + return retval; 3.926 + 3.927 + memset(bprm->buf,0,BINPRM_BUF_SIZE); 3.928 + return kernel_read(bprm->file,0,bprm->buf,BINPRM_BUF_SIZE); 3.929 +} 3.930 + 3.931 +EXPORT_SYMBOL(prepare_binprm); 3.932 + 3.933 +static inline int unsafe_exec(struct task_struct *p) 3.934 +{ 3.935 + int unsafe = 0; 3.936 + if (p->ptrace & PT_PTRACED) { 3.937 + if (p->ptrace & PT_PTRACE_CAP) 3.938 + unsafe |= LSM_UNSAFE_PTRACE_CAP; 3.939 + else 3.940 + unsafe |= LSM_UNSAFE_PTRACE; 3.941 + } 3.942 + if (atomic_read(&p->fs->count) > 1 || 3.943 + atomic_read(&p->files->count) > 1 || 3.944 + atomic_read(&p->sighand->count) > 1) 3.945 + unsafe |= LSM_UNSAFE_SHARE; 3.946 + 3.947 + return unsafe; 3.948 +} 3.949 + 3.950 +void compute_creds(struct linux_binprm *bprm) 3.951 +{ 3.952 + int unsafe; 3.953 + 3.954 + if (bprm->e_uid != current->uid) 3.955 + suid_keys(current); 3.956 + exec_keys(current); 3.957 + 3.958 + task_lock(current); 3.959 + unsafe = unsafe_exec(current); 3.960 + security_bprm_apply_creds(bprm, unsafe); 3.961 + task_unlock(current); 3.962 +} 3.963 + 3.964 +EXPORT_SYMBOL(compute_creds); 3.965 + 3.966 +void remove_arg_zero(struct linux_binprm *bprm) 3.967 +{ 3.968 + if (bprm->argc) { 3.969 + unsigned long offset; 3.970 + char * kaddr; 3.971 + struct page *page; 3.972 + 3.973 + offset = bprm->p % PAGE_SIZE; 3.974 + goto inside; 3.975 + 3.976 + while (bprm->p++, *(kaddr+offset++)) { 3.977 + if (offset != PAGE_SIZE) 3.978 + continue; 3.979 + offset = 0; 3.980 + kunmap_atomic(kaddr, KM_USER0); 3.981 +inside: 3.982 + page = bprm->page[bprm->p/PAGE_SIZE]; 3.983 + kaddr = kmap_atomic(page, KM_USER0); 3.984 + } 3.985 + kunmap_atomic(kaddr, KM_USER0); 3.986 + bprm->argc--; 3.987 + } 3.988 +} 3.989 + 3.990 +EXPORT_SYMBOL(remove_arg_zero); 3.991 + 3.992 +/* 3.993 + * cycle the list of binary formats handler, until one recognizes the image 3.994 + */ 3.995 +int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs) 3.996 +{ 3.997 + int try,retval; 3.998 + struct linux_binfmt *fmt; 3.999 +#ifdef __alpha__ 3.1000 + /* handle /sbin/loader.. */ 3.1001 + { 3.1002 + struct exec * eh = (struct exec *) bprm->buf; 3.1003 + 3.1004 + if (!bprm->loader && eh->fh.f_magic == 0x183 && 3.1005 + (eh->fh.f_flags & 0x3000) == 0x3000) 3.1006 + { 3.1007 + struct file * file; 3.1008 + unsigned long loader; 3.1009 + 3.1010 + allow_write_access(bprm->file); 3.1011 + fput(bprm->file); 3.1012 + bprm->file = NULL; 3.1013 + 3.1014 + loader = PAGE_SIZE*MAX_ARG_PAGES-sizeof(void *); 3.1015 + 3.1016 + file = open_exec("/sbin/loader"); 3.1017 + retval = PTR_ERR(file); 3.1018 + if (IS_ERR(file)) 3.1019 + return retval; 3.1020 + 3.1021 + /* Remember if the application is TASO. */ 3.1022 + bprm->sh_bang = eh->ah.entry < 0x100000000UL; 3.1023 + 3.1024 + bprm->file = file; 3.1025 + bprm->loader = loader; 3.1026 + retval = prepare_binprm(bprm); 3.1027 + if (retval<0) 3.1028 + return retval; 3.1029 + /* should call search_binary_handler recursively here, 3.1030 + but it does not matter */ 3.1031 + } 3.1032 + } 3.1033 +#endif 3.1034 + retval = security_bprm_check(bprm); 3.1035 + if (retval) 3.1036 + return retval; 3.1037 + 3.1038 + /* kernel module loader fixup */ 3.1039 + /* so we don't try to load run modprobe in kernel space. */ 3.1040 + set_fs(USER_DS); 3.1041 + retval = -ENOENT; 3.1042 + for (try=0; try<2; try++) { 3.1043 + read_lock(&binfmt_lock); 3.1044 + for (fmt = formats ; fmt ; fmt = fmt->next) { 3.1045 + int (*fn)(struct linux_binprm *, struct pt_regs *) = fmt->load_binary; 3.1046 + if (!fn) 3.1047 + continue; 3.1048 + if (!try_module_get(fmt->module)) 3.1049 + continue; 3.1050 + read_unlock(&binfmt_lock); 3.1051 + retval = fn(bprm, regs); 3.1052 + if (retval >= 0) { 3.1053 + put_binfmt(fmt); 3.1054 + allow_write_access(bprm->file); 3.1055 + if (bprm->file) 3.1056 + fput(bprm->file); 3.1057 + bprm->file = NULL; 3.1058 + current->did_exec = 1; 3.1059 + return retval; 3.1060 + } 3.1061 + read_lock(&binfmt_lock); 3.1062 + put_binfmt(fmt); 3.1063 + if (retval != -ENOEXEC || bprm->mm == NULL) 3.1064 + break; 3.1065 + if (!bprm->file) { 3.1066 + read_unlock(&binfmt_lock); 3.1067 + return retval; 3.1068 + } 3.1069 + } 3.1070 + read_unlock(&binfmt_lock); 3.1071 + if (retval != -ENOEXEC || bprm->mm == NULL) { 3.1072 + break; 3.1073 +#ifdef CONFIG_KMOD 3.1074 + }else{ 3.1075 +#define printable(c) (((c)=='\t') || ((c)=='\n') || (0x20<=(c) && (c)<=0x7e)) 3.1076 + if (printable(bprm->buf[0]) && 3.1077 + printable(bprm->buf[1]) && 3.1078 + printable(bprm->buf[2]) && 3.1079 + printable(bprm->buf[3])) 3.1080 + break; /* -ENOEXEC */ 3.1081 + request_module("binfmt-%04x", *(unsigned short *)(&bprm->buf[2])); 3.1082 +#endif 3.1083 + } 3.1084 + } 3.1085 + return retval; 3.1086 +} 3.1087 + 3.1088 +EXPORT_SYMBOL(search_binary_handler); 3.1089 + 3.1090 +/* 3.1091 + * sys_execve() executes a new program. 3.1092 + */ 3.1093 +int do_execve(char * filename, 3.1094 + char __user *__user *argv, 3.1095 + char __user *__user *envp, 3.1096 + struct pt_regs * regs) 3.1097 +{ 3.1098 + struct linux_binprm *bprm; 3.1099 + struct file *file; 3.1100 + int retval; 3.1101 + int i; 3.1102 + 3.1103 + retval = -ENOMEM; 3.1104 + bprm = kmalloc(sizeof(*bprm), GFP_KERNEL); 3.1105 + if (!bprm) 3.1106 + goto out_ret; 3.1107 + memset(bprm, 0, sizeof(*bprm)); 3.1108 + 3.1109 + file = open_exec(filename); 3.1110 + retval = PTR_ERR(file); 3.1111 + if (IS_ERR(file)) 3.1112 + goto out_kfree; 3.1113 + 3.1114 + sched_exec(); 3.1115 + 3.1116 + bprm->p = PAGE_SIZE*MAX_ARG_PAGES-sizeof(void *); 3.1117 + 3.1118 + bprm->file = file; 3.1119 + bprm->filename = filename; 3.1120 + bprm->interp = filename; 3.1121 + bprm->mm = mm_alloc(); 3.1122 + retval = -ENOMEM; 3.1123 + if (!bprm->mm) 3.1124 + goto out_file; 3.1125 + 3.1126 + retval = init_new_context(current, bprm->mm); 3.1127 + if (retval < 0) 3.1128 + goto out_mm; 3.1129 + 3.1130 + bprm->argc = count(argv, bprm->p / sizeof(void *)); 3.1131 + if ((retval = bprm->argc) < 0) 3.1132 + goto out_mm; 3.1133 + 3.1134 + bprm->envc = count(envp, bprm->p / sizeof(void *)); 3.1135 + if ((retval = bprm->envc) < 0) 3.1136 + goto out_mm; 3.1137 + 3.1138 + retval = security_bprm_alloc(bprm); 3.1139 + if (retval) 3.1140 + goto out; 3.1141 + 3.1142 + retval = prepare_binprm(bprm); 3.1143 + if (retval < 0) 3.1144 + goto out; 3.1145 + 3.1146 + retval = copy_strings_kernel(1, &bprm->filename, bprm); 3.1147 + if (retval < 0) 3.1148 + goto out; 3.1149 + 3.1150 + bprm->exec = bprm->p; 3.1151 + retval = copy_strings(bprm->envc, envp, bprm); 3.1152 + if (retval < 0) 3.1153 + goto out; 3.1154 + 3.1155 + retval = copy_strings(bprm->argc, argv, bprm); 3.1156 + if (retval < 0) 3.1157 + goto out; 3.1158 + 3.1159 + retval = search_binary_handler(bprm,regs); 3.1160 + if (retval >= 0) { 3.1161 + free_arg_pages(bprm); 3.1162 + 3.1163 + /* execve success */ 3.1164 + security_bprm_free(bprm); 3.1165 + kfree(bprm); 3.1166 + return retval; 3.1167 + } 3.1168 + 3.1169 +out: 3.1170 + /* Something went wrong, return the inode and free the argument pages*/ 3.1171 + for (i = 0 ; i < MAX_ARG_PAGES ; i++) { 3.1172 + struct page * page = bprm->page[i]; 3.1173 + if (page) 3.1174 + __free_page(page); 3.1175 + } 3.1176 + 3.1177 + if (bprm->security) 3.1178 + security_bprm_free(bprm); 3.1179 + 3.1180 +out_mm: 3.1181 + if (bprm->mm) 3.1182 + mmdrop(bprm->mm); 3.1183 + 3.1184 +out_file: 3.1185 + if (bprm->file) { 3.1186 + allow_write_access(bprm->file); 3.1187 + fput(bprm->file); 3.1188 + } 3.1189 + 3.1190 +out_kfree: 3.1191 + kfree(bprm); 3.1192 + 3.1193 +out_ret: 3.1194 + return retval; 3.1195 +} 3.1196 + 3.1197 +int set_binfmt(struct linux_binfmt *new) 3.1198 +{ 3.1199 + struct linux_binfmt *old = current->binfmt; 3.1200 + 3.1201 + if (new) { 3.1202 + if (!try_module_get(new->module)) 3.1203 + return -1; 3.1204 + } 3.1205 + current->binfmt = new; 3.1206 + if (old) 3.1207 + module_put(old->module); 3.1208 + return 0; 3.1209 +} 3.1210 + 3.1211 +EXPORT_SYMBOL(set_binfmt); 3.1212 + 3.1213 +#define CORENAME_MAX_SIZE 64 3.1214 + 3.1215 +/* format_corename will inspect the pattern parameter, and output a 3.1216 + * name into corename, which must have space for at least 3.1217 + * CORENAME_MAX_SIZE bytes plus one byte for the zero terminator. 3.1218 + */ 3.1219 +static void format_corename(char *corename, const char *pattern, long signr) 3.1220 +{ 3.1221 + const char *pat_ptr = pattern; 3.1222 + char *out_ptr = corename; 3.1223 + char *const out_end = corename + CORENAME_MAX_SIZE; 3.1224 + int rc; 3.1225 + int pid_in_pattern = 0; 3.1226 + 3.1227 + /* Repeat as long as we have more pattern to process and more output 3.1228 + space */ 3.1229 + while (*pat_ptr) { 3.1230 + if (*pat_ptr != '%') { 3.1231 + if (out_ptr == out_end) 3.1232 + goto out; 3.1233 + *out_ptr++ = *pat_ptr++; 3.1234 + } else { 3.1235 + switch (*++pat_ptr) { 3.1236 + case 0: 3.1237 + goto out; 3.1238 + /* Double percent, output one percent */ 3.1239 + case '%': 3.1240 + if (out_ptr == out_end) 3.1241 + goto out; 3.1242 + *out_ptr++ = '%'; 3.1243 + break; 3.1244 + /* pid */ 3.1245 + case 'p': 3.1246 + pid_in_pattern = 1; 3.1247 + rc = snprintf(out_ptr, out_end - out_ptr, 3.1248 + "%d", current->tgid); 3.1249 + if (rc > out_end - out_ptr) 3.1250 + goto out; 3.1251 + out_ptr += rc; 3.1252 + break; 3.1253 + /* uid */ 3.1254 + case 'u': 3.1255 + rc = snprintf(out_ptr, out_end - out_ptr, 3.1256 + "%d", current->uid); 3.1257 + if (rc > out_end - out_ptr) 3.1258 + goto out; 3.1259 + out_ptr += rc; 3.1260 + break; 3.1261 + /* gid */ 3.1262 + case 'g': 3.1263 + rc = snprintf(out_ptr, out_end - out_ptr, 3.1264 + "%d", current->gid); 3.1265 + if (rc > out_end - out_ptr) 3.1266 + goto out; 3.1267 + out_ptr += rc; 3.1268 + break; 3.1269 + /* signal that caused the coredump */ 3.1270 + case 's': 3.1271 + rc = snprintf(out_ptr, out_end - out_ptr, 3.1272 + "%ld", signr); 3.1273 + if (rc > out_end - out_ptr) 3.1274 + goto out; 3.1275 + out_ptr += rc; 3.1276 + break; 3.1277 + /* UNIX time of coredump */ 3.1278 + case 't': { 3.1279 + struct timeval tv; 3.1280 + do_gettimeofday(&tv); 3.1281 + rc = snprintf(out_ptr, out_end - out_ptr, 3.1282 + "%lu", tv.tv_sec); 3.1283 + if (rc > out_end - out_ptr) 3.1284 + goto out; 3.1285 + out_ptr += rc; 3.1286 + break; 3.1287 + } 3.1288 + /* hostname */ 3.1289 + case 'h': 3.1290 + down_read(&uts_sem); 3.1291 + rc = snprintf(out_ptr, out_end - out_ptr, 3.1292 + "%s", system_utsname.nodename); 3.1293 + up_read(&uts_sem); 3.1294 + if (rc > out_end - out_ptr) 3.1295 + goto out; 3.1296 + out_ptr += rc; 3.1297 + break; 3.1298 + /* executable */ 3.1299 + case 'e': 3.1300 + rc = snprintf(out_ptr, out_end - out_ptr, 3.1301 + "%s", current->comm); 3.1302 + if (rc > out_end - out_ptr) 3.1303 + goto out; 3.1304 + out_ptr += rc; 3.1305 + break; 3.1306 + default: 3.1307 + break; 3.1308 + } 3.1309 + ++pat_ptr; 3.1310 + } 3.1311 + } 3.1312 + /* Backward compatibility with core_uses_pid: 3.1313 + * 3.1314 + * If core_pattern does not include a %p (as is the default) 3.1315 + * and core_uses_pid is set, then .%pid will be appended to 3.1316 + * the filename */ 3.1317 + if (!pid_in_pattern 3.1318 + && (core_uses_pid || atomic_read(¤t->mm->mm_users) != 1)) { 3.1319 + rc = snprintf(out_ptr, out_end - out_ptr, 3.1320 + ".%d", current->tgid); 3.1321 + if (rc > out_end - out_ptr) 3.1322 + goto out; 3.1323 + out_ptr += rc; 3.1324 + } 3.1325 + out: 3.1326 + *out_ptr = 0; 3.1327 +} 3.1328 + 3.1329 +static void zap_threads (struct mm_struct *mm) 3.1330 +{ 3.1331 + struct task_struct *g, *p; 3.1332 + struct task_struct *tsk = current; 3.1333 + struct completion *vfork_done = tsk->vfork_done; 3.1334 + 3.1335 + /* 3.1336 + * Make sure nobody is waiting for us to release the VM, 3.1337 + * otherwise we can deadlock when we wait on each other 3.1338 + */ 3.1339 + if (vfork_done) { 3.1340 + tsk->vfork_done = NULL; 3.1341 + complete(vfork_done); 3.1342 + } 3.1343 + 3.1344 + read_lock(&tasklist_lock); 3.1345 + do_each_thread(g,p) 3.1346 + if (mm == p->mm && p != tsk) { 3.1347 + force_sig_specific(SIGKILL, p); 3.1348 + mm->core_waiters++; 3.1349 + } 3.1350 + while_each_thread(g,p); 3.1351 + 3.1352 + read_unlock(&tasklist_lock); 3.1353 +} 3.1354 + 3.1355 +static void coredump_wait(struct mm_struct *mm) 3.1356 +{ 3.1357 + DECLARE_COMPLETION(startup_done); 3.1358 + 3.1359 + mm->core_waiters++; /* let other threads block */ 3.1360 + mm->core_startup_done = &startup_done; 3.1361 + 3.1362 + /* give other threads a chance to run: */ 3.1363 + yield(); 3.1364 + 3.1365 + zap_threads(mm); 3.1366 + if (--mm->core_waiters) { 3.1367 + up_write(&mm->mmap_sem); 3.1368 + wait_for_completion(&startup_done); 3.1369 + } else 3.1370 + up_write(&mm->mmap_sem); 3.1371 + BUG_ON(mm->core_waiters); 3.1372 +} 3.1373 + 3.1374 +int do_coredump(long signr, int exit_code, struct pt_regs * regs) 3.1375 +{ 3.1376 + char corename[CORENAME_MAX_SIZE + 1]; 3.1377 + struct mm_struct *mm = current->mm; 3.1378 + struct linux_binfmt * binfmt; 3.1379 + struct inode * inode; 3.1380 + struct file * file; 3.1381 + int retval = 0; 3.1382 + 3.1383 + binfmt = current->binfmt; 3.1384 + if (!binfmt || !binfmt->core_dump) 3.1385 + goto fail; 3.1386 + down_write(&mm->mmap_sem); 3.1387 + if (!mm->dumpable) { 3.1388 + up_write(&mm->mmap_sem); 3.1389 + goto fail; 3.1390 + } 3.1391 + mm->dumpable = 0; 3.1392 + init_completion(&mm->core_done); 3.1393 + current->signal->group_exit = 1; 3.1394 + current->signal->group_exit_code = exit_code; 3.1395 + coredump_wait(mm); 3.1396 + 3.1397 + if (current->signal->rlim[RLIMIT_CORE].rlim_cur < binfmt->min_coredump) 3.1398 + goto fail_unlock; 3.1399 + 3.1400 + /* 3.1401 + * lock_kernel() because format_corename() is controlled by sysctl, which 3.1402 + * uses lock_kernel() 3.1403 + */ 3.1404 + lock_kernel(); 3.1405 + format_corename(corename, core_pattern, signr); 3.1406 + unlock_kernel(); 3.1407 + file = filp_open(corename, O_CREAT | 2 | O_NOFOLLOW | O_LARGEFILE, 0600); 3.1408 + if (IS_ERR(file)) 3.1409 + goto fail_unlock; 3.1410 + inode = file->f_dentry->d_inode; 3.1411 + if (inode->i_nlink > 1) 3.1412 + goto close_fail; /* multiple links - don't dump */ 3.1413 + if (d_unhashed(file->f_dentry)) 3.1414 + goto close_fail; 3.1415 + 3.1416 + if (!S_ISREG(inode->i_mode)) 3.1417 + goto close_fail; 3.1418 + if (!file->f_op) 3.1419 + goto close_fail; 3.1420 + if (!file->f_op->write) 3.1421 + goto close_fail; 3.1422 + if (do_truncate(file->f_dentry, 0) != 0) 3.1423 + goto close_fail; 3.1424 + 3.1425 + retval = binfmt->core_dump(signr, regs, file); 3.1426 + 3.1427 + if (retval) 3.1428 + current->signal->group_exit_code |= 0x80; 3.1429 +close_fail: 3.1430 + filp_close(file, NULL); 3.1431 +fail_unlock: 3.1432 + complete_all(&mm->core_done); 3.1433 +fail: 3.1434 + return retval; 3.1435 +}
4.1 --- a/linux-2.6.10-xen-sparse/include/asm-xen/asm-i386/pgtable-2level.h Thu Mar 10 18:12:10 2005 +0000 4.2 +++ b/linux-2.6.10-xen-sparse/include/asm-xen/asm-i386/pgtable-2level.h Tue Mar 15 14:50:10 2005 +0000 4.3 @@ -39,10 +39,16 @@ do { \ 4.4 #if defined(CONFIG_XEN_DEBUG_NO_MMU_BATCHING) 4.5 #define set_pte(pteptr, pteval)\ 4.6 set_pte_batched(pteptr, pteval) 4.7 -#elif defined(CONFIG_XEN_BATCH_MODE) 4.8 + 4.9 +#elif defined(CONFIG_XEN_BATCH_MODE1) 4.10 #define set_pte(pteptr, pteval)({\ 4.11 set_pte_batched(pteptr, pteval);\ 4.12 _flush_page_update_queue();}) 4.13 + 4.14 +#elif defined(CONFIG_XEN_BATCH_MODE2) 4.15 +#define set_pte(pteptr, pteval)\ 4.16 + set_pte_batched(pteptr, pteval) 4.17 + 4.18 #else 4.19 #define set_pte(pteptr, pteval) (*(pteptr) = pteval) 4.20 #endif
5.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 5.2 +++ b/linux-2.6.10-xen-sparse/mm/highmem.c Tue Mar 15 14:50:10 2005 +0000 5.3 @@ -0,0 +1,607 @@ 5.4 +/* 5.5 + * High memory handling common code and variables. 5.6 + * 5.7 + * (C) 1999 Andrea Arcangeli, SuSE GmbH, andrea@suse.de 5.8 + * Gerhard Wichert, Siemens AG, Gerhard.Wichert@pdb.siemens.de 5.9 + * 5.10 + * 5.11 + * Redesigned the x86 32-bit VM architecture to deal with 5.12 + * 64-bit physical space. With current x86 CPUs this 5.13 + * means up to 64 Gigabytes physical RAM. 5.14 + * 5.15 + * Rewrote high memory support to move the page cache into 5.16 + * high memory. Implemented permanent (schedulable) kmaps 5.17 + * based on Linus' idea. 5.18 + * 5.19 + * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com> 5.20 + */ 5.21 + 5.22 +#include <linux/mm.h> 5.23 +#include <linux/module.h> 5.24 +#include <linux/swap.h> 5.25 +#include <linux/bio.h> 5.26 +#include <linux/pagemap.h> 5.27 +#include <linux/mempool.h> 5.28 +#include <linux/blkdev.h> 5.29 +#include <linux/init.h> 5.30 +#include <linux/hash.h> 5.31 +#include <linux/highmem.h> 5.32 +#include <asm/tlbflush.h> 5.33 + 5.34 +static mempool_t *page_pool, *isa_page_pool; 5.35 + 5.36 +static void *page_pool_alloc(int gfp_mask, void *data) 5.37 +{ 5.38 + int gfp = gfp_mask | (int) (long) data; 5.39 + 5.40 + return alloc_page(gfp); 5.41 +} 5.42 + 5.43 +static void page_pool_free(void *page, void *data) 5.44 +{ 5.45 + __free_page(page); 5.46 +} 5.47 + 5.48 +/* 5.49 + * Virtual_count is not a pure "count". 5.50 + * 0 means that it is not mapped, and has not been mapped 5.51 + * since a TLB flush - it is usable. 5.52 + * 1 means that there are no users, but it has been mapped 5.53 + * since the last TLB flush - so we can't use it. 5.54 + * n means that there are (n-1) current users of it. 5.55 + */ 5.56 +#ifdef CONFIG_HIGHMEM 5.57 +static int pkmap_count[LAST_PKMAP]; 5.58 +static unsigned int last_pkmap_nr; 5.59 +static spinlock_t kmap_lock __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED; 5.60 + 5.61 +pte_t * pkmap_page_table; 5.62 + 5.63 +static DECLARE_WAIT_QUEUE_HEAD(pkmap_map_wait); 5.64 + 5.65 +static void flush_all_zero_pkmaps(void) 5.66 +{ 5.67 + int i; 5.68 + 5.69 + flush_cache_kmaps(); 5.70 + 5.71 + for (i = 0; i < LAST_PKMAP; i++) { 5.72 + struct page *page; 5.73 + 5.74 + /* 5.75 + * zero means we don't have anything to do, 5.76 + * >1 means that it is still in use. Only 5.77 + * a count of 1 means that it is free but 5.78 + * needs to be unmapped 5.79 + */ 5.80 + if (pkmap_count[i] != 1) 5.81 + continue; 5.82 + pkmap_count[i] = 0; 5.83 + 5.84 + /* sanity check */ 5.85 + if (pte_none(pkmap_page_table[i])) 5.86 + BUG(); 5.87 + 5.88 + /* 5.89 + * Don't need an atomic fetch-and-clear op here; 5.90 + * no-one has the page mapped, and cannot get at 5.91 + * its virtual address (and hence PTE) without first 5.92 + * getting the kmap_lock (which is held here). 5.93 + * So no dangers, even with speculative execution. 5.94 + */ 5.95 + page = pte_page(pkmap_page_table[i]); 5.96 + pte_clear(&pkmap_page_table[i]); 5.97 + 5.98 + set_page_address(page, NULL); 5.99 + } 5.100 + flush_tlb_kernel_range(PKMAP_ADDR(0), PKMAP_ADDR(LAST_PKMAP)); 5.101 +} 5.102 + 5.103 +static inline unsigned long map_new_virtual(struct page *page) 5.104 +{ 5.105 + unsigned long vaddr; 5.106 + int count; 5.107 + 5.108 +start: 5.109 + count = LAST_PKMAP; 5.110 + /* Find an empty entry */ 5.111 + for (;;) { 5.112 + last_pkmap_nr = (last_pkmap_nr + 1) & LAST_PKMAP_MASK; 5.113 + if (!last_pkmap_nr) { 5.114 + flush_all_zero_pkmaps(); 5.115 + count = LAST_PKMAP; 5.116 + } 5.117 + if (!pkmap_count[last_pkmap_nr]) 5.118 + break; /* Found a usable entry */ 5.119 + if (--count) 5.120 + continue; 5.121 + 5.122 + /* 5.123 + * Sleep for somebody else to unmap their entries 5.124 + */ 5.125 + { 5.126 + DECLARE_WAITQUEUE(wait, current); 5.127 + 5.128 + __set_current_state(TASK_UNINTERRUPTIBLE); 5.129 + add_wait_queue(&pkmap_map_wait, &wait); 5.130 + spin_unlock(&kmap_lock); 5.131 + schedule(); 5.132 + remove_wait_queue(&pkmap_map_wait, &wait); 5.133 + spin_lock(&kmap_lock); 5.134 + 5.135 + /* Somebody else might have mapped it while we slept */ 5.136 + if (page_address(page)) 5.137 + return (unsigned long)page_address(page); 5.138 + 5.139 + /* Re-start */ 5.140 + goto start; 5.141 + } 5.142 + } 5.143 + vaddr = PKMAP_ADDR(last_pkmap_nr); 5.144 + set_pte(&(pkmap_page_table[last_pkmap_nr]), mk_pte(page, kmap_prot)); 5.145 +#ifdef CONFIG_XEN_BATCH_MODE2 5.146 + XEN_flush_page_update_queue(); 5.147 +#endif 5.148 + pkmap_count[last_pkmap_nr] = 1; 5.149 + set_page_address(page, (void *)vaddr); 5.150 + 5.151 + return vaddr; 5.152 +} 5.153 + 5.154 +void fastcall *kmap_high(struct page *page) 5.155 +{ 5.156 + unsigned long vaddr; 5.157 + 5.158 + /* 5.159 + * For highmem pages, we can't trust "virtual" until 5.160 + * after we have the lock. 5.161 + * 5.162 + * We cannot call this from interrupts, as it may block 5.163 + */ 5.164 + spin_lock(&kmap_lock); 5.165 + vaddr = (unsigned long)page_address(page); 5.166 + if (!vaddr) 5.167 + vaddr = map_new_virtual(page); 5.168 + pkmap_count[PKMAP_NR(vaddr)]++; 5.169 + if (pkmap_count[PKMAP_NR(vaddr)] < 2) 5.170 + BUG(); 5.171 + spin_unlock(&kmap_lock); 5.172 + return (void*) vaddr; 5.173 +} 5.174 + 5.175 +EXPORT_SYMBOL(kmap_high); 5.176 + 5.177 +void fastcall kunmap_high(struct page *page) 5.178 +{ 5.179 + unsigned long vaddr; 5.180 + unsigned long nr; 5.181 + int need_wakeup; 5.182 + 5.183 + spin_lock(&kmap_lock); 5.184 + vaddr = (unsigned long)page_address(page); 5.185 + if (!vaddr) 5.186 + BUG(); 5.187 + nr = PKMAP_NR(vaddr); 5.188 + 5.189 + /* 5.190 + * A count must never go down to zero 5.191 + * without a TLB flush! 5.192 + */ 5.193 + need_wakeup = 0; 5.194 + switch (--pkmap_count[nr]) { 5.195 + case 0: 5.196 + BUG(); 5.197 + case 1: 5.198 + /* 5.199 + * Avoid an unnecessary wake_up() function call. 5.200 + * The common case is pkmap_count[] == 1, but 5.201 + * no waiters. 5.202 + * The tasks queued in the wait-queue are guarded 5.203 + * by both the lock in the wait-queue-head and by 5.204 + * the kmap_lock. As the kmap_lock is held here, 5.205 + * no need for the wait-queue-head's lock. Simply 5.206 + * test if the queue is empty. 5.207 + */ 5.208 + need_wakeup = waitqueue_active(&pkmap_map_wait); 5.209 + } 5.210 + spin_unlock(&kmap_lock); 5.211 + 5.212 + /* do wake-up, if needed, race-free outside of the spin lock */ 5.213 + if (need_wakeup) 5.214 + wake_up(&pkmap_map_wait); 5.215 +} 5.216 + 5.217 +EXPORT_SYMBOL(kunmap_high); 5.218 + 5.219 +#define POOL_SIZE 64 5.220 + 5.221 +static __init int init_emergency_pool(void) 5.222 +{ 5.223 + struct sysinfo i; 5.224 + si_meminfo(&i); 5.225 + si_swapinfo(&i); 5.226 + 5.227 + if (!i.totalhigh) 5.228 + return 0; 5.229 + 5.230 + page_pool = mempool_create(POOL_SIZE, page_pool_alloc, page_pool_free, NULL); 5.231 + if (!page_pool) 5.232 + BUG(); 5.233 + printk("highmem bounce pool size: %d pages\n", POOL_SIZE); 5.234 + 5.235 + return 0; 5.236 +} 5.237 + 5.238 +__initcall(init_emergency_pool); 5.239 + 5.240 +/* 5.241 + * highmem version, map in to vec 5.242 + */ 5.243 +static void bounce_copy_vec(struct bio_vec *to, unsigned char *vfrom) 5.244 +{ 5.245 + unsigned long flags; 5.246 + unsigned char *vto; 5.247 + 5.248 + local_irq_save(flags); 5.249 + vto = kmap_atomic(to->bv_page, KM_BOUNCE_READ); 5.250 + memcpy(vto + to->bv_offset, vfrom, to->bv_len); 5.251 + kunmap_atomic(vto, KM_BOUNCE_READ); 5.252 + local_irq_restore(flags); 5.253 +} 5.254 + 5.255 +#else /* CONFIG_HIGHMEM */ 5.256 + 5.257 +#define bounce_copy_vec(to, vfrom) \ 5.258 + memcpy(page_address((to)->bv_page) + (to)->bv_offset, vfrom, (to)->bv_len) 5.259 + 5.260 +#endif 5.261 + 5.262 +#define ISA_POOL_SIZE 16 5.263 + 5.264 +/* 5.265 + * gets called "every" time someone init's a queue with BLK_BOUNCE_ISA 5.266 + * as the max address, so check if the pool has already been created. 5.267 + */ 5.268 +int init_emergency_isa_pool(void) 5.269 +{ 5.270 + if (isa_page_pool) 5.271 + return 0; 5.272 + 5.273 + isa_page_pool = mempool_create(ISA_POOL_SIZE, page_pool_alloc, page_pool_free, (void *) __GFP_DMA); 5.274 + if (!isa_page_pool) 5.275 + BUG(); 5.276 + 5.277 + printk("isa bounce pool size: %d pages\n", ISA_POOL_SIZE); 5.278 + return 0; 5.279 +} 5.280 + 5.281 +/* 5.282 + * Simple bounce buffer support for highmem pages. Depending on the 5.283 + * queue gfp mask set, *to may or may not be a highmem page. kmap it 5.284 + * always, it will do the Right Thing 5.285 + */ 5.286 +static void copy_to_high_bio_irq(struct bio *to, struct bio *from) 5.287 +{ 5.288 + unsigned char *vfrom; 5.289 + struct bio_vec *tovec, *fromvec; 5.290 + int i; 5.291 + 5.292 + __bio_for_each_segment(tovec, to, i, 0) { 5.293 + fromvec = from->bi_io_vec + i; 5.294 + 5.295 + /* 5.296 + * not bounced 5.297 + */ 5.298 + if (tovec->bv_page == fromvec->bv_page) 5.299 + continue; 5.300 + 5.301 + /* 5.302 + * fromvec->bv_offset and fromvec->bv_len might have been 5.303 + * modified by the block layer, so use the original copy, 5.304 + * bounce_copy_vec already uses tovec->bv_len 5.305 + */ 5.306 + vfrom = page_address(fromvec->bv_page) + tovec->bv_offset; 5.307 + 5.308 + flush_dcache_page(tovec->bv_page); 5.309 + bounce_copy_vec(tovec, vfrom); 5.310 + } 5.311 +} 5.312 + 5.313 +static void bounce_end_io(struct bio *bio, mempool_t *pool, int err) 5.314 +{ 5.315 + struct bio *bio_orig = bio->bi_private; 5.316 + struct bio_vec *bvec, *org_vec; 5.317 + int i; 5.318 + 5.319 + if (test_bit(BIO_EOPNOTSUPP, &bio->bi_flags)) 5.320 + set_bit(BIO_EOPNOTSUPP, &bio_orig->bi_flags); 5.321 + 5.322 + /* 5.323 + * free up bounce indirect pages used 5.324 + */ 5.325 + __bio_for_each_segment(bvec, bio, i, 0) { 5.326 + org_vec = bio_orig->bi_io_vec + i; 5.327 + if (bvec->bv_page == org_vec->bv_page) 5.328 + continue; 5.329 + 5.330 + mempool_free(bvec->bv_page, pool); 5.331 + } 5.332 + 5.333 + bio_endio(bio_orig, bio_orig->bi_size, err); 5.334 + bio_put(bio); 5.335 +} 5.336 + 5.337 +static int bounce_end_io_write(struct bio *bio, unsigned int bytes_done,int err) 5.338 +{ 5.339 + if (bio->bi_size) 5.340 + return 1; 5.341 + 5.342 + bounce_end_io(bio, page_pool, err); 5.343 + return 0; 5.344 +} 5.345 + 5.346 +static int bounce_end_io_write_isa(struct bio *bio, unsigned int bytes_done, int err) 5.347 +{ 5.348 + if (bio->bi_size) 5.349 + return 1; 5.350 + 5.351 + bounce_end_io(bio, isa_page_pool, err); 5.352 + return 0; 5.353 +} 5.354 + 5.355 +static void __bounce_end_io_read(struct bio *bio, mempool_t *pool, int err) 5.356 +{ 5.357 + struct bio *bio_orig = bio->bi_private; 5.358 + 5.359 + if (test_bit(BIO_UPTODATE, &bio->bi_flags)) 5.360 + copy_to_high_bio_irq(bio_orig, bio); 5.361 + 5.362 + bounce_end_io(bio, pool, err); 5.363 +} 5.364 + 5.365 +static int bounce_end_io_read(struct bio *bio, unsigned int bytes_done, int err) 5.366 +{ 5.367 + if (bio->bi_size) 5.368 + return 1; 5.369 + 5.370 + __bounce_end_io_read(bio, page_pool, err); 5.371 + return 0; 5.372 +} 5.373 + 5.374 +static int bounce_end_io_read_isa(struct bio *bio, unsigned int bytes_done, int err) 5.375 +{ 5.376 + if (bio->bi_size) 5.377 + return 1; 5.378 + 5.379 + __bounce_end_io_read(bio, isa_page_pool, err); 5.380 + return 0; 5.381 +} 5.382 + 5.383 +static void __blk_queue_bounce(request_queue_t *q, struct bio **bio_orig, 5.384 + mempool_t *pool) 5.385 +{ 5.386 + struct page *page; 5.387 + struct bio *bio = NULL; 5.388 + int i, rw = bio_data_dir(*bio_orig); 5.389 + struct bio_vec *to, *from; 5.390 + 5.391 + bio_for_each_segment(from, *bio_orig, i) { 5.392 + page = from->bv_page; 5.393 + 5.394 + /* 5.395 + * is destination page below bounce pfn? 5.396 + */ 5.397 + if (page_to_pfn(page) < q->bounce_pfn) 5.398 + continue; 5.399 + 5.400 + /* 5.401 + * irk, bounce it 5.402 + */ 5.403 + if (!bio) 5.404 + bio = bio_alloc(GFP_NOIO, (*bio_orig)->bi_vcnt); 5.405 + 5.406 + to = bio->bi_io_vec + i; 5.407 + 5.408 + to->bv_page = mempool_alloc(pool, q->bounce_gfp); 5.409 + to->bv_len = from->bv_len; 5.410 + to->bv_offset = from->bv_offset; 5.411 + 5.412 + if (rw == WRITE) { 5.413 + char *vto, *vfrom; 5.414 + 5.415 + flush_dcache_page(from->bv_page); 5.416 + vto = page_address(to->bv_page) + to->bv_offset; 5.417 + vfrom = kmap(from->bv_page) + from->bv_offset; 5.418 + memcpy(vto, vfrom, to->bv_len); 5.419 + kunmap(from->bv_page); 5.420 + } 5.421 + } 5.422 + 5.423 + /* 5.424 + * no pages bounced 5.425 + */ 5.426 + if (!bio) 5.427 + return; 5.428 + 5.429 + /* 5.430 + * at least one page was bounced, fill in possible non-highmem 5.431 + * pages 5.432 + */ 5.433 + bio_for_each_segment(from, *bio_orig, i) { 5.434 + to = bio_iovec_idx(bio, i); 5.435 + if (!to->bv_page) { 5.436 + to->bv_page = from->bv_page; 5.437 + to->bv_len = from->bv_len; 5.438 + to->bv_offset = from->bv_offset; 5.439 + } 5.440 + } 5.441 + 5.442 + bio->bi_bdev = (*bio_orig)->bi_bdev; 5.443 + bio->bi_flags |= (1 << BIO_BOUNCED); 5.444 + bio->bi_sector = (*bio_orig)->bi_sector; 5.445 + bio->bi_rw = (*bio_orig)->bi_rw; 5.446 + 5.447 + bio->bi_vcnt = (*bio_orig)->bi_vcnt; 5.448 + bio->bi_idx = (*bio_orig)->bi_idx; 5.449 + bio->bi_size = (*bio_orig)->bi_size; 5.450 + 5.451 + if (pool == page_pool) { 5.452 + bio->bi_end_io = bounce_end_io_write; 5.453 + if (rw == READ) 5.454 + bio->bi_end_io = bounce_end_io_read; 5.455 + } else { 5.456 + bio->bi_end_io = bounce_end_io_write_isa; 5.457 + if (rw == READ) 5.458 + bio->bi_end_io = bounce_end_io_read_isa; 5.459 + } 5.460 + 5.461 + bio->bi_private = *bio_orig; 5.462 + *bio_orig = bio; 5.463 +} 5.464 + 5.465 +void blk_queue_bounce(request_queue_t *q, struct bio **bio_orig) 5.466 +{ 5.467 + mempool_t *pool; 5.468 + 5.469 + /* 5.470 + * for non-isa bounce case, just check if the bounce pfn is equal 5.471 + * to or bigger than the highest pfn in the system -- in that case, 5.472 + * don't waste time iterating over bio segments 5.473 + */ 5.474 + if (!(q->bounce_gfp & GFP_DMA)) { 5.475 + if (q->bounce_pfn >= blk_max_pfn) 5.476 + return; 5.477 + pool = page_pool; 5.478 + } else { 5.479 + BUG_ON(!isa_page_pool); 5.480 + pool = isa_page_pool; 5.481 + } 5.482 + 5.483 + /* 5.484 + * slow path 5.485 + */ 5.486 + __blk_queue_bounce(q, bio_orig, pool); 5.487 +} 5.488 + 5.489 +EXPORT_SYMBOL(blk_queue_bounce); 5.490 + 5.491 +#if defined(HASHED_PAGE_VIRTUAL) 5.492 + 5.493 +#define PA_HASH_ORDER 7 5.494 + 5.495 +/* 5.496 + * Describes one page->virtual association 5.497 + */ 5.498 +struct page_address_map { 5.499 + struct page *page; 5.500 + void *virtual; 5.501 + struct list_head list; 5.502 +}; 5.503 + 5.504 +/* 5.505 + * page_address_map freelist, allocated from page_address_maps. 5.506 + */ 5.507 +static struct list_head page_address_pool; /* freelist */ 5.508 +static spinlock_t pool_lock; /* protects page_address_pool */ 5.509 + 5.510 +/* 5.511 + * Hash table bucket 5.512 + */ 5.513 +static struct page_address_slot { 5.514 + struct list_head lh; /* List of page_address_maps */ 5.515 + spinlock_t lock; /* Protect this bucket's list */ 5.516 +} ____cacheline_aligned_in_smp page_address_htable[1<<PA_HASH_ORDER]; 5.517 + 5.518 +static struct page_address_slot *page_slot(struct page *page) 5.519 +{ 5.520 + return &page_address_htable[hash_ptr(page, PA_HASH_ORDER)]; 5.521 +} 5.522 + 5.523 +void *page_address(struct page *page) 5.524 +{ 5.525 + unsigned long flags; 5.526 + void *ret; 5.527 + struct page_address_slot *pas; 5.528 + 5.529 + if (!PageHighMem(page)) 5.530 + return lowmem_page_address(page); 5.531 + 5.532 + pas = page_slot(page); 5.533 + ret = NULL; 5.534 + spin_lock_irqsave(&pas->lock, flags); 5.535 + if (!list_empty(&pas->lh)) { 5.536 + struct page_address_map *pam; 5.537 + 5.538 + list_for_each_entry(pam, &pas->lh, list) { 5.539 + if (pam->page == page) { 5.540 + ret = pam->virtual; 5.541 + goto done; 5.542 + } 5.543 + } 5.544 + } 5.545 +done: 5.546 + spin_unlock_irqrestore(&pas->lock, flags); 5.547 + return ret; 5.548 +} 5.549 + 5.550 +EXPORT_SYMBOL(page_address); 5.551 + 5.552 +void set_page_address(struct page *page, void *virtual) 5.553 +{ 5.554 + unsigned long flags; 5.555 + struct page_address_slot *pas; 5.556 + struct page_address_map *pam; 5.557 + 5.558 + BUG_ON(!PageHighMem(page)); 5.559 + 5.560 + pas = page_slot(page); 5.561 + if (virtual) { /* Add */ 5.562 + BUG_ON(list_empty(&page_address_pool)); 5.563 + 5.564 + spin_lock_irqsave(&pool_lock, flags); 5.565 + pam = list_entry(page_address_pool.next, 5.566 + struct page_address_map, list); 5.567 + list_del(&pam->list); 5.568 + spin_unlock_irqrestore(&pool_lock, flags); 5.569 + 5.570 + pam->page = page; 5.571 + pam->virtual = virtual; 5.572 + 5.573 + spin_lock_irqsave(&pas->lock, flags); 5.574 + list_add_tail(&pam->list, &pas->lh); 5.575 + spin_unlock_irqrestore(&pas->lock, flags); 5.576 + } else { /* Remove */ 5.577 + spin_lock_irqsave(&pas->lock, flags); 5.578 + list_for_each_entry(pam, &pas->lh, list) { 5.579 + if (pam->page == page) { 5.580 + list_del(&pam->list); 5.581 + spin_unlock_irqrestore(&pas->lock, flags); 5.582 + spin_lock_irqsave(&pool_lock, flags); 5.583 + list_add_tail(&pam->list, &page_address_pool); 5.584 + spin_unlock_irqrestore(&pool_lock, flags); 5.585 + goto done; 5.586 + } 5.587 + } 5.588 + spin_unlock_irqrestore(&pas->lock, flags); 5.589 + } 5.590 +done: 5.591 + return; 5.592 +} 5.593 + 5.594 +static struct page_address_map page_address_maps[LAST_PKMAP]; 5.595 + 5.596 +void __init page_address_init(void) 5.597 +{ 5.598 + int i; 5.599 + 5.600 + INIT_LIST_HEAD(&page_address_pool); 5.601 + for (i = 0; i < ARRAY_SIZE(page_address_maps); i++) 5.602 + list_add(&page_address_maps[i].list, &page_address_pool); 5.603 + for (i = 0; i < ARRAY_SIZE(page_address_htable); i++) { 5.604 + INIT_LIST_HEAD(&page_address_htable[i].lh); 5.605 + spin_lock_init(&page_address_htable[i].lock); 5.606 + } 5.607 + spin_lock_init(&pool_lock); 5.608 +} 5.609 + 5.610 +#endif /* defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) */
6.1 --- a/linux-2.6.10-xen-sparse/mm/memory.c Thu Mar 10 18:12:10 2005 +0000 6.2 +++ b/linux-2.6.10-xen-sparse/mm/memory.c Tue Mar 15 14:50:10 2005 +0000 6.3 @@ -152,6 +152,10 @@ void clear_page_tables(struct mmu_gather 6.4 free_one_pgd(tlb, page_dir); 6.5 page_dir++; 6.6 } while (--nr); 6.7 +#ifdef CONFIG_XEN_BATCH_MODE2 6.8 + XEN_flush_page_update_queue(); 6.9 +#endif 6.10 + 6.11 } 6.12 6.13 pte_t fastcall * pte_alloc_map(struct mm_struct *mm, pmd_t *pmd, unsigned long address) 6.14 @@ -326,8 +330,15 @@ skip_copy_pte_range: 6.15 * in the parent and the child 6.16 */ 6.17 if (cow) { 6.18 +#ifdef CONFIG_XEN_BATCH_MODE2 6.19 +/* XEN modification: modified ordering here to avoid RaW hazard. */ 6.20 + pte = *src_pte; 6.21 + pte = pte_wrprotect(pte); 6.22 + ptep_set_wrprotect(src_pte); 6.23 +#else 6.24 ptep_set_wrprotect(src_pte); 6.25 pte = *src_pte; 6.26 +#endif 6.27 } 6.28 6.29 /* 6.30 @@ -1451,7 +1462,20 @@ static int do_swap_page(struct mm_struct 6.31 unlock_page(page); 6.32 6.33 flush_icache_page(vma, page); 6.34 + 6.35 +#ifdef CONFIG_XEN_BATCH_MODE2 6.36 + if ( likely(vma->vm_mm == current->mm) ) { 6.37 + XEN_flush_page_update_queue(); 6.38 + HYPERVISOR_update_va_mapping(address, pte, 0); 6.39 + } else { 6.40 + set_pte(page_table, pte); 6.41 + XEN_flush_page_update_queue(); 6.42 + } 6.43 +#else 6.44 set_pte(page_table, pte); 6.45 +#endif 6.46 + 6.47 + 6.48 page_add_anon_rmap(page, vma, address); 6.49 6.50 if (write_access) { 6.51 @@ -1516,7 +1540,17 @@ do_anonymous_page(struct mm_struct *mm, 6.52 page_add_anon_rmap(page, vma, addr); 6.53 } 6.54 6.55 +#ifdef CONFIG_XEN_BATCH_MODE2 6.56 + if ( likely(vma->vm_mm == current->mm) ) { 6.57 + XEN_flush_page_update_queue(); 6.58 + HYPERVISOR_update_va_mapping(addr, entry, 0); 6.59 + } else { 6.60 + set_pte(page_table, entry); 6.61 + XEN_flush_page_update_queue(); 6.62 + } 6.63 +#else 6.64 ptep_establish_new(vma, addr, page_table, entry); 6.65 +#endif 6.66 pte_unmap(page_table); 6.67 6.68 /* No need to invalidate - it was non-present before */ 6.69 @@ -1621,7 +1655,17 @@ retry: 6.70 entry = mk_pte(new_page, vma->vm_page_prot); 6.71 if (write_access) 6.72 entry = maybe_mkwrite(pte_mkdirty(entry), vma); 6.73 +#ifdef CONFIG_XEN_BATCH_MODE2 6.74 + if ( likely(vma->vm_mm == current->mm) ) { 6.75 + XEN_flush_page_update_queue(); 6.76 + HYPERVISOR_update_va_mapping(address, entry, 0); 6.77 + } else { 6.78 + set_pte(page_table, entry); 6.79 + XEN_flush_page_update_queue(); 6.80 + } 6.81 +#else 6.82 ptep_establish_new(vma, address, page_table, entry); 6.83 +#endif 6.84 if (anon) { 6.85 lru_cache_add_active(new_page); 6.86 page_add_anon_rmap(new_page, vma, address);
7.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 7.2 +++ b/linux-2.6.10-xen-sparse/mm/swapfile.c Tue Mar 15 14:50:10 2005 +0000 7.3 @@ -0,0 +1,1711 @@ 7.4 +/* 7.5 + * linux/mm/swapfile.c 7.6 + * 7.7 + * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds 7.8 + * Swap reorganised 29.12.95, Stephen Tweedie 7.9 + */ 7.10 + 7.11 +#include <linux/config.h> 7.12 +#include <linux/mm.h> 7.13 +#include <linux/hugetlb.h> 7.14 +#include <linux/mman.h> 7.15 +#include <linux/slab.h> 7.16 +#include <linux/kernel_stat.h> 7.17 +#include <linux/swap.h> 7.18 +#include <linux/vmalloc.h> 7.19 +#include <linux/pagemap.h> 7.20 +#include <linux/namei.h> 7.21 +#include <linux/shm.h> 7.22 +#include <linux/blkdev.h> 7.23 +#include <linux/writeback.h> 7.24 +#include <linux/proc_fs.h> 7.25 +#include <linux/seq_file.h> 7.26 +#include <linux/init.h> 7.27 +#include <linux/module.h> 7.28 +#include <linux/rmap.h> 7.29 +#include <linux/security.h> 7.30 +#include <linux/backing-dev.h> 7.31 +#include <linux/syscalls.h> 7.32 + 7.33 +#include <asm/pgtable.h> 7.34 +#include <asm/tlbflush.h> 7.35 +#include <linux/swapops.h> 7.36 + 7.37 +spinlock_t swaplock = SPIN_LOCK_UNLOCKED; 7.38 +unsigned int nr_swapfiles; 7.39 +long total_swap_pages; 7.40 +static int swap_overflow; 7.41 + 7.42 +EXPORT_SYMBOL(total_swap_pages); 7.43 + 7.44 +static const char Bad_file[] = "Bad swap file entry "; 7.45 +static const char Unused_file[] = "Unused swap file entry "; 7.46 +static const char Bad_offset[] = "Bad swap offset entry "; 7.47 +static const char Unused_offset[] = "Unused swap offset entry "; 7.48 + 7.49 +struct swap_list_t swap_list = {-1, -1}; 7.50 + 7.51 +struct swap_info_struct swap_info[MAX_SWAPFILES]; 7.52 + 7.53 +static DECLARE_MUTEX(swapon_sem); 7.54 + 7.55 +/* 7.56 + * We need this because the bdev->unplug_fn can sleep and we cannot 7.57 + * hold swap_list_lock while calling the unplug_fn. And swap_list_lock 7.58 + * cannot be turned into a semaphore. 7.59 + */ 7.60 +static DECLARE_RWSEM(swap_unplug_sem); 7.61 + 7.62 +#define SWAPFILE_CLUSTER 256 7.63 + 7.64 +void swap_unplug_io_fn(struct backing_dev_info *unused_bdi, struct page *page) 7.65 +{ 7.66 + swp_entry_t entry; 7.67 + 7.68 + down_read(&swap_unplug_sem); 7.69 + entry.val = page->private; 7.70 + if (PageSwapCache(page)) { 7.71 + struct block_device *bdev = swap_info[swp_type(entry)].bdev; 7.72 + struct backing_dev_info *bdi; 7.73 + 7.74 + /* 7.75 + * If the page is removed from swapcache from under us (with a 7.76 + * racy try_to_unuse/swapoff) we need an additional reference 7.77 + * count to avoid reading garbage from page->private above. If 7.78 + * the WARN_ON triggers during a swapoff it maybe the race 7.79 + * condition and it's harmless. However if it triggers without 7.80 + * swapoff it signals a problem. 7.81 + */ 7.82 + WARN_ON(page_count(page) <= 1); 7.83 + 7.84 + bdi = bdev->bd_inode->i_mapping->backing_dev_info; 7.85 + bdi->unplug_io_fn(bdi, page); 7.86 + } 7.87 + up_read(&swap_unplug_sem); 7.88 +} 7.89 + 7.90 +static inline int scan_swap_map(struct swap_info_struct *si) 7.91 +{ 7.92 + unsigned long offset; 7.93 + /* 7.94 + * We try to cluster swap pages by allocating them 7.95 + * sequentially in swap. Once we've allocated 7.96 + * SWAPFILE_CLUSTER pages this way, however, we resort to 7.97 + * first-free allocation, starting a new cluster. This 7.98 + * prevents us from scattering swap pages all over the entire 7.99 + * swap partition, so that we reduce overall disk seek times 7.100 + * between swap pages. -- sct */ 7.101 + if (si->cluster_nr) { 7.102 + while (si->cluster_next <= si->highest_bit) { 7.103 + offset = si->cluster_next++; 7.104 + if (si->swap_map[offset]) 7.105 + continue; 7.106 + si->cluster_nr--; 7.107 + goto got_page; 7.108 + } 7.109 + } 7.110 + si->cluster_nr = SWAPFILE_CLUSTER; 7.111 + 7.112 + /* try to find an empty (even not aligned) cluster. */ 7.113 + offset = si->lowest_bit; 7.114 + check_next_cluster: 7.115 + if (offset+SWAPFILE_CLUSTER-1 <= si->highest_bit) 7.116 + { 7.117 + unsigned long nr; 7.118 + for (nr = offset; nr < offset+SWAPFILE_CLUSTER; nr++) 7.119 + if (si->swap_map[nr]) 7.120 + { 7.121 + offset = nr+1; 7.122 + goto check_next_cluster; 7.123 + } 7.124 + /* We found a completly empty cluster, so start 7.125 + * using it. 7.126 + */ 7.127 + goto got_page; 7.128 + } 7.129 + /* No luck, so now go finegrined as usual. -Andrea */ 7.130 + for (offset = si->lowest_bit; offset <= si->highest_bit ; offset++) { 7.131 + if (si->swap_map[offset]) 7.132 + continue; 7.133 + si->lowest_bit = offset+1; 7.134 + got_page: 7.135 + if (offset == si->lowest_bit) 7.136 + si->lowest_bit++; 7.137 + if (offset == si->highest_bit) 7.138 + si->highest_bit--; 7.139 + if (si->lowest_bit > si->highest_bit) { 7.140 + si->lowest_bit = si->max; 7.141 + si->highest_bit = 0; 7.142 + } 7.143 + si->swap_map[offset] = 1; 7.144 + si->inuse_pages++; 7.145 + nr_swap_pages--; 7.146 + si->cluster_next = offset+1; 7.147 + return offset; 7.148 + } 7.149 + si->lowest_bit = si->max; 7.150 + si->highest_bit = 0; 7.151 + return 0; 7.152 +} 7.153 + 7.154 +swp_entry_t get_swap_page(void) 7.155 +{ 7.156 + struct swap_info_struct * p; 7.157 + unsigned long offset; 7.158 + swp_entry_t entry; 7.159 + int type, wrapped = 0; 7.160 + 7.161 + entry.val = 0; /* Out of memory */ 7.162 + swap_list_lock(); 7.163 + type = swap_list.next; 7.164 + if (type < 0) 7.165 + goto out; 7.166 + if (nr_swap_pages <= 0) 7.167 + goto out; 7.168 + 7.169 + while (1) { 7.170 + p = &swap_info[type]; 7.171 + if ((p->flags & SWP_ACTIVE) == SWP_ACTIVE) { 7.172 + swap_device_lock(p); 7.173 + offset = scan_swap_map(p); 7.174 + swap_device_unlock(p); 7.175 + if (offset) { 7.176 + entry = swp_entry(type,offset); 7.177 + type = swap_info[type].next; 7.178 + if (type < 0 || 7.179 + p->prio != swap_info[type].prio) { 7.180 + swap_list.next = swap_list.head; 7.181 + } else { 7.182 + swap_list.next = type; 7.183 + } 7.184 + goto out; 7.185 + } 7.186 + } 7.187 + type = p->next; 7.188 + if (!wrapped) { 7.189 + if (type < 0 || p->prio != swap_info[type].prio) { 7.190 + type = swap_list.head; 7.191 + wrapped = 1; 7.192 + } 7.193 + } else 7.194 + if (type < 0) 7.195 + goto out; /* out of swap space */ 7.196 + } 7.197 +out: 7.198 + swap_list_unlock(); 7.199 + return entry; 7.200 +} 7.201 + 7.202 +static struct swap_info_struct * swap_info_get(swp_entry_t entry) 7.203 +{ 7.204 + struct swap_info_struct * p; 7.205 + unsigned long offset, type; 7.206 + 7.207 + if (!entry.val) 7.208 + goto out; 7.209 + type = swp_type(entry); 7.210 + if (type >= nr_swapfiles) 7.211 + goto bad_nofile; 7.212 + p = & swap_info[type]; 7.213 + if (!(p->flags & SWP_USED)) 7.214 + goto bad_device; 7.215 + offset = swp_offset(entry); 7.216 + if (offset >= p->max) 7.217 + goto bad_offset; 7.218 + if (!p->swap_map[offset]) 7.219 + goto bad_free; 7.220 + swap_list_lock(); 7.221 + if (p->prio > swap_info[swap_list.next].prio) 7.222 + swap_list.next = type; 7.223 + swap_device_lock(p); 7.224 + return p; 7.225 + 7.226 +bad_free: 7.227 + printk(KERN_ERR "swap_free: %s%08lx\n", Unused_offset, entry.val); 7.228 + goto out; 7.229 +bad_offset: 7.230 + printk(KERN_ERR "swap_free: %s%08lx\n", Bad_offset, entry.val); 7.231 + goto out; 7.232 +bad_device: 7.233 + printk(KERN_ERR "swap_free: %s%08lx\n", Unused_file, entry.val); 7.234 + goto out; 7.235 +bad_nofile: 7.236 + printk(KERN_ERR "swap_free: %s%08lx\n", Bad_file, entry.val); 7.237 +out: 7.238 + return NULL; 7.239 +} 7.240 + 7.241 +static void swap_info_put(struct swap_info_struct * p) 7.242 +{ 7.243 + swap_device_unlock(p); 7.244 + swap_list_unlock(); 7.245 +} 7.246 + 7.247 +static int swap_entry_free(struct swap_info_struct *p, unsigned long offset) 7.248 +{ 7.249 + int count = p->swap_map[offset]; 7.250 + 7.251 + if (count < SWAP_MAP_MAX) { 7.252 + count--; 7.253 + p->swap_map[offset] = count; 7.254 + if (!count) { 7.255 + if (offset < p->lowest_bit) 7.256 + p->lowest_bit = offset; 7.257 + if (offset > p->highest_bit) 7.258 + p->highest_bit = offset; 7.259 + nr_swap_pages++; 7.260 + p->inuse_pages--; 7.261 + } 7.262 + } 7.263 + return count; 7.264 +} 7.265 + 7.266 +/* 7.267 + * Caller has made sure that the swapdevice corresponding to entry 7.268 + * is still around or has not been recycled. 7.269 + */ 7.270 +void swap_free(swp_entry_t entry) 7.271 +{ 7.272 + struct swap_info_struct * p; 7.273 + 7.274 + p = swap_info_get(entry); 7.275 + if (p) { 7.276 + swap_entry_free(p, swp_offset(entry)); 7.277 + swap_info_put(p); 7.278 + } 7.279 +} 7.280 + 7.281 +/* 7.282 + * Check if we're the only user of a swap page, 7.283 + * when the page is locked. 7.284 + */ 7.285 +static int exclusive_swap_page(struct page *page) 7.286 +{ 7.287 + int retval = 0; 7.288 + struct swap_info_struct * p; 7.289 + swp_entry_t entry; 7.290 + 7.291 + entry.val = page->private; 7.292 + p = swap_info_get(entry); 7.293 + if (p) { 7.294 + /* Is the only swap cache user the cache itself? */ 7.295 + if (p->swap_map[swp_offset(entry)] == 1) { 7.296 + /* Recheck the page count with the swapcache lock held.. */ 7.297 + spin_lock_irq(&swapper_space.tree_lock); 7.298 + if (page_count(page) == 2) 7.299 + retval = 1; 7.300 + spin_unlock_irq(&swapper_space.tree_lock); 7.301 + } 7.302 + swap_info_put(p); 7.303 + } 7.304 + return retval; 7.305 +} 7.306 + 7.307 +/* 7.308 + * We can use this swap cache entry directly 7.309 + * if there are no other references to it. 7.310 + * 7.311 + * Here "exclusive_swap_page()" does the real 7.312 + * work, but we opportunistically check whether 7.313 + * we need to get all the locks first.. 7.314 + */ 7.315 +int can_share_swap_page(struct page *page) 7.316 +{ 7.317 + int retval = 0; 7.318 + 7.319 + if (!PageLocked(page)) 7.320 + BUG(); 7.321 + switch (page_count(page)) { 7.322 + case 3: 7.323 + if (!PagePrivate(page)) 7.324 + break; 7.325 + /* Fallthrough */ 7.326 + case 2: 7.327 + if (!PageSwapCache(page)) 7.328 + break; 7.329 + retval = exclusive_swap_page(page); 7.330 + break; 7.331 + case 1: 7.332 + if (PageReserved(page)) 7.333 + break; 7.334 + retval = 1; 7.335 + } 7.336 + return retval; 7.337 +} 7.338 + 7.339 +/* 7.340 + * Work out if there are any other processes sharing this 7.341 + * swap cache page. Free it if you can. Return success. 7.342 + */ 7.343 +int remove_exclusive_swap_page(struct page *page) 7.344 +{ 7.345 + int retval; 7.346 + struct swap_info_struct * p; 7.347 + swp_entry_t entry; 7.348 + 7.349 + BUG_ON(PagePrivate(page)); 7.350 + BUG_ON(!PageLocked(page)); 7.351 + 7.352 + if (!PageSwapCache(page)) 7.353 + return 0; 7.354 + if (PageWriteback(page)) 7.355 + return 0; 7.356 + if (page_count(page) != 2) /* 2: us + cache */ 7.357 + return 0; 7.358 + 7.359 + entry.val = page->private; 7.360 + p = swap_info_get(entry); 7.361 + if (!p) 7.362 + return 0; 7.363 + 7.364 + /* Is the only swap cache user the cache itself? */ 7.365 + retval = 0; 7.366 + if (p->swap_map[swp_offset(entry)] == 1) { 7.367 + /* Recheck the page count with the swapcache lock held.. */ 7.368 + spin_lock_irq(&swapper_space.tree_lock); 7.369 + if ((page_count(page) == 2) && !PageWriteback(page)) { 7.370 + __delete_from_swap_cache(page); 7.371 + SetPageDirty(page); 7.372 + retval = 1; 7.373 + } 7.374 + spin_unlock_irq(&swapper_space.tree_lock); 7.375 + } 7.376 + swap_info_put(p); 7.377 + 7.378 + if (retval) { 7.379 + swap_free(entry); 7.380 + page_cache_release(page); 7.381 + } 7.382 + 7.383 + return retval; 7.384 +} 7.385 + 7.386 +/* 7.387 + * Free the swap entry like above, but also try to 7.388 + * free the page cache entry if it is the last user. 7.389 + */ 7.390 +void free_swap_and_cache(swp_entry_t entry) 7.391 +{ 7.392 + struct swap_info_struct * p; 7.393 + struct page *page = NULL; 7.394 + 7.395 + p = swap_info_get(entry); 7.396 + if (p) { 7.397 + if (swap_entry_free(p, swp_offset(entry)) == 1) { 7.398 + spin_lock_irq(&swapper_space.tree_lock); 7.399 + page = radix_tree_lookup(&swapper_space.page_tree, 7.400 + entry.val); 7.401 + if (page && TestSetPageLocked(page)) 7.402 + page = NULL; 7.403 + spin_unlock_irq(&swapper_space.tree_lock); 7.404 + } 7.405 + swap_info_put(p); 7.406 + } 7.407 + if (page) { 7.408 + int one_user; 7.409 + 7.410 + BUG_ON(PagePrivate(page)); 7.411 + page_cache_get(page); 7.412 + one_user = (page_count(page) == 2); 7.413 + /* Only cache user (+us), or swap space full? Free it! */ 7.414 + if (!PageWriteback(page) && (one_user || vm_swap_full())) { 7.415 + delete_from_swap_cache(page); 7.416 + SetPageDirty(page); 7.417 + } 7.418 + unlock_page(page); 7.419 + page_cache_release(page); 7.420 + } 7.421 +} 7.422 + 7.423 +/* 7.424 + * The swap entry has been read in advance, and we return 1 to indicate 7.425 + * that the page has been used or is no longer needed. 7.426 + * 7.427 + * Always set the resulting pte to be nowrite (the same as COW pages 7.428 + * after one process has exited). We don't know just how many PTEs will 7.429 + * share this swap entry, so be cautious and let do_wp_page work out 7.430 + * what to do if a write is requested later. 7.431 + */ 7.432 +/* vma->vm_mm->page_table_lock is held */ 7.433 +static void 7.434 +unuse_pte(struct vm_area_struct *vma, unsigned long address, pte_t *dir, 7.435 + swp_entry_t entry, struct page *page) 7.436 +{ 7.437 + vma->vm_mm->rss++; 7.438 + get_page(page); 7.439 + set_pte(dir, pte_mkold(mk_pte(page, vma->vm_page_prot))); 7.440 + page_add_anon_rmap(page, vma, address); 7.441 + swap_free(entry); 7.442 +} 7.443 + 7.444 +/* vma->vm_mm->page_table_lock is held */ 7.445 +static unsigned long unuse_pmd(struct vm_area_struct * vma, pmd_t *dir, 7.446 + unsigned long address, unsigned long size, unsigned long offset, 7.447 + swp_entry_t entry, struct page *page) 7.448 +{ 7.449 + pte_t * pte; 7.450 + unsigned long end; 7.451 + pte_t swp_pte = swp_entry_to_pte(entry); 7.452 + 7.453 + if (pmd_none(*dir)) 7.454 + return 0; 7.455 + if (pmd_bad(*dir)) { 7.456 + pmd_ERROR(*dir); 7.457 + pmd_clear(dir); 7.458 + return 0; 7.459 + } 7.460 + pte = pte_offset_map(dir, address); 7.461 + offset += address & PMD_MASK; 7.462 + address &= ~PMD_MASK; 7.463 + end = address + size; 7.464 + if (end > PMD_SIZE) 7.465 + end = PMD_SIZE; 7.466 + do { 7.467 + /* 7.468 + * swapoff spends a _lot_ of time in this loop! 7.469 + * Test inline before going to call unuse_pte. 7.470 + */ 7.471 + if (unlikely(pte_same(*pte, swp_pte))) { 7.472 + unuse_pte(vma, offset + address, pte, entry, page); 7.473 + pte_unmap(pte); 7.474 + 7.475 + /* 7.476 + * Move the page to the active list so it is not 7.477 + * immediately swapped out again after swapon. 7.478 + */ 7.479 + activate_page(page); 7.480 + 7.481 + /* add 1 since address may be 0 */ 7.482 + return 1 + offset + address; 7.483 + } 7.484 + address += PAGE_SIZE; 7.485 + pte++; 7.486 + } while (address && (address < end)); 7.487 + pte_unmap(pte - 1); 7.488 + return 0; 7.489 +} 7.490 + 7.491 +/* vma->vm_mm->page_table_lock is held */ 7.492 +static unsigned long unuse_pgd(struct vm_area_struct * vma, pgd_t *dir, 7.493 + unsigned long address, unsigned long size, 7.494 + swp_entry_t entry, struct page *page) 7.495 +{ 7.496 + pmd_t * pmd; 7.497 + unsigned long offset, end; 7.498 + unsigned long foundaddr; 7.499 + 7.500 + if (pgd_none(*dir)) 7.501 + return 0; 7.502 + if (pgd_bad(*dir)) { 7.503 + pgd_ERROR(*dir); 7.504 + pgd_clear(dir); 7.505 + return 0; 7.506 + } 7.507 + pmd = pmd_offset(dir, address); 7.508 + offset = address & PGDIR_MASK; 7.509 + address &= ~PGDIR_MASK; 7.510 + end = address + size; 7.511 + if (end > PGDIR_SIZE) 7.512 + end = PGDIR_SIZE; 7.513 + if (address >= end) 7.514 + BUG(); 7.515 + do { 7.516 + foundaddr = unuse_pmd(vma, pmd, address, end - address, 7.517 + offset, entry, page); 7.518 + if (foundaddr) 7.519 + return foundaddr; 7.520 + address = (address + PMD_SIZE) & PMD_MASK; 7.521 + pmd++; 7.522 + } while (address && (address < end)); 7.523 + return 0; 7.524 +} 7.525 + 7.526 +/* vma->vm_mm->page_table_lock is held */ 7.527 +static unsigned long unuse_vma(struct vm_area_struct * vma, 7.528 + swp_entry_t entry, struct page *page) 7.529 +{ 7.530 + pgd_t *pgdir; 7.531 + unsigned long start, end; 7.532 + unsigned long foundaddr; 7.533 + 7.534 + if (page->mapping) { 7.535 + start = page_address_in_vma(page, vma); 7.536 + if (start == -EFAULT) 7.537 + return 0; 7.538 + else 7.539 + end = start + PAGE_SIZE; 7.540 + } else { 7.541 + start = vma->vm_start; 7.542 + end = vma->vm_end; 7.543 + } 7.544 + pgdir = pgd_offset(vma->vm_mm, start); 7.545 + do { 7.546 + foundaddr = unuse_pgd(vma, pgdir, start, end - start, 7.547 + entry, page); 7.548 + if (foundaddr) 7.549 + return foundaddr; 7.550 + start = (start + PGDIR_SIZE) & PGDIR_MASK; 7.551 + pgdir++; 7.552 + } while (start && (start < end)); 7.553 + return 0; 7.554 +} 7.555 + 7.556 +static int unuse_process(struct mm_struct * mm, 7.557 + swp_entry_t entry, struct page* page) 7.558 +{ 7.559 + struct vm_area_struct* vma; 7.560 + unsigned long foundaddr = 0; 7.561 + 7.562 + /* 7.563 + * Go through process' page directory. 7.564 + */ 7.565 + if (!down_read_trylock(&mm->mmap_sem)) { 7.566 + /* 7.567 + * Our reference to the page stops try_to_unmap_one from 7.568 + * unmapping its ptes, so swapoff can make progress. 7.569 + */ 7.570 + unlock_page(page); 7.571 + down_read(&mm->mmap_sem); 7.572 + lock_page(page); 7.573 + } 7.574 + spin_lock(&mm->page_table_lock); 7.575 + for (vma = mm->mmap; vma; vma = vma->vm_next) { 7.576 + if (vma->anon_vma) { 7.577 + foundaddr = unuse_vma(vma, entry, page); 7.578 + if (foundaddr) 7.579 + break; 7.580 + } 7.581 + } 7.582 +#ifdef CONFIG_XEN_BATCH_MODE2 7.583 + XEN_flush_page_update_queue(); 7.584 +#endif 7.585 + spin_unlock(&mm->page_table_lock); 7.586 + up_read(&mm->mmap_sem); 7.587 + /* 7.588 + * Currently unuse_process cannot fail, but leave error handling 7.589 + * at call sites for now, since we change it from time to time. 7.590 + */ 7.591 + return 0; 7.592 +} 7.593 + 7.594 +/* 7.595 + * Scan swap_map from current position to next entry still in use. 7.596 + * Recycle to start on reaching the end, returning 0 when empty. 7.597 + */ 7.598 +static int find_next_to_unuse(struct swap_info_struct *si, int prev) 7.599 +{ 7.600 + int max = si->max; 7.601 + int i = prev; 7.602 + int count; 7.603 + 7.604 + /* 7.605 + * No need for swap_device_lock(si) here: we're just looking 7.606 + * for whether an entry is in use, not modifying it; false 7.607 + * hits are okay, and sys_swapoff() has already prevented new 7.608 + * allocations from this area (while holding swap_list_lock()). 7.609 + */ 7.610 + for (;;) { 7.611 + if (++i >= max) { 7.612 + if (!prev) { 7.613 + i = 0; 7.614 + break; 7.615 + } 7.616 + /* 7.617 + * No entries in use at top of swap_map, 7.618 + * loop back to start and recheck there. 7.619 + */ 7.620 + max = prev + 1; 7.621 + prev = 0; 7.622 + i = 1; 7.623 + } 7.624 + count = si->swap_map[i]; 7.625 + if (count && count != SWAP_MAP_BAD) 7.626 + break; 7.627 + } 7.628 + return i; 7.629 +} 7.630 + 7.631 +/* 7.632 + * We completely avoid races by reading each swap page in advance, 7.633 + * and then search for the process using it. All the necessary 7.634 + * page table adjustments can then be made atomically. 7.635 + */ 7.636 +static int try_to_unuse(unsigned int type) 7.637 +{ 7.638 + struct swap_info_struct * si = &swap_info[type]; 7.639 + struct mm_struct *start_mm; 7.640 + unsigned short *swap_map; 7.641 + unsigned short swcount; 7.642 + struct page *page; 7.643 + swp_entry_t entry; 7.644 + int i = 0; 7.645 + int retval = 0; 7.646 + int reset_overflow = 0; 7.647 + int shmem; 7.648 + 7.649 + /* 7.650 + * When searching mms for an entry, a good strategy is to 7.651 + * start at the first mm we freed the previous entry from 7.652 + * (though actually we don't notice whether we or coincidence 7.653 + * freed the entry). Initialize this start_mm with a hold. 7.654 + * 7.655 + * A simpler strategy would be to start at the last mm we 7.656 + * freed the previous entry from; but that would take less 7.657 + * advantage of mmlist ordering, which clusters forked mms 7.658 + * together, child after parent. If we race with dup_mmap(), we 7.659 + * prefer to resolve parent before child, lest we miss entries 7.660 + * duplicated after we scanned child: using last mm would invert 7.661 + * that. Though it's only a serious concern when an overflowed 7.662 + * swap count is reset from SWAP_MAP_MAX, preventing a rescan. 7.663 + */ 7.664 + start_mm = &init_mm; 7.665 + atomic_inc(&init_mm.mm_users); 7.666 + 7.667 + /* 7.668 + * Keep on scanning until all entries have gone. Usually, 7.669 + * one pass through swap_map is enough, but not necessarily: 7.670 + * there are races when an instance of an entry might be missed. 7.671 + */ 7.672 + while ((i = find_next_to_unuse(si, i)) != 0) { 7.673 + if (signal_pending(current)) { 7.674 + retval = -EINTR; 7.675 + break; 7.676 + } 7.677 + 7.678 + /* 7.679 + * Get a page for the entry, using the existing swap 7.680 + * cache page if there is one. Otherwise, get a clean 7.681 + * page and read the swap into it. 7.682 + */ 7.683 + swap_map = &si->swap_map[i]; 7.684 + entry = swp_entry(type, i); 7.685 + page = read_swap_cache_async(entry, NULL, 0); 7.686 + if (!page) { 7.687 + /* 7.688 + * Either swap_duplicate() failed because entry 7.689 + * has been freed independently, and will not be 7.690 + * reused since sys_swapoff() already disabled 7.691 + * allocation from here, or alloc_page() failed. 7.692 + */ 7.693 + if (!*swap_map) 7.694 + continue; 7.695 + retval = -ENOMEM; 7.696 + break; 7.697 + } 7.698 + 7.699 + /* 7.700 + * Don't hold on to start_mm if it looks like exiting. 7.701 + */ 7.702 + if (atomic_read(&start_mm->mm_users) == 1) { 7.703 + mmput(start_mm); 7.704 + start_mm = &init_mm; 7.705 + atomic_inc(&init_mm.mm_users); 7.706 + } 7.707 + 7.708 + /* 7.709 + * Wait for and lock page. When do_swap_page races with 7.710 + * try_to_unuse, do_swap_page can handle the fault much 7.711 + * faster than try_to_unuse can locate the entry. This 7.712 + * apparently redundant "wait_on_page_locked" lets try_to_unuse 7.713 + * defer to do_swap_page in such a case - in some tests, 7.714 + * do_swap_page and try_to_unuse repeatedly compete. 7.715 + */ 7.716 + wait_on_page_locked(page); 7.717 + wait_on_page_writeback(page); 7.718 + lock_page(page); 7.719 + wait_on_page_writeback(page); 7.720 + 7.721 + /* 7.722 + * Remove all references to entry. 7.723 + * Whenever we reach init_mm, there's no address space 7.724 + * to search, but use it as a reminder to search shmem. 7.725 + */ 7.726 + shmem = 0; 7.727 + swcount = *swap_map; 7.728 + if (swcount > 1) { 7.729 + if (start_mm == &init_mm) 7.730 + shmem = shmem_unuse(entry, page); 7.731 + else 7.732 + retval = unuse_process(start_mm, entry, page); 7.733 + } 7.734 + if (*swap_map > 1) { 7.735 + int set_start_mm = (*swap_map >= swcount); 7.736 + struct list_head *p = &start_mm->mmlist; 7.737 + struct mm_struct *new_start_mm = start_mm; 7.738 + struct mm_struct *prev_mm = start_mm; 7.739 + struct mm_struct *mm; 7.740 + 7.741 + atomic_inc(&new_start_mm->mm_users); 7.742 + atomic_inc(&prev_mm->mm_users); 7.743 + spin_lock(&mmlist_lock); 7.744 + while (*swap_map > 1 && !retval && 7.745 + (p = p->next) != &start_mm->mmlist) { 7.746 + mm = list_entry(p, struct mm_struct, mmlist); 7.747 + if (atomic_inc_return(&mm->mm_users) == 1) { 7.748 + atomic_dec(&mm->mm_users); 7.749 + continue; 7.750 + } 7.751 + spin_unlock(&mmlist_lock); 7.752 + mmput(prev_mm); 7.753 + prev_mm = mm; 7.754 + 7.755 + cond_resched(); 7.756 + 7.757 + swcount = *swap_map; 7.758 + if (swcount <= 1) 7.759 + ; 7.760 + else if (mm == &init_mm) { 7.761 + set_start_mm = 1; 7.762 + shmem = shmem_unuse(entry, page); 7.763 + } else 7.764 + retval = unuse_process(mm, entry, page); 7.765 + if (set_start_mm && *swap_map < swcount) { 7.766 + mmput(new_start_mm); 7.767 + atomic_inc(&mm->mm_users); 7.768 + new_start_mm = mm; 7.769 + set_start_mm = 0; 7.770 + } 7.771 + spin_lock(&mmlist_lock); 7.772 + } 7.773 + spin_unlock(&mmlist_lock); 7.774 + mmput(prev_mm); 7.775 + mmput(start_mm); 7.776 + start_mm = new_start_mm; 7.777 + } 7.778 + if (retval) { 7.779 + unlock_page(page); 7.780 + page_cache_release(page); 7.781 + break; 7.782 + } 7.783 + 7.784 + /* 7.785 + * How could swap count reach 0x7fff when the maximum 7.786 + * pid is 0x7fff, and there's no way to repeat a swap 7.787 + * page within an mm (except in shmem, where it's the 7.788 + * shared object which takes the reference count)? 7.789 + * We believe SWAP_MAP_MAX cannot occur in Linux 2.4. 7.790 + * 7.791 + * If that's wrong, then we should worry more about 7.792 + * exit_mmap() and do_munmap() cases described above: 7.793 + * we might be resetting SWAP_MAP_MAX too early here. 7.794 + * We know "Undead"s can happen, they're okay, so don't 7.795 + * report them; but do report if we reset SWAP_MAP_MAX. 7.796 + */ 7.797 + if (*swap_map == SWAP_MAP_MAX) { 7.798 + swap_device_lock(si); 7.799 + *swap_map = 1; 7.800 + swap_device_unlock(si); 7.801 + reset_overflow = 1; 7.802 + } 7.803 + 7.804 + /* 7.805 + * If a reference remains (rare), we would like to leave 7.806 + * the page in the swap cache; but try_to_unmap could 7.807 + * then re-duplicate the entry once we drop page lock, 7.808 + * so we might loop indefinitely; also, that page could 7.809 + * not be swapped out to other storage meanwhile. So: 7.810 + * delete from cache even if there's another reference, 7.811 + * after ensuring that the data has been saved to disk - 7.812 + * since if the reference remains (rarer), it will be 7.813 + * read from disk into another page. Splitting into two 7.814 + * pages would be incorrect if swap supported "shared 7.815 + * private" pages, but they are handled by tmpfs files. 7.816 + * 7.817 + * Note shmem_unuse already deleted a swappage from 7.818 + * the swap cache, unless the move to filepage failed: 7.819 + * in which case it left swappage in cache, lowered its 7.820 + * swap count to pass quickly through the loops above, 7.821 + * and now we must reincrement count to try again later. 7.822 + */ 7.823 + if ((*swap_map > 1) && PageDirty(page) && PageSwapCache(page)) { 7.824 + struct writeback_control wbc = { 7.825 + .sync_mode = WB_SYNC_NONE, 7.826 + }; 7.827 + 7.828 + swap_writepage(page, &wbc); 7.829 + lock_page(page); 7.830 + wait_on_page_writeback(page); 7.831 + } 7.832 + if (PageSwapCache(page)) { 7.833 + if (shmem) 7.834 + swap_duplicate(entry); 7.835 + else 7.836 + delete_from_swap_cache(page); 7.837 + } 7.838 + 7.839 + /* 7.840 + * So we could skip searching mms once swap count went 7.841 + * to 1, we did not mark any present ptes as dirty: must 7.842 + * mark page dirty so shrink_list will preserve it. 7.843 + */ 7.844 + SetPageDirty(page); 7.845 + unlock_page(page); 7.846 + page_cache_release(page); 7.847 + 7.848 + /* 7.849 + * Make sure that we aren't completely killing 7.850 + * interactive performance. 7.851 + */ 7.852 + cond_resched(); 7.853 + } 7.854 + 7.855 + mmput(start_mm); 7.856 + if (reset_overflow) { 7.857 + printk(KERN_WARNING "swapoff: cleared swap entry overflow\n"); 7.858 + swap_overflow = 0; 7.859 + } 7.860 + return retval; 7.861 +} 7.862 + 7.863 +/* 7.864 + * After a successful try_to_unuse, if no swap is now in use, we know we 7.865 + * can empty the mmlist. swap_list_lock must be held on entry and exit. 7.866 + * Note that mmlist_lock nests inside swap_list_lock, and an mm must be 7.867 + * added to the mmlist just after page_duplicate - before would be racy. 7.868 + */ 7.869 +static void drain_mmlist(void) 7.870 +{ 7.871 + struct list_head *p, *next; 7.872 + unsigned int i; 7.873 + 7.874 + for (i = 0; i < nr_swapfiles; i++) 7.875 + if (swap_info[i].inuse_pages) 7.876 + return; 7.877 + spin_lock(&mmlist_lock); 7.878 + list_for_each_safe(p, next, &init_mm.mmlist) 7.879 + list_del_init(p); 7.880 + spin_unlock(&mmlist_lock); 7.881 +} 7.882 + 7.883 +/* 7.884 + * Use this swapdev's extent info to locate the (PAGE_SIZE) block which 7.885 + * corresponds to page offset `offset'. 7.886 + */ 7.887 +sector_t map_swap_page(struct swap_info_struct *sis, pgoff_t offset) 7.888 +{ 7.889 + struct swap_extent *se = sis->curr_swap_extent; 7.890 + struct swap_extent *start_se = se; 7.891 + 7.892 + for ( ; ; ) { 7.893 + struct list_head *lh; 7.894 + 7.895 + if (se->start_page <= offset && 7.896 + offset < (se->start_page + se->nr_pages)) { 7.897 + return se->start_block + (offset - se->start_page); 7.898 + } 7.899 + lh = se->list.prev; 7.900 + if (lh == &sis->extent_list) 7.901 + lh = lh->prev; 7.902 + se = list_entry(lh, struct swap_extent, list); 7.903 + sis->curr_swap_extent = se; 7.904 + BUG_ON(se == start_se); /* It *must* be present */ 7.905 + } 7.906 +} 7.907 + 7.908 +/* 7.909 + * Free all of a swapdev's extent information 7.910 + */ 7.911 +static void destroy_swap_extents(struct swap_info_struct *sis) 7.912 +{ 7.913 + while (!list_empty(&sis->extent_list)) { 7.914 + struct swap_extent *se; 7.915 + 7.916 + se = list_entry(sis->extent_list.next, 7.917 + struct swap_extent, list); 7.918 + list_del(&se->list); 7.919 + kfree(se); 7.920 + } 7.921 + sis->nr_extents = 0; 7.922 +} 7.923 + 7.924 +/* 7.925 + * Add a block range (and the corresponding page range) into this swapdev's 7.926 + * extent list. The extent list is kept sorted in block order. 7.927 + * 7.928 + * This function rather assumes that it is called in ascending sector_t order. 7.929 + * It doesn't look for extent coalescing opportunities. 7.930 + */ 7.931 +static int 7.932 +add_swap_extent(struct swap_info_struct *sis, unsigned long start_page, 7.933 + unsigned long nr_pages, sector_t start_block) 7.934 +{ 7.935 + struct swap_extent *se; 7.936 + struct swap_extent *new_se; 7.937 + struct list_head *lh; 7.938 + 7.939 + lh = sis->extent_list.next; /* The highest-addressed block */ 7.940 + while (lh != &sis->extent_list) { 7.941 + se = list_entry(lh, struct swap_extent, list); 7.942 + if (se->start_block + se->nr_pages == start_block && 7.943 + se->start_page + se->nr_pages == start_page) { 7.944 + /* Merge it */ 7.945 + se->nr_pages += nr_pages; 7.946 + return 0; 7.947 + } 7.948 + lh = lh->next; 7.949 + } 7.950 + 7.951 + /* 7.952 + * No merge. Insert a new extent, preserving ordering. 7.953 + */ 7.954 + new_se = kmalloc(sizeof(*se), GFP_KERNEL); 7.955 + if (new_se == NULL) 7.956 + return -ENOMEM; 7.957 + new_se->start_page = start_page; 7.958 + new_se->nr_pages = nr_pages; 7.959 + new_se->start_block = start_block; 7.960 + 7.961 + lh = sis->extent_list.prev; /* The lowest block */ 7.962 + while (lh != &sis->extent_list) { 7.963 + se = list_entry(lh, struct swap_extent, list); 7.964 + if (se->start_block > start_block) 7.965 + break; 7.966 + lh = lh->prev; 7.967 + } 7.968 + list_add_tail(&new_se->list, lh); 7.969 + sis->nr_extents++; 7.970 + return 0; 7.971 +} 7.972 + 7.973 +/* 7.974 + * A `swap extent' is a simple thing which maps a contiguous range of pages 7.975 + * onto a contiguous range of disk blocks. An ordered list of swap extents 7.976 + * is built at swapon time and is then used at swap_writepage/swap_readpage 7.977 + * time for locating where on disk a page belongs. 7.978 + * 7.979 + * If the swapfile is an S_ISBLK block device, a single extent is installed. 7.980 + * This is done so that the main operating code can treat S_ISBLK and S_ISREG 7.981 + * swap files identically. 7.982 + * 7.983 + * Whether the swapdev is an S_ISREG file or an S_ISBLK blockdev, the swap 7.984 + * extent list operates in PAGE_SIZE disk blocks. Both S_ISREG and S_ISBLK 7.985 + * swapfiles are handled *identically* after swapon time. 7.986 + * 7.987 + * For S_ISREG swapfiles, setup_swap_extents() will walk all the file's blocks 7.988 + * and will parse them into an ordered extent list, in PAGE_SIZE chunks. If 7.989 + * some stray blocks are found which do not fall within the PAGE_SIZE alignment 7.990 + * requirements, they are simply tossed out - we will never use those blocks 7.991 + * for swapping. 7.992 + * 7.993 + * For S_ISREG swapfiles we hold i_sem across the life of the swapon. This 7.994 + * prevents root from shooting her foot off by ftruncating an in-use swapfile, 7.995 + * which will scribble on the fs. 7.996 + * 7.997 + * The amount of disk space which a single swap extent represents varies. 7.998 + * Typically it is in the 1-4 megabyte range. So we can have hundreds of 7.999 + * extents in the list. To avoid much list walking, we cache the previous 7.1000 + * search location in `curr_swap_extent', and start new searches from there. 7.1001 + * This is extremely effective. The average number of iterations in 7.1002 + * map_swap_page() has been measured at about 0.3 per page. - akpm. 7.1003 + */ 7.1004 +static int setup_swap_extents(struct swap_info_struct *sis) 7.1005 +{ 7.1006 + struct inode *inode; 7.1007 + unsigned blocks_per_page; 7.1008 + unsigned long page_no; 7.1009 + unsigned blkbits; 7.1010 + sector_t probe_block; 7.1011 + sector_t last_block; 7.1012 + int ret; 7.1013 + 7.1014 + inode = sis->swap_file->f_mapping->host; 7.1015 + if (S_ISBLK(inode->i_mode)) { 7.1016 + ret = add_swap_extent(sis, 0, sis->max, 0); 7.1017 + goto done; 7.1018 + } 7.1019 + 7.1020 + blkbits = inode->i_blkbits; 7.1021 + blocks_per_page = PAGE_SIZE >> blkbits; 7.1022 + 7.1023 + /* 7.1024 + * Map all the blocks into the extent list. This code doesn't try 7.1025 + * to be very smart. 7.1026 + */ 7.1027 + probe_block = 0; 7.1028 + page_no = 0; 7.1029 + last_block = i_size_read(inode) >> blkbits; 7.1030 + while ((probe_block + blocks_per_page) <= last_block && 7.1031 + page_no < sis->max) { 7.1032 + unsigned block_in_page; 7.1033 + sector_t first_block; 7.1034 + 7.1035 + first_block = bmap(inode, probe_block); 7.1036 + if (first_block == 0) 7.1037 + goto bad_bmap; 7.1038 + 7.1039 + /* 7.1040 + * It must be PAGE_SIZE aligned on-disk 7.1041 + */ 7.1042 + if (first_block & (blocks_per_page - 1)) { 7.1043 + probe_block++; 7.1044 + goto reprobe; 7.1045 + } 7.1046 + 7.1047 + for (block_in_page = 1; block_in_page < blocks_per_page; 7.1048 + block_in_page++) { 7.1049 + sector_t block; 7.1050 + 7.1051 + block = bmap(inode, probe_block + block_in_page); 7.1052 + if (block == 0) 7.1053 + goto bad_bmap; 7.1054 + if (block != first_block + block_in_page) { 7.1055 + /* Discontiguity */ 7.1056 + probe_block++; 7.1057 + goto reprobe; 7.1058 + } 7.1059 + } 7.1060 + 7.1061 + /* 7.1062 + * We found a PAGE_SIZE-length, PAGE_SIZE-aligned run of blocks 7.1063 + */ 7.1064 + ret = add_swap_extent(sis, page_no, 1, 7.1065 + first_block >> (PAGE_SHIFT - blkbits)); 7.1066 + if (ret) 7.1067 + goto out; 7.1068 + page_no++; 7.1069 + probe_block += blocks_per_page; 7.1070 +reprobe: 7.1071 + continue; 7.1072 + } 7.1073 + ret = 0; 7.1074 + if (page_no == 0) 7.1075 + ret = -EINVAL; 7.1076 + sis->max = page_no; 7.1077 + sis->highest_bit = page_no - 1; 7.1078 +done: 7.1079 + sis->curr_swap_extent = list_entry(sis->extent_list.prev, 7.1080 + struct swap_extent, list); 7.1081 + goto out; 7.1082 +bad_bmap: 7.1083 + printk(KERN_ERR "swapon: swapfile has holes\n"); 7.1084 + ret = -EINVAL; 7.1085 +out: 7.1086 + return ret; 7.1087 +} 7.1088 + 7.1089 +#if 0 /* We don't need this yet */ 7.1090 +#include <linux/backing-dev.h> 7.1091 +int page_queue_congested(struct page *page) 7.1092 +{ 7.1093 + struct backing_dev_info *bdi; 7.1094 + 7.1095 + BUG_ON(!PageLocked(page)); /* It pins the swap_info_struct */ 7.1096 + 7.1097 + if (PageSwapCache(page)) { 7.1098 + swp_entry_t entry = { .val = page->private }; 7.1099 + struct swap_info_struct *sis; 7.1100 + 7.1101 + sis = get_swap_info_struct(swp_type(entry)); 7.1102 + bdi = sis->bdev->bd_inode->i_mapping->backing_dev_info; 7.1103 + } else 7.1104 + bdi = page->mapping->backing_dev_info; 7.1105 + return bdi_write_congested(bdi); 7.1106 +} 7.1107 +#endif 7.1108 + 7.1109 +asmlinkage long sys_swapoff(const char __user * specialfile) 7.1110 +{ 7.1111 + struct swap_info_struct * p = NULL; 7.1112 + unsigned short *swap_map; 7.1113 + struct file *swap_file, *victim; 7.1114 + struct address_space *mapping; 7.1115 + struct inode *inode; 7.1116 + char * pathname; 7.1117 + int i, type, prev; 7.1118 + int err; 7.1119 + 7.1120 + if (!capable(CAP_SYS_ADMIN)) 7.1121 + return -EPERM; 7.1122 + 7.1123 + pathname = getname(specialfile); 7.1124 + err = PTR_ERR(pathname); 7.1125 + if (IS_ERR(pathname)) 7.1126 + goto out; 7.1127 + 7.1128 + victim = filp_open(pathname, O_RDWR|O_LARGEFILE, 0); 7.1129 + putname(pathname); 7.1130 + err = PTR_ERR(victim); 7.1131 + if (IS_ERR(victim)) 7.1132 + goto out; 7.1133 + 7.1134 + mapping = victim->f_mapping; 7.1135 + prev = -1; 7.1136 + swap_list_lock(); 7.1137 + for (type = swap_list.head; type >= 0; type = swap_info[type].next) { 7.1138 + p = swap_info + type; 7.1139 + if ((p->flags & SWP_ACTIVE) == SWP_ACTIVE) { 7.1140 + if (p->swap_file->f_mapping == mapping) 7.1141 + break; 7.1142 + } 7.1143 + prev = type; 7.1144 + } 7.1145 + if (type < 0) { 7.1146 + err = -EINVAL; 7.1147 + swap_list_unlock(); 7.1148 + goto out_dput; 7.1149 + } 7.1150 + if (!security_vm_enough_memory(p->pages)) 7.1151 + vm_unacct_memory(p->pages); 7.1152 + else { 7.1153 + err = -ENOMEM; 7.1154 + swap_list_unlock(); 7.1155 + goto out_dput; 7.1156 + } 7.1157 + if (prev < 0) { 7.1158 + swap_list.head = p->next; 7.1159 + } else { 7.1160 + swap_info[prev].next = p->next; 7.1161 + } 7.1162 + if (type == swap_list.next) { 7.1163 + /* just pick something that's safe... */ 7.1164 + swap_list.next = swap_list.head; 7.1165 + } 7.1166 + nr_swap_pages -= p->pages; 7.1167 + total_swap_pages -= p->pages; 7.1168 + p->flags &= ~SWP_WRITEOK; 7.1169 + swap_list_unlock(); 7.1170 + current->flags |= PF_SWAPOFF; 7.1171 + err = try_to_unuse(type); 7.1172 + current->flags &= ~PF_SWAPOFF; 7.1173 + 7.1174 + /* wait for any unplug function to finish */ 7.1175 + down_write(&swap_unplug_sem); 7.1176 + up_write(&swap_unplug_sem); 7.1177 + 7.1178 + if (err) { 7.1179 + /* re-insert swap space back into swap_list */ 7.1180 + swap_list_lock(); 7.1181 + for (prev = -1, i = swap_list.head; i >= 0; prev = i, i = swap_info[i].next) 7.1182 + if (p->prio >= swap_info[i].prio) 7.1183 + break; 7.1184 + p->next = i; 7.1185 + if (prev < 0) 7.1186 + swap_list.head = swap_list.next = p - swap_info; 7.1187 + else 7.1188 + swap_info[prev].next = p - swap_info; 7.1189 + nr_swap_pages += p->pages; 7.1190 + total_swap_pages += p->pages; 7.1191 + p->flags |= SWP_WRITEOK; 7.1192 + swap_list_unlock(); 7.1193 + goto out_dput; 7.1194 + } 7.1195 + down(&swapon_sem); 7.1196 + swap_list_lock(); 7.1197 + drain_mmlist(); 7.1198 + swap_device_lock(p); 7.1199 + swap_file = p->swap_file; 7.1200 + p->swap_file = NULL; 7.1201 + p->max = 0; 7.1202 + swap_map = p->swap_map; 7.1203 + p->swap_map = NULL; 7.1204 + p->flags = 0; 7.1205 + destroy_swap_extents(p); 7.1206 + swap_device_unlock(p); 7.1207 + swap_list_unlock(); 7.1208 + up(&swapon_sem); 7.1209 + vfree(swap_map); 7.1210 + inode = mapping->host; 7.1211 + if (S_ISBLK(inode->i_mode)) { 7.1212 + struct block_device *bdev = I_BDEV(inode); 7.1213 + set_blocksize(bdev, p->old_block_size); 7.1214 + bd_release(bdev); 7.1215 + } else { 7.1216 + down(&inode->i_sem); 7.1217 + inode->i_flags &= ~S_SWAPFILE; 7.1218 + up(&inode->i_sem); 7.1219 + } 7.1220 + filp_close(swap_file, NULL); 7.1221 + err = 0; 7.1222 + 7.1223 +out_dput: 7.1224 + filp_close(victim, NULL); 7.1225 +out: 7.1226 + return err; 7.1227 +} 7.1228 + 7.1229 +#ifdef CONFIG_PROC_FS 7.1230 +/* iterator */ 7.1231 +static void *swap_start(struct seq_file *swap, loff_t *pos) 7.1232 +{ 7.1233 + struct swap_info_struct *ptr = swap_info; 7.1234 + int i; 7.1235 + loff_t l = *pos; 7.1236 + 7.1237 + down(&swapon_sem); 7.1238 + 7.1239 + for (i = 0; i < nr_swapfiles; i++, ptr++) { 7.1240 + if (!(ptr->flags & SWP_USED) || !ptr->swap_map) 7.1241 + continue; 7.1242 + if (!l--) 7.1243 + return ptr; 7.1244 + } 7.1245 + 7.1246 + return NULL; 7.1247 +} 7.1248 + 7.1249 +static void *swap_next(struct seq_file *swap, void *v, loff_t *pos) 7.1250 +{ 7.1251 + struct swap_info_struct *ptr = v; 7.1252 + struct swap_info_struct *endptr = swap_info + nr_swapfiles; 7.1253 + 7.1254 + for (++ptr; ptr < endptr; ptr++) { 7.1255 + if (!(ptr->flags & SWP_USED) || !ptr->swap_map) 7.1256 + continue; 7.1257 + ++*pos; 7.1258 + return ptr; 7.1259 + } 7.1260 + 7.1261 + return NULL; 7.1262 +} 7.1263 + 7.1264 +static void swap_stop(struct seq_file *swap, void *v) 7.1265 +{ 7.1266 + up(&swapon_sem); 7.1267 +} 7.1268 + 7.1269 +static int swap_show(struct seq_file *swap, void *v) 7.1270 +{ 7.1271 + struct swap_info_struct *ptr = v; 7.1272 + struct file *file; 7.1273 + int len; 7.1274 + 7.1275 + if (v == swap_info) 7.1276 + seq_puts(swap, "Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n"); 7.1277 + 7.1278 + file = ptr->swap_file; 7.1279 + len = seq_path(swap, file->f_vfsmnt, file->f_dentry, " \t\n\\"); 7.1280 + seq_printf(swap, "%*s%s\t%d\t%ld\t%d\n", 7.1281 + len < 40 ? 40 - len : 1, " ", 7.1282 + S_ISBLK(file->f_dentry->d_inode->i_mode) ? 7.1283 + "partition" : "file\t", 7.1284 + ptr->pages << (PAGE_SHIFT - 10), 7.1285 + ptr->inuse_pages << (PAGE_SHIFT - 10), 7.1286 + ptr->prio); 7.1287 + return 0; 7.1288 +} 7.1289 + 7.1290 +static struct seq_operations swaps_op = { 7.1291 + .start = swap_start, 7.1292 + .next = swap_next, 7.1293 + .stop = swap_stop, 7.1294 + .show = swap_show 7.1295 +}; 7.1296 + 7.1297 +static int swaps_open(struct inode *inode, struct file *file) 7.1298 +{ 7.1299 + return seq_open(file, &swaps_op); 7.1300 +} 7.1301 + 7.1302 +static struct file_operations proc_swaps_operations = { 7.1303 + .open = swaps_open, 7.1304 + .read = seq_read, 7.1305 + .llseek = seq_lseek, 7.1306 + .release = seq_release, 7.1307 +}; 7.1308 + 7.1309 +static int __init procswaps_init(void) 7.1310 +{ 7.1311 + struct proc_dir_entry *entry; 7.1312 + 7.1313 + entry = create_proc_entry("swaps", 0, NULL); 7.1314 + if (entry) 7.1315 + entry->proc_fops = &proc_swaps_operations; 7.1316 + return 0; 7.1317 +} 7.1318 +__initcall(procswaps_init); 7.1319 +#endif /* CONFIG_PROC_FS */ 7.1320 + 7.1321 +/* 7.1322 + * Written 01/25/92 by Simmule Turner, heavily changed by Linus. 7.1323 + * 7.1324 + * The swapon system call 7.1325 + */ 7.1326 +asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags) 7.1327 +{ 7.1328 + struct swap_info_struct * p; 7.1329 + char *name = NULL; 7.1330 + struct block_device *bdev = NULL; 7.1331 + struct file *swap_file = NULL; 7.1332 + struct address_space *mapping; 7.1333 + unsigned int type; 7.1334 + int i, prev; 7.1335 + int error; 7.1336 + static int least_priority; 7.1337 + union swap_header *swap_header = NULL; 7.1338 + int swap_header_version; 7.1339 + int nr_good_pages = 0; 7.1340 + unsigned long maxpages = 1; 7.1341 + int swapfilesize; 7.1342 + unsigned short *swap_map; 7.1343 + struct page *page = NULL; 7.1344 + struct inode *inode = NULL; 7.1345 + int did_down = 0; 7.1346 + 7.1347 + if (!capable(CAP_SYS_ADMIN)) 7.1348 + return -EPERM; 7.1349 + swap_list_lock(); 7.1350 + p = swap_info; 7.1351 + for (type = 0 ; type < nr_swapfiles ; type++,p++) 7.1352 + if (!(p->flags & SWP_USED)) 7.1353 + break; 7.1354 + error = -EPERM; 7.1355 + /* 7.1356 + * Test if adding another swap device is possible. There are 7.1357 + * two limiting factors: 1) the number of bits for the swap 7.1358 + * type swp_entry_t definition and 2) the number of bits for 7.1359 + * the swap type in the swap ptes as defined by the different 7.1360 + * architectures. To honor both limitations a swap entry 7.1361 + * with swap offset 0 and swap type ~0UL is created, encoded 7.1362 + * to a swap pte, decoded to a swp_entry_t again and finally 7.1363 + * the swap type part is extracted. This will mask all bits 7.1364 + * from the initial ~0UL that can't be encoded in either the 7.1365 + * swp_entry_t or the architecture definition of a swap pte. 7.1366 + */ 7.1367 + if (type > swp_type(pte_to_swp_entry(swp_entry_to_pte(swp_entry(~0UL,0))))) { 7.1368 + swap_list_unlock(); 7.1369 + goto out; 7.1370 + } 7.1371 + if (type >= nr_swapfiles) 7.1372 + nr_swapfiles = type+1; 7.1373 + INIT_LIST_HEAD(&p->extent_list); 7.1374 + p->flags = SWP_USED; 7.1375 + p->nr_extents = 0; 7.1376 + p->swap_file = NULL; 7.1377 + p->old_block_size = 0; 7.1378 + p->swap_map = NULL; 7.1379 + p->lowest_bit = 0; 7.1380 + p->highest_bit = 0; 7.1381 + p->cluster_nr = 0; 7.1382 + p->inuse_pages = 0; 7.1383 + spin_lock_init(&p->sdev_lock); 7.1384 + p->next = -1; 7.1385 + if (swap_flags & SWAP_FLAG_PREFER) { 7.1386 + p->prio = 7.1387 + (swap_flags & SWAP_FLAG_PRIO_MASK)>>SWAP_FLAG_PRIO_SHIFT; 7.1388 + } else { 7.1389 + p->prio = --least_priority; 7.1390 + } 7.1391 + swap_list_unlock(); 7.1392 + name = getname(specialfile); 7.1393 + error = PTR_ERR(name); 7.1394 + if (IS_ERR(name)) { 7.1395 + name = NULL; 7.1396 + goto bad_swap_2; 7.1397 + } 7.1398 + swap_file = filp_open(name, O_RDWR|O_LARGEFILE, 0); 7.1399 + error = PTR_ERR(swap_file); 7.1400 + if (IS_ERR(swap_file)) { 7.1401 + swap_file = NULL; 7.1402 + goto bad_swap_2; 7.1403 + } 7.1404 + 7.1405 + p->swap_file = swap_file; 7.1406 + mapping = swap_file->f_mapping; 7.1407 + inode = mapping->host; 7.1408 + 7.1409 + error = -EBUSY; 7.1410 + for (i = 0; i < nr_swapfiles; i++) { 7.1411 + struct swap_info_struct *q = &swap_info[i]; 7.1412 + 7.1413 + if (i == type || !q->swap_file) 7.1414 + continue; 7.1415 + if (mapping == q->swap_file->f_mapping) 7.1416 + goto bad_swap; 7.1417 + } 7.1418 + 7.1419 + error = -EINVAL; 7.1420 + if (S_ISBLK(inode->i_mode)) { 7.1421 + bdev = I_BDEV(inode); 7.1422 + error = bd_claim(bdev, sys_swapon); 7.1423 + if (error < 0) { 7.1424 + bdev = NULL; 7.1425 + goto bad_swap; 7.1426 + } 7.1427 + p->old_block_size = block_size(bdev); 7.1428 + error = set_blocksize(bdev, PAGE_SIZE); 7.1429 + if (error < 0) 7.1430 + goto bad_swap; 7.1431 + p->bdev = bdev; 7.1432 + } else if (S_ISREG(inode->i_mode)) { 7.1433 + p->bdev = inode->i_sb->s_bdev; 7.1434 + down(&inode->i_sem); 7.1435 + did_down = 1; 7.1436 + if (IS_SWAPFILE(inode)) { 7.1437 + error = -EBUSY; 7.1438 + goto bad_swap; 7.1439 + } 7.1440 + } else { 7.1441 + goto bad_swap; 7.1442 + } 7.1443 + 7.1444 + swapfilesize = i_size_read(inode) >> PAGE_SHIFT; 7.1445 + 7.1446 + /* 7.1447 + * Read the swap header. 7.1448 + */ 7.1449 + if (!mapping->a_ops->readpage) { 7.1450 + error = -EINVAL; 7.1451 + goto bad_swap; 7.1452 + } 7.1453 + page = read_cache_page(mapping, 0, 7.1454 + (filler_t *)mapping->a_ops->readpage, swap_file); 7.1455 + if (IS_ERR(page)) { 7.1456 + error = PTR_ERR(page); 7.1457 + goto bad_swap; 7.1458 + } 7.1459 + wait_on_page_locked(page); 7.1460 + if (!PageUptodate(page)) 7.1461 + goto bad_swap; 7.1462 + kmap(page); 7.1463 + swap_header = page_address(page); 7.1464 + 7.1465 + if (!memcmp("SWAP-SPACE",swap_header->magic.magic,10)) 7.1466 + swap_header_version = 1; 7.1467 + else if (!memcmp("SWAPSPACE2",swap_header->magic.magic,10)) 7.1468 + swap_header_version = 2; 7.1469 + else { 7.1470 + printk("Unable to find swap-space signature\n"); 7.1471 + error = -EINVAL; 7.1472 + goto bad_swap; 7.1473 + } 7.1474 + 7.1475 + switch (swap_header_version) { 7.1476 + case 1: 7.1477 + printk(KERN_ERR "version 0 swap is no longer supported. " 7.1478 + "Use mkswap -v1 %s\n", name); 7.1479 + error = -EINVAL; 7.1480 + goto bad_swap; 7.1481 + case 2: 7.1482 + /* Check the swap header's sub-version and the size of 7.1483 + the swap file and bad block lists */ 7.1484 + if (swap_header->info.version != 1) { 7.1485 + printk(KERN_WARNING 7.1486 + "Unable to handle swap header version %d\n", 7.1487 + swap_header->info.version); 7.1488 + error = -EINVAL; 7.1489 + goto bad_swap; 7.1490 + } 7.1491 + 7.1492 + p->lowest_bit = 1; 7.1493 + /* 7.1494 + * Find out how many pages are allowed for a single swap 7.1495 + * device. There are two limiting factors: 1) the number of 7.1496 + * bits for the swap offset in the swp_entry_t type and 7.1497 + * 2) the number of bits in the a swap pte as defined by 7.1498 + * the different architectures. In order to find the 7.1499 + * largest possible bit mask a swap entry with swap type 0 7.1500 + * and swap offset ~0UL is created, encoded to a swap pte, 7.1501 + * decoded to a swp_entry_t again and finally the swap 7.1502 + * offset is extracted. This will mask all the bits from 7.1503 + * the initial ~0UL mask that can't be encoded in either 7.1504 + * the swp_entry_t or the architecture definition of a 7.1505 + * swap pte. 7.1506 + */ 7.1507 + maxpages = swp_offset(pte_to_swp_entry(swp_entry_to_pte(swp_entry(0,~0UL)))) - 1; 7.1508 + if (maxpages > swap_header->info.last_page) 7.1509 + maxpages = swap_header->info.last_page; 7.1510 + p->highest_bit = maxpages - 1; 7.1511 + 7.1512 + error = -EINVAL; 7.1513 + if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES) 7.1514 + goto bad_swap; 7.1515 + 7.1516 + /* OK, set up the swap map and apply the bad block list */ 7.1517 + if (!(p->swap_map = vmalloc(maxpages * sizeof(short)))) { 7.1518 + error = -ENOMEM; 7.1519 + goto bad_swap; 7.1520 + } 7.1521 + 7.1522 + error = 0; 7.1523 + memset(p->swap_map, 0, maxpages * sizeof(short)); 7.1524 + for (i=0; i<swap_header->info.nr_badpages; i++) { 7.1525 + int page = swap_header->info.badpages[i]; 7.1526 + if (page <= 0 || page >= swap_header->info.last_page) 7.1527 + error = -EINVAL; 7.1528 + else 7.1529 + p->swap_map[page] = SWAP_MAP_BAD; 7.1530 + } 7.1531 + nr_good_pages = swap_header->info.last_page - 7.1532 + swap_header->info.nr_badpages - 7.1533 + 1 /* header page */; 7.1534 + if (error) 7.1535 + goto bad_swap; 7.1536 + } 7.1537 + 7.1538 + if (swapfilesize && maxpages > swapfilesize) { 7.1539 + printk(KERN_WARNING 7.1540 + "Swap area shorter than signature indicates\n"); 7.1541 + error = -EINVAL; 7.1542 + goto bad_swap; 7.1543 + } 7.1544 + if (!nr_good_pages) { 7.1545 + printk(KERN_WARNING "Empty swap-file\n"); 7.1546 + error = -EINVAL; 7.1547 + goto bad_swap; 7.1548 + } 7.1549 + p->swap_map[0] = SWAP_MAP_BAD; 7.1550 + p->max = maxpages; 7.1551 + p->pages = nr_good_pages; 7.1552 + 7.1553 + error = setup_swap_extents(p); 7.1554 + if (error) 7.1555 + goto bad_swap; 7.1556 + 7.1557 + down(&swapon_sem); 7.1558 + swap_list_lock(); 7.1559 + swap_device_lock(p); 7.1560 + p->flags = SWP_ACTIVE; 7.1561 + nr_swap_pages += nr_good_pages; 7.1562 + total_swap_pages += nr_good_pages; 7.1563 + printk(KERN_INFO "Adding %dk swap on %s. Priority:%d extents:%d\n", 7.1564 + nr_good_pages<<(PAGE_SHIFT-10), name, 7.1565 + p->prio, p->nr_extents); 7.1566 + 7.1567 + /* insert swap space into swap_list: */ 7.1568 + prev = -1; 7.1569 + for (i = swap_list.head; i >= 0; i = swap_info[i].next) { 7.1570 + if (p->prio >= swap_info[i].prio) { 7.1571 + break; 7.1572 + } 7.1573 + prev = i; 7.1574 + } 7.1575 + p->next = i; 7.1576 + if (prev < 0) { 7.1577 + swap_list.head = swap_list.next = p - swap_info; 7.1578 + } else { 7.1579 + swap_info[prev].next = p - swap_info; 7.1580 + } 7.1581 + swap_device_unlock(p); 7.1582 + swap_list_unlock(); 7.1583 + up(&swapon_sem); 7.1584 + error = 0; 7.1585 + goto out; 7.1586 +bad_swap: 7.1587 + if (bdev) { 7.1588 + set_blocksize(bdev, p->old_block_size); 7.1589 + bd_release(bdev); 7.1590 + } 7.1591 +bad_swap_2: 7.1592 + swap_list_lock(); 7.1593 + swap_map = p->swap_map; 7.1594 + p->swap_file = NULL; 7.1595 + p->swap_map = NULL; 7.1596 + p->flags = 0; 7.1597 + if (!(swap_flags & SWAP_FLAG_PREFER)) 7.1598 + ++least_priority; 7.1599 + swap_list_unlock(); 7.1600 + destroy_swap_extents(p); 7.1601 + if (swap_map) 7.1602 + vfree(swap_map); 7.1603 + if (swap_file) 7.1604 + filp_close(swap_file, NULL); 7.1605 +out: 7.1606 + if (page && !IS_ERR(page)) { 7.1607 + kunmap(page); 7.1608 + page_cache_release(page); 7.1609 + } 7.1610 + if (name) 7.1611 + putname(name); 7.1612 + if (did_down) { 7.1613 + if (!error) 7.1614 + inode->i_flags |= S_SWAPFILE; 7.1615 + up(&inode->i_sem); 7.1616 + } 7.1617 + return error; 7.1618 +} 7.1619 + 7.1620 +void si_swapinfo(struct sysinfo *val) 7.1621 +{ 7.1622 + unsigned int i; 7.1623 + unsigned long nr_to_be_unused = 0; 7.1624 + 7.1625 + swap_list_lock(); 7.1626 + for (i = 0; i < nr_swapfiles; i++) { 7.1627 + if (!(swap_info[i].flags & SWP_USED) || 7.1628 + (swap_info[i].flags & SWP_WRITEOK)) 7.1629 + continue; 7.1630 + nr_to_be_unused += swap_info[i].inuse_pages; 7.1631 + } 7.1632 + val->freeswap = nr_swap_pages + nr_to_be_unused; 7.1633 + val->totalswap = total_swap_pages + nr_to_be_unused; 7.1634 + swap_list_unlock(); 7.1635 +} 7.1636 + 7.1637 +/* 7.1638 + * Verify that a swap entry is valid and increment its swap map count. 7.1639 + * 7.1640 + * Note: if swap_map[] reaches SWAP_MAP_MAX the entries are treated as 7.1641 + * "permanent", but will be reclaimed by the next swapoff. 7.1642 + */ 7.1643 +int swap_duplicate(swp_entry_t entry) 7.1644 +{ 7.1645 + struct swap_info_struct * p; 7.1646 + unsigned long offset, type; 7.1647 + int result = 0; 7.1648 + 7.1649 + type = swp_type(entry); 7.1650 + if (type >= nr_swapfiles) 7.1651 + goto bad_file; 7.1652 + p = type + swap_info; 7.1653 + offset = swp_offset(entry); 7.1654 + 7.1655 + swap_device_lock(p); 7.1656 + if (offset < p->max && p->swap_map[offset]) { 7.1657 + if (p->swap_map[offset] < SWAP_MAP_MAX - 1) { 7.1658 + p->swap_map[offset]++; 7.1659 + result = 1; 7.1660 + } else if (p->swap_map[offset] <= SWAP_MAP_MAX) { 7.1661 + if (swap_overflow++ < 5) 7.1662 + printk(KERN_WARNING "swap_dup: swap entry overflow\n"); 7.1663 + p->swap_map[offset] = SWAP_MAP_MAX; 7.1664 + result = 1; 7.1665 + } 7.1666 + } 7.1667 + swap_device_unlock(p); 7.1668 +out: 7.1669 + return result; 7.1670 + 7.1671 +bad_file: 7.1672 + printk(KERN_ERR "swap_dup: %s%08lx\n", Bad_file, entry.val); 7.1673 + goto out; 7.1674 +} 7.1675 + 7.1676 +struct swap_info_struct * 7.1677 +get_swap_info_struct(unsigned type) 7.1678 +{ 7.1679 + return &swap_info[type]; 7.1680 +} 7.1681 + 7.1682 +/* 7.1683 + * swap_device_lock prevents swap_map being freed. Don't grab an extra 7.1684 + * reference on the swaphandle, it doesn't matter if it becomes unused. 7.1685 + */ 7.1686 +int valid_swaphandles(swp_entry_t entry, unsigned long *offset) 7.1687 +{ 7.1688 + int ret = 0, i = 1 << page_cluster; 7.1689 + unsigned long toff; 7.1690 + struct swap_info_struct *swapdev = swp_type(entry) + swap_info; 7.1691 + 7.1692 + if (!page_cluster) /* no readahead */ 7.1693 + return 0; 7.1694 + toff = (swp_offset(entry) >> page_cluster) << page_cluster; 7.1695 + if (!toff) /* first page is swap header */ 7.1696 + toff++, i--; 7.1697 + *offset = toff; 7.1698 + 7.1699 + swap_device_lock(swapdev); 7.1700 + do { 7.1701 + /* Don't read-ahead past the end of the swap area */ 7.1702 + if (toff >= swapdev->max) 7.1703 + break; 7.1704 + /* Don't read in free or bad pages */ 7.1705 + if (!swapdev->swap_map[toff]) 7.1706 + break; 7.1707 + if (swapdev->swap_map[toff] == SWAP_MAP_BAD) 7.1708 + break; 7.1709 + toff++; 7.1710 + ret++; 7.1711 + } while (--i); 7.1712 + swap_device_unlock(swapdev); 7.1713 + return ret; 7.1714 +}