ia64/xen-unstable

changeset 4157:2c4ca5aad6c4

bitkeeper revision 1.1245 (4236f622mMlu4s1f6bmCbV2qW4kvjw)

added 2.4 batch mode

Signed-off-by: michael.fetterman@cl.cam.ac.uk
author rneugeba@wyvis.research.intel-research.net
date Tue Mar 15 14:50:10 2005 +0000 (2005-03-15)
parents 47e1cb8a3d38
children e379e05dfb91 0cf318b324fb
files .rootkeys linux-2.6.10-xen-sparse/arch/xen/i386/mm/fault.c linux-2.6.10-xen-sparse/fs/exec.c linux-2.6.10-xen-sparse/include/asm-xen/asm-i386/pgtable-2level.h linux-2.6.10-xen-sparse/mm/highmem.c linux-2.6.10-xen-sparse/mm/memory.c linux-2.6.10-xen-sparse/mm/swapfile.c
line diff
     1.1 --- a/.rootkeys	Thu Mar 10 18:12:10 2005 +0000
     1.2 +++ b/.rootkeys	Tue Mar 15 14:50:10 2005 +0000
     1.3 @@ -230,6 +230,7 @@ 41ee5e8bSs3BGC7yegM_ek2Tn0Ahvw linux-2.6
     1.4  41ee5e8bglvqKvZSY5uJ5JGQejEwyQ linux-2.6.10-xen-sparse/drivers/xen/usbback/usbback.c
     1.5  41ee5e8ckZ9xVNvu9NHIZDK7JqApmQ linux-2.6.10-xen-sparse/drivers/xen/usbfront/usbfront.c
     1.6  41ee5e8ck9scpGirfqEZRARbGDyTXA linux-2.6.10-xen-sparse/drivers/xen/usbfront/xhci.h
     1.7 +4236f620IqJ4VZVDPfMJzrpFrio8Sw linux-2.6.10-xen-sparse/fs/exec.c
     1.8  412f47e4RKD-R5IS5gEXvcT8L4v8gA linux-2.6.10-xen-sparse/include/asm-generic/pgtable.h
     1.9  40f56239YAjS52QG2FIAQpHDZAdGHg linux-2.6.10-xen-sparse/include/asm-xen/asm-i386/desc.h
    1.10  4107adf1E5O4ztGHNGMzCCNhcvqNow linux-2.6.10-xen-sparse/include/asm-xen/asm-i386/dma-mapping.h
    1.11 @@ -274,8 +275,10 @@ 419dfc609zbti8rqL60tL2dHXQ_rvQ linux-2.6
    1.12  4124f66f4NaKNa0xPiGGykn9QaZk3w linux-2.6.10-xen-sparse/include/linux/skbuff.h
    1.13  419dfc6awx7w88wk6cG9P3mPidX6LQ linux-2.6.10-xen-sparse/kernel/irq/manage.c
    1.14  40f56a0ddHCSs3501MY4hRf22tctOw linux-2.6.10-xen-sparse/mkbuildtree
    1.15 +4236f620IaM-42pgVYuNGF4cFrttbw linux-2.6.10-xen-sparse/mm/highmem.c
    1.16  412f46c0LJuKAgSPGoC0Z1DEkLfuLA linux-2.6.10-xen-sparse/mm/memory.c
    1.17  410a94a4KT6I6X0LVc7djB39tRDp4g linux-2.6.10-xen-sparse/mm/page_alloc.c
    1.18 +4236f620F2ZXlYSPUkwtN85tZMqDFQ linux-2.6.10-xen-sparse/mm/swapfile.c
    1.19  41505c572m-s9ATiO1LiD1GPznTTIg linux-2.6.10-xen-sparse/net/core/skbuff.c
    1.20  413cb1e4zst25MDYjg63Y-NGC5_pLg netbsd-2.0-xen-sparse/Makefile
    1.21  413cb1e5c_Mkxf_X0zimEhTKI_l4DA netbsd-2.0-xen-sparse/mkbuildtree
     2.1 --- a/linux-2.6.10-xen-sparse/arch/xen/i386/mm/fault.c	Thu Mar 10 18:12:10 2005 +0000
     2.2 +++ b/linux-2.6.10-xen-sparse/arch/xen/i386/mm/fault.c	Tue Mar 15 14:50:10 2005 +0000
     2.3 @@ -231,6 +231,12 @@ fastcall void do_page_fault(struct pt_re
     2.4  	error_code |= (regs->xcs & 2) << 1;
     2.5  	if (regs->eflags & X86_EFLAGS_VM)
     2.6  		error_code |= 4;
     2.7 +
     2.8 +#ifdef CONFIG_XEN_BATCH_MODE2
     2.9 +    /* ensure all updates have completed */
    2.10 +    flush_page_update_queue();
    2.11 +#endif
    2.12 +
    2.13  		
    2.14   	if (notify_die(DIE_PAGE_FAULT, "page fault", regs, error_code, 14,
    2.15   					SIGSEGV) == NOTIFY_STOP)
     3.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     3.2 +++ b/linux-2.6.10-xen-sparse/fs/exec.c	Tue Mar 15 14:50:10 2005 +0000
     3.3 @@ -0,0 +1,1432 @@
     3.4 +/*
     3.5 + *  linux/fs/exec.c
     3.6 + *
     3.7 + *  Copyright (C) 1991, 1992  Linus Torvalds
     3.8 + */
     3.9 +
    3.10 +/*
    3.11 + * #!-checking implemented by tytso.
    3.12 + */
    3.13 +/*
    3.14 + * Demand-loading implemented 01.12.91 - no need to read anything but
    3.15 + * the header into memory. The inode of the executable is put into
    3.16 + * "current->executable", and page faults do the actual loading. Clean.
    3.17 + *
    3.18 + * Once more I can proudly say that linux stood up to being changed: it
    3.19 + * was less than 2 hours work to get demand-loading completely implemented.
    3.20 + *
    3.21 + * Demand loading changed July 1993 by Eric Youngdale.   Use mmap instead,
    3.22 + * current->executable is only used by the procfs.  This allows a dispatch
    3.23 + * table to check for several different types  of binary formats.  We keep
    3.24 + * trying until we recognize the file or we run out of supported binary
    3.25 + * formats. 
    3.26 + */
    3.27 +
    3.28 +#include <linux/config.h>
    3.29 +#include <linux/slab.h>
    3.30 +#include <linux/file.h>
    3.31 +#include <linux/mman.h>
    3.32 +#include <linux/a.out.h>
    3.33 +#include <linux/stat.h>
    3.34 +#include <linux/fcntl.h>
    3.35 +#include <linux/smp_lock.h>
    3.36 +#include <linux/init.h>
    3.37 +#include <linux/pagemap.h>
    3.38 +#include <linux/highmem.h>
    3.39 +#include <linux/spinlock.h>
    3.40 +#include <linux/key.h>
    3.41 +#include <linux/personality.h>
    3.42 +#include <linux/binfmts.h>
    3.43 +#include <linux/swap.h>
    3.44 +#include <linux/utsname.h>
    3.45 +#include <linux/module.h>
    3.46 +#include <linux/namei.h>
    3.47 +#include <linux/proc_fs.h>
    3.48 +#include <linux/ptrace.h>
    3.49 +#include <linux/mount.h>
    3.50 +#include <linux/security.h>
    3.51 +#include <linux/syscalls.h>
    3.52 +#include <linux/rmap.h>
    3.53 +
    3.54 +#include <asm/uaccess.h>
    3.55 +#include <asm/mmu_context.h>
    3.56 +
    3.57 +#ifdef CONFIG_KMOD
    3.58 +#include <linux/kmod.h>
    3.59 +#endif
    3.60 +
    3.61 +int core_uses_pid;
    3.62 +char core_pattern[65] = "core";
    3.63 +/* The maximal length of core_pattern is also specified in sysctl.c */
    3.64 +
    3.65 +static struct linux_binfmt *formats;
    3.66 +static rwlock_t binfmt_lock = RW_LOCK_UNLOCKED;
    3.67 +
    3.68 +int register_binfmt(struct linux_binfmt * fmt)
    3.69 +{
    3.70 +	struct linux_binfmt ** tmp = &formats;
    3.71 +
    3.72 +	if (!fmt)
    3.73 +		return -EINVAL;
    3.74 +	if (fmt->next)
    3.75 +		return -EBUSY;
    3.76 +	write_lock(&binfmt_lock);
    3.77 +	while (*tmp) {
    3.78 +		if (fmt == *tmp) {
    3.79 +			write_unlock(&binfmt_lock);
    3.80 +			return -EBUSY;
    3.81 +		}
    3.82 +		tmp = &(*tmp)->next;
    3.83 +	}
    3.84 +	fmt->next = formats;
    3.85 +	formats = fmt;
    3.86 +	write_unlock(&binfmt_lock);
    3.87 +	return 0;	
    3.88 +}
    3.89 +
    3.90 +EXPORT_SYMBOL(register_binfmt);
    3.91 +
    3.92 +int unregister_binfmt(struct linux_binfmt * fmt)
    3.93 +{
    3.94 +	struct linux_binfmt ** tmp = &formats;
    3.95 +
    3.96 +	write_lock(&binfmt_lock);
    3.97 +	while (*tmp) {
    3.98 +		if (fmt == *tmp) {
    3.99 +			*tmp = fmt->next;
   3.100 +			write_unlock(&binfmt_lock);
   3.101 +			return 0;
   3.102 +		}
   3.103 +		tmp = &(*tmp)->next;
   3.104 +	}
   3.105 +	write_unlock(&binfmt_lock);
   3.106 +	return -EINVAL;
   3.107 +}
   3.108 +
   3.109 +EXPORT_SYMBOL(unregister_binfmt);
   3.110 +
   3.111 +static inline void put_binfmt(struct linux_binfmt * fmt)
   3.112 +{
   3.113 +	module_put(fmt->module);
   3.114 +}
   3.115 +
   3.116 +/*
   3.117 + * Note that a shared library must be both readable and executable due to
   3.118 + * security reasons.
   3.119 + *
   3.120 + * Also note that we take the address to load from from the file itself.
   3.121 + */
   3.122 +asmlinkage long sys_uselib(const char __user * library)
   3.123 +{
   3.124 +	struct file * file;
   3.125 +	struct nameidata nd;
   3.126 +	int error;
   3.127 +
   3.128 +	nd.intent.open.flags = FMODE_READ;
   3.129 +	error = __user_walk(library, LOOKUP_FOLLOW|LOOKUP_OPEN, &nd);
   3.130 +	if (error)
   3.131 +		goto out;
   3.132 +
   3.133 +	error = -EINVAL;
   3.134 +	if (!S_ISREG(nd.dentry->d_inode->i_mode))
   3.135 +		goto exit;
   3.136 +
   3.137 +	error = permission(nd.dentry->d_inode, MAY_READ | MAY_EXEC, &nd);
   3.138 +	if (error)
   3.139 +		goto exit;
   3.140 +
   3.141 +	file = dentry_open(nd.dentry, nd.mnt, O_RDONLY);
   3.142 +	error = PTR_ERR(file);
   3.143 +	if (IS_ERR(file))
   3.144 +		goto out;
   3.145 +
   3.146 +	error = -ENOEXEC;
   3.147 +	if(file->f_op) {
   3.148 +		struct linux_binfmt * fmt;
   3.149 +
   3.150 +		read_lock(&binfmt_lock);
   3.151 +		for (fmt = formats ; fmt ; fmt = fmt->next) {
   3.152 +			if (!fmt->load_shlib)
   3.153 +				continue;
   3.154 +			if (!try_module_get(fmt->module))
   3.155 +				continue;
   3.156 +			read_unlock(&binfmt_lock);
   3.157 +			error = fmt->load_shlib(file);
   3.158 +			read_lock(&binfmt_lock);
   3.159 +			put_binfmt(fmt);
   3.160 +			if (error != -ENOEXEC)
   3.161 +				break;
   3.162 +		}
   3.163 +		read_unlock(&binfmt_lock);
   3.164 +	}
   3.165 +	fput(file);
   3.166 +out:
   3.167 +  	return error;
   3.168 +exit:
   3.169 +	path_release(&nd);
   3.170 +	goto out;
   3.171 +}
   3.172 +
   3.173 +/*
   3.174 + * count() counts the number of strings in array ARGV.
   3.175 + */
   3.176 +static int count(char __user * __user * argv, int max)
   3.177 +{
   3.178 +	int i = 0;
   3.179 +
   3.180 +	if (argv != NULL) {
   3.181 +		for (;;) {
   3.182 +			char __user * p;
   3.183 +
   3.184 +			if (get_user(p, argv))
   3.185 +				return -EFAULT;
   3.186 +			if (!p)
   3.187 +				break;
   3.188 +			argv++;
   3.189 +			if(++i > max)
   3.190 +				return -E2BIG;
   3.191 +		}
   3.192 +	}
   3.193 +	return i;
   3.194 +}
   3.195 +
   3.196 +/*
   3.197 + * 'copy_strings()' copies argument/environment strings from user
   3.198 + * memory to free pages in kernel mem. These are in a format ready
   3.199 + * to be put directly into the top of new user memory.
   3.200 + */
   3.201 +int copy_strings(int argc,char __user * __user * argv, struct linux_binprm *bprm)
   3.202 +{
   3.203 +	struct page *kmapped_page = NULL;
   3.204 +	char *kaddr = NULL;
   3.205 +	int ret;
   3.206 +
   3.207 +	while (argc-- > 0) {
   3.208 +		char __user *str;
   3.209 +		int len;
   3.210 +		unsigned long pos;
   3.211 +
   3.212 +		if (get_user(str, argv+argc) ||
   3.213 +				!(len = strnlen_user(str, bprm->p))) {
   3.214 +			ret = -EFAULT;
   3.215 +			goto out;
   3.216 +		}
   3.217 +
   3.218 +		if (bprm->p < len)  {
   3.219 +			ret = -E2BIG;
   3.220 +			goto out;
   3.221 +		}
   3.222 +
   3.223 +		bprm->p -= len;
   3.224 +		/* XXX: add architecture specific overflow check here. */
   3.225 +		pos = bprm->p;
   3.226 +
   3.227 +		while (len > 0) {
   3.228 +			int i, new, err;
   3.229 +			int offset, bytes_to_copy;
   3.230 +			struct page *page;
   3.231 +
   3.232 +			offset = pos % PAGE_SIZE;
   3.233 +			i = pos/PAGE_SIZE;
   3.234 +			page = bprm->page[i];
   3.235 +			new = 0;
   3.236 +			if (!page) {
   3.237 +				page = alloc_page(GFP_HIGHUSER);
   3.238 +				bprm->page[i] = page;
   3.239 +				if (!page) {
   3.240 +					ret = -ENOMEM;
   3.241 +					goto out;
   3.242 +				}
   3.243 +				new = 1;
   3.244 +			}
   3.245 +
   3.246 +			if (page != kmapped_page) {
   3.247 +				if (kmapped_page)
   3.248 +					kunmap(kmapped_page);
   3.249 +				kmapped_page = page;
   3.250 +				kaddr = kmap(kmapped_page);
   3.251 +			}
   3.252 +			if (new && offset)
   3.253 +				memset(kaddr, 0, offset);
   3.254 +			bytes_to_copy = PAGE_SIZE - offset;
   3.255 +			if (bytes_to_copy > len) {
   3.256 +				bytes_to_copy = len;
   3.257 +				if (new)
   3.258 +					memset(kaddr+offset+len, 0,
   3.259 +						PAGE_SIZE-offset-len);
   3.260 +			}
   3.261 +			err = copy_from_user(kaddr+offset, str, bytes_to_copy);
   3.262 +			if (err) {
   3.263 +				ret = -EFAULT;
   3.264 +				goto out;
   3.265 +			}
   3.266 +
   3.267 +			pos += bytes_to_copy;
   3.268 +			str += bytes_to_copy;
   3.269 +			len -= bytes_to_copy;
   3.270 +		}
   3.271 +	}
   3.272 +	ret = 0;
   3.273 +out:
   3.274 +	if (kmapped_page)
   3.275 +		kunmap(kmapped_page);
   3.276 +	return ret;
   3.277 +}
   3.278 +
   3.279 +/*
   3.280 + * Like copy_strings, but get argv and its values from kernel memory.
   3.281 + */
   3.282 +int copy_strings_kernel(int argc,char ** argv, struct linux_binprm *bprm)
   3.283 +{
   3.284 +	int r;
   3.285 +	mm_segment_t oldfs = get_fs();
   3.286 +	set_fs(KERNEL_DS);
   3.287 +	r = copy_strings(argc, (char __user * __user *)argv, bprm);
   3.288 +	set_fs(oldfs);
   3.289 +	return r;
   3.290 +}
   3.291 +
   3.292 +EXPORT_SYMBOL(copy_strings_kernel);
   3.293 +
   3.294 +#ifdef CONFIG_MMU
   3.295 +/*
   3.296 + * This routine is used to map in a page into an address space: needed by
   3.297 + * execve() for the initial stack and environment pages.
   3.298 + *
   3.299 + * vma->vm_mm->mmap_sem is held for writing.
   3.300 + */
   3.301 +void install_arg_page(struct vm_area_struct *vma,
   3.302 +			struct page *page, unsigned long address)
   3.303 +{
   3.304 +	struct mm_struct *mm = vma->vm_mm;
   3.305 +	pgd_t * pgd;
   3.306 +	pmd_t * pmd;
   3.307 +	pte_t * pte;
   3.308 +
   3.309 +	if (unlikely(anon_vma_prepare(vma)))
   3.310 +		goto out_sig;
   3.311 +
   3.312 +	flush_dcache_page(page);
   3.313 +	pgd = pgd_offset(mm, address);
   3.314 +
   3.315 +	spin_lock(&mm->page_table_lock);
   3.316 +	pmd = pmd_alloc(mm, pgd, address);
   3.317 +	if (!pmd)
   3.318 +		goto out;
   3.319 +	pte = pte_alloc_map(mm, pmd, address);
   3.320 +	if (!pte)
   3.321 +		goto out;
   3.322 +	if (!pte_none(*pte)) {
   3.323 +		pte_unmap(pte);
   3.324 +		goto out;
   3.325 +	}
   3.326 +	mm->rss++;
   3.327 +	lru_cache_add_active(page);
   3.328 +	set_pte(pte, pte_mkdirty(pte_mkwrite(mk_pte(
   3.329 +					page, vma->vm_page_prot))));
   3.330 +#ifdef CONFIG_XEN_BATCH_MODE2
   3.331 +    XEN_flush_page_update_queue();
   3.332 +#endif
   3.333 +	page_add_anon_rmap(page, vma, address);
   3.334 +	pte_unmap(pte);
   3.335 +	spin_unlock(&mm->page_table_lock);
   3.336 +
   3.337 +	/* no need for flush_tlb */
   3.338 +	return;
   3.339 +out:
   3.340 +	spin_unlock(&mm->page_table_lock);
   3.341 +out_sig:
   3.342 +	__free_page(page);
   3.343 +	force_sig(SIGKILL, current);
   3.344 +}
   3.345 +
   3.346 +int setup_arg_pages(struct linux_binprm *bprm, int executable_stack)
   3.347 +{
   3.348 +	unsigned long stack_base;
   3.349 +	struct vm_area_struct *mpnt;
   3.350 +	struct mm_struct *mm = current->mm;
   3.351 +	int i, ret;
   3.352 +	long arg_size;
   3.353 +
   3.354 +#ifdef CONFIG_STACK_GROWSUP
   3.355 +	/* Move the argument and environment strings to the bottom of the
   3.356 +	 * stack space.
   3.357 +	 */
   3.358 +	int offset, j;
   3.359 +	char *to, *from;
   3.360 +
   3.361 +	/* Start by shifting all the pages down */
   3.362 +	i = 0;
   3.363 +	for (j = 0; j < MAX_ARG_PAGES; j++) {
   3.364 +		struct page *page = bprm->page[j];
   3.365 +		if (!page)
   3.366 +			continue;
   3.367 +		bprm->page[i++] = page;
   3.368 +	}
   3.369 +
   3.370 +	/* Now move them within their pages */
   3.371 +	offset = bprm->p % PAGE_SIZE;
   3.372 +	to = kmap(bprm->page[0]);
   3.373 +	for (j = 1; j < i; j++) {
   3.374 +		memmove(to, to + offset, PAGE_SIZE - offset);
   3.375 +		from = kmap(bprm->page[j]);
   3.376 +		memcpy(to + PAGE_SIZE - offset, from, offset);
   3.377 +		kunmap(bprm->page[j - 1]);
   3.378 +		to = from;
   3.379 +	}
   3.380 +	memmove(to, to + offset, PAGE_SIZE - offset);
   3.381 +	kunmap(bprm->page[j - 1]);
   3.382 +
   3.383 +	/* Adjust bprm->p to point to the end of the strings. */
   3.384 +	bprm->p = PAGE_SIZE * i - offset;
   3.385 +
   3.386 +	/* Limit stack size to 1GB */
   3.387 +	stack_base = current->signal->rlim[RLIMIT_STACK].rlim_max;
   3.388 +	if (stack_base > (1 << 30))
   3.389 +		stack_base = 1 << 30;
   3.390 +	stack_base = PAGE_ALIGN(STACK_TOP - stack_base);
   3.391 +
   3.392 +	mm->arg_start = stack_base;
   3.393 +	arg_size = i << PAGE_SHIFT;
   3.394 +
   3.395 +	/* zero pages that were copied above */
   3.396 +	while (i < MAX_ARG_PAGES)
   3.397 +		bprm->page[i++] = NULL;
   3.398 +#else
   3.399 +	stack_base = STACK_TOP - MAX_ARG_PAGES * PAGE_SIZE;
   3.400 +	mm->arg_start = bprm->p + stack_base;
   3.401 +	arg_size = STACK_TOP - (PAGE_MASK & (unsigned long) mm->arg_start);
   3.402 +#endif
   3.403 +
   3.404 +	bprm->p += stack_base;
   3.405 +	if (bprm->loader)
   3.406 +		bprm->loader += stack_base;
   3.407 +	bprm->exec += stack_base;
   3.408 +
   3.409 +	mpnt = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
   3.410 +	if (!mpnt)
   3.411 +		return -ENOMEM;
   3.412 +
   3.413 +	if (security_vm_enough_memory(arg_size >> PAGE_SHIFT)) {
   3.414 +		kmem_cache_free(vm_area_cachep, mpnt);
   3.415 +		return -ENOMEM;
   3.416 +	}
   3.417 +
   3.418 +	memset(mpnt, 0, sizeof(*mpnt));
   3.419 +
   3.420 +	down_write(&mm->mmap_sem);
   3.421 +	{
   3.422 +		mpnt->vm_mm = mm;
   3.423 +#ifdef CONFIG_STACK_GROWSUP
   3.424 +		mpnt->vm_start = stack_base;
   3.425 +		mpnt->vm_end = PAGE_MASK &
   3.426 +			(PAGE_SIZE - 1 + (unsigned long) bprm->p);
   3.427 +#else
   3.428 +		mpnt->vm_start = PAGE_MASK & (unsigned long) bprm->p;
   3.429 +		mpnt->vm_end = STACK_TOP;
   3.430 +#endif
   3.431 +		/* Adjust stack execute permissions; explicitly enable
   3.432 +		 * for EXSTACK_ENABLE_X, disable for EXSTACK_DISABLE_X
   3.433 +		 * and leave alone (arch default) otherwise. */
   3.434 +		if (unlikely(executable_stack == EXSTACK_ENABLE_X))
   3.435 +			mpnt->vm_flags = VM_STACK_FLAGS |  VM_EXEC;
   3.436 +		else if (executable_stack == EXSTACK_DISABLE_X)
   3.437 +			mpnt->vm_flags = VM_STACK_FLAGS & ~VM_EXEC;
   3.438 +		else
   3.439 +			mpnt->vm_flags = VM_STACK_FLAGS;
   3.440 +		mpnt->vm_flags |= mm->def_flags;
   3.441 +		mpnt->vm_page_prot = protection_map[mpnt->vm_flags & 0x7];
   3.442 +		if ((ret = insert_vm_struct(mm, mpnt))) {
   3.443 +			up_write(&mm->mmap_sem);
   3.444 +			kmem_cache_free(vm_area_cachep, mpnt);
   3.445 +			return ret;
   3.446 +		}
   3.447 +		mm->stack_vm = mm->total_vm = vma_pages(mpnt);
   3.448 +	}
   3.449 +
   3.450 +	for (i = 0 ; i < MAX_ARG_PAGES ; i++) {
   3.451 +		struct page *page = bprm->page[i];
   3.452 +		if (page) {
   3.453 +			bprm->page[i] = NULL;
   3.454 +			install_arg_page(mpnt, page, stack_base);
   3.455 +		}
   3.456 +		stack_base += PAGE_SIZE;
   3.457 +	}
   3.458 +	up_write(&mm->mmap_sem);
   3.459 +	
   3.460 +	return 0;
   3.461 +}
   3.462 +
   3.463 +EXPORT_SYMBOL(setup_arg_pages);
   3.464 +
   3.465 +#define free_arg_pages(bprm) do { } while (0)
   3.466 +
   3.467 +#else
   3.468 +
   3.469 +static inline void free_arg_pages(struct linux_binprm *bprm)
   3.470 +{
   3.471 +	int i;
   3.472 +
   3.473 +	for (i = 0; i < MAX_ARG_PAGES; i++) {
   3.474 +		if (bprm->page[i])
   3.475 +			__free_page(bprm->page[i]);
   3.476 +		bprm->page[i] = NULL;
   3.477 +	}
   3.478 +}
   3.479 +
   3.480 +#endif /* CONFIG_MMU */
   3.481 +
   3.482 +struct file *open_exec(const char *name)
   3.483 +{
   3.484 +	struct nameidata nd;
   3.485 +	int err;
   3.486 +	struct file *file;
   3.487 +
   3.488 +	nd.intent.open.flags = FMODE_READ;
   3.489 +	err = path_lookup(name, LOOKUP_FOLLOW|LOOKUP_OPEN, &nd);
   3.490 +	file = ERR_PTR(err);
   3.491 +
   3.492 +	if (!err) {
   3.493 +		struct inode *inode = nd.dentry->d_inode;
   3.494 +		file = ERR_PTR(-EACCES);
   3.495 +		if (!(nd.mnt->mnt_flags & MNT_NOEXEC) &&
   3.496 +		    S_ISREG(inode->i_mode)) {
   3.497 +			int err = permission(inode, MAY_EXEC, &nd);
   3.498 +			if (!err && !(inode->i_mode & 0111))
   3.499 +				err = -EACCES;
   3.500 +			file = ERR_PTR(err);
   3.501 +			if (!err) {
   3.502 +				file = dentry_open(nd.dentry, nd.mnt, O_RDONLY);
   3.503 +				if (!IS_ERR(file)) {
   3.504 +					err = deny_write_access(file);
   3.505 +					if (err) {
   3.506 +						fput(file);
   3.507 +						file = ERR_PTR(err);
   3.508 +					}
   3.509 +				}
   3.510 +out:
   3.511 +				return file;
   3.512 +			}
   3.513 +		}
   3.514 +		path_release(&nd);
   3.515 +	}
   3.516 +	goto out;
   3.517 +}
   3.518 +
   3.519 +EXPORT_SYMBOL(open_exec);
   3.520 +
   3.521 +int kernel_read(struct file *file, unsigned long offset,
   3.522 +	char *addr, unsigned long count)
   3.523 +{
   3.524 +	mm_segment_t old_fs;
   3.525 +	loff_t pos = offset;
   3.526 +	int result;
   3.527 +
   3.528 +	old_fs = get_fs();
   3.529 +	set_fs(get_ds());
   3.530 +	/* The cast to a user pointer is valid due to the set_fs() */
   3.531 +	result = vfs_read(file, (void __user *)addr, count, &pos);
   3.532 +	set_fs(old_fs);
   3.533 +	return result;
   3.534 +}
   3.535 +
   3.536 +EXPORT_SYMBOL(kernel_read);
   3.537 +
   3.538 +static int exec_mmap(struct mm_struct *mm)
   3.539 +{
   3.540 +	struct task_struct *tsk;
   3.541 +	struct mm_struct * old_mm, *active_mm;
   3.542 +
   3.543 +	/* Notify parent that we're no longer interested in the old VM */
   3.544 +	tsk = current;
   3.545 +	old_mm = current->mm;
   3.546 +	mm_release(tsk, old_mm);
   3.547 +
   3.548 +	task_lock(tsk);
   3.549 +	active_mm = tsk->active_mm;
   3.550 +	tsk->mm = mm;
   3.551 +	tsk->active_mm = mm;
   3.552 +	activate_mm(active_mm, mm);
   3.553 +	task_unlock(tsk);
   3.554 +	arch_pick_mmap_layout(mm);
   3.555 +	if (old_mm) {
   3.556 +		if (active_mm != old_mm) BUG();
   3.557 +		mmput(old_mm);
   3.558 +		return 0;
   3.559 +	}
   3.560 +	mmdrop(active_mm);
   3.561 +	return 0;
   3.562 +}
   3.563 +
   3.564 +/*
   3.565 + * This function makes sure the current process has its own signal table,
   3.566 + * so that flush_signal_handlers can later reset the handlers without
   3.567 + * disturbing other processes.  (Other processes might share the signal
   3.568 + * table via the CLONE_SIGHAND option to clone().)
   3.569 + */
   3.570 +static inline int de_thread(struct task_struct *tsk)
   3.571 +{
   3.572 +	struct signal_struct *sig = tsk->signal;
   3.573 +	struct sighand_struct *newsighand, *oldsighand = tsk->sighand;
   3.574 +	spinlock_t *lock = &oldsighand->siglock;
   3.575 +	int count;
   3.576 +
   3.577 +	/*
   3.578 +	 * If we don't share sighandlers, then we aren't sharing anything
   3.579 +	 * and we can just re-use it all.
   3.580 +	 */
   3.581 +	if (atomic_read(&oldsighand->count) <= 1) {
   3.582 +		BUG_ON(atomic_read(&sig->count) != 1);
   3.583 +		exit_itimers(sig);
   3.584 +		return 0;
   3.585 +	}
   3.586 +
   3.587 +	newsighand = kmem_cache_alloc(sighand_cachep, GFP_KERNEL);
   3.588 +	if (!newsighand)
   3.589 +		return -ENOMEM;
   3.590 +
   3.591 +	if (thread_group_empty(current))
   3.592 +		goto no_thread_group;
   3.593 +
   3.594 +	/*
   3.595 +	 * Kill all other threads in the thread group.
   3.596 +	 * We must hold tasklist_lock to call zap_other_threads.
   3.597 +	 */
   3.598 +	read_lock(&tasklist_lock);
   3.599 +	spin_lock_irq(lock);
   3.600 +	if (sig->group_exit) {
   3.601 +		/*
   3.602 +		 * Another group action in progress, just
   3.603 +		 * return so that the signal is processed.
   3.604 +		 */
   3.605 +		spin_unlock_irq(lock);
   3.606 +		read_unlock(&tasklist_lock);
   3.607 +		kmem_cache_free(sighand_cachep, newsighand);
   3.608 +		return -EAGAIN;
   3.609 +	}
   3.610 +	sig->group_exit = 1;
   3.611 +	zap_other_threads(current);
   3.612 +	read_unlock(&tasklist_lock);
   3.613 +
   3.614 +	/*
   3.615 +	 * Account for the thread group leader hanging around:
   3.616 +	 */
   3.617 +	count = 2;
   3.618 +	if (current->pid == current->tgid)
   3.619 +		count = 1;
   3.620 +	while (atomic_read(&sig->count) > count) {
   3.621 +		sig->group_exit_task = current;
   3.622 +		sig->notify_count = count;
   3.623 +		__set_current_state(TASK_UNINTERRUPTIBLE);
   3.624 +		spin_unlock_irq(lock);
   3.625 +		schedule();
   3.626 +		spin_lock_irq(lock);
   3.627 +	}
   3.628 +	sig->group_exit_task = NULL;
   3.629 +	sig->notify_count = 0;
   3.630 +	spin_unlock_irq(lock);
   3.631 +
   3.632 +	/*
   3.633 +	 * At this point all other threads have exited, all we have to
   3.634 +	 * do is to wait for the thread group leader to become inactive,
   3.635 +	 * and to assume its PID:
   3.636 +	 */
   3.637 +	if (current->pid != current->tgid) {
   3.638 +		struct task_struct *leader = current->group_leader, *parent;
   3.639 +		struct dentry *proc_dentry1, *proc_dentry2;
   3.640 +		unsigned long exit_state, ptrace;
   3.641 +
   3.642 +		/*
   3.643 +		 * Wait for the thread group leader to be a zombie.
   3.644 +		 * It should already be zombie at this point, most
   3.645 +		 * of the time.
   3.646 +		 */
   3.647 +		while (leader->exit_state != EXIT_ZOMBIE)
   3.648 +			yield();
   3.649 +
   3.650 +		spin_lock(&leader->proc_lock);
   3.651 +		spin_lock(&current->proc_lock);
   3.652 +		proc_dentry1 = proc_pid_unhash(current);
   3.653 +		proc_dentry2 = proc_pid_unhash(leader);
   3.654 +		write_lock_irq(&tasklist_lock);
   3.655 +
   3.656 +		if (leader->tgid != current->tgid)
   3.657 +			BUG();
   3.658 +		if (current->pid == current->tgid)
   3.659 +			BUG();
   3.660 +		/*
   3.661 +		 * An exec() starts a new thread group with the
   3.662 +		 * TGID of the previous thread group. Rehash the
   3.663 +		 * two threads with a switched PID, and release
   3.664 +		 * the former thread group leader:
   3.665 +		 */
   3.666 +		ptrace = leader->ptrace;
   3.667 +		parent = leader->parent;
   3.668 +
   3.669 +		ptrace_unlink(current);
   3.670 +		ptrace_unlink(leader);
   3.671 +		remove_parent(current);
   3.672 +		remove_parent(leader);
   3.673 +
   3.674 +		switch_exec_pids(leader, current);
   3.675 +
   3.676 +		current->parent = current->real_parent = leader->real_parent;
   3.677 +		leader->parent = leader->real_parent = child_reaper;
   3.678 +		current->group_leader = current;
   3.679 +		leader->group_leader = leader;
   3.680 +
   3.681 +		add_parent(current, current->parent);
   3.682 +		add_parent(leader, leader->parent);
   3.683 +		if (ptrace) {
   3.684 +			current->ptrace = ptrace;
   3.685 +			__ptrace_link(current, parent);
   3.686 +		}
   3.687 +
   3.688 +		list_del(&current->tasks);
   3.689 +		list_add_tail(&current->tasks, &init_task.tasks);
   3.690 +		current->exit_signal = SIGCHLD;
   3.691 +		exit_state = leader->exit_state;
   3.692 +
   3.693 +		write_unlock_irq(&tasklist_lock);
   3.694 +		spin_unlock(&leader->proc_lock);
   3.695 +		spin_unlock(&current->proc_lock);
   3.696 +		proc_pid_flush(proc_dentry1);
   3.697 +		proc_pid_flush(proc_dentry2);
   3.698 +
   3.699 +		if (exit_state != EXIT_ZOMBIE)
   3.700 +			BUG();
   3.701 +		release_task(leader);
   3.702 +        }
   3.703 +
   3.704 +	/*
   3.705 +	 * Now there are really no other threads at all,
   3.706 +	 * so it's safe to stop telling them to kill themselves.
   3.707 +	 */
   3.708 +	sig->group_exit = 0;
   3.709 +
   3.710 +no_thread_group:
   3.711 +	BUG_ON(atomic_read(&sig->count) != 1);
   3.712 +	exit_itimers(sig);
   3.713 +
   3.714 +	if (atomic_read(&oldsighand->count) == 1) {
   3.715 +		/*
   3.716 +		 * Now that we nuked the rest of the thread group,
   3.717 +		 * it turns out we are not sharing sighand any more either.
   3.718 +		 * So we can just keep it.
   3.719 +		 */
   3.720 +		kmem_cache_free(sighand_cachep, newsighand);
   3.721 +	} else {
   3.722 +		/*
   3.723 +		 * Move our state over to newsighand and switch it in.
   3.724 +		 */
   3.725 +		spin_lock_init(&newsighand->siglock);
   3.726 +		atomic_set(&newsighand->count, 1);
   3.727 +		memcpy(newsighand->action, oldsighand->action,
   3.728 +		       sizeof(newsighand->action));
   3.729 +
   3.730 +		write_lock_irq(&tasklist_lock);
   3.731 +		spin_lock(&oldsighand->siglock);
   3.732 +		spin_lock(&newsighand->siglock);
   3.733 +
   3.734 +		current->sighand = newsighand;
   3.735 +		recalc_sigpending();
   3.736 +
   3.737 +		spin_unlock(&newsighand->siglock);
   3.738 +		spin_unlock(&oldsighand->siglock);
   3.739 +		write_unlock_irq(&tasklist_lock);
   3.740 +
   3.741 +		if (atomic_dec_and_test(&oldsighand->count))
   3.742 +			kmem_cache_free(sighand_cachep, oldsighand);
   3.743 +	}
   3.744 +
   3.745 +	if (!thread_group_empty(current))
   3.746 +		BUG();
   3.747 +	if (current->tgid != current->pid)
   3.748 +		BUG();
   3.749 +	return 0;
   3.750 +}
   3.751 +	
   3.752 +/*
   3.753 + * These functions flushes out all traces of the currently running executable
   3.754 + * so that a new one can be started
   3.755 + */
   3.756 +
   3.757 +static inline void flush_old_files(struct files_struct * files)
   3.758 +{
   3.759 +	long j = -1;
   3.760 +
   3.761 +	spin_lock(&files->file_lock);
   3.762 +	for (;;) {
   3.763 +		unsigned long set, i;
   3.764 +
   3.765 +		j++;
   3.766 +		i = j * __NFDBITS;
   3.767 +		if (i >= files->max_fds || i >= files->max_fdset)
   3.768 +			break;
   3.769 +		set = files->close_on_exec->fds_bits[j];
   3.770 +		if (!set)
   3.771 +			continue;
   3.772 +		files->close_on_exec->fds_bits[j] = 0;
   3.773 +		spin_unlock(&files->file_lock);
   3.774 +		for ( ; set ; i++,set >>= 1) {
   3.775 +			if (set & 1) {
   3.776 +				sys_close(i);
   3.777 +			}
   3.778 +		}
   3.779 +		spin_lock(&files->file_lock);
   3.780 +
   3.781 +	}
   3.782 +	spin_unlock(&files->file_lock);
   3.783 +}
   3.784 +
   3.785 +void get_task_comm(char *buf, struct task_struct *tsk)
   3.786 +{
   3.787 +	/* buf must be at least sizeof(tsk->comm) in size */
   3.788 +	task_lock(tsk);
   3.789 +	memcpy(buf, tsk->comm, sizeof(tsk->comm));
   3.790 +	task_unlock(tsk);
   3.791 +}
   3.792 +
   3.793 +void set_task_comm(struct task_struct *tsk, char *buf)
   3.794 +{
   3.795 +	task_lock(tsk);
   3.796 +	strlcpy(tsk->comm, buf, sizeof(tsk->comm));
   3.797 +	task_unlock(tsk);
   3.798 +}
   3.799 +
   3.800 +int flush_old_exec(struct linux_binprm * bprm)
   3.801 +{
   3.802 +	char * name;
   3.803 +	int i, ch, retval;
   3.804 +	struct files_struct *files;
   3.805 +	char tcomm[sizeof(current->comm)];
   3.806 +
   3.807 +	/*
   3.808 +	 * Make sure we have a private signal table and that
   3.809 +	 * we are unassociated from the previous thread group.
   3.810 +	 */
   3.811 +	retval = de_thread(current);
   3.812 +	if (retval)
   3.813 +		goto out;
   3.814 +
   3.815 +	/*
   3.816 +	 * Make sure we have private file handles. Ask the
   3.817 +	 * fork helper to do the work for us and the exit
   3.818 +	 * helper to do the cleanup of the old one.
   3.819 +	 */
   3.820 +	files = current->files;		/* refcounted so safe to hold */
   3.821 +	retval = unshare_files();
   3.822 +	if (retval)
   3.823 +		goto out;
   3.824 +	/*
   3.825 +	 * Release all of the old mmap stuff
   3.826 +	 */
   3.827 +	retval = exec_mmap(bprm->mm);
   3.828 +	if (retval)
   3.829 +		goto mmap_failed;
   3.830 +
   3.831 +	bprm->mm = NULL;		/* We're using it now */
   3.832 +
   3.833 +	/* This is the point of no return */
   3.834 +	steal_locks(files);
   3.835 +	put_files_struct(files);
   3.836 +
   3.837 +	current->sas_ss_sp = current->sas_ss_size = 0;
   3.838 +
   3.839 +	if (current->euid == current->uid && current->egid == current->gid)
   3.840 +		current->mm->dumpable = 1;
   3.841 +	name = bprm->filename;
   3.842 +	for (i=0; (ch = *(name++)) != '\0';) {
   3.843 +		if (ch == '/')
   3.844 +			i = 0;
   3.845 +		else
   3.846 +			if (i < (sizeof(tcomm) - 1))
   3.847 +				tcomm[i++] = ch;
   3.848 +	}
   3.849 +	tcomm[i] = '\0';
   3.850 +	set_task_comm(current, tcomm);
   3.851 +
   3.852 +	flush_thread();
   3.853 +
   3.854 +	if (bprm->e_uid != current->euid || bprm->e_gid != current->egid || 
   3.855 +	    permission(bprm->file->f_dentry->d_inode,MAY_READ, NULL) ||
   3.856 +	    (bprm->interp_flags & BINPRM_FLAGS_ENFORCE_NONDUMP)) {
   3.857 +		suid_keys(current);
   3.858 +		current->mm->dumpable = 0;
   3.859 +	}
   3.860 +
   3.861 +	/* An exec changes our domain. We are no longer part of the thread
   3.862 +	   group */
   3.863 +
   3.864 +	current->self_exec_id++;
   3.865 +			
   3.866 +	flush_signal_handlers(current, 0);
   3.867 +	flush_old_files(current->files);
   3.868 +
   3.869 +	return 0;
   3.870 +
   3.871 +mmap_failed:
   3.872 +	put_files_struct(current->files);
   3.873 +	current->files = files;
   3.874 +out:
   3.875 +	return retval;
   3.876 +}
   3.877 +
   3.878 +EXPORT_SYMBOL(flush_old_exec);
   3.879 +
   3.880 +/* 
   3.881 + * Fill the binprm structure from the inode. 
   3.882 + * Check permissions, then read the first 128 (BINPRM_BUF_SIZE) bytes
   3.883 + */
   3.884 +int prepare_binprm(struct linux_binprm *bprm)
   3.885 +{
   3.886 +	int mode;
   3.887 +	struct inode * inode = bprm->file->f_dentry->d_inode;
   3.888 +	int retval;
   3.889 +
   3.890 +	mode = inode->i_mode;
   3.891 +	/*
   3.892 +	 * Check execute perms again - if the caller has CAP_DAC_OVERRIDE,
   3.893 +	 * generic_permission lets a non-executable through
   3.894 +	 */
   3.895 +	if (!(mode & 0111))	/* with at least _one_ execute bit set */
   3.896 +		return -EACCES;
   3.897 +	if (bprm->file->f_op == NULL)
   3.898 +		return -EACCES;
   3.899 +
   3.900 +	bprm->e_uid = current->euid;
   3.901 +	bprm->e_gid = current->egid;
   3.902 +
   3.903 +	if(!(bprm->file->f_vfsmnt->mnt_flags & MNT_NOSUID)) {
   3.904 +		/* Set-uid? */
   3.905 +		if (mode & S_ISUID) {
   3.906 +			current->personality &= ~PER_CLEAR_ON_SETID;
   3.907 +			bprm->e_uid = inode->i_uid;
   3.908 +		}
   3.909 +
   3.910 +		/* Set-gid? */
   3.911 +		/*
   3.912 +		 * If setgid is set but no group execute bit then this
   3.913 +		 * is a candidate for mandatory locking, not a setgid
   3.914 +		 * executable.
   3.915 +		 */
   3.916 +		if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) {
   3.917 +			current->personality &= ~PER_CLEAR_ON_SETID;
   3.918 +			bprm->e_gid = inode->i_gid;
   3.919 +		}
   3.920 +	}
   3.921 +
   3.922 +	/* fill in binprm security blob */
   3.923 +	retval = security_bprm_set(bprm);
   3.924 +	if (retval)
   3.925 +		return retval;
   3.926 +
   3.927 +	memset(bprm->buf,0,BINPRM_BUF_SIZE);
   3.928 +	return kernel_read(bprm->file,0,bprm->buf,BINPRM_BUF_SIZE);
   3.929 +}
   3.930 +
   3.931 +EXPORT_SYMBOL(prepare_binprm);
   3.932 +
   3.933 +static inline int unsafe_exec(struct task_struct *p)
   3.934 +{
   3.935 +	int unsafe = 0;
   3.936 +	if (p->ptrace & PT_PTRACED) {
   3.937 +		if (p->ptrace & PT_PTRACE_CAP)
   3.938 +			unsafe |= LSM_UNSAFE_PTRACE_CAP;
   3.939 +		else
   3.940 +			unsafe |= LSM_UNSAFE_PTRACE;
   3.941 +	}
   3.942 +	if (atomic_read(&p->fs->count) > 1 ||
   3.943 +	    atomic_read(&p->files->count) > 1 ||
   3.944 +	    atomic_read(&p->sighand->count) > 1)
   3.945 +		unsafe |= LSM_UNSAFE_SHARE;
   3.946 +
   3.947 +	return unsafe;
   3.948 +}
   3.949 +
   3.950 +void compute_creds(struct linux_binprm *bprm)
   3.951 +{
   3.952 +	int unsafe;
   3.953 +
   3.954 +	if (bprm->e_uid != current->uid)
   3.955 +		suid_keys(current);
   3.956 +	exec_keys(current);
   3.957 +
   3.958 +	task_lock(current);
   3.959 +	unsafe = unsafe_exec(current);
   3.960 +	security_bprm_apply_creds(bprm, unsafe);
   3.961 +	task_unlock(current);
   3.962 +}
   3.963 +
   3.964 +EXPORT_SYMBOL(compute_creds);
   3.965 +
   3.966 +void remove_arg_zero(struct linux_binprm *bprm)
   3.967 +{
   3.968 +	if (bprm->argc) {
   3.969 +		unsigned long offset;
   3.970 +		char * kaddr;
   3.971 +		struct page *page;
   3.972 +
   3.973 +		offset = bprm->p % PAGE_SIZE;
   3.974 +		goto inside;
   3.975 +
   3.976 +		while (bprm->p++, *(kaddr+offset++)) {
   3.977 +			if (offset != PAGE_SIZE)
   3.978 +				continue;
   3.979 +			offset = 0;
   3.980 +			kunmap_atomic(kaddr, KM_USER0);
   3.981 +inside:
   3.982 +			page = bprm->page[bprm->p/PAGE_SIZE];
   3.983 +			kaddr = kmap_atomic(page, KM_USER0);
   3.984 +		}
   3.985 +		kunmap_atomic(kaddr, KM_USER0);
   3.986 +		bprm->argc--;
   3.987 +	}
   3.988 +}
   3.989 +
   3.990 +EXPORT_SYMBOL(remove_arg_zero);
   3.991 +
   3.992 +/*
   3.993 + * cycle the list of binary formats handler, until one recognizes the image
   3.994 + */
   3.995 +int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs)
   3.996 +{
   3.997 +	int try,retval;
   3.998 +	struct linux_binfmt *fmt;
   3.999 +#ifdef __alpha__
  3.1000 +	/* handle /sbin/loader.. */
  3.1001 +	{
  3.1002 +	    struct exec * eh = (struct exec *) bprm->buf;
  3.1003 +
  3.1004 +	    if (!bprm->loader && eh->fh.f_magic == 0x183 &&
  3.1005 +		(eh->fh.f_flags & 0x3000) == 0x3000)
  3.1006 +	    {
  3.1007 +		struct file * file;
  3.1008 +		unsigned long loader;
  3.1009 +
  3.1010 +		allow_write_access(bprm->file);
  3.1011 +		fput(bprm->file);
  3.1012 +		bprm->file = NULL;
  3.1013 +
  3.1014 +	        loader = PAGE_SIZE*MAX_ARG_PAGES-sizeof(void *);
  3.1015 +
  3.1016 +		file = open_exec("/sbin/loader");
  3.1017 +		retval = PTR_ERR(file);
  3.1018 +		if (IS_ERR(file))
  3.1019 +			return retval;
  3.1020 +
  3.1021 +		/* Remember if the application is TASO.  */
  3.1022 +		bprm->sh_bang = eh->ah.entry < 0x100000000UL;
  3.1023 +
  3.1024 +		bprm->file = file;
  3.1025 +		bprm->loader = loader;
  3.1026 +		retval = prepare_binprm(bprm);
  3.1027 +		if (retval<0)
  3.1028 +			return retval;
  3.1029 +		/* should call search_binary_handler recursively here,
  3.1030 +		   but it does not matter */
  3.1031 +	    }
  3.1032 +	}
  3.1033 +#endif
  3.1034 +	retval = security_bprm_check(bprm);
  3.1035 +	if (retval)
  3.1036 +		return retval;
  3.1037 +
  3.1038 +	/* kernel module loader fixup */
  3.1039 +	/* so we don't try to load run modprobe in kernel space. */
  3.1040 +	set_fs(USER_DS);
  3.1041 +	retval = -ENOENT;
  3.1042 +	for (try=0; try<2; try++) {
  3.1043 +		read_lock(&binfmt_lock);
  3.1044 +		for (fmt = formats ; fmt ; fmt = fmt->next) {
  3.1045 +			int (*fn)(struct linux_binprm *, struct pt_regs *) = fmt->load_binary;
  3.1046 +			if (!fn)
  3.1047 +				continue;
  3.1048 +			if (!try_module_get(fmt->module))
  3.1049 +				continue;
  3.1050 +			read_unlock(&binfmt_lock);
  3.1051 +			retval = fn(bprm, regs);
  3.1052 +			if (retval >= 0) {
  3.1053 +				put_binfmt(fmt);
  3.1054 +				allow_write_access(bprm->file);
  3.1055 +				if (bprm->file)
  3.1056 +					fput(bprm->file);
  3.1057 +				bprm->file = NULL;
  3.1058 +				current->did_exec = 1;
  3.1059 +				return retval;
  3.1060 +			}
  3.1061 +			read_lock(&binfmt_lock);
  3.1062 +			put_binfmt(fmt);
  3.1063 +			if (retval != -ENOEXEC || bprm->mm == NULL)
  3.1064 +				break;
  3.1065 +			if (!bprm->file) {
  3.1066 +				read_unlock(&binfmt_lock);
  3.1067 +				return retval;
  3.1068 +			}
  3.1069 +		}
  3.1070 +		read_unlock(&binfmt_lock);
  3.1071 +		if (retval != -ENOEXEC || bprm->mm == NULL) {
  3.1072 +			break;
  3.1073 +#ifdef CONFIG_KMOD
  3.1074 +		}else{
  3.1075 +#define printable(c) (((c)=='\t') || ((c)=='\n') || (0x20<=(c) && (c)<=0x7e))
  3.1076 +			if (printable(bprm->buf[0]) &&
  3.1077 +			    printable(bprm->buf[1]) &&
  3.1078 +			    printable(bprm->buf[2]) &&
  3.1079 +			    printable(bprm->buf[3]))
  3.1080 +				break; /* -ENOEXEC */
  3.1081 +			request_module("binfmt-%04x", *(unsigned short *)(&bprm->buf[2]));
  3.1082 +#endif
  3.1083 +		}
  3.1084 +	}
  3.1085 +	return retval;
  3.1086 +}
  3.1087 +
  3.1088 +EXPORT_SYMBOL(search_binary_handler);
  3.1089 +
  3.1090 +/*
  3.1091 + * sys_execve() executes a new program.
  3.1092 + */
  3.1093 +int do_execve(char * filename,
  3.1094 +	char __user *__user *argv,
  3.1095 +	char __user *__user *envp,
  3.1096 +	struct pt_regs * regs)
  3.1097 +{
  3.1098 +	struct linux_binprm *bprm;
  3.1099 +	struct file *file;
  3.1100 +	int retval;
  3.1101 +	int i;
  3.1102 +
  3.1103 +	retval = -ENOMEM;
  3.1104 +	bprm = kmalloc(sizeof(*bprm), GFP_KERNEL);
  3.1105 +	if (!bprm)
  3.1106 +		goto out_ret;
  3.1107 +	memset(bprm, 0, sizeof(*bprm));
  3.1108 +
  3.1109 +	file = open_exec(filename);
  3.1110 +	retval = PTR_ERR(file);
  3.1111 +	if (IS_ERR(file))
  3.1112 +		goto out_kfree;
  3.1113 +
  3.1114 +	sched_exec();
  3.1115 +
  3.1116 +	bprm->p = PAGE_SIZE*MAX_ARG_PAGES-sizeof(void *);
  3.1117 +
  3.1118 +	bprm->file = file;
  3.1119 +	bprm->filename = filename;
  3.1120 +	bprm->interp = filename;
  3.1121 +	bprm->mm = mm_alloc();
  3.1122 +	retval = -ENOMEM;
  3.1123 +	if (!bprm->mm)
  3.1124 +		goto out_file;
  3.1125 +
  3.1126 +	retval = init_new_context(current, bprm->mm);
  3.1127 +	if (retval < 0)
  3.1128 +		goto out_mm;
  3.1129 +
  3.1130 +	bprm->argc = count(argv, bprm->p / sizeof(void *));
  3.1131 +	if ((retval = bprm->argc) < 0)
  3.1132 +		goto out_mm;
  3.1133 +
  3.1134 +	bprm->envc = count(envp, bprm->p / sizeof(void *));
  3.1135 +	if ((retval = bprm->envc) < 0)
  3.1136 +		goto out_mm;
  3.1137 +
  3.1138 +	retval = security_bprm_alloc(bprm);
  3.1139 +	if (retval)
  3.1140 +		goto out;
  3.1141 +
  3.1142 +	retval = prepare_binprm(bprm);
  3.1143 +	if (retval < 0)
  3.1144 +		goto out;
  3.1145 +
  3.1146 +	retval = copy_strings_kernel(1, &bprm->filename, bprm);
  3.1147 +	if (retval < 0)
  3.1148 +		goto out;
  3.1149 +
  3.1150 +	bprm->exec = bprm->p;
  3.1151 +	retval = copy_strings(bprm->envc, envp, bprm);
  3.1152 +	if (retval < 0)
  3.1153 +		goto out;
  3.1154 +
  3.1155 +	retval = copy_strings(bprm->argc, argv, bprm);
  3.1156 +	if (retval < 0)
  3.1157 +		goto out;
  3.1158 +
  3.1159 +	retval = search_binary_handler(bprm,regs);
  3.1160 +	if (retval >= 0) {
  3.1161 +		free_arg_pages(bprm);
  3.1162 +
  3.1163 +		/* execve success */
  3.1164 +		security_bprm_free(bprm);
  3.1165 +		kfree(bprm);
  3.1166 +		return retval;
  3.1167 +	}
  3.1168 +
  3.1169 +out:
  3.1170 +	/* Something went wrong, return the inode and free the argument pages*/
  3.1171 +	for (i = 0 ; i < MAX_ARG_PAGES ; i++) {
  3.1172 +		struct page * page = bprm->page[i];
  3.1173 +		if (page)
  3.1174 +			__free_page(page);
  3.1175 +	}
  3.1176 +
  3.1177 +	if (bprm->security)
  3.1178 +		security_bprm_free(bprm);
  3.1179 +
  3.1180 +out_mm:
  3.1181 +	if (bprm->mm)
  3.1182 +		mmdrop(bprm->mm);
  3.1183 +
  3.1184 +out_file:
  3.1185 +	if (bprm->file) {
  3.1186 +		allow_write_access(bprm->file);
  3.1187 +		fput(bprm->file);
  3.1188 +	}
  3.1189 +
  3.1190 +out_kfree:
  3.1191 +	kfree(bprm);
  3.1192 +
  3.1193 +out_ret:
  3.1194 +	return retval;
  3.1195 +}
  3.1196 +
  3.1197 +int set_binfmt(struct linux_binfmt *new)
  3.1198 +{
  3.1199 +	struct linux_binfmt *old = current->binfmt;
  3.1200 +
  3.1201 +	if (new) {
  3.1202 +		if (!try_module_get(new->module))
  3.1203 +			return -1;
  3.1204 +	}
  3.1205 +	current->binfmt = new;
  3.1206 +	if (old)
  3.1207 +		module_put(old->module);
  3.1208 +	return 0;
  3.1209 +}
  3.1210 +
  3.1211 +EXPORT_SYMBOL(set_binfmt);
  3.1212 +
  3.1213 +#define CORENAME_MAX_SIZE 64
  3.1214 +
  3.1215 +/* format_corename will inspect the pattern parameter, and output a
  3.1216 + * name into corename, which must have space for at least
  3.1217 + * CORENAME_MAX_SIZE bytes plus one byte for the zero terminator.
  3.1218 + */
  3.1219 +static void format_corename(char *corename, const char *pattern, long signr)
  3.1220 +{
  3.1221 +	const char *pat_ptr = pattern;
  3.1222 +	char *out_ptr = corename;
  3.1223 +	char *const out_end = corename + CORENAME_MAX_SIZE;
  3.1224 +	int rc;
  3.1225 +	int pid_in_pattern = 0;
  3.1226 +
  3.1227 +	/* Repeat as long as we have more pattern to process and more output
  3.1228 +	   space */
  3.1229 +	while (*pat_ptr) {
  3.1230 +		if (*pat_ptr != '%') {
  3.1231 +			if (out_ptr == out_end)
  3.1232 +				goto out;
  3.1233 +			*out_ptr++ = *pat_ptr++;
  3.1234 +		} else {
  3.1235 +			switch (*++pat_ptr) {
  3.1236 +			case 0:
  3.1237 +				goto out;
  3.1238 +			/* Double percent, output one percent */
  3.1239 +			case '%':
  3.1240 +				if (out_ptr == out_end)
  3.1241 +					goto out;
  3.1242 +				*out_ptr++ = '%';
  3.1243 +				break;
  3.1244 +			/* pid */
  3.1245 +			case 'p':
  3.1246 +				pid_in_pattern = 1;
  3.1247 +				rc = snprintf(out_ptr, out_end - out_ptr,
  3.1248 +					      "%d", current->tgid);
  3.1249 +				if (rc > out_end - out_ptr)
  3.1250 +					goto out;
  3.1251 +				out_ptr += rc;
  3.1252 +				break;
  3.1253 +			/* uid */
  3.1254 +			case 'u':
  3.1255 +				rc = snprintf(out_ptr, out_end - out_ptr,
  3.1256 +					      "%d", current->uid);
  3.1257 +				if (rc > out_end - out_ptr)
  3.1258 +					goto out;
  3.1259 +				out_ptr += rc;
  3.1260 +				break;
  3.1261 +			/* gid */
  3.1262 +			case 'g':
  3.1263 +				rc = snprintf(out_ptr, out_end - out_ptr,
  3.1264 +					      "%d", current->gid);
  3.1265 +				if (rc > out_end - out_ptr)
  3.1266 +					goto out;
  3.1267 +				out_ptr += rc;
  3.1268 +				break;
  3.1269 +			/* signal that caused the coredump */
  3.1270 +			case 's':
  3.1271 +				rc = snprintf(out_ptr, out_end - out_ptr,
  3.1272 +					      "%ld", signr);
  3.1273 +				if (rc > out_end - out_ptr)
  3.1274 +					goto out;
  3.1275 +				out_ptr += rc;
  3.1276 +				break;
  3.1277 +			/* UNIX time of coredump */
  3.1278 +			case 't': {
  3.1279 +				struct timeval tv;
  3.1280 +				do_gettimeofday(&tv);
  3.1281 +				rc = snprintf(out_ptr, out_end - out_ptr,
  3.1282 +					      "%lu", tv.tv_sec);
  3.1283 +				if (rc > out_end - out_ptr)
  3.1284 +					goto out;
  3.1285 +				out_ptr += rc;
  3.1286 +				break;
  3.1287 +			}
  3.1288 +			/* hostname */
  3.1289 +			case 'h':
  3.1290 +				down_read(&uts_sem);
  3.1291 +				rc = snprintf(out_ptr, out_end - out_ptr,
  3.1292 +					      "%s", system_utsname.nodename);
  3.1293 +				up_read(&uts_sem);
  3.1294 +				if (rc > out_end - out_ptr)
  3.1295 +					goto out;
  3.1296 +				out_ptr += rc;
  3.1297 +				break;
  3.1298 +			/* executable */
  3.1299 +			case 'e':
  3.1300 +				rc = snprintf(out_ptr, out_end - out_ptr,
  3.1301 +					      "%s", current->comm);
  3.1302 +				if (rc > out_end - out_ptr)
  3.1303 +					goto out;
  3.1304 +				out_ptr += rc;
  3.1305 +				break;
  3.1306 +			default:
  3.1307 +				break;
  3.1308 +			}
  3.1309 +			++pat_ptr;
  3.1310 +		}
  3.1311 +	}
  3.1312 +	/* Backward compatibility with core_uses_pid:
  3.1313 +	 *
  3.1314 +	 * If core_pattern does not include a %p (as is the default)
  3.1315 +	 * and core_uses_pid is set, then .%pid will be appended to
  3.1316 +	 * the filename */
  3.1317 +	if (!pid_in_pattern
  3.1318 +            && (core_uses_pid || atomic_read(&current->mm->mm_users) != 1)) {
  3.1319 +		rc = snprintf(out_ptr, out_end - out_ptr,
  3.1320 +			      ".%d", current->tgid);
  3.1321 +		if (rc > out_end - out_ptr)
  3.1322 +			goto out;
  3.1323 +		out_ptr += rc;
  3.1324 +	}
  3.1325 +      out:
  3.1326 +	*out_ptr = 0;
  3.1327 +}
  3.1328 +
  3.1329 +static void zap_threads (struct mm_struct *mm)
  3.1330 +{
  3.1331 +	struct task_struct *g, *p;
  3.1332 +	struct task_struct *tsk = current;
  3.1333 +	struct completion *vfork_done = tsk->vfork_done;
  3.1334 +
  3.1335 +	/*
  3.1336 +	 * Make sure nobody is waiting for us to release the VM,
  3.1337 +	 * otherwise we can deadlock when we wait on each other
  3.1338 +	 */
  3.1339 +	if (vfork_done) {
  3.1340 +		tsk->vfork_done = NULL;
  3.1341 +		complete(vfork_done);
  3.1342 +	}
  3.1343 +
  3.1344 +	read_lock(&tasklist_lock);
  3.1345 +	do_each_thread(g,p)
  3.1346 +		if (mm == p->mm && p != tsk) {
  3.1347 +			force_sig_specific(SIGKILL, p);
  3.1348 +			mm->core_waiters++;
  3.1349 +		}
  3.1350 +	while_each_thread(g,p);
  3.1351 +
  3.1352 +	read_unlock(&tasklist_lock);
  3.1353 +}
  3.1354 +
  3.1355 +static void coredump_wait(struct mm_struct *mm)
  3.1356 +{
  3.1357 +	DECLARE_COMPLETION(startup_done);
  3.1358 +
  3.1359 +	mm->core_waiters++; /* let other threads block */
  3.1360 +	mm->core_startup_done = &startup_done;
  3.1361 +
  3.1362 +	/* give other threads a chance to run: */
  3.1363 +	yield();
  3.1364 +
  3.1365 +	zap_threads(mm);
  3.1366 +	if (--mm->core_waiters) {
  3.1367 +		up_write(&mm->mmap_sem);
  3.1368 +		wait_for_completion(&startup_done);
  3.1369 +	} else
  3.1370 +		up_write(&mm->mmap_sem);
  3.1371 +	BUG_ON(mm->core_waiters);
  3.1372 +}
  3.1373 +
  3.1374 +int do_coredump(long signr, int exit_code, struct pt_regs * regs)
  3.1375 +{
  3.1376 +	char corename[CORENAME_MAX_SIZE + 1];
  3.1377 +	struct mm_struct *mm = current->mm;
  3.1378 +	struct linux_binfmt * binfmt;
  3.1379 +	struct inode * inode;
  3.1380 +	struct file * file;
  3.1381 +	int retval = 0;
  3.1382 +
  3.1383 +	binfmt = current->binfmt;
  3.1384 +	if (!binfmt || !binfmt->core_dump)
  3.1385 +		goto fail;
  3.1386 +	down_write(&mm->mmap_sem);
  3.1387 +	if (!mm->dumpable) {
  3.1388 +		up_write(&mm->mmap_sem);
  3.1389 +		goto fail;
  3.1390 +	}
  3.1391 +	mm->dumpable = 0;
  3.1392 +	init_completion(&mm->core_done);
  3.1393 +	current->signal->group_exit = 1;
  3.1394 +	current->signal->group_exit_code = exit_code;
  3.1395 +	coredump_wait(mm);
  3.1396 +
  3.1397 +	if (current->signal->rlim[RLIMIT_CORE].rlim_cur < binfmt->min_coredump)
  3.1398 +		goto fail_unlock;
  3.1399 +
  3.1400 +	/*
  3.1401 +	 * lock_kernel() because format_corename() is controlled by sysctl, which
  3.1402 +	 * uses lock_kernel()
  3.1403 +	 */
  3.1404 + 	lock_kernel();
  3.1405 +	format_corename(corename, core_pattern, signr);
  3.1406 +	unlock_kernel();
  3.1407 +	file = filp_open(corename, O_CREAT | 2 | O_NOFOLLOW | O_LARGEFILE, 0600);
  3.1408 +	if (IS_ERR(file))
  3.1409 +		goto fail_unlock;
  3.1410 +	inode = file->f_dentry->d_inode;
  3.1411 +	if (inode->i_nlink > 1)
  3.1412 +		goto close_fail;	/* multiple links - don't dump */
  3.1413 +	if (d_unhashed(file->f_dentry))
  3.1414 +		goto close_fail;
  3.1415 +
  3.1416 +	if (!S_ISREG(inode->i_mode))
  3.1417 +		goto close_fail;
  3.1418 +	if (!file->f_op)
  3.1419 +		goto close_fail;
  3.1420 +	if (!file->f_op->write)
  3.1421 +		goto close_fail;
  3.1422 +	if (do_truncate(file->f_dentry, 0) != 0)
  3.1423 +		goto close_fail;
  3.1424 +
  3.1425 +	retval = binfmt->core_dump(signr, regs, file);
  3.1426 +
  3.1427 +	if (retval)
  3.1428 +		current->signal->group_exit_code |= 0x80;
  3.1429 +close_fail:
  3.1430 +	filp_close(file, NULL);
  3.1431 +fail_unlock:
  3.1432 +	complete_all(&mm->core_done);
  3.1433 +fail:
  3.1434 +	return retval;
  3.1435 +}
     4.1 --- a/linux-2.6.10-xen-sparse/include/asm-xen/asm-i386/pgtable-2level.h	Thu Mar 10 18:12:10 2005 +0000
     4.2 +++ b/linux-2.6.10-xen-sparse/include/asm-xen/asm-i386/pgtable-2level.h	Tue Mar 15 14:50:10 2005 +0000
     4.3 @@ -39,10 +39,16 @@ do { \
     4.4  #if defined(CONFIG_XEN_DEBUG_NO_MMU_BATCHING)
     4.5  #define set_pte(pteptr, pteval)\
     4.6      set_pte_batched(pteptr, pteval)
     4.7 -#elif defined(CONFIG_XEN_BATCH_MODE)
     4.8 +
     4.9 +#elif defined(CONFIG_XEN_BATCH_MODE1)
    4.10  #define set_pte(pteptr, pteval)({\
    4.11      set_pte_batched(pteptr, pteval);\
    4.12      _flush_page_update_queue();})
    4.13 +
    4.14 +#elif defined(CONFIG_XEN_BATCH_MODE2)
    4.15 +#define set_pte(pteptr, pteval)\
    4.16 +    set_pte_batched(pteptr, pteval)
    4.17 +
    4.18  #else
    4.19  #define set_pte(pteptr, pteval) (*(pteptr) = pteval)
    4.20  #endif
     5.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     5.2 +++ b/linux-2.6.10-xen-sparse/mm/highmem.c	Tue Mar 15 14:50:10 2005 +0000
     5.3 @@ -0,0 +1,607 @@
     5.4 +/*
     5.5 + * High memory handling common code and variables.
     5.6 + *
     5.7 + * (C) 1999 Andrea Arcangeli, SuSE GmbH, andrea@suse.de
     5.8 + *          Gerhard Wichert, Siemens AG, Gerhard.Wichert@pdb.siemens.de
     5.9 + *
    5.10 + *
    5.11 + * Redesigned the x86 32-bit VM architecture to deal with
    5.12 + * 64-bit physical space. With current x86 CPUs this
    5.13 + * means up to 64 Gigabytes physical RAM.
    5.14 + *
    5.15 + * Rewrote high memory support to move the page cache into
    5.16 + * high memory. Implemented permanent (schedulable) kmaps
    5.17 + * based on Linus' idea.
    5.18 + *
    5.19 + * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
    5.20 + */
    5.21 +
    5.22 +#include <linux/mm.h>
    5.23 +#include <linux/module.h>
    5.24 +#include <linux/swap.h>
    5.25 +#include <linux/bio.h>
    5.26 +#include <linux/pagemap.h>
    5.27 +#include <linux/mempool.h>
    5.28 +#include <linux/blkdev.h>
    5.29 +#include <linux/init.h>
    5.30 +#include <linux/hash.h>
    5.31 +#include <linux/highmem.h>
    5.32 +#include <asm/tlbflush.h>
    5.33 +
    5.34 +static mempool_t *page_pool, *isa_page_pool;
    5.35 +
    5.36 +static void *page_pool_alloc(int gfp_mask, void *data)
    5.37 +{
    5.38 +	int gfp = gfp_mask | (int) (long) data;
    5.39 +
    5.40 +	return alloc_page(gfp);
    5.41 +}
    5.42 +
    5.43 +static void page_pool_free(void *page, void *data)
    5.44 +{
    5.45 +	__free_page(page);
    5.46 +}
    5.47 +
    5.48 +/*
    5.49 + * Virtual_count is not a pure "count".
    5.50 + *  0 means that it is not mapped, and has not been mapped
    5.51 + *    since a TLB flush - it is usable.
    5.52 + *  1 means that there are no users, but it has been mapped
    5.53 + *    since the last TLB flush - so we can't use it.
    5.54 + *  n means that there are (n-1) current users of it.
    5.55 + */
    5.56 +#ifdef CONFIG_HIGHMEM
    5.57 +static int pkmap_count[LAST_PKMAP];
    5.58 +static unsigned int last_pkmap_nr;
    5.59 +static spinlock_t kmap_lock __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED;
    5.60 +
    5.61 +pte_t * pkmap_page_table;
    5.62 +
    5.63 +static DECLARE_WAIT_QUEUE_HEAD(pkmap_map_wait);
    5.64 +
    5.65 +static void flush_all_zero_pkmaps(void)
    5.66 +{
    5.67 +	int i;
    5.68 +
    5.69 +	flush_cache_kmaps();
    5.70 +
    5.71 +	for (i = 0; i < LAST_PKMAP; i++) {
    5.72 +		struct page *page;
    5.73 +
    5.74 +		/*
    5.75 +		 * zero means we don't have anything to do,
    5.76 +		 * >1 means that it is still in use. Only
    5.77 +		 * a count of 1 means that it is free but
    5.78 +		 * needs to be unmapped
    5.79 +		 */
    5.80 +		if (pkmap_count[i] != 1)
    5.81 +			continue;
    5.82 +		pkmap_count[i] = 0;
    5.83 +
    5.84 +		/* sanity check */
    5.85 +		if (pte_none(pkmap_page_table[i]))
    5.86 +			BUG();
    5.87 +
    5.88 +		/*
    5.89 +		 * Don't need an atomic fetch-and-clear op here;
    5.90 +		 * no-one has the page mapped, and cannot get at
    5.91 +		 * its virtual address (and hence PTE) without first
    5.92 +		 * getting the kmap_lock (which is held here).
    5.93 +		 * So no dangers, even with speculative execution.
    5.94 +		 */
    5.95 +		page = pte_page(pkmap_page_table[i]);
    5.96 +		pte_clear(&pkmap_page_table[i]);
    5.97 +
    5.98 +		set_page_address(page, NULL);
    5.99 +	}
   5.100 +	flush_tlb_kernel_range(PKMAP_ADDR(0), PKMAP_ADDR(LAST_PKMAP));
   5.101 +}
   5.102 +
   5.103 +static inline unsigned long map_new_virtual(struct page *page)
   5.104 +{
   5.105 +	unsigned long vaddr;
   5.106 +	int count;
   5.107 +
   5.108 +start:
   5.109 +	count = LAST_PKMAP;
   5.110 +	/* Find an empty entry */
   5.111 +	for (;;) {
   5.112 +		last_pkmap_nr = (last_pkmap_nr + 1) & LAST_PKMAP_MASK;
   5.113 +		if (!last_pkmap_nr) {
   5.114 +			flush_all_zero_pkmaps();
   5.115 +			count = LAST_PKMAP;
   5.116 +		}
   5.117 +		if (!pkmap_count[last_pkmap_nr])
   5.118 +			break;	/* Found a usable entry */
   5.119 +		if (--count)
   5.120 +			continue;
   5.121 +
   5.122 +		/*
   5.123 +		 * Sleep for somebody else to unmap their entries
   5.124 +		 */
   5.125 +		{
   5.126 +			DECLARE_WAITQUEUE(wait, current);
   5.127 +
   5.128 +			__set_current_state(TASK_UNINTERRUPTIBLE);
   5.129 +			add_wait_queue(&pkmap_map_wait, &wait);
   5.130 +			spin_unlock(&kmap_lock);
   5.131 +			schedule();
   5.132 +			remove_wait_queue(&pkmap_map_wait, &wait);
   5.133 +			spin_lock(&kmap_lock);
   5.134 +
   5.135 +			/* Somebody else might have mapped it while we slept */
   5.136 +			if (page_address(page))
   5.137 +				return (unsigned long)page_address(page);
   5.138 +
   5.139 +			/* Re-start */
   5.140 +			goto start;
   5.141 +		}
   5.142 +	}
   5.143 +	vaddr = PKMAP_ADDR(last_pkmap_nr);
   5.144 +	set_pte(&(pkmap_page_table[last_pkmap_nr]), mk_pte(page, kmap_prot));
   5.145 +#ifdef CONFIG_XEN_BATCH_MODE2
   5.146 +	XEN_flush_page_update_queue();
   5.147 +#endif
   5.148 +	pkmap_count[last_pkmap_nr] = 1;
   5.149 +	set_page_address(page, (void *)vaddr);
   5.150 +
   5.151 +	return vaddr;
   5.152 +}
   5.153 +
   5.154 +void fastcall *kmap_high(struct page *page)
   5.155 +{
   5.156 +	unsigned long vaddr;
   5.157 +
   5.158 +	/*
   5.159 +	 * For highmem pages, we can't trust "virtual" until
   5.160 +	 * after we have the lock.
   5.161 +	 *
   5.162 +	 * We cannot call this from interrupts, as it may block
   5.163 +	 */
   5.164 +	spin_lock(&kmap_lock);
   5.165 +	vaddr = (unsigned long)page_address(page);
   5.166 +	if (!vaddr)
   5.167 +		vaddr = map_new_virtual(page);
   5.168 +	pkmap_count[PKMAP_NR(vaddr)]++;
   5.169 +	if (pkmap_count[PKMAP_NR(vaddr)] < 2)
   5.170 +		BUG();
   5.171 +	spin_unlock(&kmap_lock);
   5.172 +	return (void*) vaddr;
   5.173 +}
   5.174 +
   5.175 +EXPORT_SYMBOL(kmap_high);
   5.176 +
   5.177 +void fastcall kunmap_high(struct page *page)
   5.178 +{
   5.179 +	unsigned long vaddr;
   5.180 +	unsigned long nr;
   5.181 +	int need_wakeup;
   5.182 +
   5.183 +	spin_lock(&kmap_lock);
   5.184 +	vaddr = (unsigned long)page_address(page);
   5.185 +	if (!vaddr)
   5.186 +		BUG();
   5.187 +	nr = PKMAP_NR(vaddr);
   5.188 +
   5.189 +	/*
   5.190 +	 * A count must never go down to zero
   5.191 +	 * without a TLB flush!
   5.192 +	 */
   5.193 +	need_wakeup = 0;
   5.194 +	switch (--pkmap_count[nr]) {
   5.195 +	case 0:
   5.196 +		BUG();
   5.197 +	case 1:
   5.198 +		/*
   5.199 +		 * Avoid an unnecessary wake_up() function call.
   5.200 +		 * The common case is pkmap_count[] == 1, but
   5.201 +		 * no waiters.
   5.202 +		 * The tasks queued in the wait-queue are guarded
   5.203 +		 * by both the lock in the wait-queue-head and by
   5.204 +		 * the kmap_lock.  As the kmap_lock is held here,
   5.205 +		 * no need for the wait-queue-head's lock.  Simply
   5.206 +		 * test if the queue is empty.
   5.207 +		 */
   5.208 +		need_wakeup = waitqueue_active(&pkmap_map_wait);
   5.209 +	}
   5.210 +	spin_unlock(&kmap_lock);
   5.211 +
   5.212 +	/* do wake-up, if needed, race-free outside of the spin lock */
   5.213 +	if (need_wakeup)
   5.214 +		wake_up(&pkmap_map_wait);
   5.215 +}
   5.216 +
   5.217 +EXPORT_SYMBOL(kunmap_high);
   5.218 +
   5.219 +#define POOL_SIZE	64
   5.220 +
   5.221 +static __init int init_emergency_pool(void)
   5.222 +{
   5.223 +	struct sysinfo i;
   5.224 +	si_meminfo(&i);
   5.225 +	si_swapinfo(&i);
   5.226 +        
   5.227 +	if (!i.totalhigh)
   5.228 +		return 0;
   5.229 +
   5.230 +	page_pool = mempool_create(POOL_SIZE, page_pool_alloc, page_pool_free, NULL);
   5.231 +	if (!page_pool)
   5.232 +		BUG();
   5.233 +	printk("highmem bounce pool size: %d pages\n", POOL_SIZE);
   5.234 +
   5.235 +	return 0;
   5.236 +}
   5.237 +
   5.238 +__initcall(init_emergency_pool);
   5.239 +
   5.240 +/*
   5.241 + * highmem version, map in to vec
   5.242 + */
   5.243 +static void bounce_copy_vec(struct bio_vec *to, unsigned char *vfrom)
   5.244 +{
   5.245 +	unsigned long flags;
   5.246 +	unsigned char *vto;
   5.247 +
   5.248 +	local_irq_save(flags);
   5.249 +	vto = kmap_atomic(to->bv_page, KM_BOUNCE_READ);
   5.250 +	memcpy(vto + to->bv_offset, vfrom, to->bv_len);
   5.251 +	kunmap_atomic(vto, KM_BOUNCE_READ);
   5.252 +	local_irq_restore(flags);
   5.253 +}
   5.254 +
   5.255 +#else /* CONFIG_HIGHMEM */
   5.256 +
   5.257 +#define bounce_copy_vec(to, vfrom)	\
   5.258 +	memcpy(page_address((to)->bv_page) + (to)->bv_offset, vfrom, (to)->bv_len)
   5.259 +
   5.260 +#endif
   5.261 +
   5.262 +#define ISA_POOL_SIZE	16
   5.263 +
   5.264 +/*
   5.265 + * gets called "every" time someone init's a queue with BLK_BOUNCE_ISA
   5.266 + * as the max address, so check if the pool has already been created.
   5.267 + */
   5.268 +int init_emergency_isa_pool(void)
   5.269 +{
   5.270 +	if (isa_page_pool)
   5.271 +		return 0;
   5.272 +
   5.273 +	isa_page_pool = mempool_create(ISA_POOL_SIZE, page_pool_alloc, page_pool_free, (void *) __GFP_DMA);
   5.274 +	if (!isa_page_pool)
   5.275 +		BUG();
   5.276 +
   5.277 +	printk("isa bounce pool size: %d pages\n", ISA_POOL_SIZE);
   5.278 +	return 0;
   5.279 +}
   5.280 +
   5.281 +/*
   5.282 + * Simple bounce buffer support for highmem pages. Depending on the
   5.283 + * queue gfp mask set, *to may or may not be a highmem page. kmap it
   5.284 + * always, it will do the Right Thing
   5.285 + */
   5.286 +static void copy_to_high_bio_irq(struct bio *to, struct bio *from)
   5.287 +{
   5.288 +	unsigned char *vfrom;
   5.289 +	struct bio_vec *tovec, *fromvec;
   5.290 +	int i;
   5.291 +
   5.292 +	__bio_for_each_segment(tovec, to, i, 0) {
   5.293 +		fromvec = from->bi_io_vec + i;
   5.294 +
   5.295 +		/*
   5.296 +		 * not bounced
   5.297 +		 */
   5.298 +		if (tovec->bv_page == fromvec->bv_page)
   5.299 +			continue;
   5.300 +
   5.301 +		/*
   5.302 +		 * fromvec->bv_offset and fromvec->bv_len might have been
   5.303 +		 * modified by the block layer, so use the original copy,
   5.304 +		 * bounce_copy_vec already uses tovec->bv_len
   5.305 +		 */
   5.306 +		vfrom = page_address(fromvec->bv_page) + tovec->bv_offset;
   5.307 +
   5.308 +		flush_dcache_page(tovec->bv_page);
   5.309 +		bounce_copy_vec(tovec, vfrom);
   5.310 +	}
   5.311 +}
   5.312 +
   5.313 +static void bounce_end_io(struct bio *bio, mempool_t *pool, int err)
   5.314 +{
   5.315 +	struct bio *bio_orig = bio->bi_private;
   5.316 +	struct bio_vec *bvec, *org_vec;
   5.317 +	int i;
   5.318 +
   5.319 +	if (test_bit(BIO_EOPNOTSUPP, &bio->bi_flags))
   5.320 +		set_bit(BIO_EOPNOTSUPP, &bio_orig->bi_flags);
   5.321 +
   5.322 +	/*
   5.323 +	 * free up bounce indirect pages used
   5.324 +	 */
   5.325 +	__bio_for_each_segment(bvec, bio, i, 0) {
   5.326 +		org_vec = bio_orig->bi_io_vec + i;
   5.327 +		if (bvec->bv_page == org_vec->bv_page)
   5.328 +			continue;
   5.329 +
   5.330 +		mempool_free(bvec->bv_page, pool);	
   5.331 +	}
   5.332 +
   5.333 +	bio_endio(bio_orig, bio_orig->bi_size, err);
   5.334 +	bio_put(bio);
   5.335 +}
   5.336 +
   5.337 +static int bounce_end_io_write(struct bio *bio, unsigned int bytes_done,int err)
   5.338 +{
   5.339 +	if (bio->bi_size)
   5.340 +		return 1;
   5.341 +
   5.342 +	bounce_end_io(bio, page_pool, err);
   5.343 +	return 0;
   5.344 +}
   5.345 +
   5.346 +static int bounce_end_io_write_isa(struct bio *bio, unsigned int bytes_done, int err)
   5.347 +{
   5.348 +	if (bio->bi_size)
   5.349 +		return 1;
   5.350 +
   5.351 +	bounce_end_io(bio, isa_page_pool, err);
   5.352 +	return 0;
   5.353 +}
   5.354 +
   5.355 +static void __bounce_end_io_read(struct bio *bio, mempool_t *pool, int err)
   5.356 +{
   5.357 +	struct bio *bio_orig = bio->bi_private;
   5.358 +
   5.359 +	if (test_bit(BIO_UPTODATE, &bio->bi_flags))
   5.360 +		copy_to_high_bio_irq(bio_orig, bio);
   5.361 +
   5.362 +	bounce_end_io(bio, pool, err);
   5.363 +}
   5.364 +
   5.365 +static int bounce_end_io_read(struct bio *bio, unsigned int bytes_done, int err)
   5.366 +{
   5.367 +	if (bio->bi_size)
   5.368 +		return 1;
   5.369 +
   5.370 +	__bounce_end_io_read(bio, page_pool, err);
   5.371 +	return 0;
   5.372 +}
   5.373 +
   5.374 +static int bounce_end_io_read_isa(struct bio *bio, unsigned int bytes_done, int err)
   5.375 +{
   5.376 +	if (bio->bi_size)
   5.377 +		return 1;
   5.378 +
   5.379 +	__bounce_end_io_read(bio, isa_page_pool, err);
   5.380 +	return 0;
   5.381 +}
   5.382 +
   5.383 +static void __blk_queue_bounce(request_queue_t *q, struct bio **bio_orig,
   5.384 +			mempool_t *pool)
   5.385 +{
   5.386 +	struct page *page;
   5.387 +	struct bio *bio = NULL;
   5.388 +	int i, rw = bio_data_dir(*bio_orig);
   5.389 +	struct bio_vec *to, *from;
   5.390 +
   5.391 +	bio_for_each_segment(from, *bio_orig, i) {
   5.392 +		page = from->bv_page;
   5.393 +
   5.394 +		/*
   5.395 +		 * is destination page below bounce pfn?
   5.396 +		 */
   5.397 +		if (page_to_pfn(page) < q->bounce_pfn)
   5.398 +			continue;
   5.399 +
   5.400 +		/*
   5.401 +		 * irk, bounce it
   5.402 +		 */
   5.403 +		if (!bio)
   5.404 +			bio = bio_alloc(GFP_NOIO, (*bio_orig)->bi_vcnt);
   5.405 +
   5.406 +		to = bio->bi_io_vec + i;
   5.407 +
   5.408 +		to->bv_page = mempool_alloc(pool, q->bounce_gfp);
   5.409 +		to->bv_len = from->bv_len;
   5.410 +		to->bv_offset = from->bv_offset;
   5.411 +
   5.412 +		if (rw == WRITE) {
   5.413 +			char *vto, *vfrom;
   5.414 +
   5.415 +			flush_dcache_page(from->bv_page);
   5.416 +			vto = page_address(to->bv_page) + to->bv_offset;
   5.417 +			vfrom = kmap(from->bv_page) + from->bv_offset;
   5.418 +			memcpy(vto, vfrom, to->bv_len);
   5.419 +			kunmap(from->bv_page);
   5.420 +		}
   5.421 +	}
   5.422 +
   5.423 +	/*
   5.424 +	 * no pages bounced
   5.425 +	 */
   5.426 +	if (!bio)
   5.427 +		return;
   5.428 +
   5.429 +	/*
   5.430 +	 * at least one page was bounced, fill in possible non-highmem
   5.431 +	 * pages
   5.432 +	 */
   5.433 +	bio_for_each_segment(from, *bio_orig, i) {
   5.434 +		to = bio_iovec_idx(bio, i);
   5.435 +		if (!to->bv_page) {
   5.436 +			to->bv_page = from->bv_page;
   5.437 +			to->bv_len = from->bv_len;
   5.438 +			to->bv_offset = from->bv_offset;
   5.439 +		}
   5.440 +	}
   5.441 +
   5.442 +	bio->bi_bdev = (*bio_orig)->bi_bdev;
   5.443 +	bio->bi_flags |= (1 << BIO_BOUNCED);
   5.444 +	bio->bi_sector = (*bio_orig)->bi_sector;
   5.445 +	bio->bi_rw = (*bio_orig)->bi_rw;
   5.446 +
   5.447 +	bio->bi_vcnt = (*bio_orig)->bi_vcnt;
   5.448 +	bio->bi_idx = (*bio_orig)->bi_idx;
   5.449 +	bio->bi_size = (*bio_orig)->bi_size;
   5.450 +
   5.451 +	if (pool == page_pool) {
   5.452 +		bio->bi_end_io = bounce_end_io_write;
   5.453 +		if (rw == READ)
   5.454 +			bio->bi_end_io = bounce_end_io_read;
   5.455 +	} else {
   5.456 +		bio->bi_end_io = bounce_end_io_write_isa;
   5.457 +		if (rw == READ)
   5.458 +			bio->bi_end_io = bounce_end_io_read_isa;
   5.459 +	}
   5.460 +
   5.461 +	bio->bi_private = *bio_orig;
   5.462 +	*bio_orig = bio;
   5.463 +}
   5.464 +
   5.465 +void blk_queue_bounce(request_queue_t *q, struct bio **bio_orig)
   5.466 +{
   5.467 +	mempool_t *pool;
   5.468 +
   5.469 +	/*
   5.470 +	 * for non-isa bounce case, just check if the bounce pfn is equal
   5.471 +	 * to or bigger than the highest pfn in the system -- in that case,
   5.472 +	 * don't waste time iterating over bio segments
   5.473 +	 */
   5.474 +	if (!(q->bounce_gfp & GFP_DMA)) {
   5.475 +		if (q->bounce_pfn >= blk_max_pfn)
   5.476 +			return;
   5.477 +		pool = page_pool;
   5.478 +	} else {
   5.479 +		BUG_ON(!isa_page_pool);
   5.480 +		pool = isa_page_pool;
   5.481 +	}
   5.482 +
   5.483 +	/*
   5.484 +	 * slow path
   5.485 +	 */
   5.486 +	__blk_queue_bounce(q, bio_orig, pool);
   5.487 +}
   5.488 +
   5.489 +EXPORT_SYMBOL(blk_queue_bounce);
   5.490 +
   5.491 +#if defined(HASHED_PAGE_VIRTUAL)
   5.492 +
   5.493 +#define PA_HASH_ORDER	7
   5.494 +
   5.495 +/*
   5.496 + * Describes one page->virtual association
   5.497 + */
   5.498 +struct page_address_map {
   5.499 +	struct page *page;
   5.500 +	void *virtual;
   5.501 +	struct list_head list;
   5.502 +};
   5.503 +
   5.504 +/*
   5.505 + * page_address_map freelist, allocated from page_address_maps.
   5.506 + */
   5.507 +static struct list_head page_address_pool;	/* freelist */
   5.508 +static spinlock_t pool_lock;			/* protects page_address_pool */
   5.509 +
   5.510 +/*
   5.511 + * Hash table bucket
   5.512 + */
   5.513 +static struct page_address_slot {
   5.514 +	struct list_head lh;			/* List of page_address_maps */
   5.515 +	spinlock_t lock;			/* Protect this bucket's list */
   5.516 +} ____cacheline_aligned_in_smp page_address_htable[1<<PA_HASH_ORDER];
   5.517 +
   5.518 +static struct page_address_slot *page_slot(struct page *page)
   5.519 +{
   5.520 +	return &page_address_htable[hash_ptr(page, PA_HASH_ORDER)];
   5.521 +}
   5.522 +
   5.523 +void *page_address(struct page *page)
   5.524 +{
   5.525 +	unsigned long flags;
   5.526 +	void *ret;
   5.527 +	struct page_address_slot *pas;
   5.528 +
   5.529 +	if (!PageHighMem(page))
   5.530 +		return lowmem_page_address(page);
   5.531 +
   5.532 +	pas = page_slot(page);
   5.533 +	ret = NULL;
   5.534 +	spin_lock_irqsave(&pas->lock, flags);
   5.535 +	if (!list_empty(&pas->lh)) {
   5.536 +		struct page_address_map *pam;
   5.537 +
   5.538 +		list_for_each_entry(pam, &pas->lh, list) {
   5.539 +			if (pam->page == page) {
   5.540 +				ret = pam->virtual;
   5.541 +				goto done;
   5.542 +			}
   5.543 +		}
   5.544 +	}
   5.545 +done:
   5.546 +	spin_unlock_irqrestore(&pas->lock, flags);
   5.547 +	return ret;
   5.548 +}
   5.549 +
   5.550 +EXPORT_SYMBOL(page_address);
   5.551 +
   5.552 +void set_page_address(struct page *page, void *virtual)
   5.553 +{
   5.554 +	unsigned long flags;
   5.555 +	struct page_address_slot *pas;
   5.556 +	struct page_address_map *pam;
   5.557 +
   5.558 +	BUG_ON(!PageHighMem(page));
   5.559 +
   5.560 +	pas = page_slot(page);
   5.561 +	if (virtual) {		/* Add */
   5.562 +		BUG_ON(list_empty(&page_address_pool));
   5.563 +
   5.564 +		spin_lock_irqsave(&pool_lock, flags);
   5.565 +		pam = list_entry(page_address_pool.next,
   5.566 +				struct page_address_map, list);
   5.567 +		list_del(&pam->list);
   5.568 +		spin_unlock_irqrestore(&pool_lock, flags);
   5.569 +
   5.570 +		pam->page = page;
   5.571 +		pam->virtual = virtual;
   5.572 +
   5.573 +		spin_lock_irqsave(&pas->lock, flags);
   5.574 +		list_add_tail(&pam->list, &pas->lh);
   5.575 +		spin_unlock_irqrestore(&pas->lock, flags);
   5.576 +	} else {		/* Remove */
   5.577 +		spin_lock_irqsave(&pas->lock, flags);
   5.578 +		list_for_each_entry(pam, &pas->lh, list) {
   5.579 +			if (pam->page == page) {
   5.580 +				list_del(&pam->list);
   5.581 +				spin_unlock_irqrestore(&pas->lock, flags);
   5.582 +				spin_lock_irqsave(&pool_lock, flags);
   5.583 +				list_add_tail(&pam->list, &page_address_pool);
   5.584 +				spin_unlock_irqrestore(&pool_lock, flags);
   5.585 +				goto done;
   5.586 +			}
   5.587 +		}
   5.588 +		spin_unlock_irqrestore(&pas->lock, flags);
   5.589 +	}
   5.590 +done:
   5.591 +	return;
   5.592 +}
   5.593 +
   5.594 +static struct page_address_map page_address_maps[LAST_PKMAP];
   5.595 +
   5.596 +void __init page_address_init(void)
   5.597 +{
   5.598 +	int i;
   5.599 +
   5.600 +	INIT_LIST_HEAD(&page_address_pool);
   5.601 +	for (i = 0; i < ARRAY_SIZE(page_address_maps); i++)
   5.602 +		list_add(&page_address_maps[i].list, &page_address_pool);
   5.603 +	for (i = 0; i < ARRAY_SIZE(page_address_htable); i++) {
   5.604 +		INIT_LIST_HEAD(&page_address_htable[i].lh);
   5.605 +		spin_lock_init(&page_address_htable[i].lock);
   5.606 +	}
   5.607 +	spin_lock_init(&pool_lock);
   5.608 +}
   5.609 +
   5.610 +#endif	/* defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) */
     6.1 --- a/linux-2.6.10-xen-sparse/mm/memory.c	Thu Mar 10 18:12:10 2005 +0000
     6.2 +++ b/linux-2.6.10-xen-sparse/mm/memory.c	Tue Mar 15 14:50:10 2005 +0000
     6.3 @@ -152,6 +152,10 @@ void clear_page_tables(struct mmu_gather
     6.4  		free_one_pgd(tlb, page_dir);
     6.5  		page_dir++;
     6.6  	} while (--nr);
     6.7 +#ifdef CONFIG_XEN_BATCH_MODE2
     6.8 +    XEN_flush_page_update_queue();
     6.9 +#endif
    6.10 +
    6.11  }
    6.12  
    6.13  pte_t fastcall * pte_alloc_map(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
    6.14 @@ -326,8 +330,15 @@ skip_copy_pte_range:
    6.15  				 * in the parent and the child
    6.16  				 */
    6.17  				if (cow) {
    6.18 +#ifdef CONFIG_XEN_BATCH_MODE2
    6.19 +/* XEN modification: modified ordering here to avoid RaW hazard. */
    6.20 +                    pte = *src_pte;
    6.21 +					pte = pte_wrprotect(pte);
    6.22 +					ptep_set_wrprotect(src_pte);
    6.23 +#else
    6.24  					ptep_set_wrprotect(src_pte);
    6.25  					pte = *src_pte;
    6.26 +#endif
    6.27  				}
    6.28  
    6.29  				/*
    6.30 @@ -1451,7 +1462,20 @@ static int do_swap_page(struct mm_struct
    6.31  	unlock_page(page);
    6.32  
    6.33  	flush_icache_page(vma, page);
    6.34 +
    6.35 +#ifdef CONFIG_XEN_BATCH_MODE2
    6.36 +	if ( likely(vma->vm_mm == current->mm) ) {
    6.37 +		XEN_flush_page_update_queue();
    6.38 +		HYPERVISOR_update_va_mapping(address, pte, 0);
    6.39 +	} else {
    6.40 +		set_pte(page_table, pte);
    6.41 +		XEN_flush_page_update_queue();        
    6.42 +	}
    6.43 +#else
    6.44  	set_pte(page_table, pte);
    6.45 +#endif
    6.46 +
    6.47 +
    6.48  	page_add_anon_rmap(page, vma, address);
    6.49  
    6.50  	if (write_access) {
    6.51 @@ -1516,7 +1540,17 @@ do_anonymous_page(struct mm_struct *mm, 
    6.52  		page_add_anon_rmap(page, vma, addr);
    6.53  	}
    6.54  
    6.55 +#ifdef CONFIG_XEN_BATCH_MODE2
    6.56 +	if ( likely(vma->vm_mm == current->mm) ) {
    6.57 +		XEN_flush_page_update_queue();
    6.58 +		HYPERVISOR_update_va_mapping(addr, entry, 0);
    6.59 +	} else {
    6.60 +		set_pte(page_table, entry);
    6.61 +		XEN_flush_page_update_queue();
    6.62 +	}
    6.63 +#else
    6.64  	ptep_establish_new(vma, addr, page_table, entry);
    6.65 +#endif
    6.66  	pte_unmap(page_table);
    6.67  
    6.68  	/* No need to invalidate - it was non-present before */
    6.69 @@ -1621,7 +1655,17 @@ retry:
    6.70  		entry = mk_pte(new_page, vma->vm_page_prot);
    6.71  		if (write_access)
    6.72  			entry = maybe_mkwrite(pte_mkdirty(entry), vma);
    6.73 +#ifdef CONFIG_XEN_BATCH_MODE2
    6.74 +		if ( likely(vma->vm_mm == current->mm) ) {
    6.75 +			XEN_flush_page_update_queue();
    6.76 +			HYPERVISOR_update_va_mapping(address, entry, 0);
    6.77 +		} else {
    6.78 +			set_pte(page_table, entry);
    6.79 +			XEN_flush_page_update_queue();
    6.80 +		}
    6.81 +#else
    6.82  		ptep_establish_new(vma, address, page_table, entry);
    6.83 +#endif
    6.84  		if (anon) {
    6.85  			lru_cache_add_active(new_page);
    6.86  			page_add_anon_rmap(new_page, vma, address);
     7.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     7.2 +++ b/linux-2.6.10-xen-sparse/mm/swapfile.c	Tue Mar 15 14:50:10 2005 +0000
     7.3 @@ -0,0 +1,1711 @@
     7.4 +/*
     7.5 + *  linux/mm/swapfile.c
     7.6 + *
     7.7 + *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
     7.8 + *  Swap reorganised 29.12.95, Stephen Tweedie
     7.9 + */
    7.10 +
    7.11 +#include <linux/config.h>
    7.12 +#include <linux/mm.h>
    7.13 +#include <linux/hugetlb.h>
    7.14 +#include <linux/mman.h>
    7.15 +#include <linux/slab.h>
    7.16 +#include <linux/kernel_stat.h>
    7.17 +#include <linux/swap.h>
    7.18 +#include <linux/vmalloc.h>
    7.19 +#include <linux/pagemap.h>
    7.20 +#include <linux/namei.h>
    7.21 +#include <linux/shm.h>
    7.22 +#include <linux/blkdev.h>
    7.23 +#include <linux/writeback.h>
    7.24 +#include <linux/proc_fs.h>
    7.25 +#include <linux/seq_file.h>
    7.26 +#include <linux/init.h>
    7.27 +#include <linux/module.h>
    7.28 +#include <linux/rmap.h>
    7.29 +#include <linux/security.h>
    7.30 +#include <linux/backing-dev.h>
    7.31 +#include <linux/syscalls.h>
    7.32 +
    7.33 +#include <asm/pgtable.h>
    7.34 +#include <asm/tlbflush.h>
    7.35 +#include <linux/swapops.h>
    7.36 +
    7.37 +spinlock_t swaplock = SPIN_LOCK_UNLOCKED;
    7.38 +unsigned int nr_swapfiles;
    7.39 +long total_swap_pages;
    7.40 +static int swap_overflow;
    7.41 +
    7.42 +EXPORT_SYMBOL(total_swap_pages);
    7.43 +
    7.44 +static const char Bad_file[] = "Bad swap file entry ";
    7.45 +static const char Unused_file[] = "Unused swap file entry ";
    7.46 +static const char Bad_offset[] = "Bad swap offset entry ";
    7.47 +static const char Unused_offset[] = "Unused swap offset entry ";
    7.48 +
    7.49 +struct swap_list_t swap_list = {-1, -1};
    7.50 +
    7.51 +struct swap_info_struct swap_info[MAX_SWAPFILES];
    7.52 +
    7.53 +static DECLARE_MUTEX(swapon_sem);
    7.54 +
    7.55 +/*
    7.56 + * We need this because the bdev->unplug_fn can sleep and we cannot
    7.57 + * hold swap_list_lock while calling the unplug_fn. And swap_list_lock
    7.58 + * cannot be turned into a semaphore.
    7.59 + */
    7.60 +static DECLARE_RWSEM(swap_unplug_sem);
    7.61 +
    7.62 +#define SWAPFILE_CLUSTER 256
    7.63 +
    7.64 +void swap_unplug_io_fn(struct backing_dev_info *unused_bdi, struct page *page)
    7.65 +{
    7.66 +	swp_entry_t entry;
    7.67 +
    7.68 +	down_read(&swap_unplug_sem);
    7.69 +	entry.val = page->private;
    7.70 +	if (PageSwapCache(page)) {
    7.71 +		struct block_device *bdev = swap_info[swp_type(entry)].bdev;
    7.72 +		struct backing_dev_info *bdi;
    7.73 +
    7.74 +		/*
    7.75 +		 * If the page is removed from swapcache from under us (with a
    7.76 +		 * racy try_to_unuse/swapoff) we need an additional reference
    7.77 +		 * count to avoid reading garbage from page->private above. If
    7.78 +		 * the WARN_ON triggers during a swapoff it maybe the race
    7.79 +		 * condition and it's harmless. However if it triggers without
    7.80 +		 * swapoff it signals a problem.
    7.81 +		 */
    7.82 +		WARN_ON(page_count(page) <= 1);
    7.83 +
    7.84 +		bdi = bdev->bd_inode->i_mapping->backing_dev_info;
    7.85 +		bdi->unplug_io_fn(bdi, page);
    7.86 +	}
    7.87 +	up_read(&swap_unplug_sem);
    7.88 +}
    7.89 +
    7.90 +static inline int scan_swap_map(struct swap_info_struct *si)
    7.91 +{
    7.92 +	unsigned long offset;
    7.93 +	/* 
    7.94 +	 * We try to cluster swap pages by allocating them
    7.95 +	 * sequentially in swap.  Once we've allocated
    7.96 +	 * SWAPFILE_CLUSTER pages this way, however, we resort to
    7.97 +	 * first-free allocation, starting a new cluster.  This
    7.98 +	 * prevents us from scattering swap pages all over the entire
    7.99 +	 * swap partition, so that we reduce overall disk seek times
   7.100 +	 * between swap pages.  -- sct */
   7.101 +	if (si->cluster_nr) {
   7.102 +		while (si->cluster_next <= si->highest_bit) {
   7.103 +			offset = si->cluster_next++;
   7.104 +			if (si->swap_map[offset])
   7.105 +				continue;
   7.106 +			si->cluster_nr--;
   7.107 +			goto got_page;
   7.108 +		}
   7.109 +	}
   7.110 +	si->cluster_nr = SWAPFILE_CLUSTER;
   7.111 +
   7.112 +	/* try to find an empty (even not aligned) cluster. */
   7.113 +	offset = si->lowest_bit;
   7.114 + check_next_cluster:
   7.115 +	if (offset+SWAPFILE_CLUSTER-1 <= si->highest_bit)
   7.116 +	{
   7.117 +		unsigned long nr;
   7.118 +		for (nr = offset; nr < offset+SWAPFILE_CLUSTER; nr++)
   7.119 +			if (si->swap_map[nr])
   7.120 +			{
   7.121 +				offset = nr+1;
   7.122 +				goto check_next_cluster;
   7.123 +			}
   7.124 +		/* We found a completly empty cluster, so start
   7.125 +		 * using it.
   7.126 +		 */
   7.127 +		goto got_page;
   7.128 +	}
   7.129 +	/* No luck, so now go finegrined as usual. -Andrea */
   7.130 +	for (offset = si->lowest_bit; offset <= si->highest_bit ; offset++) {
   7.131 +		if (si->swap_map[offset])
   7.132 +			continue;
   7.133 +		si->lowest_bit = offset+1;
   7.134 +	got_page:
   7.135 +		if (offset == si->lowest_bit)
   7.136 +			si->lowest_bit++;
   7.137 +		if (offset == si->highest_bit)
   7.138 +			si->highest_bit--;
   7.139 +		if (si->lowest_bit > si->highest_bit) {
   7.140 +			si->lowest_bit = si->max;
   7.141 +			si->highest_bit = 0;
   7.142 +		}
   7.143 +		si->swap_map[offset] = 1;
   7.144 +		si->inuse_pages++;
   7.145 +		nr_swap_pages--;
   7.146 +		si->cluster_next = offset+1;
   7.147 +		return offset;
   7.148 +	}
   7.149 +	si->lowest_bit = si->max;
   7.150 +	si->highest_bit = 0;
   7.151 +	return 0;
   7.152 +}
   7.153 +
   7.154 +swp_entry_t get_swap_page(void)
   7.155 +{
   7.156 +	struct swap_info_struct * p;
   7.157 +	unsigned long offset;
   7.158 +	swp_entry_t entry;
   7.159 +	int type, wrapped = 0;
   7.160 +
   7.161 +	entry.val = 0;	/* Out of memory */
   7.162 +	swap_list_lock();
   7.163 +	type = swap_list.next;
   7.164 +	if (type < 0)
   7.165 +		goto out;
   7.166 +	if (nr_swap_pages <= 0)
   7.167 +		goto out;
   7.168 +
   7.169 +	while (1) {
   7.170 +		p = &swap_info[type];
   7.171 +		if ((p->flags & SWP_ACTIVE) == SWP_ACTIVE) {
   7.172 +			swap_device_lock(p);
   7.173 +			offset = scan_swap_map(p);
   7.174 +			swap_device_unlock(p);
   7.175 +			if (offset) {
   7.176 +				entry = swp_entry(type,offset);
   7.177 +				type = swap_info[type].next;
   7.178 +				if (type < 0 ||
   7.179 +					p->prio != swap_info[type].prio) {
   7.180 +						swap_list.next = swap_list.head;
   7.181 +				} else {
   7.182 +					swap_list.next = type;
   7.183 +				}
   7.184 +				goto out;
   7.185 +			}
   7.186 +		}
   7.187 +		type = p->next;
   7.188 +		if (!wrapped) {
   7.189 +			if (type < 0 || p->prio != swap_info[type].prio) {
   7.190 +				type = swap_list.head;
   7.191 +				wrapped = 1;
   7.192 +			}
   7.193 +		} else
   7.194 +			if (type < 0)
   7.195 +				goto out;	/* out of swap space */
   7.196 +	}
   7.197 +out:
   7.198 +	swap_list_unlock();
   7.199 +	return entry;
   7.200 +}
   7.201 +
   7.202 +static struct swap_info_struct * swap_info_get(swp_entry_t entry)
   7.203 +{
   7.204 +	struct swap_info_struct * p;
   7.205 +	unsigned long offset, type;
   7.206 +
   7.207 +	if (!entry.val)
   7.208 +		goto out;
   7.209 +	type = swp_type(entry);
   7.210 +	if (type >= nr_swapfiles)
   7.211 +		goto bad_nofile;
   7.212 +	p = & swap_info[type];
   7.213 +	if (!(p->flags & SWP_USED))
   7.214 +		goto bad_device;
   7.215 +	offset = swp_offset(entry);
   7.216 +	if (offset >= p->max)
   7.217 +		goto bad_offset;
   7.218 +	if (!p->swap_map[offset])
   7.219 +		goto bad_free;
   7.220 +	swap_list_lock();
   7.221 +	if (p->prio > swap_info[swap_list.next].prio)
   7.222 +		swap_list.next = type;
   7.223 +	swap_device_lock(p);
   7.224 +	return p;
   7.225 +
   7.226 +bad_free:
   7.227 +	printk(KERN_ERR "swap_free: %s%08lx\n", Unused_offset, entry.val);
   7.228 +	goto out;
   7.229 +bad_offset:
   7.230 +	printk(KERN_ERR "swap_free: %s%08lx\n", Bad_offset, entry.val);
   7.231 +	goto out;
   7.232 +bad_device:
   7.233 +	printk(KERN_ERR "swap_free: %s%08lx\n", Unused_file, entry.val);
   7.234 +	goto out;
   7.235 +bad_nofile:
   7.236 +	printk(KERN_ERR "swap_free: %s%08lx\n", Bad_file, entry.val);
   7.237 +out:
   7.238 +	return NULL;
   7.239 +}	
   7.240 +
   7.241 +static void swap_info_put(struct swap_info_struct * p)
   7.242 +{
   7.243 +	swap_device_unlock(p);
   7.244 +	swap_list_unlock();
   7.245 +}
   7.246 +
   7.247 +static int swap_entry_free(struct swap_info_struct *p, unsigned long offset)
   7.248 +{
   7.249 +	int count = p->swap_map[offset];
   7.250 +
   7.251 +	if (count < SWAP_MAP_MAX) {
   7.252 +		count--;
   7.253 +		p->swap_map[offset] = count;
   7.254 +		if (!count) {
   7.255 +			if (offset < p->lowest_bit)
   7.256 +				p->lowest_bit = offset;
   7.257 +			if (offset > p->highest_bit)
   7.258 +				p->highest_bit = offset;
   7.259 +			nr_swap_pages++;
   7.260 +			p->inuse_pages--;
   7.261 +		}
   7.262 +	}
   7.263 +	return count;
   7.264 +}
   7.265 +
   7.266 +/*
   7.267 + * Caller has made sure that the swapdevice corresponding to entry
   7.268 + * is still around or has not been recycled.
   7.269 + */
   7.270 +void swap_free(swp_entry_t entry)
   7.271 +{
   7.272 +	struct swap_info_struct * p;
   7.273 +
   7.274 +	p = swap_info_get(entry);
   7.275 +	if (p) {
   7.276 +		swap_entry_free(p, swp_offset(entry));
   7.277 +		swap_info_put(p);
   7.278 +	}
   7.279 +}
   7.280 +
   7.281 +/*
   7.282 + * Check if we're the only user of a swap page,
   7.283 + * when the page is locked.
   7.284 + */
   7.285 +static int exclusive_swap_page(struct page *page)
   7.286 +{
   7.287 +	int retval = 0;
   7.288 +	struct swap_info_struct * p;
   7.289 +	swp_entry_t entry;
   7.290 +
   7.291 +	entry.val = page->private;
   7.292 +	p = swap_info_get(entry);
   7.293 +	if (p) {
   7.294 +		/* Is the only swap cache user the cache itself? */
   7.295 +		if (p->swap_map[swp_offset(entry)] == 1) {
   7.296 +			/* Recheck the page count with the swapcache lock held.. */
   7.297 +			spin_lock_irq(&swapper_space.tree_lock);
   7.298 +			if (page_count(page) == 2)
   7.299 +				retval = 1;
   7.300 +			spin_unlock_irq(&swapper_space.tree_lock);
   7.301 +		}
   7.302 +		swap_info_put(p);
   7.303 +	}
   7.304 +	return retval;
   7.305 +}
   7.306 +
   7.307 +/*
   7.308 + * We can use this swap cache entry directly
   7.309 + * if there are no other references to it.
   7.310 + *
   7.311 + * Here "exclusive_swap_page()" does the real
   7.312 + * work, but we opportunistically check whether
   7.313 + * we need to get all the locks first..
   7.314 + */
   7.315 +int can_share_swap_page(struct page *page)
   7.316 +{
   7.317 +	int retval = 0;
   7.318 +
   7.319 +	if (!PageLocked(page))
   7.320 +		BUG();
   7.321 +	switch (page_count(page)) {
   7.322 +	case 3:
   7.323 +		if (!PagePrivate(page))
   7.324 +			break;
   7.325 +		/* Fallthrough */
   7.326 +	case 2:
   7.327 +		if (!PageSwapCache(page))
   7.328 +			break;
   7.329 +		retval = exclusive_swap_page(page);
   7.330 +		break;
   7.331 +	case 1:
   7.332 +		if (PageReserved(page))
   7.333 +			break;
   7.334 +		retval = 1;
   7.335 +	}
   7.336 +	return retval;
   7.337 +}
   7.338 +
   7.339 +/*
   7.340 + * Work out if there are any other processes sharing this
   7.341 + * swap cache page. Free it if you can. Return success.
   7.342 + */
   7.343 +int remove_exclusive_swap_page(struct page *page)
   7.344 +{
   7.345 +	int retval;
   7.346 +	struct swap_info_struct * p;
   7.347 +	swp_entry_t entry;
   7.348 +
   7.349 +	BUG_ON(PagePrivate(page));
   7.350 +	BUG_ON(!PageLocked(page));
   7.351 +
   7.352 +	if (!PageSwapCache(page))
   7.353 +		return 0;
   7.354 +	if (PageWriteback(page))
   7.355 +		return 0;
   7.356 +	if (page_count(page) != 2) /* 2: us + cache */
   7.357 +		return 0;
   7.358 +
   7.359 +	entry.val = page->private;
   7.360 +	p = swap_info_get(entry);
   7.361 +	if (!p)
   7.362 +		return 0;
   7.363 +
   7.364 +	/* Is the only swap cache user the cache itself? */
   7.365 +	retval = 0;
   7.366 +	if (p->swap_map[swp_offset(entry)] == 1) {
   7.367 +		/* Recheck the page count with the swapcache lock held.. */
   7.368 +		spin_lock_irq(&swapper_space.tree_lock);
   7.369 +		if ((page_count(page) == 2) && !PageWriteback(page)) {
   7.370 +			__delete_from_swap_cache(page);
   7.371 +			SetPageDirty(page);
   7.372 +			retval = 1;
   7.373 +		}
   7.374 +		spin_unlock_irq(&swapper_space.tree_lock);
   7.375 +	}
   7.376 +	swap_info_put(p);
   7.377 +
   7.378 +	if (retval) {
   7.379 +		swap_free(entry);
   7.380 +		page_cache_release(page);
   7.381 +	}
   7.382 +
   7.383 +	return retval;
   7.384 +}
   7.385 +
   7.386 +/*
   7.387 + * Free the swap entry like above, but also try to
   7.388 + * free the page cache entry if it is the last user.
   7.389 + */
   7.390 +void free_swap_and_cache(swp_entry_t entry)
   7.391 +{
   7.392 +	struct swap_info_struct * p;
   7.393 +	struct page *page = NULL;
   7.394 +
   7.395 +	p = swap_info_get(entry);
   7.396 +	if (p) {
   7.397 +		if (swap_entry_free(p, swp_offset(entry)) == 1) {
   7.398 +			spin_lock_irq(&swapper_space.tree_lock);
   7.399 +			page = radix_tree_lookup(&swapper_space.page_tree,
   7.400 +				entry.val);
   7.401 +			if (page && TestSetPageLocked(page))
   7.402 +				page = NULL;
   7.403 +			spin_unlock_irq(&swapper_space.tree_lock);
   7.404 +		}
   7.405 +		swap_info_put(p);
   7.406 +	}
   7.407 +	if (page) {
   7.408 +		int one_user;
   7.409 +
   7.410 +		BUG_ON(PagePrivate(page));
   7.411 +		page_cache_get(page);
   7.412 +		one_user = (page_count(page) == 2);
   7.413 +		/* Only cache user (+us), or swap space full? Free it! */
   7.414 +		if (!PageWriteback(page) && (one_user || vm_swap_full())) {
   7.415 +			delete_from_swap_cache(page);
   7.416 +			SetPageDirty(page);
   7.417 +		}
   7.418 +		unlock_page(page);
   7.419 +		page_cache_release(page);
   7.420 +	}
   7.421 +}
   7.422 +
   7.423 +/*
   7.424 + * The swap entry has been read in advance, and we return 1 to indicate
   7.425 + * that the page has been used or is no longer needed.
   7.426 + *
   7.427 + * Always set the resulting pte to be nowrite (the same as COW pages
   7.428 + * after one process has exited).  We don't know just how many PTEs will
   7.429 + * share this swap entry, so be cautious and let do_wp_page work out
   7.430 + * what to do if a write is requested later.
   7.431 + */
   7.432 +/* vma->vm_mm->page_table_lock is held */
   7.433 +static void
   7.434 +unuse_pte(struct vm_area_struct *vma, unsigned long address, pte_t *dir,
   7.435 +	swp_entry_t entry, struct page *page)
   7.436 +{
   7.437 +	vma->vm_mm->rss++;
   7.438 +	get_page(page);
   7.439 +	set_pte(dir, pte_mkold(mk_pte(page, vma->vm_page_prot)));
   7.440 +	page_add_anon_rmap(page, vma, address);
   7.441 +	swap_free(entry);
   7.442 +}
   7.443 +
   7.444 +/* vma->vm_mm->page_table_lock is held */
   7.445 +static unsigned long unuse_pmd(struct vm_area_struct * vma, pmd_t *dir,
   7.446 +	unsigned long address, unsigned long size, unsigned long offset,
   7.447 +	swp_entry_t entry, struct page *page)
   7.448 +{
   7.449 +	pte_t * pte;
   7.450 +	unsigned long end;
   7.451 +	pte_t swp_pte = swp_entry_to_pte(entry);
   7.452 +
   7.453 +	if (pmd_none(*dir))
   7.454 +		return 0;
   7.455 +	if (pmd_bad(*dir)) {
   7.456 +		pmd_ERROR(*dir);
   7.457 +		pmd_clear(dir);
   7.458 +		return 0;
   7.459 +	}
   7.460 +	pte = pte_offset_map(dir, address);
   7.461 +	offset += address & PMD_MASK;
   7.462 +	address &= ~PMD_MASK;
   7.463 +	end = address + size;
   7.464 +	if (end > PMD_SIZE)
   7.465 +		end = PMD_SIZE;
   7.466 +	do {
   7.467 +		/*
   7.468 +		 * swapoff spends a _lot_ of time in this loop!
   7.469 +		 * Test inline before going to call unuse_pte.
   7.470 +		 */
   7.471 +		if (unlikely(pte_same(*pte, swp_pte))) {
   7.472 +			unuse_pte(vma, offset + address, pte, entry, page);
   7.473 +			pte_unmap(pte);
   7.474 +
   7.475 +			/*
   7.476 +			 * Move the page to the active list so it is not
   7.477 +			 * immediately swapped out again after swapon.
   7.478 +			 */
   7.479 +			activate_page(page);
   7.480 +
   7.481 +			/* add 1 since address may be 0 */
   7.482 +			return 1 + offset + address;
   7.483 +		}
   7.484 +		address += PAGE_SIZE;
   7.485 +		pte++;
   7.486 +	} while (address && (address < end));
   7.487 +	pte_unmap(pte - 1);
   7.488 +	return 0;
   7.489 +}
   7.490 +
   7.491 +/* vma->vm_mm->page_table_lock is held */
   7.492 +static unsigned long unuse_pgd(struct vm_area_struct * vma, pgd_t *dir,
   7.493 +	unsigned long address, unsigned long size,
   7.494 +	swp_entry_t entry, struct page *page)
   7.495 +{
   7.496 +	pmd_t * pmd;
   7.497 +	unsigned long offset, end;
   7.498 +	unsigned long foundaddr;
   7.499 +
   7.500 +	if (pgd_none(*dir))
   7.501 +		return 0;
   7.502 +	if (pgd_bad(*dir)) {
   7.503 +		pgd_ERROR(*dir);
   7.504 +		pgd_clear(dir);
   7.505 +		return 0;
   7.506 +	}
   7.507 +	pmd = pmd_offset(dir, address);
   7.508 +	offset = address & PGDIR_MASK;
   7.509 +	address &= ~PGDIR_MASK;
   7.510 +	end = address + size;
   7.511 +	if (end > PGDIR_SIZE)
   7.512 +		end = PGDIR_SIZE;
   7.513 +	if (address >= end)
   7.514 +		BUG();
   7.515 +	do {
   7.516 +		foundaddr = unuse_pmd(vma, pmd, address, end - address,
   7.517 +						offset, entry, page);
   7.518 +		if (foundaddr)
   7.519 +			return foundaddr;
   7.520 +		address = (address + PMD_SIZE) & PMD_MASK;
   7.521 +		pmd++;
   7.522 +	} while (address && (address < end));
   7.523 +	return 0;
   7.524 +}
   7.525 +
   7.526 +/* vma->vm_mm->page_table_lock is held */
   7.527 +static unsigned long unuse_vma(struct vm_area_struct * vma,
   7.528 +	swp_entry_t entry, struct page *page)
   7.529 +{
   7.530 +	pgd_t *pgdir;
   7.531 +	unsigned long start, end;
   7.532 +	unsigned long foundaddr;
   7.533 +
   7.534 +	if (page->mapping) {
   7.535 +		start = page_address_in_vma(page, vma);
   7.536 +		if (start == -EFAULT)
   7.537 +			return 0;
   7.538 +		else
   7.539 +			end = start + PAGE_SIZE;
   7.540 +	} else {
   7.541 +		start = vma->vm_start;
   7.542 +		end = vma->vm_end;
   7.543 +	}
   7.544 +	pgdir = pgd_offset(vma->vm_mm, start);
   7.545 +	do {
   7.546 +		foundaddr = unuse_pgd(vma, pgdir, start, end - start,
   7.547 +						entry, page);
   7.548 +		if (foundaddr)
   7.549 +			return foundaddr;
   7.550 +		start = (start + PGDIR_SIZE) & PGDIR_MASK;
   7.551 +		pgdir++;
   7.552 +	} while (start && (start < end));
   7.553 +	return 0;
   7.554 +}
   7.555 +
   7.556 +static int unuse_process(struct mm_struct * mm,
   7.557 +			swp_entry_t entry, struct page* page)
   7.558 +{
   7.559 +	struct vm_area_struct* vma;
   7.560 +	unsigned long foundaddr = 0;
   7.561 +
   7.562 +	/*
   7.563 +	 * Go through process' page directory.
   7.564 +	 */
   7.565 +	if (!down_read_trylock(&mm->mmap_sem)) {
   7.566 +		/*
   7.567 +		 * Our reference to the page stops try_to_unmap_one from
   7.568 +		 * unmapping its ptes, so swapoff can make progress.
   7.569 +		 */
   7.570 +		unlock_page(page);
   7.571 +		down_read(&mm->mmap_sem);
   7.572 +		lock_page(page);
   7.573 +	}
   7.574 +	spin_lock(&mm->page_table_lock);
   7.575 +	for (vma = mm->mmap; vma; vma = vma->vm_next) {
   7.576 +		if (vma->anon_vma) {
   7.577 +			foundaddr = unuse_vma(vma, entry, page);
   7.578 +			if (foundaddr)
   7.579 +				break;
   7.580 +		}
   7.581 +	}
   7.582 +#ifdef CONFIG_XEN_BATCH_MODE2
   7.583 +        XEN_flush_page_update_queue();
   7.584 +#endif
   7.585 +	spin_unlock(&mm->page_table_lock);
   7.586 +	up_read(&mm->mmap_sem);
   7.587 +	/*
   7.588 +	 * Currently unuse_process cannot fail, but leave error handling
   7.589 +	 * at call sites for now, since we change it from time to time.
   7.590 +	 */
   7.591 +	return 0;
   7.592 +}
   7.593 +
   7.594 +/*
   7.595 + * Scan swap_map from current position to next entry still in use.
   7.596 + * Recycle to start on reaching the end, returning 0 when empty.
   7.597 + */
   7.598 +static int find_next_to_unuse(struct swap_info_struct *si, int prev)
   7.599 +{
   7.600 +	int max = si->max;
   7.601 +	int i = prev;
   7.602 +	int count;
   7.603 +
   7.604 +	/*
   7.605 +	 * No need for swap_device_lock(si) here: we're just looking
   7.606 +	 * for whether an entry is in use, not modifying it; false
   7.607 +	 * hits are okay, and sys_swapoff() has already prevented new
   7.608 +	 * allocations from this area (while holding swap_list_lock()).
   7.609 +	 */
   7.610 +	for (;;) {
   7.611 +		if (++i >= max) {
   7.612 +			if (!prev) {
   7.613 +				i = 0;
   7.614 +				break;
   7.615 +			}
   7.616 +			/*
   7.617 +			 * No entries in use at top of swap_map,
   7.618 +			 * loop back to start and recheck there.
   7.619 +			 */
   7.620 +			max = prev + 1;
   7.621 +			prev = 0;
   7.622 +			i = 1;
   7.623 +		}
   7.624 +		count = si->swap_map[i];
   7.625 +		if (count && count != SWAP_MAP_BAD)
   7.626 +			break;
   7.627 +	}
   7.628 +	return i;
   7.629 +}
   7.630 +
   7.631 +/*
   7.632 + * We completely avoid races by reading each swap page in advance,
   7.633 + * and then search for the process using it.  All the necessary
   7.634 + * page table adjustments can then be made atomically.
   7.635 + */
   7.636 +static int try_to_unuse(unsigned int type)
   7.637 +{
   7.638 +	struct swap_info_struct * si = &swap_info[type];
   7.639 +	struct mm_struct *start_mm;
   7.640 +	unsigned short *swap_map;
   7.641 +	unsigned short swcount;
   7.642 +	struct page *page;
   7.643 +	swp_entry_t entry;
   7.644 +	int i = 0;
   7.645 +	int retval = 0;
   7.646 +	int reset_overflow = 0;
   7.647 +	int shmem;
   7.648 +
   7.649 +	/*
   7.650 +	 * When searching mms for an entry, a good strategy is to
   7.651 +	 * start at the first mm we freed the previous entry from
   7.652 +	 * (though actually we don't notice whether we or coincidence
   7.653 +	 * freed the entry).  Initialize this start_mm with a hold.
   7.654 +	 *
   7.655 +	 * A simpler strategy would be to start at the last mm we
   7.656 +	 * freed the previous entry from; but that would take less
   7.657 +	 * advantage of mmlist ordering, which clusters forked mms
   7.658 +	 * together, child after parent.  If we race with dup_mmap(), we
   7.659 +	 * prefer to resolve parent before child, lest we miss entries
   7.660 +	 * duplicated after we scanned child: using last mm would invert
   7.661 +	 * that.  Though it's only a serious concern when an overflowed
   7.662 +	 * swap count is reset from SWAP_MAP_MAX, preventing a rescan.
   7.663 +	 */
   7.664 +	start_mm = &init_mm;
   7.665 +	atomic_inc(&init_mm.mm_users);
   7.666 +
   7.667 +	/*
   7.668 +	 * Keep on scanning until all entries have gone.  Usually,
   7.669 +	 * one pass through swap_map is enough, but not necessarily:
   7.670 +	 * there are races when an instance of an entry might be missed.
   7.671 +	 */
   7.672 +	while ((i = find_next_to_unuse(si, i)) != 0) {
   7.673 +		if (signal_pending(current)) {
   7.674 +			retval = -EINTR;
   7.675 +			break;
   7.676 +		}
   7.677 +
   7.678 +		/* 
   7.679 +		 * Get a page for the entry, using the existing swap
   7.680 +		 * cache page if there is one.  Otherwise, get a clean
   7.681 +		 * page and read the swap into it. 
   7.682 +		 */
   7.683 +		swap_map = &si->swap_map[i];
   7.684 +		entry = swp_entry(type, i);
   7.685 +		page = read_swap_cache_async(entry, NULL, 0);
   7.686 +		if (!page) {
   7.687 +			/*
   7.688 +			 * Either swap_duplicate() failed because entry
   7.689 +			 * has been freed independently, and will not be
   7.690 +			 * reused since sys_swapoff() already disabled
   7.691 +			 * allocation from here, or alloc_page() failed.
   7.692 +			 */
   7.693 +			if (!*swap_map)
   7.694 +				continue;
   7.695 +			retval = -ENOMEM;
   7.696 +			break;
   7.697 +		}
   7.698 +
   7.699 +		/*
   7.700 +		 * Don't hold on to start_mm if it looks like exiting.
   7.701 +		 */
   7.702 +		if (atomic_read(&start_mm->mm_users) == 1) {
   7.703 +			mmput(start_mm);
   7.704 +			start_mm = &init_mm;
   7.705 +			atomic_inc(&init_mm.mm_users);
   7.706 +		}
   7.707 +
   7.708 +		/*
   7.709 +		 * Wait for and lock page.  When do_swap_page races with
   7.710 +		 * try_to_unuse, do_swap_page can handle the fault much
   7.711 +		 * faster than try_to_unuse can locate the entry.  This
   7.712 +		 * apparently redundant "wait_on_page_locked" lets try_to_unuse
   7.713 +		 * defer to do_swap_page in such a case - in some tests,
   7.714 +		 * do_swap_page and try_to_unuse repeatedly compete.
   7.715 +		 */
   7.716 +		wait_on_page_locked(page);
   7.717 +		wait_on_page_writeback(page);
   7.718 +		lock_page(page);
   7.719 +		wait_on_page_writeback(page);
   7.720 +
   7.721 +		/*
   7.722 +		 * Remove all references to entry.
   7.723 +		 * Whenever we reach init_mm, there's no address space
   7.724 +		 * to search, but use it as a reminder to search shmem.
   7.725 +		 */
   7.726 +		shmem = 0;
   7.727 +		swcount = *swap_map;
   7.728 +		if (swcount > 1) {
   7.729 +			if (start_mm == &init_mm)
   7.730 +				shmem = shmem_unuse(entry, page);
   7.731 +			else
   7.732 +				retval = unuse_process(start_mm, entry, page);
   7.733 +		}
   7.734 +		if (*swap_map > 1) {
   7.735 +			int set_start_mm = (*swap_map >= swcount);
   7.736 +			struct list_head *p = &start_mm->mmlist;
   7.737 +			struct mm_struct *new_start_mm = start_mm;
   7.738 +			struct mm_struct *prev_mm = start_mm;
   7.739 +			struct mm_struct *mm;
   7.740 +
   7.741 +			atomic_inc(&new_start_mm->mm_users);
   7.742 +			atomic_inc(&prev_mm->mm_users);
   7.743 +			spin_lock(&mmlist_lock);
   7.744 +			while (*swap_map > 1 && !retval &&
   7.745 +					(p = p->next) != &start_mm->mmlist) {
   7.746 +				mm = list_entry(p, struct mm_struct, mmlist);
   7.747 +				if (atomic_inc_return(&mm->mm_users) == 1) {
   7.748 +					atomic_dec(&mm->mm_users);
   7.749 +					continue;
   7.750 +				}
   7.751 +				spin_unlock(&mmlist_lock);
   7.752 +				mmput(prev_mm);
   7.753 +				prev_mm = mm;
   7.754 +
   7.755 +				cond_resched();
   7.756 +
   7.757 +				swcount = *swap_map;
   7.758 +				if (swcount <= 1)
   7.759 +					;
   7.760 +				else if (mm == &init_mm) {
   7.761 +					set_start_mm = 1;
   7.762 +					shmem = shmem_unuse(entry, page);
   7.763 +				} else
   7.764 +					retval = unuse_process(mm, entry, page);
   7.765 +				if (set_start_mm && *swap_map < swcount) {
   7.766 +					mmput(new_start_mm);
   7.767 +					atomic_inc(&mm->mm_users);
   7.768 +					new_start_mm = mm;
   7.769 +					set_start_mm = 0;
   7.770 +				}
   7.771 +				spin_lock(&mmlist_lock);
   7.772 +			}
   7.773 +			spin_unlock(&mmlist_lock);
   7.774 +			mmput(prev_mm);
   7.775 +			mmput(start_mm);
   7.776 +			start_mm = new_start_mm;
   7.777 +		}
   7.778 +		if (retval) {
   7.779 +			unlock_page(page);
   7.780 +			page_cache_release(page);
   7.781 +			break;
   7.782 +		}
   7.783 +
   7.784 +		/*
   7.785 +		 * How could swap count reach 0x7fff when the maximum
   7.786 +		 * pid is 0x7fff, and there's no way to repeat a swap
   7.787 +		 * page within an mm (except in shmem, where it's the
   7.788 +		 * shared object which takes the reference count)?
   7.789 +		 * We believe SWAP_MAP_MAX cannot occur in Linux 2.4.
   7.790 +		 *
   7.791 +		 * If that's wrong, then we should worry more about
   7.792 +		 * exit_mmap() and do_munmap() cases described above:
   7.793 +		 * we might be resetting SWAP_MAP_MAX too early here.
   7.794 +		 * We know "Undead"s can happen, they're okay, so don't
   7.795 +		 * report them; but do report if we reset SWAP_MAP_MAX.
   7.796 +		 */
   7.797 +		if (*swap_map == SWAP_MAP_MAX) {
   7.798 +			swap_device_lock(si);
   7.799 +			*swap_map = 1;
   7.800 +			swap_device_unlock(si);
   7.801 +			reset_overflow = 1;
   7.802 +		}
   7.803 +
   7.804 +		/*
   7.805 +		 * If a reference remains (rare), we would like to leave
   7.806 +		 * the page in the swap cache; but try_to_unmap could
   7.807 +		 * then re-duplicate the entry once we drop page lock,
   7.808 +		 * so we might loop indefinitely; also, that page could
   7.809 +		 * not be swapped out to other storage meanwhile.  So:
   7.810 +		 * delete from cache even if there's another reference,
   7.811 +		 * after ensuring that the data has been saved to disk -
   7.812 +		 * since if the reference remains (rarer), it will be
   7.813 +		 * read from disk into another page.  Splitting into two
   7.814 +		 * pages would be incorrect if swap supported "shared
   7.815 +		 * private" pages, but they are handled by tmpfs files.
   7.816 +		 *
   7.817 +		 * Note shmem_unuse already deleted a swappage from
   7.818 +		 * the swap cache, unless the move to filepage failed:
   7.819 +		 * in which case it left swappage in cache, lowered its
   7.820 +		 * swap count to pass quickly through the loops above,
   7.821 +		 * and now we must reincrement count to try again later.
   7.822 +		 */
   7.823 +		if ((*swap_map > 1) && PageDirty(page) && PageSwapCache(page)) {
   7.824 +			struct writeback_control wbc = {
   7.825 +				.sync_mode = WB_SYNC_NONE,
   7.826 +			};
   7.827 +
   7.828 +			swap_writepage(page, &wbc);
   7.829 +			lock_page(page);
   7.830 +			wait_on_page_writeback(page);
   7.831 +		}
   7.832 +		if (PageSwapCache(page)) {
   7.833 +			if (shmem)
   7.834 +				swap_duplicate(entry);
   7.835 +			else
   7.836 +				delete_from_swap_cache(page);
   7.837 +		}
   7.838 +
   7.839 +		/*
   7.840 +		 * So we could skip searching mms once swap count went
   7.841 +		 * to 1, we did not mark any present ptes as dirty: must
   7.842 +		 * mark page dirty so shrink_list will preserve it.
   7.843 +		 */
   7.844 +		SetPageDirty(page);
   7.845 +		unlock_page(page);
   7.846 +		page_cache_release(page);
   7.847 +
   7.848 +		/*
   7.849 +		 * Make sure that we aren't completely killing
   7.850 +		 * interactive performance.
   7.851 +		 */
   7.852 +		cond_resched();
   7.853 +	}
   7.854 +
   7.855 +	mmput(start_mm);
   7.856 +	if (reset_overflow) {
   7.857 +		printk(KERN_WARNING "swapoff: cleared swap entry overflow\n");
   7.858 +		swap_overflow = 0;
   7.859 +	}
   7.860 +	return retval;
   7.861 +}
   7.862 +
   7.863 +/*
   7.864 + * After a successful try_to_unuse, if no swap is now in use, we know we
   7.865 + * can empty the mmlist.  swap_list_lock must be held on entry and exit.
   7.866 + * Note that mmlist_lock nests inside swap_list_lock, and an mm must be
   7.867 + * added to the mmlist just after page_duplicate - before would be racy.
   7.868 + */
   7.869 +static void drain_mmlist(void)
   7.870 +{
   7.871 +	struct list_head *p, *next;
   7.872 +	unsigned int i;
   7.873 +
   7.874 +	for (i = 0; i < nr_swapfiles; i++)
   7.875 +		if (swap_info[i].inuse_pages)
   7.876 +			return;
   7.877 +	spin_lock(&mmlist_lock);
   7.878 +	list_for_each_safe(p, next, &init_mm.mmlist)
   7.879 +		list_del_init(p);
   7.880 +	spin_unlock(&mmlist_lock);
   7.881 +}
   7.882 +
   7.883 +/*
   7.884 + * Use this swapdev's extent info to locate the (PAGE_SIZE) block which
   7.885 + * corresponds to page offset `offset'.
   7.886 + */
   7.887 +sector_t map_swap_page(struct swap_info_struct *sis, pgoff_t offset)
   7.888 +{
   7.889 +	struct swap_extent *se = sis->curr_swap_extent;
   7.890 +	struct swap_extent *start_se = se;
   7.891 +
   7.892 +	for ( ; ; ) {
   7.893 +		struct list_head *lh;
   7.894 +
   7.895 +		if (se->start_page <= offset &&
   7.896 +				offset < (se->start_page + se->nr_pages)) {
   7.897 +			return se->start_block + (offset - se->start_page);
   7.898 +		}
   7.899 +		lh = se->list.prev;
   7.900 +		if (lh == &sis->extent_list)
   7.901 +			lh = lh->prev;
   7.902 +		se = list_entry(lh, struct swap_extent, list);
   7.903 +		sis->curr_swap_extent = se;
   7.904 +		BUG_ON(se == start_se);		/* It *must* be present */
   7.905 +	}
   7.906 +}
   7.907 +
   7.908 +/*
   7.909 + * Free all of a swapdev's extent information
   7.910 + */
   7.911 +static void destroy_swap_extents(struct swap_info_struct *sis)
   7.912 +{
   7.913 +	while (!list_empty(&sis->extent_list)) {
   7.914 +		struct swap_extent *se;
   7.915 +
   7.916 +		se = list_entry(sis->extent_list.next,
   7.917 +				struct swap_extent, list);
   7.918 +		list_del(&se->list);
   7.919 +		kfree(se);
   7.920 +	}
   7.921 +	sis->nr_extents = 0;
   7.922 +}
   7.923 +
   7.924 +/*
   7.925 + * Add a block range (and the corresponding page range) into this swapdev's
   7.926 + * extent list.  The extent list is kept sorted in block order.
   7.927 + *
   7.928 + * This function rather assumes that it is called in ascending sector_t order.
   7.929 + * It doesn't look for extent coalescing opportunities.
   7.930 + */
   7.931 +static int
   7.932 +add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
   7.933 +		unsigned long nr_pages, sector_t start_block)
   7.934 +{
   7.935 +	struct swap_extent *se;
   7.936 +	struct swap_extent *new_se;
   7.937 +	struct list_head *lh;
   7.938 +
   7.939 +	lh = sis->extent_list.next;	/* The highest-addressed block */
   7.940 +	while (lh != &sis->extent_list) {
   7.941 +		se = list_entry(lh, struct swap_extent, list);
   7.942 +		if (se->start_block + se->nr_pages == start_block &&
   7.943 +		    se->start_page  + se->nr_pages == start_page) {
   7.944 +			/* Merge it */
   7.945 +			se->nr_pages += nr_pages;
   7.946 +			return 0;
   7.947 +		}
   7.948 +		lh = lh->next;
   7.949 +	}
   7.950 +
   7.951 +	/*
   7.952 +	 * No merge.  Insert a new extent, preserving ordering.
   7.953 +	 */
   7.954 +	new_se = kmalloc(sizeof(*se), GFP_KERNEL);
   7.955 +	if (new_se == NULL)
   7.956 +		return -ENOMEM;
   7.957 +	new_se->start_page = start_page;
   7.958 +	new_se->nr_pages = nr_pages;
   7.959 +	new_se->start_block = start_block;
   7.960 +
   7.961 +	lh = sis->extent_list.prev;	/* The lowest block */
   7.962 +	while (lh != &sis->extent_list) {
   7.963 +		se = list_entry(lh, struct swap_extent, list);
   7.964 +		if (se->start_block > start_block)
   7.965 +			break;
   7.966 +		lh = lh->prev;
   7.967 +	}
   7.968 +	list_add_tail(&new_se->list, lh);
   7.969 +	sis->nr_extents++;
   7.970 +	return 0;
   7.971 +}
   7.972 +
   7.973 +/*
   7.974 + * A `swap extent' is a simple thing which maps a contiguous range of pages
   7.975 + * onto a contiguous range of disk blocks.  An ordered list of swap extents
   7.976 + * is built at swapon time and is then used at swap_writepage/swap_readpage
   7.977 + * time for locating where on disk a page belongs.
   7.978 + *
   7.979 + * If the swapfile is an S_ISBLK block device, a single extent is installed.
   7.980 + * This is done so that the main operating code can treat S_ISBLK and S_ISREG
   7.981 + * swap files identically.
   7.982 + *
   7.983 + * Whether the swapdev is an S_ISREG file or an S_ISBLK blockdev, the swap
   7.984 + * extent list operates in PAGE_SIZE disk blocks.  Both S_ISREG and S_ISBLK
   7.985 + * swapfiles are handled *identically* after swapon time.
   7.986 + *
   7.987 + * For S_ISREG swapfiles, setup_swap_extents() will walk all the file's blocks
   7.988 + * and will parse them into an ordered extent list, in PAGE_SIZE chunks.  If
   7.989 + * some stray blocks are found which do not fall within the PAGE_SIZE alignment
   7.990 + * requirements, they are simply tossed out - we will never use those blocks
   7.991 + * for swapping.
   7.992 + *
   7.993 + * For S_ISREG swapfiles we hold i_sem across the life of the swapon.  This
   7.994 + * prevents root from shooting her foot off by ftruncating an in-use swapfile,
   7.995 + * which will scribble on the fs.
   7.996 + *
   7.997 + * The amount of disk space which a single swap extent represents varies.
   7.998 + * Typically it is in the 1-4 megabyte range.  So we can have hundreds of
   7.999 + * extents in the list.  To avoid much list walking, we cache the previous
  7.1000 + * search location in `curr_swap_extent', and start new searches from there.
  7.1001 + * This is extremely effective.  The average number of iterations in
  7.1002 + * map_swap_page() has been measured at about 0.3 per page.  - akpm.
  7.1003 + */
  7.1004 +static int setup_swap_extents(struct swap_info_struct *sis)
  7.1005 +{
  7.1006 +	struct inode *inode;
  7.1007 +	unsigned blocks_per_page;
  7.1008 +	unsigned long page_no;
  7.1009 +	unsigned blkbits;
  7.1010 +	sector_t probe_block;
  7.1011 +	sector_t last_block;
  7.1012 +	int ret;
  7.1013 +
  7.1014 +	inode = sis->swap_file->f_mapping->host;
  7.1015 +	if (S_ISBLK(inode->i_mode)) {
  7.1016 +		ret = add_swap_extent(sis, 0, sis->max, 0);
  7.1017 +		goto done;
  7.1018 +	}
  7.1019 +
  7.1020 +	blkbits = inode->i_blkbits;
  7.1021 +	blocks_per_page = PAGE_SIZE >> blkbits;
  7.1022 +
  7.1023 +	/*
  7.1024 +	 * Map all the blocks into the extent list.  This code doesn't try
  7.1025 +	 * to be very smart.
  7.1026 +	 */
  7.1027 +	probe_block = 0;
  7.1028 +	page_no = 0;
  7.1029 +	last_block = i_size_read(inode) >> blkbits;
  7.1030 +	while ((probe_block + blocks_per_page) <= last_block &&
  7.1031 +			page_no < sis->max) {
  7.1032 +		unsigned block_in_page;
  7.1033 +		sector_t first_block;
  7.1034 +
  7.1035 +		first_block = bmap(inode, probe_block);
  7.1036 +		if (first_block == 0)
  7.1037 +			goto bad_bmap;
  7.1038 +
  7.1039 +		/*
  7.1040 +		 * It must be PAGE_SIZE aligned on-disk
  7.1041 +		 */
  7.1042 +		if (first_block & (blocks_per_page - 1)) {
  7.1043 +			probe_block++;
  7.1044 +			goto reprobe;
  7.1045 +		}
  7.1046 +
  7.1047 +		for (block_in_page = 1; block_in_page < blocks_per_page;
  7.1048 +					block_in_page++) {
  7.1049 +			sector_t block;
  7.1050 +
  7.1051 +			block = bmap(inode, probe_block + block_in_page);
  7.1052 +			if (block == 0)
  7.1053 +				goto bad_bmap;
  7.1054 +			if (block != first_block + block_in_page) {
  7.1055 +				/* Discontiguity */
  7.1056 +				probe_block++;
  7.1057 +				goto reprobe;
  7.1058 +			}
  7.1059 +		}
  7.1060 +
  7.1061 +		/*
  7.1062 +		 * We found a PAGE_SIZE-length, PAGE_SIZE-aligned run of blocks
  7.1063 +		 */
  7.1064 +		ret = add_swap_extent(sis, page_no, 1,
  7.1065 +				first_block >> (PAGE_SHIFT - blkbits));
  7.1066 +		if (ret)
  7.1067 +			goto out;
  7.1068 +		page_no++;
  7.1069 +		probe_block += blocks_per_page;
  7.1070 +reprobe:
  7.1071 +		continue;
  7.1072 +	}
  7.1073 +	ret = 0;
  7.1074 +	if (page_no == 0)
  7.1075 +		ret = -EINVAL;
  7.1076 +	sis->max = page_no;
  7.1077 +	sis->highest_bit = page_no - 1;
  7.1078 +done:
  7.1079 +	sis->curr_swap_extent = list_entry(sis->extent_list.prev,
  7.1080 +					struct swap_extent, list);
  7.1081 +	goto out;
  7.1082 +bad_bmap:
  7.1083 +	printk(KERN_ERR "swapon: swapfile has holes\n");
  7.1084 +	ret = -EINVAL;
  7.1085 +out:
  7.1086 +	return ret;
  7.1087 +}
  7.1088 +
  7.1089 +#if 0	/* We don't need this yet */
  7.1090 +#include <linux/backing-dev.h>
  7.1091 +int page_queue_congested(struct page *page)
  7.1092 +{
  7.1093 +	struct backing_dev_info *bdi;
  7.1094 +
  7.1095 +	BUG_ON(!PageLocked(page));	/* It pins the swap_info_struct */
  7.1096 +
  7.1097 +	if (PageSwapCache(page)) {
  7.1098 +		swp_entry_t entry = { .val = page->private };
  7.1099 +		struct swap_info_struct *sis;
  7.1100 +
  7.1101 +		sis = get_swap_info_struct(swp_type(entry));
  7.1102 +		bdi = sis->bdev->bd_inode->i_mapping->backing_dev_info;
  7.1103 +	} else
  7.1104 +		bdi = page->mapping->backing_dev_info;
  7.1105 +	return bdi_write_congested(bdi);
  7.1106 +}
  7.1107 +#endif
  7.1108 +
  7.1109 +asmlinkage long sys_swapoff(const char __user * specialfile)
  7.1110 +{
  7.1111 +	struct swap_info_struct * p = NULL;
  7.1112 +	unsigned short *swap_map;
  7.1113 +	struct file *swap_file, *victim;
  7.1114 +	struct address_space *mapping;
  7.1115 +	struct inode *inode;
  7.1116 +	char * pathname;
  7.1117 +	int i, type, prev;
  7.1118 +	int err;
  7.1119 +	
  7.1120 +	if (!capable(CAP_SYS_ADMIN))
  7.1121 +		return -EPERM;
  7.1122 +
  7.1123 +	pathname = getname(specialfile);
  7.1124 +	err = PTR_ERR(pathname);
  7.1125 +	if (IS_ERR(pathname))
  7.1126 +		goto out;
  7.1127 +
  7.1128 +	victim = filp_open(pathname, O_RDWR|O_LARGEFILE, 0);
  7.1129 +	putname(pathname);
  7.1130 +	err = PTR_ERR(victim);
  7.1131 +	if (IS_ERR(victim))
  7.1132 +		goto out;
  7.1133 +
  7.1134 +	mapping = victim->f_mapping;
  7.1135 +	prev = -1;
  7.1136 +	swap_list_lock();
  7.1137 +	for (type = swap_list.head; type >= 0; type = swap_info[type].next) {
  7.1138 +		p = swap_info + type;
  7.1139 +		if ((p->flags & SWP_ACTIVE) == SWP_ACTIVE) {
  7.1140 +			if (p->swap_file->f_mapping == mapping)
  7.1141 +				break;
  7.1142 +		}
  7.1143 +		prev = type;
  7.1144 +	}
  7.1145 +	if (type < 0) {
  7.1146 +		err = -EINVAL;
  7.1147 +		swap_list_unlock();
  7.1148 +		goto out_dput;
  7.1149 +	}
  7.1150 +	if (!security_vm_enough_memory(p->pages))
  7.1151 +		vm_unacct_memory(p->pages);
  7.1152 +	else {
  7.1153 +		err = -ENOMEM;
  7.1154 +		swap_list_unlock();
  7.1155 +		goto out_dput;
  7.1156 +	}
  7.1157 +	if (prev < 0) {
  7.1158 +		swap_list.head = p->next;
  7.1159 +	} else {
  7.1160 +		swap_info[prev].next = p->next;
  7.1161 +	}
  7.1162 +	if (type == swap_list.next) {
  7.1163 +		/* just pick something that's safe... */
  7.1164 +		swap_list.next = swap_list.head;
  7.1165 +	}
  7.1166 +	nr_swap_pages -= p->pages;
  7.1167 +	total_swap_pages -= p->pages;
  7.1168 +	p->flags &= ~SWP_WRITEOK;
  7.1169 +	swap_list_unlock();
  7.1170 +	current->flags |= PF_SWAPOFF;
  7.1171 +	err = try_to_unuse(type);
  7.1172 +	current->flags &= ~PF_SWAPOFF;
  7.1173 +
  7.1174 +	/* wait for any unplug function to finish */
  7.1175 +	down_write(&swap_unplug_sem);
  7.1176 +	up_write(&swap_unplug_sem);
  7.1177 +
  7.1178 +	if (err) {
  7.1179 +		/* re-insert swap space back into swap_list */
  7.1180 +		swap_list_lock();
  7.1181 +		for (prev = -1, i = swap_list.head; i >= 0; prev = i, i = swap_info[i].next)
  7.1182 +			if (p->prio >= swap_info[i].prio)
  7.1183 +				break;
  7.1184 +		p->next = i;
  7.1185 +		if (prev < 0)
  7.1186 +			swap_list.head = swap_list.next = p - swap_info;
  7.1187 +		else
  7.1188 +			swap_info[prev].next = p - swap_info;
  7.1189 +		nr_swap_pages += p->pages;
  7.1190 +		total_swap_pages += p->pages;
  7.1191 +		p->flags |= SWP_WRITEOK;
  7.1192 +		swap_list_unlock();
  7.1193 +		goto out_dput;
  7.1194 +	}
  7.1195 +	down(&swapon_sem);
  7.1196 +	swap_list_lock();
  7.1197 +	drain_mmlist();
  7.1198 +	swap_device_lock(p);
  7.1199 +	swap_file = p->swap_file;
  7.1200 +	p->swap_file = NULL;
  7.1201 +	p->max = 0;
  7.1202 +	swap_map = p->swap_map;
  7.1203 +	p->swap_map = NULL;
  7.1204 +	p->flags = 0;
  7.1205 +	destroy_swap_extents(p);
  7.1206 +	swap_device_unlock(p);
  7.1207 +	swap_list_unlock();
  7.1208 +	up(&swapon_sem);
  7.1209 +	vfree(swap_map);
  7.1210 +	inode = mapping->host;
  7.1211 +	if (S_ISBLK(inode->i_mode)) {
  7.1212 +		struct block_device *bdev = I_BDEV(inode);
  7.1213 +		set_blocksize(bdev, p->old_block_size);
  7.1214 +		bd_release(bdev);
  7.1215 +	} else {
  7.1216 +		down(&inode->i_sem);
  7.1217 +		inode->i_flags &= ~S_SWAPFILE;
  7.1218 +		up(&inode->i_sem);
  7.1219 +	}
  7.1220 +	filp_close(swap_file, NULL);
  7.1221 +	err = 0;
  7.1222 +
  7.1223 +out_dput:
  7.1224 +	filp_close(victim, NULL);
  7.1225 +out:
  7.1226 +	return err;
  7.1227 +}
  7.1228 +
  7.1229 +#ifdef CONFIG_PROC_FS
  7.1230 +/* iterator */
  7.1231 +static void *swap_start(struct seq_file *swap, loff_t *pos)
  7.1232 +{
  7.1233 +	struct swap_info_struct *ptr = swap_info;
  7.1234 +	int i;
  7.1235 +	loff_t l = *pos;
  7.1236 +
  7.1237 +	down(&swapon_sem);
  7.1238 +
  7.1239 +	for (i = 0; i < nr_swapfiles; i++, ptr++) {
  7.1240 +		if (!(ptr->flags & SWP_USED) || !ptr->swap_map)
  7.1241 +			continue;
  7.1242 +		if (!l--)
  7.1243 +			return ptr;
  7.1244 +	}
  7.1245 +
  7.1246 +	return NULL;
  7.1247 +}
  7.1248 +
  7.1249 +static void *swap_next(struct seq_file *swap, void *v, loff_t *pos)
  7.1250 +{
  7.1251 +	struct swap_info_struct *ptr = v;
  7.1252 +	struct swap_info_struct *endptr = swap_info + nr_swapfiles;
  7.1253 +
  7.1254 +	for (++ptr; ptr < endptr; ptr++) {
  7.1255 +		if (!(ptr->flags & SWP_USED) || !ptr->swap_map)
  7.1256 +			continue;
  7.1257 +		++*pos;
  7.1258 +		return ptr;
  7.1259 +	}
  7.1260 +
  7.1261 +	return NULL;
  7.1262 +}
  7.1263 +
  7.1264 +static void swap_stop(struct seq_file *swap, void *v)
  7.1265 +{
  7.1266 +	up(&swapon_sem);
  7.1267 +}
  7.1268 +
  7.1269 +static int swap_show(struct seq_file *swap, void *v)
  7.1270 +{
  7.1271 +	struct swap_info_struct *ptr = v;
  7.1272 +	struct file *file;
  7.1273 +	int len;
  7.1274 +
  7.1275 +	if (v == swap_info)
  7.1276 +		seq_puts(swap, "Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n");
  7.1277 +
  7.1278 +	file = ptr->swap_file;
  7.1279 +	len = seq_path(swap, file->f_vfsmnt, file->f_dentry, " \t\n\\");
  7.1280 +	seq_printf(swap, "%*s%s\t%d\t%ld\t%d\n",
  7.1281 +		       len < 40 ? 40 - len : 1, " ",
  7.1282 +		       S_ISBLK(file->f_dentry->d_inode->i_mode) ?
  7.1283 +				"partition" : "file\t",
  7.1284 +		       ptr->pages << (PAGE_SHIFT - 10),
  7.1285 +		       ptr->inuse_pages << (PAGE_SHIFT - 10),
  7.1286 +		       ptr->prio);
  7.1287 +	return 0;
  7.1288 +}
  7.1289 +
  7.1290 +static struct seq_operations swaps_op = {
  7.1291 +	.start =	swap_start,
  7.1292 +	.next =		swap_next,
  7.1293 +	.stop =		swap_stop,
  7.1294 +	.show =		swap_show
  7.1295 +};
  7.1296 +
  7.1297 +static int swaps_open(struct inode *inode, struct file *file)
  7.1298 +{
  7.1299 +	return seq_open(file, &swaps_op);
  7.1300 +}
  7.1301 +
  7.1302 +static struct file_operations proc_swaps_operations = {
  7.1303 +	.open		= swaps_open,
  7.1304 +	.read		= seq_read,
  7.1305 +	.llseek		= seq_lseek,
  7.1306 +	.release	= seq_release,
  7.1307 +};
  7.1308 +
  7.1309 +static int __init procswaps_init(void)
  7.1310 +{
  7.1311 +	struct proc_dir_entry *entry;
  7.1312 +
  7.1313 +	entry = create_proc_entry("swaps", 0, NULL);
  7.1314 +	if (entry)
  7.1315 +		entry->proc_fops = &proc_swaps_operations;
  7.1316 +	return 0;
  7.1317 +}
  7.1318 +__initcall(procswaps_init);
  7.1319 +#endif /* CONFIG_PROC_FS */
  7.1320 +
  7.1321 +/*
  7.1322 + * Written 01/25/92 by Simmule Turner, heavily changed by Linus.
  7.1323 + *
  7.1324 + * The swapon system call
  7.1325 + */
  7.1326 +asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
  7.1327 +{
  7.1328 +	struct swap_info_struct * p;
  7.1329 +	char *name = NULL;
  7.1330 +	struct block_device *bdev = NULL;
  7.1331 +	struct file *swap_file = NULL;
  7.1332 +	struct address_space *mapping;
  7.1333 +	unsigned int type;
  7.1334 +	int i, prev;
  7.1335 +	int error;
  7.1336 +	static int least_priority;
  7.1337 +	union swap_header *swap_header = NULL;
  7.1338 +	int swap_header_version;
  7.1339 +	int nr_good_pages = 0;
  7.1340 +	unsigned long maxpages = 1;
  7.1341 +	int swapfilesize;
  7.1342 +	unsigned short *swap_map;
  7.1343 +	struct page *page = NULL;
  7.1344 +	struct inode *inode = NULL;
  7.1345 +	int did_down = 0;
  7.1346 +
  7.1347 +	if (!capable(CAP_SYS_ADMIN))
  7.1348 +		return -EPERM;
  7.1349 +	swap_list_lock();
  7.1350 +	p = swap_info;
  7.1351 +	for (type = 0 ; type < nr_swapfiles ; type++,p++)
  7.1352 +		if (!(p->flags & SWP_USED))
  7.1353 +			break;
  7.1354 +	error = -EPERM;
  7.1355 +	/*
  7.1356 +	 * Test if adding another swap device is possible. There are
  7.1357 +	 * two limiting factors: 1) the number of bits for the swap
  7.1358 +	 * type swp_entry_t definition and 2) the number of bits for
  7.1359 +	 * the swap type in the swap ptes as defined by the different
  7.1360 +	 * architectures. To honor both limitations a swap entry
  7.1361 +	 * with swap offset 0 and swap type ~0UL is created, encoded
  7.1362 +	 * to a swap pte, decoded to a swp_entry_t again and finally
  7.1363 +	 * the swap type part is extracted. This will mask all bits
  7.1364 +	 * from the initial ~0UL that can't be encoded in either the
  7.1365 +	 * swp_entry_t or the architecture definition of a swap pte.
  7.1366 +	 */
  7.1367 +	if (type > swp_type(pte_to_swp_entry(swp_entry_to_pte(swp_entry(~0UL,0))))) {
  7.1368 +		swap_list_unlock();
  7.1369 +		goto out;
  7.1370 +	}
  7.1371 +	if (type >= nr_swapfiles)
  7.1372 +		nr_swapfiles = type+1;
  7.1373 +	INIT_LIST_HEAD(&p->extent_list);
  7.1374 +	p->flags = SWP_USED;
  7.1375 +	p->nr_extents = 0;
  7.1376 +	p->swap_file = NULL;
  7.1377 +	p->old_block_size = 0;
  7.1378 +	p->swap_map = NULL;
  7.1379 +	p->lowest_bit = 0;
  7.1380 +	p->highest_bit = 0;
  7.1381 +	p->cluster_nr = 0;
  7.1382 +	p->inuse_pages = 0;
  7.1383 +	spin_lock_init(&p->sdev_lock);
  7.1384 +	p->next = -1;
  7.1385 +	if (swap_flags & SWAP_FLAG_PREFER) {
  7.1386 +		p->prio =
  7.1387 +		  (swap_flags & SWAP_FLAG_PRIO_MASK)>>SWAP_FLAG_PRIO_SHIFT;
  7.1388 +	} else {
  7.1389 +		p->prio = --least_priority;
  7.1390 +	}
  7.1391 +	swap_list_unlock();
  7.1392 +	name = getname(specialfile);
  7.1393 +	error = PTR_ERR(name);
  7.1394 +	if (IS_ERR(name)) {
  7.1395 +		name = NULL;
  7.1396 +		goto bad_swap_2;
  7.1397 +	}
  7.1398 +	swap_file = filp_open(name, O_RDWR|O_LARGEFILE, 0);
  7.1399 +	error = PTR_ERR(swap_file);
  7.1400 +	if (IS_ERR(swap_file)) {
  7.1401 +		swap_file = NULL;
  7.1402 +		goto bad_swap_2;
  7.1403 +	}
  7.1404 +
  7.1405 +	p->swap_file = swap_file;
  7.1406 +	mapping = swap_file->f_mapping;
  7.1407 +	inode = mapping->host;
  7.1408 +
  7.1409 +	error = -EBUSY;
  7.1410 +	for (i = 0; i < nr_swapfiles; i++) {
  7.1411 +		struct swap_info_struct *q = &swap_info[i];
  7.1412 +
  7.1413 +		if (i == type || !q->swap_file)
  7.1414 +			continue;
  7.1415 +		if (mapping == q->swap_file->f_mapping)
  7.1416 +			goto bad_swap;
  7.1417 +	}
  7.1418 +
  7.1419 +	error = -EINVAL;
  7.1420 +	if (S_ISBLK(inode->i_mode)) {
  7.1421 +		bdev = I_BDEV(inode);
  7.1422 +		error = bd_claim(bdev, sys_swapon);
  7.1423 +		if (error < 0) {
  7.1424 +			bdev = NULL;
  7.1425 +			goto bad_swap;
  7.1426 +		}
  7.1427 +		p->old_block_size = block_size(bdev);
  7.1428 +		error = set_blocksize(bdev, PAGE_SIZE);
  7.1429 +		if (error < 0)
  7.1430 +			goto bad_swap;
  7.1431 +		p->bdev = bdev;
  7.1432 +	} else if (S_ISREG(inode->i_mode)) {
  7.1433 +		p->bdev = inode->i_sb->s_bdev;
  7.1434 +		down(&inode->i_sem);
  7.1435 +		did_down = 1;
  7.1436 +		if (IS_SWAPFILE(inode)) {
  7.1437 +			error = -EBUSY;
  7.1438 +			goto bad_swap;
  7.1439 +		}
  7.1440 +	} else {
  7.1441 +		goto bad_swap;
  7.1442 +	}
  7.1443 +
  7.1444 +	swapfilesize = i_size_read(inode) >> PAGE_SHIFT;
  7.1445 +
  7.1446 +	/*
  7.1447 +	 * Read the swap header.
  7.1448 +	 */
  7.1449 +	if (!mapping->a_ops->readpage) {
  7.1450 +		error = -EINVAL;
  7.1451 +		goto bad_swap;
  7.1452 +	}
  7.1453 +	page = read_cache_page(mapping, 0,
  7.1454 +			(filler_t *)mapping->a_ops->readpage, swap_file);
  7.1455 +	if (IS_ERR(page)) {
  7.1456 +		error = PTR_ERR(page);
  7.1457 +		goto bad_swap;
  7.1458 +	}
  7.1459 +	wait_on_page_locked(page);
  7.1460 +	if (!PageUptodate(page))
  7.1461 +		goto bad_swap;
  7.1462 +	kmap(page);
  7.1463 +	swap_header = page_address(page);
  7.1464 +
  7.1465 +	if (!memcmp("SWAP-SPACE",swap_header->magic.magic,10))
  7.1466 +		swap_header_version = 1;
  7.1467 +	else if (!memcmp("SWAPSPACE2",swap_header->magic.magic,10))
  7.1468 +		swap_header_version = 2;
  7.1469 +	else {
  7.1470 +		printk("Unable to find swap-space signature\n");
  7.1471 +		error = -EINVAL;
  7.1472 +		goto bad_swap;
  7.1473 +	}
  7.1474 +	
  7.1475 +	switch (swap_header_version) {
  7.1476 +	case 1:
  7.1477 +		printk(KERN_ERR "version 0 swap is no longer supported. "
  7.1478 +			"Use mkswap -v1 %s\n", name);
  7.1479 +		error = -EINVAL;
  7.1480 +		goto bad_swap;
  7.1481 +	case 2:
  7.1482 +		/* Check the swap header's sub-version and the size of
  7.1483 +                   the swap file and bad block lists */
  7.1484 +		if (swap_header->info.version != 1) {
  7.1485 +			printk(KERN_WARNING
  7.1486 +			       "Unable to handle swap header version %d\n",
  7.1487 +			       swap_header->info.version);
  7.1488 +			error = -EINVAL;
  7.1489 +			goto bad_swap;
  7.1490 +		}
  7.1491 +
  7.1492 +		p->lowest_bit  = 1;
  7.1493 +		/*
  7.1494 +		 * Find out how many pages are allowed for a single swap
  7.1495 +		 * device. There are two limiting factors: 1) the number of
  7.1496 +		 * bits for the swap offset in the swp_entry_t type and
  7.1497 +		 * 2) the number of bits in the a swap pte as defined by
  7.1498 +		 * the different architectures. In order to find the
  7.1499 +		 * largest possible bit mask a swap entry with swap type 0
  7.1500 +		 * and swap offset ~0UL is created, encoded to a swap pte,
  7.1501 +		 * decoded to a swp_entry_t again and finally the swap
  7.1502 +		 * offset is extracted. This will mask all the bits from
  7.1503 +		 * the initial ~0UL mask that can't be encoded in either
  7.1504 +		 * the swp_entry_t or the architecture definition of a
  7.1505 +		 * swap pte.
  7.1506 +		 */
  7.1507 +		maxpages = swp_offset(pte_to_swp_entry(swp_entry_to_pte(swp_entry(0,~0UL)))) - 1;
  7.1508 +		if (maxpages > swap_header->info.last_page)
  7.1509 +			maxpages = swap_header->info.last_page;
  7.1510 +		p->highest_bit = maxpages - 1;
  7.1511 +
  7.1512 +		error = -EINVAL;
  7.1513 +		if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES)
  7.1514 +			goto bad_swap;
  7.1515 +		
  7.1516 +		/* OK, set up the swap map and apply the bad block list */
  7.1517 +		if (!(p->swap_map = vmalloc(maxpages * sizeof(short)))) {
  7.1518 +			error = -ENOMEM;
  7.1519 +			goto bad_swap;
  7.1520 +		}
  7.1521 +
  7.1522 +		error = 0;
  7.1523 +		memset(p->swap_map, 0, maxpages * sizeof(short));
  7.1524 +		for (i=0; i<swap_header->info.nr_badpages; i++) {
  7.1525 +			int page = swap_header->info.badpages[i];
  7.1526 +			if (page <= 0 || page >= swap_header->info.last_page)
  7.1527 +				error = -EINVAL;
  7.1528 +			else
  7.1529 +				p->swap_map[page] = SWAP_MAP_BAD;
  7.1530 +		}
  7.1531 +		nr_good_pages = swap_header->info.last_page -
  7.1532 +				swap_header->info.nr_badpages -
  7.1533 +				1 /* header page */;
  7.1534 +		if (error) 
  7.1535 +			goto bad_swap;
  7.1536 +	}
  7.1537 +	
  7.1538 +	if (swapfilesize && maxpages > swapfilesize) {
  7.1539 +		printk(KERN_WARNING
  7.1540 +		       "Swap area shorter than signature indicates\n");
  7.1541 +		error = -EINVAL;
  7.1542 +		goto bad_swap;
  7.1543 +	}
  7.1544 +	if (!nr_good_pages) {
  7.1545 +		printk(KERN_WARNING "Empty swap-file\n");
  7.1546 +		error = -EINVAL;
  7.1547 +		goto bad_swap;
  7.1548 +	}
  7.1549 +	p->swap_map[0] = SWAP_MAP_BAD;
  7.1550 +	p->max = maxpages;
  7.1551 +	p->pages = nr_good_pages;
  7.1552 +
  7.1553 +	error = setup_swap_extents(p);
  7.1554 +	if (error)
  7.1555 +		goto bad_swap;
  7.1556 +
  7.1557 +	down(&swapon_sem);
  7.1558 +	swap_list_lock();
  7.1559 +	swap_device_lock(p);
  7.1560 +	p->flags = SWP_ACTIVE;
  7.1561 +	nr_swap_pages += nr_good_pages;
  7.1562 +	total_swap_pages += nr_good_pages;
  7.1563 +	printk(KERN_INFO "Adding %dk swap on %s.  Priority:%d extents:%d\n",
  7.1564 +		nr_good_pages<<(PAGE_SHIFT-10), name,
  7.1565 +		p->prio, p->nr_extents);
  7.1566 +
  7.1567 +	/* insert swap space into swap_list: */
  7.1568 +	prev = -1;
  7.1569 +	for (i = swap_list.head; i >= 0; i = swap_info[i].next) {
  7.1570 +		if (p->prio >= swap_info[i].prio) {
  7.1571 +			break;
  7.1572 +		}
  7.1573 +		prev = i;
  7.1574 +	}
  7.1575 +	p->next = i;
  7.1576 +	if (prev < 0) {
  7.1577 +		swap_list.head = swap_list.next = p - swap_info;
  7.1578 +	} else {
  7.1579 +		swap_info[prev].next = p - swap_info;
  7.1580 +	}
  7.1581 +	swap_device_unlock(p);
  7.1582 +	swap_list_unlock();
  7.1583 +	up(&swapon_sem);
  7.1584 +	error = 0;
  7.1585 +	goto out;
  7.1586 +bad_swap:
  7.1587 +	if (bdev) {
  7.1588 +		set_blocksize(bdev, p->old_block_size);
  7.1589 +		bd_release(bdev);
  7.1590 +	}
  7.1591 +bad_swap_2:
  7.1592 +	swap_list_lock();
  7.1593 +	swap_map = p->swap_map;
  7.1594 +	p->swap_file = NULL;
  7.1595 +	p->swap_map = NULL;
  7.1596 +	p->flags = 0;
  7.1597 +	if (!(swap_flags & SWAP_FLAG_PREFER))
  7.1598 +		++least_priority;
  7.1599 +	swap_list_unlock();
  7.1600 +	destroy_swap_extents(p);
  7.1601 +	if (swap_map)
  7.1602 +		vfree(swap_map);
  7.1603 +	if (swap_file)
  7.1604 +		filp_close(swap_file, NULL);
  7.1605 +out:
  7.1606 +	if (page && !IS_ERR(page)) {
  7.1607 +		kunmap(page);
  7.1608 +		page_cache_release(page);
  7.1609 +	}
  7.1610 +	if (name)
  7.1611 +		putname(name);
  7.1612 +	if (did_down) {
  7.1613 +		if (!error)
  7.1614 +			inode->i_flags |= S_SWAPFILE;
  7.1615 +		up(&inode->i_sem);
  7.1616 +	}
  7.1617 +	return error;
  7.1618 +}
  7.1619 +
  7.1620 +void si_swapinfo(struct sysinfo *val)
  7.1621 +{
  7.1622 +	unsigned int i;
  7.1623 +	unsigned long nr_to_be_unused = 0;
  7.1624 +
  7.1625 +	swap_list_lock();
  7.1626 +	for (i = 0; i < nr_swapfiles; i++) {
  7.1627 +		if (!(swap_info[i].flags & SWP_USED) ||
  7.1628 +		     (swap_info[i].flags & SWP_WRITEOK))
  7.1629 +			continue;
  7.1630 +		nr_to_be_unused += swap_info[i].inuse_pages;
  7.1631 +	}
  7.1632 +	val->freeswap = nr_swap_pages + nr_to_be_unused;
  7.1633 +	val->totalswap = total_swap_pages + nr_to_be_unused;
  7.1634 +	swap_list_unlock();
  7.1635 +}
  7.1636 +
  7.1637 +/*
  7.1638 + * Verify that a swap entry is valid and increment its swap map count.
  7.1639 + *
  7.1640 + * Note: if swap_map[] reaches SWAP_MAP_MAX the entries are treated as
  7.1641 + * "permanent", but will be reclaimed by the next swapoff.
  7.1642 + */
  7.1643 +int swap_duplicate(swp_entry_t entry)
  7.1644 +{
  7.1645 +	struct swap_info_struct * p;
  7.1646 +	unsigned long offset, type;
  7.1647 +	int result = 0;
  7.1648 +
  7.1649 +	type = swp_type(entry);
  7.1650 +	if (type >= nr_swapfiles)
  7.1651 +		goto bad_file;
  7.1652 +	p = type + swap_info;
  7.1653 +	offset = swp_offset(entry);
  7.1654 +
  7.1655 +	swap_device_lock(p);
  7.1656 +	if (offset < p->max && p->swap_map[offset]) {
  7.1657 +		if (p->swap_map[offset] < SWAP_MAP_MAX - 1) {
  7.1658 +			p->swap_map[offset]++;
  7.1659 +			result = 1;
  7.1660 +		} else if (p->swap_map[offset] <= SWAP_MAP_MAX) {
  7.1661 +			if (swap_overflow++ < 5)
  7.1662 +				printk(KERN_WARNING "swap_dup: swap entry overflow\n");
  7.1663 +			p->swap_map[offset] = SWAP_MAP_MAX;
  7.1664 +			result = 1;
  7.1665 +		}
  7.1666 +	}
  7.1667 +	swap_device_unlock(p);
  7.1668 +out:
  7.1669 +	return result;
  7.1670 +
  7.1671 +bad_file:
  7.1672 +	printk(KERN_ERR "swap_dup: %s%08lx\n", Bad_file, entry.val);
  7.1673 +	goto out;
  7.1674 +}
  7.1675 +
  7.1676 +struct swap_info_struct *
  7.1677 +get_swap_info_struct(unsigned type)
  7.1678 +{
  7.1679 +	return &swap_info[type];
  7.1680 +}
  7.1681 +
  7.1682 +/*
  7.1683 + * swap_device_lock prevents swap_map being freed. Don't grab an extra
  7.1684 + * reference on the swaphandle, it doesn't matter if it becomes unused.
  7.1685 + */
  7.1686 +int valid_swaphandles(swp_entry_t entry, unsigned long *offset)
  7.1687 +{
  7.1688 +	int ret = 0, i = 1 << page_cluster;
  7.1689 +	unsigned long toff;
  7.1690 +	struct swap_info_struct *swapdev = swp_type(entry) + swap_info;
  7.1691 +
  7.1692 +	if (!page_cluster)	/* no readahead */
  7.1693 +		return 0;
  7.1694 +	toff = (swp_offset(entry) >> page_cluster) << page_cluster;
  7.1695 +	if (!toff)		/* first page is swap header */
  7.1696 +		toff++, i--;
  7.1697 +	*offset = toff;
  7.1698 +
  7.1699 +	swap_device_lock(swapdev);
  7.1700 +	do {
  7.1701 +		/* Don't read-ahead past the end of the swap area */
  7.1702 +		if (toff >= swapdev->max)
  7.1703 +			break;
  7.1704 +		/* Don't read in free or bad pages */
  7.1705 +		if (!swapdev->swap_map[toff])
  7.1706 +			break;
  7.1707 +		if (swapdev->swap_map[toff] == SWAP_MAP_BAD)
  7.1708 +			break;
  7.1709 +		toff++;
  7.1710 +		ret++;
  7.1711 +	} while (--i);
  7.1712 +	swap_device_unlock(swapdev);
  7.1713 +	return ret;
  7.1714 +}