ia64/xen-unstable

changeset 4333:8a798fbdbb60

bitkeeper revision 1.1236.43.15 (42446ea83i0TVEFNdNTE8D6WBPWfaQ)

Move Linux 2.4 to writable pagetables. It doesn't boot, but that bug
is not caused by this changeset (I see exactly the same behaviour
with these changes backed out). Will need some investigation: first
on 2.0-testing to see if any fixes are needed there...
Signed-off-by: Keir Fraser <keir@xensource.com>
author kaf24@firebug.cl.cam.ac.uk
date Fri Mar 25 20:03:52 2005 +0000 (2005-03-25)
parents 320b9cc5155c
children 38fe904ec5b1
files .rootkeys linux-2.4.29-xen-sparse/arch/xen/kernel/head.S linux-2.4.29-xen-sparse/arch/xen/kernel/ldt.c linux-2.4.29-xen-sparse/arch/xen/kernel/traps.c linux-2.4.29-xen-sparse/arch/xen/mm/fault.c linux-2.4.29-xen-sparse/arch/xen/mm/init.c linux-2.4.29-xen-sparse/fs/exec.c linux-2.4.29-xen-sparse/include/asm-xen/page.h linux-2.4.29-xen-sparse/include/asm-xen/pgalloc.h linux-2.4.29-xen-sparse/include/asm-xen/pgtable-2level.h linux-2.4.29-xen-sparse/include/asm-xen/pgtable.h linux-2.4.29-xen-sparse/mm/highmem.c linux-2.4.29-xen-sparse/mm/memory.c linux-2.4.29-xen-sparse/mm/mremap.c linux-2.4.29-xen-sparse/mm/swapfile.c linux-2.4.29-xen-sparse/mm/vmalloc.c linux-2.6.11-xen-sparse/arch/xen/Kconfig linux-2.6.11-xen-sparse/arch/xen/configs/xen0_defconfig linux-2.6.11-xen-sparse/arch/xen/configs/xenU_defconfig linux-2.6.11-xen-sparse/arch/xen/i386/kernel/traps.c linux-2.6.11-xen-sparse/arch/xen/i386/mm/fault.c linux-2.6.11-xen-sparse/arch/xen/i386/mm/hypervisor.c linux-2.6.11-xen-sparse/arch/xen/i386/mm/pgtable.c linux-2.6.11-xen-sparse/arch/xen/kernel/reboot.c linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/page.h linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/pgtable.h linux-2.6.11-xen-sparse/include/asm-xen/hypervisor.h
line diff
     1.1 --- a/.rootkeys	Fri Mar 25 19:30:52 2005 +0000
     1.2 +++ b/.rootkeys	Fri Mar 25 20:03:52 2005 +0000
     1.3 @@ -167,7 +167,6 @@ 3f108aeaLcGDgQdFAANLTUEid0a05w linux-2.4
     1.4  3e5a4e66rw65CxyolW9PKz4GG42RcA linux-2.4.29-xen-sparse/drivers/char/tty_io.c
     1.5  40c9c0c1pPwYE3-4i-oI3ubUu7UgvQ linux-2.4.29-xen-sparse/drivers/scsi/aic7xxx/Makefile
     1.6  41f97f64nW0wmgLxhwzPTzkF4E5ERA linux-2.4.29-xen-sparse/drivers/usb/hcd.c
     1.7 -3e5a4e669uzIE54VwucPYtGwXLAbzA linux-2.4.29-xen-sparse/fs/exec.c
     1.8  3e5a4e66wbeCpsJgVf_U8Jde-CNcsA linux-2.4.29-xen-sparse/include/asm-xen/bugs.h
     1.9  3e5a4e66HdSkvIV6SJ1evG_xmTmXHA linux-2.4.29-xen-sparse/include/asm-xen/desc.h
    1.10  3e5a4e66SYp_UpAVcF8Lc1wa3Qtgzw linux-2.4.29-xen-sparse/include/asm-xen/fixmap.h
    1.11 @@ -205,8 +204,6 @@ 3e5a4e68GxCIaFH4sy01v1wjapetaA linux-2.4
    1.12  3f108af5VxPkLv13tXpXgoRKALQtXQ linux-2.4.29-xen-sparse/mm/mprotect.c
    1.13  3e5a4e681xMPdF9xCMwpyfuYMySU5g linux-2.4.29-xen-sparse/mm/mremap.c
    1.14  409ba2e7akOFqQUg6Qyg2s28xcXiMg linux-2.4.29-xen-sparse/mm/page_alloc.c
    1.15 -3e5a4e683HKVU-sxtagrDasRB8eBVw linux-2.4.29-xen-sparse/mm/swapfile.c
    1.16 -41180721bNns9Na7w1nJ0ZVt8bhUNA linux-2.4.29-xen-sparse/mm/vmalloc.c
    1.17  41505c57WAd5l1rlfCLNSCpx9J13vA linux-2.4.29-xen-sparse/net/core/skbuff.c
    1.18  40f562372u3A7_kfbYYixPHJJxYUxA linux-2.6.11-xen-sparse/arch/xen/Kconfig
    1.19  40f56237utH41NPukqHksuNf29IC9A linux-2.6.11-xen-sparse/arch/xen/Kconfig.drivers
     2.1 --- a/linux-2.4.29-xen-sparse/arch/xen/kernel/head.S	Fri Mar 25 19:30:52 2005 +0000
     2.2 +++ b/linux-2.4.29-xen-sparse/arch/xen/kernel/head.S	Fri Mar 25 20:03:52 2005 +0000
     2.3 @@ -1,6 +1,9 @@
     2.4  
     2.5  .section __xen_guest
     2.6 -    .asciz "GUEST_OS=linux,GUEST_VER=2.4,XEN_VER=3.0,VIRT_BASE=0xC0000000"
     2.7 +    .ascii "GUEST_OS=linux,GUEST_VER=2.4,XEN_VER=3.0,VIRT_BASE=0xC0000000"
     2.8 +    .ascii ",LOADER=generic"
     2.9 +    .ascii ",PT_MODE_WRITABLE"
    2.10 +    .byte  0
    2.11  
    2.12  .text
    2.13  #include <linux/config.h>
     3.1 --- a/linux-2.4.29-xen-sparse/arch/xen/kernel/ldt.c	Fri Mar 25 19:30:52 2005 +0000
     3.2 +++ b/linux-2.4.29-xen-sparse/arch/xen/kernel/ldt.c	Fri Mar 25 20:03:52 2005 +0000
     3.3 @@ -84,6 +84,7 @@ static inline int copy_ldt(mm_context_t 
     3.4  	}
     3.5  	memcpy(new->ldt, old->ldt, old->size*LDT_ENTRY_SIZE);
     3.6  	make_pages_readonly(new->ldt, (new->size*LDT_ENTRY_SIZE)/PAGE_SIZE);
     3.7 +	flush_page_update_queue();
     3.8  	return 0;
     3.9  }
    3.10  
     4.1 --- a/linux-2.4.29-xen-sparse/arch/xen/kernel/traps.c	Fri Mar 25 19:30:52 2005 +0000
     4.2 +++ b/linux-2.4.29-xen-sparse/arch/xen/kernel/traps.c	Fri Mar 25 20:03:52 2005 +0000
     4.3 @@ -623,6 +623,7 @@ void __init trap_init(void)
     4.4      set_call_gate(&default_ldt[0],lcall7);
     4.5      set_call_gate(&default_ldt[4],lcall27);
     4.6      __make_page_readonly(&default_ldt[0]);
     4.7 +    flush_page_update_queue();
     4.8  
     4.9      cpu_init();
    4.10  }
     5.1 --- a/linux-2.4.29-xen-sparse/arch/xen/mm/fault.c	Fri Mar 25 19:30:52 2005 +0000
     5.2 +++ b/linux-2.4.29-xen-sparse/arch/xen/mm/fault.c	Fri Mar 25 20:03:52 2005 +0000
     5.3 @@ -296,7 +296,6 @@ vmalloc_fault:
     5.4  		if (!pmd_present(*pmd_k))
     5.5  			goto no_context;
     5.6  		set_pmd(pmd, *pmd_k);
     5.7 -                XEN_flush_page_update_queue(); /* flush PMD update */
     5.8  
     5.9  		pte_k = pte_offset(pmd_k, address);
    5.10  		if (!pte_present(*pte_k))
     6.1 --- a/linux-2.4.29-xen-sparse/arch/xen/mm/init.c	Fri Mar 25 19:30:52 2005 +0000
     6.2 +++ b/linux-2.4.29-xen-sparse/arch/xen/mm/init.c	Fri Mar 25 20:03:52 2005 +0000
     6.3 @@ -142,7 +142,7 @@ static inline void set_pte_phys (unsigne
     6.4      }
     6.5      pte = pte_offset(pmd, vaddr);
     6.6  
     6.7 -    queue_l1_entry_update(pte, phys | pgprot_val(prot));
     6.8 +    set_pte(pte, (pte_t) { phys | pgprot_val(prot) });
     6.9  
    6.10      /*
    6.11       * It's enough to flush this one mapping.
    6.12 @@ -201,17 +201,13 @@ static void __init fixrange_init (unsign
    6.13                  kpgd = pgd_offset_k((unsigned long)pte);
    6.14                  kpmd = pmd_offset(kpgd, (unsigned long)pte);
    6.15                  kpte = pte_offset(kpmd, (unsigned long)pte);
    6.16 -                queue_l1_entry_update(kpte,
    6.17 -                                      (*(unsigned long *)kpte)&~_PAGE_RW);
    6.18 -
    6.19 +                set_pte(kpte, pte_wrprotect(*kpte));
    6.20                  set_pmd(pmd, __pmd(_KERNPG_TABLE + __pa(pte)));
    6.21              }
    6.22              vaddr += PMD_SIZE;
    6.23          }
    6.24          j = 0;
    6.25      }
    6.26 -	
    6.27 -    XEN_flush_page_update_queue();
    6.28  }
    6.29  
    6.30  
    6.31 @@ -257,10 +253,8 @@ static void __init pagetable_init (void)
    6.32              kpgd = pgd_offset_k((unsigned long)pte_base);
    6.33              kpmd = pmd_offset(kpgd, (unsigned long)pte_base);
    6.34              kpte = pte_offset(kpmd, (unsigned long)pte_base);
    6.35 -            queue_l1_entry_update(kpte,
    6.36 -                                  (*(unsigned long *)kpte)&~_PAGE_RW);
    6.37 +            set_pte(kpte, pte_wrprotect(*kpte));
    6.38              set_pmd(pmd, __pmd(_KERNPG_TABLE + __pa(pte_base)));
    6.39 -            XEN_flush_page_update_queue();
    6.40          }
    6.41      }
    6.42  
    6.43 @@ -311,6 +305,7 @@ void __init paging_init(void)
    6.44      pagetable_init();
    6.45  
    6.46      zone_sizes_init();
    6.47 +
    6.48      /* Switch to the real shared_info page, and clear the dummy page. */
    6.49      set_fixmap(FIX_SHARED_INFO, xen_start_info.shared_info);
    6.50      HYPERVISOR_shared_info = (shared_info_t *)fix_to_virt(FIX_SHARED_INFO);
     7.1 --- a/linux-2.4.29-xen-sparse/fs/exec.c	Fri Mar 25 19:30:52 2005 +0000
     7.2 +++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
     7.3 @@ -1,1179 +0,0 @@
     7.4 -/*
     7.5 - *  linux/fs/exec.c
     7.6 - *
     7.7 - *  Copyright (C) 1991, 1992  Linus Torvalds
     7.8 - */
     7.9 -
    7.10 -/*
    7.11 - * #!-checking implemented by tytso.
    7.12 - */
    7.13 -/*
    7.14 - * Demand-loading implemented 01.12.91 - no need to read anything but
    7.15 - * the header into memory. The inode of the executable is put into
    7.16 - * "current->executable", and page faults do the actual loading. Clean.
    7.17 - *
    7.18 - * Once more I can proudly say that linux stood up to being changed: it
    7.19 - * was less than 2 hours work to get demand-loading completely implemented.
    7.20 - *
    7.21 - * Demand loading changed July 1993 by Eric Youngdale.   Use mmap instead,
    7.22 - * current->executable is only used by the procfs.  This allows a dispatch
    7.23 - * table to check for several different types  of binary formats.  We keep
    7.24 - * trying until we recognize the file or we run out of supported binary
    7.25 - * formats. 
    7.26 - */
    7.27 -
    7.28 -#include <linux/config.h>
    7.29 -#include <linux/slab.h>
    7.30 -#include <linux/file.h>
    7.31 -#include <linux/mman.h>
    7.32 -#include <linux/a.out.h>
    7.33 -#include <linux/stat.h>
    7.34 -#include <linux/fcntl.h>
    7.35 -#include <linux/smp_lock.h>
    7.36 -#include <linux/init.h>
    7.37 -#include <linux/pagemap.h>
    7.38 -#include <linux/highmem.h>
    7.39 -#include <linux/spinlock.h>
    7.40 -#include <linux/personality.h>
    7.41 -#include <linux/swap.h>
    7.42 -#include <linux/utsname.h>
    7.43 -#define __NO_VERSION__
    7.44 -#include <linux/module.h>
    7.45 -
    7.46 -#include <asm/uaccess.h>
    7.47 -#include <asm/pgalloc.h>
    7.48 -#include <asm/mmu_context.h>
    7.49 -
    7.50 -#ifdef CONFIG_KMOD
    7.51 -#include <linux/kmod.h>
    7.52 -#endif
    7.53 -
    7.54 -int core_uses_pid;
    7.55 -char core_pattern[65] = "core";
    7.56 -int core_setuid_ok = 0;
    7.57 -/* The maximal length of core_pattern is also specified in sysctl.c */ 
    7.58 -
    7.59 -static struct linux_binfmt *formats;
    7.60 -static rwlock_t binfmt_lock = RW_LOCK_UNLOCKED;
    7.61 -
    7.62 -int register_binfmt(struct linux_binfmt * fmt)
    7.63 -{
    7.64 -	struct linux_binfmt ** tmp = &formats;
    7.65 -
    7.66 -	if (!fmt)
    7.67 -		return -EINVAL;
    7.68 -	if (fmt->next)
    7.69 -		return -EBUSY;
    7.70 -	write_lock(&binfmt_lock);
    7.71 -	while (*tmp) {
    7.72 -		if (fmt == *tmp) {
    7.73 -			write_unlock(&binfmt_lock);
    7.74 -			return -EBUSY;
    7.75 -		}
    7.76 -		tmp = &(*tmp)->next;
    7.77 -	}
    7.78 -	fmt->next = formats;
    7.79 -	formats = fmt;
    7.80 -	write_unlock(&binfmt_lock);
    7.81 -	return 0;	
    7.82 -}
    7.83 -
    7.84 -int unregister_binfmt(struct linux_binfmt * fmt)
    7.85 -{
    7.86 -	struct linux_binfmt ** tmp = &formats;
    7.87 -
    7.88 -	write_lock(&binfmt_lock);
    7.89 -	while (*tmp) {
    7.90 -		if (fmt == *tmp) {
    7.91 -			*tmp = fmt->next;
    7.92 -			write_unlock(&binfmt_lock);
    7.93 -			return 0;
    7.94 -		}
    7.95 -		tmp = &(*tmp)->next;
    7.96 -	}
    7.97 -	write_unlock(&binfmt_lock);
    7.98 -	return -EINVAL;
    7.99 -}
   7.100 -
   7.101 -static inline void put_binfmt(struct linux_binfmt * fmt)
   7.102 -{
   7.103 -	if (fmt->module)
   7.104 -		__MOD_DEC_USE_COUNT(fmt->module);
   7.105 -}
   7.106 -
   7.107 -/*
   7.108 - * Note that a shared library must be both readable and executable due to
   7.109 - * security reasons.
   7.110 - *
   7.111 - * Also note that we take the address to load from from the file itself.
   7.112 - */
   7.113 -asmlinkage long sys_uselib(const char * library)
   7.114 -{
   7.115 -	struct file * file;
   7.116 -	struct nameidata nd;
   7.117 -	int error;
   7.118 -
   7.119 -	error = user_path_walk(library, &nd);
   7.120 -	if (error)
   7.121 -		goto out;
   7.122 -
   7.123 -	error = -EINVAL;
   7.124 -	if (!S_ISREG(nd.dentry->d_inode->i_mode))
   7.125 -		goto exit;
   7.126 -
   7.127 -	error = permission(nd.dentry->d_inode, MAY_READ | MAY_EXEC);
   7.128 -	if (error)
   7.129 -		goto exit;
   7.130 -
   7.131 -	file = dentry_open(nd.dentry, nd.mnt, O_RDONLY);
   7.132 -	error = PTR_ERR(file);
   7.133 -	if (IS_ERR(file))
   7.134 -		goto out;
   7.135 -
   7.136 -	error = -ENOEXEC;
   7.137 -	if(file->f_op && file->f_op->read) {
   7.138 -		struct linux_binfmt * fmt;
   7.139 -
   7.140 -		read_lock(&binfmt_lock);
   7.141 -		for (fmt = formats ; fmt ; fmt = fmt->next) {
   7.142 -			if (!fmt->load_shlib)
   7.143 -				continue;
   7.144 -			if (!try_inc_mod_count(fmt->module))
   7.145 -				continue;
   7.146 -			read_unlock(&binfmt_lock);
   7.147 -			error = fmt->load_shlib(file);
   7.148 -			read_lock(&binfmt_lock);
   7.149 -			put_binfmt(fmt);
   7.150 -			if (error != -ENOEXEC)
   7.151 -				break;
   7.152 -		}
   7.153 -		read_unlock(&binfmt_lock);
   7.154 -	}
   7.155 -	fput(file);
   7.156 -out:
   7.157 -  	return error;
   7.158 -exit:
   7.159 -	path_release(&nd);
   7.160 -	goto out;
   7.161 -}
   7.162 -
   7.163 -/*
   7.164 - * count() counts the number of arguments/envelopes
   7.165 - */
   7.166 -static int count(char ** argv, int max)
   7.167 -{
   7.168 -	int i = 0;
   7.169 -
   7.170 -	if (argv != NULL) {
   7.171 -		for (;;) {
   7.172 -			char * p;
   7.173 -
   7.174 -			if (get_user(p, argv))
   7.175 -				return -EFAULT;
   7.176 -			if (!p)
   7.177 -				break;
   7.178 -			argv++;
   7.179 -			if(++i > max)
   7.180 -				return -E2BIG;
   7.181 -		}
   7.182 -	}
   7.183 -	return i;
   7.184 -}
   7.185 -
   7.186 -/*
   7.187 - * 'copy_strings()' copies argument/envelope strings from user
   7.188 - * memory to free pages in kernel mem. These are in a format ready
   7.189 - * to be put directly into the top of new user memory.
   7.190 - */
   7.191 -int copy_strings(int argc,char ** argv, struct linux_binprm *bprm) 
   7.192 -{
   7.193 -	struct page *kmapped_page = NULL;
   7.194 -	char *kaddr = NULL;
   7.195 -	int ret;
   7.196 -
   7.197 -	while (argc-- > 0) {
   7.198 -		char *str;
   7.199 -		int len;
   7.200 -		unsigned long pos;
   7.201 -
   7.202 -		if (get_user(str, argv+argc) ||
   7.203 -				!(len = strnlen_user(str, bprm->p))) {
   7.204 -			ret = -EFAULT;
   7.205 -			goto out;
   7.206 -		}
   7.207 -
   7.208 -		if (bprm->p < len)  {
   7.209 -			ret = -E2BIG;
   7.210 -			goto out;
   7.211 -		}
   7.212 -
   7.213 -		bprm->p -= len;
   7.214 -		/* XXX: add architecture specific overflow check here. */ 
   7.215 -		pos = bprm->p;
   7.216 -
   7.217 -		while (len > 0) {
   7.218 -			int i, new, err;
   7.219 -			int offset, bytes_to_copy;
   7.220 -			struct page *page;
   7.221 -
   7.222 -			offset = pos % PAGE_SIZE;
   7.223 -			i = pos/PAGE_SIZE;
   7.224 -			page = bprm->page[i];
   7.225 -			new = 0;
   7.226 -			if (!page) {
   7.227 -				page = alloc_page(GFP_HIGHUSER);
   7.228 -				bprm->page[i] = page;
   7.229 -				if (!page) {
   7.230 -					ret = -ENOMEM;
   7.231 -					goto out;
   7.232 -				}
   7.233 -				new = 1;
   7.234 -			}
   7.235 -
   7.236 -			if (page != kmapped_page) {
   7.237 -				if (kmapped_page)
   7.238 -					kunmap(kmapped_page);
   7.239 -				kmapped_page = page;
   7.240 -				kaddr = kmap(kmapped_page);
   7.241 -			}
   7.242 -			if (new && offset)
   7.243 -				memset(kaddr, 0, offset);
   7.244 -			bytes_to_copy = PAGE_SIZE - offset;
   7.245 -			if (bytes_to_copy > len) {
   7.246 -				bytes_to_copy = len;
   7.247 -				if (new)
   7.248 -					memset(kaddr+offset+len, 0,
   7.249 -						PAGE_SIZE-offset-len);
   7.250 -			}
   7.251 -			err = copy_from_user(kaddr+offset, str, bytes_to_copy);
   7.252 -			if (err) {
   7.253 -				ret = -EFAULT;
   7.254 -				goto out;
   7.255 -			}
   7.256 -
   7.257 -			pos += bytes_to_copy;
   7.258 -			str += bytes_to_copy;
   7.259 -			len -= bytes_to_copy;
   7.260 -		}
   7.261 -	}
   7.262 -	ret = 0;
   7.263 -out:
   7.264 -	if (kmapped_page)
   7.265 -		kunmap(kmapped_page);
   7.266 -	return ret;
   7.267 -}
   7.268 -
   7.269 -/*
   7.270 - * Like copy_strings, but get argv and its values from kernel memory.
   7.271 - */
   7.272 -int copy_strings_kernel(int argc,char ** argv, struct linux_binprm *bprm)
   7.273 -{
   7.274 -	int r;
   7.275 -	mm_segment_t oldfs = get_fs();
   7.276 -	set_fs(KERNEL_DS); 
   7.277 -	r = copy_strings(argc, argv, bprm);
   7.278 -	set_fs(oldfs);
   7.279 -	return r; 
   7.280 -}
   7.281 -
   7.282 -/*
   7.283 - * This routine is used to map in a page into an address space: needed by
   7.284 - * execve() for the initial stack and environment pages.
   7.285 - *
   7.286 - * tsk->mmap_sem is held for writing.
   7.287 - */
   7.288 -void put_dirty_page(struct task_struct * tsk, struct page *page, unsigned long address)
   7.289 -{
   7.290 -	pgd_t * pgd;
   7.291 -	pmd_t * pmd;
   7.292 -	pte_t * pte;
   7.293 -	struct vm_area_struct *vma; 
   7.294 -	pgprot_t prot = PAGE_COPY; 
   7.295 -
   7.296 -	if (page_count(page) != 1)
   7.297 -		printk(KERN_ERR "mem_map disagrees with %p at %08lx\n", page, address);
   7.298 -	pgd = pgd_offset(tsk->mm, address);
   7.299 -
   7.300 -	spin_lock(&tsk->mm->page_table_lock);
   7.301 -	pmd = pmd_alloc(tsk->mm, pgd, address);
   7.302 -	if (!pmd)
   7.303 -		goto out;
   7.304 -	pte = pte_alloc(tsk->mm, pmd, address);
   7.305 -	if (!pte)
   7.306 -		goto out;
   7.307 -	if (!pte_none(*pte))
   7.308 -		goto out;
   7.309 -	lru_cache_add(page);
   7.310 -	flush_dcache_page(page);
   7.311 -	flush_page_to_ram(page);
   7.312 -	/* lookup is cheap because there is only a single entry in the list */
   7.313 -	vma = find_vma(tsk->mm, address);
   7.314 -	if (vma)
   7.315 -		prot = vma->vm_page_prot;
   7.316 -	set_pte(pte, pte_mkdirty(pte_mkwrite(mk_pte(page, prot))));
   7.317 -	XEN_flush_page_update_queue();
   7.318 -	tsk->mm->rss++;
   7.319 -	spin_unlock(&tsk->mm->page_table_lock);
   7.320 -
   7.321 -	/* no need for flush_tlb */
   7.322 -	return;
   7.323 -out:
   7.324 -	spin_unlock(&tsk->mm->page_table_lock);
   7.325 -	__free_page(page);
   7.326 -	force_sig(SIGKILL, tsk);
   7.327 -	return;
   7.328 -}
   7.329 -
   7.330 -int setup_arg_pages(struct linux_binprm *bprm)
   7.331 -{
   7.332 -	unsigned long stack_base;
   7.333 -	struct vm_area_struct *mpnt;
   7.334 -	int i, ret;
   7.335 -
   7.336 -	stack_base = STACK_TOP - MAX_ARG_PAGES*PAGE_SIZE;
   7.337 -
   7.338 -	bprm->p += stack_base;
   7.339 -	if (bprm->loader)
   7.340 -		bprm->loader += stack_base;
   7.341 -	bprm->exec += stack_base;
   7.342 -
   7.343 -	mpnt = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
   7.344 -	if (!mpnt) 
   7.345 -		return -ENOMEM; 
   7.346 -	
   7.347 -	down_write(&current->mm->mmap_sem);
   7.348 -	{
   7.349 -		mpnt->vm_mm = current->mm;
   7.350 -		mpnt->vm_start = PAGE_MASK & (unsigned long) bprm->p;
   7.351 -		mpnt->vm_end = STACK_TOP;
   7.352 -		mpnt->vm_flags = VM_STACK_FLAGS;
   7.353 -		mpnt->vm_page_prot = protection_map[VM_STACK_FLAGS & 0x7];
   7.354 -		mpnt->vm_ops = NULL;
   7.355 -		mpnt->vm_pgoff = 0;
   7.356 -		mpnt->vm_file = NULL;
   7.357 -		mpnt->vm_private_data = (void *) 0;
   7.358 -		if ((ret = insert_vm_struct(current->mm, mpnt))) {
   7.359 -			up_write(&current->mm->mmap_sem);
   7.360 -			kmem_cache_free(vm_area_cachep, mpnt);
   7.361 -			return ret;
   7.362 -		}
   7.363 -		current->mm->total_vm = (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT;
   7.364 -	} 
   7.365 -
   7.366 -	for (i = 0 ; i < MAX_ARG_PAGES ; i++) {
   7.367 -		struct page *page = bprm->page[i];
   7.368 -		if (page) {
   7.369 -			bprm->page[i] = NULL;
   7.370 -			put_dirty_page(current,page,stack_base);
   7.371 -		}
   7.372 -		stack_base += PAGE_SIZE;
   7.373 -	}
   7.374 -	up_write(&current->mm->mmap_sem);
   7.375 -	
   7.376 -	return 0;
   7.377 -}
   7.378 -
   7.379 -struct file *open_exec(const char *name)
   7.380 -{
   7.381 -	struct nameidata nd;
   7.382 -	struct inode *inode;
   7.383 -	struct file *file;
   7.384 -	int err = 0;
   7.385 -
   7.386 -	err = path_lookup(name, LOOKUP_FOLLOW|LOOKUP_POSITIVE, &nd);
   7.387 -	file = ERR_PTR(err);
   7.388 -	if (!err) {
   7.389 -		inode = nd.dentry->d_inode;
   7.390 -		file = ERR_PTR(-EACCES);
   7.391 -		if (!(nd.mnt->mnt_flags & MNT_NOEXEC) &&
   7.392 -		    S_ISREG(inode->i_mode)) {
   7.393 -			int err = permission(inode, MAY_EXEC);
   7.394 -			if (!err && !(inode->i_mode & 0111))
   7.395 -				err = -EACCES;
   7.396 -			file = ERR_PTR(err);
   7.397 -			if (!err) {
   7.398 -				file = dentry_open(nd.dentry, nd.mnt, O_RDONLY);
   7.399 -				if (!IS_ERR(file)) {
   7.400 -					err = deny_write_access(file);
   7.401 -					if (err) {
   7.402 -						fput(file);
   7.403 -						file = ERR_PTR(err);
   7.404 -					}
   7.405 -				}
   7.406 -out:
   7.407 -				return file;
   7.408 -			}
   7.409 -		}
   7.410 -		path_release(&nd);
   7.411 -	}
   7.412 -	goto out;
   7.413 -}
   7.414 -
   7.415 -int kernel_read(struct file *file, unsigned long offset,
   7.416 -	char * addr, unsigned long count)
   7.417 -{
   7.418 -	mm_segment_t old_fs;
   7.419 -	loff_t pos = offset;
   7.420 -	int result = -ENOSYS;
   7.421 -
   7.422 -	if (!file->f_op->read)
   7.423 -		goto fail;
   7.424 -	old_fs = get_fs();
   7.425 -	set_fs(get_ds());
   7.426 -	result = file->f_op->read(file, addr, count, &pos);
   7.427 -	set_fs(old_fs);
   7.428 -fail:
   7.429 -	return result;
   7.430 -}
   7.431 -
   7.432 -static int exec_mmap(void)
   7.433 -{
   7.434 -	struct mm_struct * mm, * old_mm;
   7.435 -
   7.436 -	old_mm = current->mm;
   7.437 -
   7.438 -	if (old_mm && atomic_read(&old_mm->mm_users) == 1) {
   7.439 -		mm_release();
   7.440 -		down_write(&old_mm->mmap_sem);
   7.441 -		exit_mmap(old_mm);
   7.442 -		up_write(&old_mm->mmap_sem);
   7.443 -		return 0;
   7.444 -	}
   7.445 -
   7.446 -
   7.447 -	mm = mm_alloc();
   7.448 -	if (mm) {
   7.449 -		struct mm_struct *active_mm;
   7.450 -
   7.451 -		if (init_new_context(current, mm)) {
   7.452 -			mmdrop(mm);
   7.453 -			return -ENOMEM;
   7.454 -		}
   7.455 -
   7.456 -		/* Add it to the list of mm's */
   7.457 -		spin_lock(&mmlist_lock);
   7.458 -		list_add(&mm->mmlist, &init_mm.mmlist);
   7.459 -		mmlist_nr++;
   7.460 -		spin_unlock(&mmlist_lock);
   7.461 -
   7.462 -		task_lock(current);
   7.463 -		active_mm = current->active_mm;
   7.464 -		current->mm = mm;
   7.465 -		current->active_mm = mm;
   7.466 -		task_unlock(current);
   7.467 -		activate_mm(active_mm, mm);
   7.468 -		mm_release();
   7.469 -		if (old_mm) {
   7.470 -			if (active_mm != old_mm) BUG();
   7.471 -			mmput(old_mm);
   7.472 -			return 0;
   7.473 -		}
   7.474 -		mmdrop(active_mm);
   7.475 -		return 0;
   7.476 -	}
   7.477 -	return -ENOMEM;
   7.478 -}
   7.479 -
   7.480 -/*
   7.481 - * This function makes sure the current process has its own signal table,
   7.482 - * so that flush_signal_handlers can later reset the handlers without
   7.483 - * disturbing other processes.  (Other processes might share the signal
   7.484 - * table via the CLONE_SIGNAL option to clone().)
   7.485 - */
   7.486 - 
   7.487 -static inline int make_private_signals(void)
   7.488 -{
   7.489 -	struct signal_struct * newsig;
   7.490 -
   7.491 -	if (atomic_read(&current->sig->count) <= 1)
   7.492 -		return 0;
   7.493 -	newsig = kmem_cache_alloc(sigact_cachep, GFP_KERNEL);
   7.494 -	if (newsig == NULL)
   7.495 -		return -ENOMEM;
   7.496 -	spin_lock_init(&newsig->siglock);
   7.497 -	atomic_set(&newsig->count, 1);
   7.498 -	memcpy(newsig->action, current->sig->action, sizeof(newsig->action));
   7.499 -	spin_lock_irq(&current->sigmask_lock);
   7.500 -	current->sig = newsig;
   7.501 -	spin_unlock_irq(&current->sigmask_lock);
   7.502 -	return 0;
   7.503 -}
   7.504 -	
   7.505 -/*
   7.506 - * If make_private_signals() made a copy of the signal table, decrement the
   7.507 - * refcount of the original table, and free it if necessary.
   7.508 - * We don't do that in make_private_signals() so that we can back off
   7.509 - * in flush_old_exec() if an error occurs after calling make_private_signals().
   7.510 - */
   7.511 -
   7.512 -static inline void release_old_signals(struct signal_struct * oldsig)
   7.513 -{
   7.514 -	if (current->sig == oldsig)
   7.515 -		return;
   7.516 -	if (atomic_dec_and_test(&oldsig->count))
   7.517 -		kmem_cache_free(sigact_cachep, oldsig);
   7.518 -}
   7.519 -
   7.520 -/*
   7.521 - * These functions flushes out all traces of the currently running executable
   7.522 - * so that a new one can be started
   7.523 - */
   7.524 -
   7.525 -static inline void flush_old_files(struct files_struct * files)
   7.526 -{
   7.527 -	long j = -1;
   7.528 -
   7.529 -	write_lock(&files->file_lock);
   7.530 -	for (;;) {
   7.531 -		unsigned long set, i;
   7.532 -
   7.533 -		j++;
   7.534 -		i = j * __NFDBITS;
   7.535 -		if (i >= files->max_fds || i >= files->max_fdset)
   7.536 -			break;
   7.537 -		set = files->close_on_exec->fds_bits[j];
   7.538 -		if (!set)
   7.539 -			continue;
   7.540 -		files->close_on_exec->fds_bits[j] = 0;
   7.541 -		write_unlock(&files->file_lock);
   7.542 -		for ( ; set ; i++,set >>= 1) {
   7.543 -			if (set & 1) {
   7.544 -				sys_close(i);
   7.545 -			}
   7.546 -		}
   7.547 -		write_lock(&files->file_lock);
   7.548 -
   7.549 -	}
   7.550 -	write_unlock(&files->file_lock);
   7.551 -}
   7.552 -
   7.553 -/*
   7.554 - * An execve() will automatically "de-thread" the process.
   7.555 - * Note: we don't have to hold the tasklist_lock to test
   7.556 - * whether we migth need to do this. If we're not part of
   7.557 - * a thread group, there is no way we can become one
   7.558 - * dynamically. And if we are, we only need to protect the
   7.559 - * unlink - even if we race with the last other thread exit,
   7.560 - * at worst the list_del_init() might end up being a no-op.
   7.561 - */
   7.562 -static inline void de_thread(struct task_struct *tsk)
   7.563 -{
   7.564 -	if (!list_empty(&tsk->thread_group)) {
   7.565 -		write_lock_irq(&tasklist_lock);
   7.566 -		list_del_init(&tsk->thread_group);
   7.567 -		write_unlock_irq(&tasklist_lock);
   7.568 -	}
   7.569 -
   7.570 -	/* Minor oddity: this might stay the same. */
   7.571 -	tsk->tgid = tsk->pid;
   7.572 -}
   7.573 -
   7.574 -void get_task_comm(char *buf, struct task_struct *tsk)
   7.575 -{
   7.576 -	/* buf must be at least sizeof(tsk->comm) in size */
   7.577 -	task_lock(tsk);
   7.578 -	memcpy(buf, tsk->comm, sizeof(tsk->comm));
   7.579 -	task_unlock(tsk);
   7.580 -}
   7.581 -
   7.582 -void set_task_comm(struct task_struct *tsk, char *buf)
   7.583 -{
   7.584 -	task_lock(tsk);
   7.585 -	strncpy(tsk->comm, buf, sizeof(tsk->comm));
   7.586 -	tsk->comm[sizeof(tsk->comm)-1]='\0';
   7.587 -	task_unlock(tsk);
   7.588 -}
   7.589 -
   7.590 -int flush_old_exec(struct linux_binprm * bprm)
   7.591 -{
   7.592 -	char * name;
   7.593 -	int i, ch, retval;
   7.594 -	struct signal_struct * oldsig;
   7.595 -	struct files_struct * files;
   7.596 -	char tcomm[sizeof(current->comm)];
   7.597 -
   7.598 -	/*
   7.599 -	 * Make sure we have a private signal table
   7.600 -	 */
   7.601 -	oldsig = current->sig;
   7.602 -	retval = make_private_signals();
   7.603 -	if (retval) goto flush_failed;
   7.604 -
   7.605 -	/*
   7.606 -	 * Make sure we have private file handles. Ask the
   7.607 -	 * fork helper to do the work for us and the exit
   7.608 -	 * helper to do the cleanup of the old one.
   7.609 -	 */
   7.610 -	 
   7.611 -	files = current->files;		/* refcounted so safe to hold */
   7.612 -	retval = unshare_files();
   7.613 -	if(retval)
   7.614 -		goto flush_failed;
   7.615 -	
   7.616 -	/* 
   7.617 -	 * Release all of the old mmap stuff
   7.618 -	 */
   7.619 -	retval = exec_mmap();
   7.620 -	if (retval) goto mmap_failed;
   7.621 -
   7.622 -	/* This is the point of no return */
   7.623 -	steal_locks(files);
   7.624 -	put_files_struct(files);
   7.625 -	release_old_signals(oldsig);
   7.626 -
   7.627 -	current->sas_ss_sp = current->sas_ss_size = 0;
   7.628 -
   7.629 -	if (current->euid == current->uid && current->egid == current->gid) {
   7.630 -		current->mm->dumpable = 1;
   7.631 -		current->task_dumpable = 1;
   7.632 -	}
   7.633 -	name = bprm->filename;
   7.634 -	for (i=0; (ch = *(name++)) != '\0';) {
   7.635 -		if (ch == '/')
   7.636 -			i = 0;
   7.637 -		else
   7.638 -			if (i < (sizeof(tcomm) - 1))
   7.639 -				tcomm[i++] = ch;
   7.640 -	}
   7.641 -	tcomm[i] = '\0';
   7.642 -	set_task_comm(current, tcomm);
   7.643 -
   7.644 -	flush_thread();
   7.645 -
   7.646 -	de_thread(current);
   7.647 -
   7.648 -	if (bprm->e_uid != current->euid || bprm->e_gid != current->egid || 
   7.649 -	    permission(bprm->file->f_dentry->d_inode,MAY_READ))
   7.650 -		current->mm->dumpable = 0;
   7.651 -
   7.652 -	/* An exec changes our domain. We are no longer part of the thread
   7.653 -	   group */
   7.654 -	   
   7.655 -	current->self_exec_id++;
   7.656 -			
   7.657 -	flush_signal_handlers(current);
   7.658 -	flush_old_files(current->files);
   7.659 -
   7.660 -	return 0;
   7.661 -
   7.662 -mmap_failed:
   7.663 -	put_files_struct(current->files);
   7.664 -	current->files = files;
   7.665 -flush_failed:
   7.666 -	spin_lock_irq(&current->sigmask_lock);
   7.667 -	if (current->sig != oldsig) {
   7.668 -		kmem_cache_free(sigact_cachep, current->sig);
   7.669 -		current->sig = oldsig;
   7.670 -	}
   7.671 -	spin_unlock_irq(&current->sigmask_lock);
   7.672 -	return retval;
   7.673 -}
   7.674 -
   7.675 -/*
   7.676 - * We mustn't allow tracing of suid binaries, unless
   7.677 - * the tracer has the capability to trace anything..
   7.678 - */
   7.679 -static inline int must_not_trace_exec(struct task_struct * p)
   7.680 -{
   7.681 -	return (p->ptrace & PT_PTRACED) && !(p->ptrace & PT_PTRACE_CAP);
   7.682 -}
   7.683 -
   7.684 -/* 
   7.685 - * Fill the binprm structure from the inode. 
   7.686 - * Check permissions, then read the first 128 (BINPRM_BUF_SIZE) bytes
   7.687 - */
   7.688 -int prepare_binprm(struct linux_binprm *bprm)
   7.689 -{
   7.690 -	int mode;
   7.691 -	struct inode * inode = bprm->file->f_dentry->d_inode;
   7.692 -
   7.693 -	mode = inode->i_mode;
   7.694 -	/*
   7.695 -	 * Check execute perms again - if the caller has CAP_DAC_OVERRIDE,
   7.696 -	 * vfs_permission lets a non-executable through
   7.697 -	 */
   7.698 -	if (!(mode & 0111))	/* with at least _one_ execute bit set */
   7.699 -		return -EACCES;
   7.700 -	if (bprm->file->f_op == NULL)
   7.701 -		return -EACCES;
   7.702 -
   7.703 -	bprm->e_uid = current->euid;
   7.704 -	bprm->e_gid = current->egid;
   7.705 -
   7.706 -	if(!(bprm->file->f_vfsmnt->mnt_flags & MNT_NOSUID)) {
   7.707 -		/* Set-uid? */
   7.708 -		if (mode & S_ISUID)
   7.709 -			bprm->e_uid = inode->i_uid;
   7.710 -
   7.711 -		/* Set-gid? */
   7.712 -		/*
   7.713 -		 * If setgid is set but no group execute bit then this
   7.714 -		 * is a candidate for mandatory locking, not a setgid
   7.715 -		 * executable.
   7.716 -		 */
   7.717 -		if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP))
   7.718 -			bprm->e_gid = inode->i_gid;
   7.719 -	}
   7.720 -
   7.721 -	/* We don't have VFS support for capabilities yet */
   7.722 -	cap_clear(bprm->cap_inheritable);
   7.723 -	cap_clear(bprm->cap_permitted);
   7.724 -	cap_clear(bprm->cap_effective);
   7.725 -
   7.726 -	/*  To support inheritance of root-permissions and suid-root
   7.727 -         *  executables under compatibility mode, we raise all three
   7.728 -         *  capability sets for the file.
   7.729 -         *
   7.730 -         *  If only the real uid is 0, we only raise the inheritable
   7.731 -         *  and permitted sets of the executable file.
   7.732 -         */
   7.733 -
   7.734 -	if (!issecure(SECURE_NOROOT)) {
   7.735 -		if (bprm->e_uid == 0 || current->uid == 0) {
   7.736 -			cap_set_full(bprm->cap_inheritable);
   7.737 -			cap_set_full(bprm->cap_permitted);
   7.738 -		}
   7.739 -		if (bprm->e_uid == 0) 
   7.740 -			cap_set_full(bprm->cap_effective);
   7.741 -	}
   7.742 -
   7.743 -	memset(bprm->buf,0,BINPRM_BUF_SIZE);
   7.744 -	return kernel_read(bprm->file,0,bprm->buf,BINPRM_BUF_SIZE);
   7.745 -}
   7.746 -
   7.747 -/*
   7.748 - * This function is used to produce the new IDs and capabilities
   7.749 - * from the old ones and the file's capabilities.
   7.750 - *
   7.751 - * The formula used for evolving capabilities is:
   7.752 - *
   7.753 - *       pI' = pI
   7.754 - * (***) pP' = (fP & X) | (fI & pI)
   7.755 - *       pE' = pP' & fE          [NB. fE is 0 or ~0]
   7.756 - *
   7.757 - * I=Inheritable, P=Permitted, E=Effective // p=process, f=file
   7.758 - * ' indicates post-exec(), and X is the global 'cap_bset'.
   7.759 - *
   7.760 - */
   7.761 -
   7.762 -void compute_creds(struct linux_binprm *bprm) 
   7.763 -{
   7.764 -	kernel_cap_t new_permitted, working;
   7.765 -	int do_unlock = 0;
   7.766 -
   7.767 -	new_permitted = cap_intersect(bprm->cap_permitted, cap_bset);
   7.768 -	working = cap_intersect(bprm->cap_inheritable,
   7.769 -				current->cap_inheritable);
   7.770 -	new_permitted = cap_combine(new_permitted, working);
   7.771 -
   7.772 -	if (bprm->e_uid != current->uid || bprm->e_gid != current->gid ||
   7.773 -	    !cap_issubset(new_permitted, current->cap_permitted)) {
   7.774 -                current->mm->dumpable = 0;
   7.775 -		
   7.776 -		lock_kernel();
   7.777 -		if (must_not_trace_exec(current)
   7.778 -		    || atomic_read(&current->fs->count) > 1
   7.779 -		    || atomic_read(&current->files->count) > 1
   7.780 -		    || atomic_read(&current->sig->count) > 1) {
   7.781 -			if(!capable(CAP_SETUID)) {
   7.782 -				bprm->e_uid = current->uid;
   7.783 -				bprm->e_gid = current->gid;
   7.784 -			}
   7.785 -			if(!capable(CAP_SETPCAP)) {
   7.786 -				new_permitted = cap_intersect(new_permitted,
   7.787 -							current->cap_permitted);
   7.788 -			}
   7.789 -		}
   7.790 -		do_unlock = 1;
   7.791 -	}
   7.792 -
   7.793 -
   7.794 -	/* For init, we want to retain the capabilities set
   7.795 -         * in the init_task struct. Thus we skip the usual
   7.796 -         * capability rules */
   7.797 -	if (current->pid != 1) {
   7.798 -		current->cap_permitted = new_permitted;
   7.799 -		current->cap_effective =
   7.800 -			cap_intersect(new_permitted, bprm->cap_effective);
   7.801 -	}
   7.802 -	
   7.803 -        /* AUD: Audit candidate if current->cap_effective is set */
   7.804 -
   7.805 -        current->suid = current->euid = current->fsuid = bprm->e_uid;
   7.806 -        current->sgid = current->egid = current->fsgid = bprm->e_gid;
   7.807 -
   7.808 -	if(do_unlock)
   7.809 -		unlock_kernel();
   7.810 -	current->keep_capabilities = 0;
   7.811 -}
   7.812 -
   7.813 -
   7.814 -void remove_arg_zero(struct linux_binprm *bprm)
   7.815 -{
   7.816 -	if (bprm->argc) {
   7.817 -		unsigned long offset;
   7.818 -		char * kaddr;
   7.819 -		struct page *page;
   7.820 -
   7.821 -		offset = bprm->p % PAGE_SIZE;
   7.822 -		goto inside;
   7.823 -
   7.824 -		while (bprm->p++, *(kaddr+offset++)) {
   7.825 -			if (offset != PAGE_SIZE)
   7.826 -				continue;
   7.827 -			offset = 0;
   7.828 -			kunmap(page);
   7.829 -inside:
   7.830 -			page = bprm->page[bprm->p/PAGE_SIZE];
   7.831 -			kaddr = kmap(page);
   7.832 -		}
   7.833 -		kunmap(page);
   7.834 -		bprm->argc--;
   7.835 -	}
   7.836 -}
   7.837 -
   7.838 -/*
   7.839 - * cycle the list of binary formats handler, until one recognizes the image
   7.840 - */
   7.841 -int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs)
   7.842 -{
   7.843 -	int try,retval=0;
   7.844 -	struct linux_binfmt *fmt;
   7.845 -#ifdef __alpha__
   7.846 -	/* handle /sbin/loader.. */
   7.847 -	{
   7.848 -	    struct exec * eh = (struct exec *) bprm->buf;
   7.849 -
   7.850 -	    if (!bprm->loader && eh->fh.f_magic == 0x183 &&
   7.851 -		(eh->fh.f_flags & 0x3000) == 0x3000)
   7.852 -	    {
   7.853 -		struct file * file;
   7.854 -		unsigned long loader;
   7.855 -
   7.856 -		allow_write_access(bprm->file);
   7.857 -		fput(bprm->file);
   7.858 -		bprm->file = NULL;
   7.859 -
   7.860 -	        loader = PAGE_SIZE*MAX_ARG_PAGES-sizeof(void *);
   7.861 -
   7.862 -		file = open_exec("/sbin/loader");
   7.863 -		retval = PTR_ERR(file);
   7.864 -		if (IS_ERR(file))
   7.865 -			return retval;
   7.866 -
   7.867 -		/* Remember if the application is TASO.  */
   7.868 -		bprm->sh_bang = eh->ah.entry < 0x100000000;
   7.869 -
   7.870 -		bprm->file = file;
   7.871 -		bprm->loader = loader;
   7.872 -		retval = prepare_binprm(bprm);
   7.873 -		if (retval<0)
   7.874 -			return retval;
   7.875 -		/* should call search_binary_handler recursively here,
   7.876 -		   but it does not matter */
   7.877 -	    }
   7.878 -	}
   7.879 -#endif
   7.880 -	/* kernel module loader fixup */
   7.881 -	/* so we don't try to load run modprobe in kernel space. */
   7.882 -	set_fs(USER_DS);
   7.883 -	for (try=0; try<2; try++) {
   7.884 -		read_lock(&binfmt_lock);
   7.885 -		for (fmt = formats ; fmt ; fmt = fmt->next) {
   7.886 -			int (*fn)(struct linux_binprm *, struct pt_regs *) = fmt->load_binary;
   7.887 -			if (!fn)
   7.888 -				continue;
   7.889 -			if (!try_inc_mod_count(fmt->module))
   7.890 -				continue;
   7.891 -			read_unlock(&binfmt_lock);
   7.892 -			retval = fn(bprm, regs);
   7.893 -			if (retval >= 0) {
   7.894 -				put_binfmt(fmt);
   7.895 -				allow_write_access(bprm->file);
   7.896 -				if (bprm->file)
   7.897 -					fput(bprm->file);
   7.898 -				bprm->file = NULL;
   7.899 -				current->did_exec = 1;
   7.900 -				return retval;
   7.901 -			}
   7.902 -			read_lock(&binfmt_lock);
   7.903 -			put_binfmt(fmt);
   7.904 -			if (retval != -ENOEXEC)
   7.905 -				break;
   7.906 -			if (!bprm->file) {
   7.907 -				read_unlock(&binfmt_lock);
   7.908 -				return retval;
   7.909 -			}
   7.910 -		}
   7.911 -		read_unlock(&binfmt_lock);
   7.912 -		if (retval != -ENOEXEC) {
   7.913 -			break;
   7.914 -#ifdef CONFIG_KMOD
   7.915 -		}else{
   7.916 -#define printable(c) (((c)=='\t') || ((c)=='\n') || (0x20<=(c) && (c)<=0x7e))
   7.917 -			char modname[20];
   7.918 -			if (printable(bprm->buf[0]) &&
   7.919 -			    printable(bprm->buf[1]) &&
   7.920 -			    printable(bprm->buf[2]) &&
   7.921 -			    printable(bprm->buf[3]))
   7.922 -				break; /* -ENOEXEC */
   7.923 -			sprintf(modname, "binfmt-%04x", *(unsigned short *)(&bprm->buf[2]));
   7.924 -			request_module(modname);
   7.925 -#endif
   7.926 -		}
   7.927 -	}
   7.928 -	return retval;
   7.929 -}
   7.930 -
   7.931 -
   7.932 -/*
   7.933 - * sys_execve() executes a new program.
   7.934 - */
   7.935 -int do_execve(char * filename, char ** argv, char ** envp, struct pt_regs * regs)
   7.936 -{
   7.937 -	struct linux_binprm bprm;
   7.938 -	struct file *file;
   7.939 -	int retval;
   7.940 -	int i;
   7.941 -
   7.942 -	file = open_exec(filename);
   7.943 -
   7.944 -	retval = PTR_ERR(file);
   7.945 -	if (IS_ERR(file))
   7.946 -		return retval;
   7.947 -
   7.948 -	bprm.p = PAGE_SIZE*MAX_ARG_PAGES-sizeof(void *);
   7.949 -	memset(bprm.page, 0, MAX_ARG_PAGES*sizeof(bprm.page[0])); 
   7.950 -
   7.951 -	bprm.file = file;
   7.952 -	bprm.filename = filename;
   7.953 -	bprm.sh_bang = 0;
   7.954 -	bprm.loader = 0;
   7.955 -	bprm.exec = 0;
   7.956 -	if ((bprm.argc = count(argv, bprm.p / sizeof(void *))) < 0) {
   7.957 -		allow_write_access(file);
   7.958 -		fput(file);
   7.959 -		return bprm.argc;
   7.960 -	}
   7.961 -
   7.962 -	if ((bprm.envc = count(envp, bprm.p / sizeof(void *))) < 0) {
   7.963 -		allow_write_access(file);
   7.964 -		fput(file);
   7.965 -		return bprm.envc;
   7.966 -	}
   7.967 -
   7.968 -	retval = prepare_binprm(&bprm);
   7.969 -	if (retval < 0) 
   7.970 -		goto out; 
   7.971 -
   7.972 -	retval = copy_strings_kernel(1, &bprm.filename, &bprm);
   7.973 -	if (retval < 0) 
   7.974 -		goto out; 
   7.975 -
   7.976 -	bprm.exec = bprm.p;
   7.977 -	retval = copy_strings(bprm.envc, envp, &bprm);
   7.978 -	if (retval < 0) 
   7.979 -		goto out; 
   7.980 -
   7.981 -	retval = copy_strings(bprm.argc, argv, &bprm);
   7.982 -	if (retval < 0) 
   7.983 -		goto out; 
   7.984 -
   7.985 -	retval = search_binary_handler(&bprm,regs);
   7.986 -	if (retval >= 0)
   7.987 -		/* execve success */
   7.988 -		return retval;
   7.989 -
   7.990 -out:
   7.991 -	/* Something went wrong, return the inode and free the argument pages*/
   7.992 -	allow_write_access(bprm.file);
   7.993 -	if (bprm.file)
   7.994 -		fput(bprm.file);
   7.995 -
   7.996 -	for (i = 0 ; i < MAX_ARG_PAGES ; i++) {
   7.997 -		struct page * page = bprm.page[i];
   7.998 -		if (page)
   7.999 -			__free_page(page);
  7.1000 -	}
  7.1001 -
  7.1002 -	return retval;
  7.1003 -}
  7.1004 -
  7.1005 -void set_binfmt(struct linux_binfmt *new)
  7.1006 -{
  7.1007 -	struct linux_binfmt *old = current->binfmt;
  7.1008 -	if (new && new->module)
  7.1009 -		__MOD_INC_USE_COUNT(new->module);
  7.1010 -	current->binfmt = new;
  7.1011 -	if (old && old->module)
  7.1012 -		__MOD_DEC_USE_COUNT(old->module);
  7.1013 -}
  7.1014 -
  7.1015 -#define CORENAME_MAX_SIZE 64
  7.1016 -
  7.1017 -/* format_corename will inspect the pattern parameter, and output a
  7.1018 - * name into corename, which must have space for at least
  7.1019 - * CORENAME_MAX_SIZE bytes plus one byte for the zero terminator.
  7.1020 - */
  7.1021 -void format_corename(char *corename, const char *pattern, long signr)
  7.1022 -{
  7.1023 -	const char *pat_ptr = pattern;
  7.1024 -	char *out_ptr = corename;
  7.1025 -	char *const out_end = corename + CORENAME_MAX_SIZE;
  7.1026 -	int rc;
  7.1027 -	int pid_in_pattern = 0;
  7.1028 -
  7.1029 -	/* Repeat as long as we have more pattern to process and more output
  7.1030 -	   space */
  7.1031 -	while (*pat_ptr) {
  7.1032 -		if (*pat_ptr != '%') {
  7.1033 -			if (out_ptr == out_end)
  7.1034 -				goto out;
  7.1035 -			*out_ptr++ = *pat_ptr++;
  7.1036 -		} else {
  7.1037 -			switch (*++pat_ptr) {
  7.1038 -			case 0:
  7.1039 -				goto out;
  7.1040 -			/* Double percent, output one percent */
  7.1041 -			case '%':
  7.1042 -				if (out_ptr == out_end)
  7.1043 -					goto out;
  7.1044 -				*out_ptr++ = '%';
  7.1045 -				break;
  7.1046 -			/* pid */
  7.1047 -			case 'p':
  7.1048 -				pid_in_pattern = 1;
  7.1049 -				rc = snprintf(out_ptr, out_end - out_ptr,
  7.1050 -					      "%d", current->pid);
  7.1051 -				if (rc > out_end - out_ptr)
  7.1052 -					goto out;
  7.1053 -				out_ptr += rc;
  7.1054 -				break;
  7.1055 -			/* uid */
  7.1056 -			case 'u':
  7.1057 -				rc = snprintf(out_ptr, out_end - out_ptr,
  7.1058 -					      "%d", current->uid);
  7.1059 -				if (rc > out_end - out_ptr)
  7.1060 -					goto out;
  7.1061 -				out_ptr += rc;
  7.1062 -				break;
  7.1063 -			/* gid */
  7.1064 -			case 'g':
  7.1065 -				rc = snprintf(out_ptr, out_end - out_ptr,
  7.1066 -					      "%d", current->gid);
  7.1067 -				if (rc > out_end - out_ptr)
  7.1068 -					goto out;
  7.1069 -				out_ptr += rc;
  7.1070 -				break;
  7.1071 -			/* signal that caused the coredump */
  7.1072 -			case 's':
  7.1073 -				rc = snprintf(out_ptr, out_end - out_ptr,
  7.1074 -					      "%ld", signr);
  7.1075 -				if (rc > out_end - out_ptr)
  7.1076 -					goto out;
  7.1077 -				out_ptr += rc;
  7.1078 -				break;
  7.1079 -			/* UNIX time of coredump */
  7.1080 -			case 't': {
  7.1081 -				struct timeval tv;
  7.1082 -				do_gettimeofday(&tv);
  7.1083 -				rc = snprintf(out_ptr, out_end - out_ptr,
  7.1084 -					      "%ld", tv.tv_sec);
  7.1085 -				if (rc > out_end - out_ptr)
  7.1086 -					goto out;
  7.1087 -				out_ptr += rc;
  7.1088 -				break;
  7.1089 -			}
  7.1090 -			/* hostname */
  7.1091 -			case 'h':
  7.1092 -				down_read(&uts_sem);
  7.1093 -				rc = snprintf(out_ptr, out_end - out_ptr,
  7.1094 -					      "%s", system_utsname.nodename);
  7.1095 -				up_read(&uts_sem);
  7.1096 -				if (rc > out_end - out_ptr)
  7.1097 -					goto out;
  7.1098 -				out_ptr += rc;
  7.1099 -				break;
  7.1100 -			/* executable */
  7.1101 -			case 'e':
  7.1102 -				rc = snprintf(out_ptr, out_end - out_ptr,
  7.1103 -					      "%s", current->comm);
  7.1104 -				if (rc > out_end - out_ptr)
  7.1105 -					goto out;
  7.1106 -				out_ptr += rc;
  7.1107 -				break;
  7.1108 -			default:
  7.1109 -				break;
  7.1110 -			}
  7.1111 -			++pat_ptr;
  7.1112 -		}
  7.1113 -	}
  7.1114 -	/* Backward compatibility with core_uses_pid:
  7.1115 -	 *
  7.1116 -	 * If core_pattern does not include a %p (as is the default)
  7.1117 -	 * and core_uses_pid is set, then .%pid will be appended to
  7.1118 -	 * the filename */
  7.1119 -	if (!pid_in_pattern
  7.1120 -            && (core_uses_pid || atomic_read(&current->mm->mm_users) != 1)) {
  7.1121 -		rc = snprintf(out_ptr, out_end - out_ptr,
  7.1122 -			      ".%d", current->pid);
  7.1123 -		if (rc > out_end - out_ptr)
  7.1124 -			goto out;
  7.1125 -		out_ptr += rc;
  7.1126 -	}
  7.1127 -      out:
  7.1128 -	*out_ptr = 0;
  7.1129 -}
  7.1130 -
  7.1131 -int do_coredump(long signr, struct pt_regs * regs)
  7.1132 -{
  7.1133 -	struct linux_binfmt * binfmt;
  7.1134 -	char corename[CORENAME_MAX_SIZE + 1];
  7.1135 -	struct file * file;
  7.1136 -	struct inode * inode;
  7.1137 -	int retval = 0;
  7.1138 -	int fsuid = current->fsuid;
  7.1139 -
  7.1140 -	lock_kernel();
  7.1141 -	binfmt = current->binfmt;
  7.1142 -	if (!binfmt || !binfmt->core_dump)
  7.1143 -		goto fail;
  7.1144 -	if (!is_dumpable(current))
  7.1145 -	{
  7.1146 -		if(!core_setuid_ok || !current->task_dumpable)
  7.1147 -			goto fail;
  7.1148 -		current->fsuid = 0;
  7.1149 -	}
  7.1150 -	current->mm->dumpable = 0;
  7.1151 -	if (current->rlim[RLIMIT_CORE].rlim_cur < binfmt->min_coredump)
  7.1152 -		goto fail;
  7.1153 -
  7.1154 - 	format_corename(corename, core_pattern, signr);
  7.1155 -	file = filp_open(corename, O_CREAT | 2 | O_NOFOLLOW, 0600);
  7.1156 -	if (IS_ERR(file))
  7.1157 -		goto fail;
  7.1158 -	inode = file->f_dentry->d_inode;
  7.1159 -	if (inode->i_nlink > 1)
  7.1160 -		goto close_fail;	/* multiple links - don't dump */
  7.1161 -	if (d_unhashed(file->f_dentry))
  7.1162 -		goto close_fail;
  7.1163 -
  7.1164 -	if (!S_ISREG(inode->i_mode))
  7.1165 -		goto close_fail;
  7.1166 -	if (!file->f_op)
  7.1167 -		goto close_fail;
  7.1168 -	if (!file->f_op->write)
  7.1169 -		goto close_fail;
  7.1170 -	if (do_truncate(file->f_dentry, 0) != 0)
  7.1171 -		goto close_fail;
  7.1172 -
  7.1173 -	retval = binfmt->core_dump(signr, regs, file);
  7.1174 -
  7.1175 -close_fail:
  7.1176 -	filp_close(file, NULL);
  7.1177 -fail:
  7.1178 -	if (fsuid != current->fsuid)
  7.1179 -		current->fsuid = fsuid;
  7.1180 -	unlock_kernel();
  7.1181 -	return retval;
  7.1182 -}
     8.1 --- a/linux-2.4.29-xen-sparse/include/asm-xen/page.h	Fri Mar 25 19:30:52 2005 +0000
     8.2 +++ b/linux-2.4.29-xen-sparse/include/asm-xen/page.h	Fri Mar 25 20:03:52 2005 +0000
     8.3 @@ -85,23 +85,18 @@ typedef struct { unsigned long pgprot; }
     8.4  static inline unsigned long pmd_val(pmd_t x)
     8.5  {
     8.6      unsigned long ret = x.pmd;
     8.7 -    if ( (ret & 1) ) ret = machine_to_phys(ret);
     8.8 +    if ( ret ) ret = machine_to_phys(ret) | 1;
     8.9      return ret;
    8.10  }
    8.11  #define pmd_val_ma(x)   ((x).pmd)
    8.12  #define pgd_val(x)	({ BUG(); (unsigned long)0; })
    8.13  #define pgprot_val(x)	((x).pgprot)
    8.14  
    8.15 -static inline pte_t __pte(unsigned long x)
    8.16 -{
    8.17 -    if ( (x & 1) ) x = phys_to_machine(x);
    8.18 -    return ((pte_t) { (x) });
    8.19 -}
    8.20 -static inline pmd_t __pmd(unsigned long x)
    8.21 -{
    8.22 -    if ( (x & 1) ) x = phys_to_machine(x);
    8.23 -    return ((pmd_t) { (x) });
    8.24 -}
    8.25 +#define __pte(x) ({ unsigned long _x = (x); \
    8.26 +    (((_x)&1) ? ((pte_t) {phys_to_machine(_x)}) : ((pte_t) {(_x)})); })
    8.27 +#define __pte_ma(x)     ((pte_t) { (x) } )
    8.28 +#define __pmd(x) ({ unsigned long _x = (x); \
    8.29 +    (((_x)&1) ? ((pmd_t) {phys_to_machine(_x)}) : ((pmd_t) {(_x)})); })
    8.30  #define __pgd(x) ({ BUG(); (pgprot_t) { 0 }; })
    8.31  #define __pgprot(x)	((pgprot_t) { (x) } )
    8.32  
     9.1 --- a/linux-2.4.29-xen-sparse/include/asm-xen/pgalloc.h	Fri Mar 25 19:30:52 2005 +0000
     9.2 +++ b/linux-2.4.29-xen-sparse/include/asm-xen/pgalloc.h	Fri Mar 25 20:03:52 2005 +0000
     9.3 @@ -22,7 +22,6 @@
     9.4  #define pmd_populate(mm, pmd, pte) 		  \
     9.5   do {                                             \
     9.6    set_pmd(pmd, __pmd(_PAGE_TABLE + __pa(pte)));   \
     9.7 -  XEN_flush_page_update_queue();                 \
     9.8   } while ( 0 )
     9.9  
    9.10  /*
    9.11 @@ -79,8 +78,9 @@ static inline pgd_t *get_pgd_slow(void)
    9.12  		memcpy(pgd + USER_PTRS_PER_PGD,
    9.13  			init_mm.pgd + USER_PTRS_PER_PGD,
    9.14  			(PTRS_PER_PGD - USER_PTRS_PER_PGD) * sizeof(pgd_t));
    9.15 -                __make_page_readonly(pgd);
    9.16 +		__make_page_readonly(pgd);
    9.17  		queue_pgd_pin(__pa(pgd));
    9.18 +		flush_page_update_queue();
    9.19  	}
    9.20  	return pgd;
    9.21  }
    9.22 @@ -111,7 +111,8 @@ static inline void free_pgd_slow(pgd_t *
    9.23  	kmem_cache_free(pae_pgd_cachep, pgd);
    9.24  #else
    9.25  	queue_pgd_unpin(__pa(pgd));
    9.26 -        __make_page_writable(pgd);
    9.27 +	__make_page_writable(pgd);
    9.28 +	flush_page_update_queue();
    9.29  	free_page((unsigned long)pgd);
    9.30  #endif
    9.31  }
    9.32 @@ -135,6 +136,7 @@ static inline pte_t *pte_alloc_one(struc
    9.33          clear_page(pte);
    9.34          __make_page_readonly(pte);
    9.35          queue_pte_pin(__pa(pte));
    9.36 +        flush_page_update_queue();
    9.37      }
    9.38      return pte;
    9.39  
    9.40 @@ -155,6 +157,7 @@ static __inline__ void pte_free_slow(pte
    9.41  {
    9.42      queue_pte_unpin(__pa(pte));
    9.43      __make_page_writable(pte);
    9.44 +    flush_page_update_queue();
    9.45      free_page((unsigned long)pte);
    9.46  }
    9.47  
    9.48 @@ -208,22 +211,19 @@ extern int do_check_pgt_cache(int, int);
    9.49  
    9.50  static inline void flush_tlb_mm(struct mm_struct *mm)
    9.51  {
    9.52 -	if (mm == current->active_mm) queue_tlb_flush();
    9.53 -	XEN_flush_page_update_queue();
    9.54 +	if (mm == current->active_mm) xen_tlb_flush();
    9.55  }
    9.56  
    9.57  static inline void flush_tlb_page(struct vm_area_struct *vma,
    9.58  	unsigned long addr)
    9.59  {
    9.60 -	if (vma->vm_mm == current->active_mm) queue_invlpg(addr);
    9.61 -	XEN_flush_page_update_queue();
    9.62 +	if (vma->vm_mm == current->active_mm) xen_invlpg(addr);
    9.63  }
    9.64  
    9.65  static inline void flush_tlb_range(struct mm_struct *mm,
    9.66  	unsigned long start, unsigned long end)
    9.67  {
    9.68 -	if (mm == current->active_mm) queue_tlb_flush();
    9.69 -	XEN_flush_page_update_queue();
    9.70 +	if (mm == current->active_mm) xen_tlb_flush();
    9.71  }
    9.72  
    9.73  #else
    9.74 @@ -261,7 +261,6 @@ static inline void flush_tlb_pgtables(st
    9.75  				      unsigned long start, unsigned long end)
    9.76  {
    9.77      /* i386 does not keep any page table caches in TLB */
    9.78 -    XEN_flush_page_update_queue();
    9.79  }
    9.80  
    9.81  /*
    10.1 --- a/linux-2.4.29-xen-sparse/include/asm-xen/pgtable-2level.h	Fri Mar 25 19:30:52 2005 +0000
    10.2 +++ b/linux-2.4.29-xen-sparse/include/asm-xen/pgtable-2level.h	Fri Mar 25 20:03:52 2005 +0000
    10.3 @@ -34,9 +34,19 @@ static inline int pgd_bad(pgd_t pgd)		{ 
    10.4  static inline int pgd_present(pgd_t pgd)	{ return 1; }
    10.5  #define pgd_clear(xp)				do { } while (0)
    10.6  
    10.7 -#define set_pte(pteptr, pteval) queue_l1_entry_update(pteptr, (pteval).pte_low)
    10.8 -#define set_pte_atomic(pteptr, pteval) queue_l1_entry_update(pteptr, (pteval).pte_low)
    10.9 -#define set_pmd(pmdptr, pmdval) queue_l2_entry_update((pmdptr), (pmdval))
   10.10 +/*
   10.11 + * Certain architectures need to do special things when PTEs
   10.12 + * within a page table are directly modified.  Thus, the following
   10.13 + * hook is made available.
   10.14 + */
   10.15 +#define set_pte(pteptr, pteval) (*(pteptr) = pteval)
   10.16 +#define set_pte_atomic(pteptr, pteval) (*(pteptr) = pteval)
   10.17 +
   10.18 +/*
   10.19 + * (pmds are folded into pgds so this doesnt get actually called,
   10.20 + * but the define is needed for a generic inline function.)
   10.21 + */
   10.22 +#define set_pmd(pmdptr, pmdval) xen_l2_entry_update((pmdptr), (pmdval))
   10.23  #define set_pgd(pgdptr, pgdval) ((void)0)
   10.24  
   10.25  #define pgd_page(pgd) \
   10.26 @@ -47,6 +57,7 @@ static inline pmd_t * pmd_offset(pgd_t *
   10.27  	return (pmd_t *) dir;
   10.28  }
   10.29  
   10.30 +#define ptep_get_and_clear(xp)	__pte_ma(xchg(&(xp)->pte_low, 0))
   10.31  #define pte_same(a, b)		((a).pte_low == (b).pte_low)
   10.32  
   10.33  /*                                 
   10.34 @@ -83,21 +94,4 @@ static inline pmd_t * pmd_offset(pgd_t *
   10.35  #define pte_none(x)		(!(x).pte_low)
   10.36  #define __mk_pte(page_nr,pgprot) __pte(((page_nr) << PAGE_SHIFT) | pgprot_val(pgprot))
   10.37  
   10.38 -/*
   10.39 - * A note on implementation of this atomic 'get-and-clear' operation.
   10.40 - * This is actually very simple because XenoLinux can only run on a single
   10.41 - * processor. Therefore, we cannot race other processors setting the 'accessed'
   10.42 - * or 'dirty' bits on a page-table entry.
   10.43 - * Even if pages are shared between domains, that is not a problem because
   10.44 - * each domain will have separate page tables, with their own versions of
   10.45 - * accessed & dirty state.
   10.46 - */
   10.47 -static inline pte_t ptep_get_and_clear(pte_t *xp)
   10.48 -{
   10.49 -    pte_t pte = *xp;
   10.50 -    if ( !pte_none(pte) )
   10.51 -        queue_l1_entry_update(xp, 0);
   10.52 -    return pte;
   10.53 -}
   10.54 -
   10.55  #endif /* _I386_PGTABLE_2LEVEL_H */
    11.1 --- a/linux-2.4.29-xen-sparse/include/asm-xen/pgtable.h	Fri Mar 25 19:30:52 2005 +0000
    11.2 +++ b/linux-2.4.29-xen-sparse/include/asm-xen/pgtable.h	Fri Mar 25 20:03:52 2005 +0000
    11.3 @@ -38,11 +38,11 @@ extern void paging_init(void);
    11.4  
    11.5  extern unsigned long pgkern_mask;
    11.6  
    11.7 -#define __flush_tlb() ({ queue_tlb_flush(); XEN_flush_page_update_queue(); })
    11.8 +#define __flush_tlb() xen_tlb_flush()
    11.9  #define __flush_tlb_global() __flush_tlb()
   11.10  #define __flush_tlb_all() __flush_tlb_global()
   11.11 -#define __flush_tlb_one(addr) ({ queue_invlpg(addr); XEN_flush_page_update_queue(); })
   11.12 -#define __flush_tlb_single(addr) ({ queue_invlpg(addr); XEN_flush_page_update_queue(); })
   11.13 +#define __flush_tlb_one(addr) xen_invlpg(addr)
   11.14 +#define __flush_tlb_single(addr) xen_invlpg(addr)
   11.15  
   11.16  /*
   11.17   * ZERO_PAGE is a global shared page that is always zero: used
   11.18 @@ -179,12 +179,14 @@ extern void * high_memory;
   11.19  #define __S111	PAGE_SHARED
   11.20  
   11.21  #define pte_present(x)	((x).pte_low & (_PAGE_PRESENT | _PAGE_PROTNONE))
   11.22 -#define pte_clear(xp)	queue_l1_entry_update(xp, 0)
   11.23 +#define pte_clear(xp)	do { set_pte(xp, __pte(0)); } while (0)
   11.24  
   11.25 -#define pmd_none(x)	(!(x).pmd)
   11.26 -#define pmd_present(x)	((x).pmd & _PAGE_PRESENT)
   11.27 +#define pmd_none(x)	(!pmd_val(x))
   11.28 +/* pmd_present doesn't just test the _PAGE_PRESENT bit since wr.p.t.
   11.29 +   can temporarily clear it. */
   11.30 +#define pmd_present(x)	(pmd_val(x))
   11.31  #define pmd_clear(xp)	do { set_pmd(xp, __pmd(0)); } while (0)
   11.32 -#define	pmd_bad(x)	(((x).pmd & (~PAGE_MASK & ~_PAGE_USER)) != _KERNPG_TABLE)
   11.33 +#define pmd_bad(x)	((pmd_val(x) & (~PAGE_MASK & ~_PAGE_USER & ~_PAGE_PRESENT)) != (_KERNPG_TABLE & ~_PAGE_PRESENT))
   11.34  
   11.35  
   11.36  #define pages_to_mb(x) ((x) >> (20-PAGE_SHIFT))
   11.37 @@ -212,29 +214,28 @@ static inline pte_t pte_mkwrite(pte_t pt
   11.38  
   11.39  static inline int ptep_test_and_clear_dirty(pte_t *ptep)
   11.40  {
   11.41 -    unsigned long pteval = *(unsigned long *)ptep;
   11.42 -    int ret = pteval & _PAGE_DIRTY;
   11.43 -    if ( ret ) queue_l1_entry_update(ptep, pteval & ~_PAGE_DIRTY);
   11.44 -    return ret;
   11.45 +    if (!pte_dirty(*ptep))
   11.46 +        return 0;
   11.47 +    return test_and_clear_bit(_PAGE_BIT_DIRTY, &ptep->pte_low);
   11.48  }
   11.49 -static inline  int ptep_test_and_clear_young(pte_t *ptep)
   11.50 +
   11.51 +static inline int ptep_test_and_clear_young(pte_t *ptep)
   11.52  {
   11.53 -    unsigned long pteval = *(unsigned long *)ptep;
   11.54 -    int ret = pteval & _PAGE_ACCESSED;
   11.55 -    if ( ret ) queue_l1_entry_update(ptep, pteval & ~_PAGE_ACCESSED);
   11.56 -    return ret;
   11.57 +    if (!pte_young(*ptep))
   11.58 +        return 0;
   11.59 +    return test_and_clear_bit(_PAGE_BIT_ACCESSED, &ptep->pte_low);
   11.60  }
   11.61 +
   11.62  static inline void ptep_set_wrprotect(pte_t *ptep)
   11.63  {
   11.64 -    unsigned long pteval = *(unsigned long *)ptep;
   11.65 -    if ( (pteval & _PAGE_RW) )
   11.66 -        queue_l1_entry_update(ptep, pteval & ~_PAGE_RW);
   11.67 +    if (pte_write(*ptep))
   11.68 +        clear_bit(_PAGE_BIT_RW, &ptep->pte_low);
   11.69  }
   11.70 +
   11.71  static inline void ptep_mkdirty(pte_t *ptep)
   11.72  {
   11.73 -    unsigned long pteval = *(unsigned long *)ptep;
   11.74 -    if ( !(pteval & _PAGE_DIRTY) )
   11.75 -        queue_l1_entry_update(ptep, pteval | _PAGE_DIRTY);
   11.76 +    if (!pte_dirty(*ptep))
   11.77 +        set_bit(_PAGE_BIT_DIRTY, &ptep->pte_low);
   11.78  }
   11.79  
   11.80  /*
    12.1 --- a/linux-2.4.29-xen-sparse/mm/highmem.c	Fri Mar 25 19:30:52 2005 +0000
    12.2 +++ b/linux-2.4.29-xen-sparse/mm/highmem.c	Fri Mar 25 20:03:52 2005 +0000
    12.3 @@ -122,7 +122,6 @@ start:
    12.4  	}
    12.5  	vaddr = PKMAP_ADDR(last_pkmap_nr);
    12.6  	set_pte(&(pkmap_page_table[last_pkmap_nr]), mk_pte(page, kmap_prot));
    12.7 -	XEN_flush_page_update_queue();
    12.8  
    12.9  	pkmap_count[last_pkmap_nr] = 1;
   12.10  	page->virtual = (void *) vaddr;
    13.1 --- a/linux-2.4.29-xen-sparse/mm/memory.c	Fri Mar 25 19:30:52 2005 +0000
    13.2 +++ b/linux-2.4.29-xen-sparse/mm/memory.c	Fri Mar 25 20:03:52 2005 +0000
    13.3 @@ -153,7 +153,6 @@ void clear_page_tables(struct mm_struct 
    13.4  		free_one_pgd(page_dir);
    13.5  		page_dir++;
    13.6  	} while (--nr);
    13.7 -	XEN_flush_page_update_queue();
    13.8  	spin_unlock(&mm->page_table_lock);
    13.9  
   13.10  	/* keep the page table cache within bounds */
   13.11 @@ -249,10 +248,8 @@ skip_copy_pte_range:		address = (address
   13.12  
   13.13  				/* If it's a COW mapping, write protect it both in the parent and the child */
   13.14  				if (cow && pte_write(pte)) {
   13.15 -					/* XEN modification: modified ordering here to avoid RaW hazard. */
   13.16 +					ptep_set_wrprotect(src_pte);
   13.17  					pte = *src_pte;
   13.18 -					pte = pte_wrprotect(pte);
   13.19 -					ptep_set_wrprotect(src_pte);
   13.20  				}
   13.21  
   13.22  				/* If it's a shared mapping, mark it clean in the child */
   13.23 @@ -914,7 +911,6 @@ static inline void establish_pte(struct 
   13.24  {
   13.25  #ifdef CONFIG_XEN
   13.26  	if ( likely(vma->vm_mm == current->mm) ) {
   13.27 -		XEN_flush_page_update_queue();
   13.28  		HYPERVISOR_update_va_mapping(address, entry, UVMF_INVLPG);
   13.29  	} else {
   13.30  		set_pte(page_table, entry);
   13.31 @@ -1189,13 +1185,10 @@ static int do_swap_page(struct mm_struct
   13.32  	flush_page_to_ram(page);
   13.33  	flush_icache_page(vma, page);
   13.34  #ifdef CONFIG_XEN
   13.35 -	if ( likely(vma->vm_mm == current->mm) ) {
   13.36 -		XEN_flush_page_update_queue();
   13.37 +	if ( likely(vma->vm_mm == current->mm) )
   13.38  		HYPERVISOR_update_va_mapping(address, pte, 0);
   13.39 -	} else {
   13.40 +	else
   13.41  		set_pte(page_table, pte);
   13.42 -		XEN_flush_page_update_queue();
   13.43 -	}
   13.44  #else
   13.45  	set_pte(page_table, pte);
   13.46  #endif
   13.47 @@ -1245,13 +1238,10 @@ static int do_anonymous_page(struct mm_s
   13.48  	}
   13.49  
   13.50  #ifdef CONFIG_XEN
   13.51 -	if ( likely(vma->vm_mm == current->mm) ) {
   13.52 -		XEN_flush_page_update_queue();
   13.53 +	if ( likely(vma->vm_mm == current->mm) )
   13.54  		HYPERVISOR_update_va_mapping(addr, entry, 0);
   13.55 -	} else {
   13.56 +	else
   13.57  		set_pte(page_table, entry);
   13.58 -		XEN_flush_page_update_queue();
   13.59 -	}
   13.60  #else
   13.61  	set_pte(page_table, entry);
   13.62  #endif
   13.63 @@ -1331,13 +1321,10 @@ static int do_no_page(struct mm_struct *
   13.64  		if (write_access)
   13.65  			entry = pte_mkwrite(pte_mkdirty(entry));
   13.66  #ifdef CONFIG_XEN
   13.67 -		if ( likely(vma->vm_mm == current->mm) ) {
   13.68 -			XEN_flush_page_update_queue();
   13.69 +		if ( likely(vma->vm_mm == current->mm) )
   13.70  			HYPERVISOR_update_va_mapping(address, entry, 0);
   13.71 -		} else {
   13.72 +		else
   13.73  			set_pte(page_table, entry);
   13.74 -			XEN_flush_page_update_queue();
   13.75 -		}
   13.76  #else
   13.77  		set_pte(page_table, entry);
   13.78  #endif
   13.79 @@ -1484,7 +1471,6 @@ pte_t fastcall *pte_alloc(struct mm_stru
   13.80  		/* "fast" allocation can happen without dropping the lock.. */
   13.81  		new = pte_alloc_one_fast(mm, address);
   13.82  		if (!new) {
   13.83 -			XEN_flush_page_update_queue();
   13.84  			spin_unlock(&mm->page_table_lock);
   13.85  			new = pte_alloc_one(mm, address);
   13.86  			spin_lock(&mm->page_table_lock);
    14.1 --- a/linux-2.4.29-xen-sparse/mm/mremap.c	Fri Mar 25 19:30:52 2005 +0000
    14.2 +++ b/linux-2.4.29-xen-sparse/mm/mremap.c	Fri Mar 25 20:03:52 2005 +0000
    14.3 @@ -119,11 +119,9 @@ static int move_page_tables(struct mm_st
    14.4  	 * the old page tables)
    14.5  	 */
    14.6  oops_we_failed:
    14.7 -	XEN_flush_page_update_queue();
    14.8  	flush_cache_range(mm, new_addr, new_addr + len);
    14.9  	while ((offset += PAGE_SIZE) < len)
   14.10  		move_one_page(mm, new_addr + offset, old_addr + offset);
   14.11 -	XEN_flush_page_update_queue();
   14.12  	zap_page_range(mm, new_addr, len);
   14.13  	return -1;
   14.14  }
    15.1 --- a/linux-2.4.29-xen-sparse/mm/swapfile.c	Fri Mar 25 19:30:52 2005 +0000
    15.2 +++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
    15.3 @@ -1,1267 +0,0 @@
    15.4 -/*
    15.5 - *  linux/mm/swapfile.c
    15.6 - *
    15.7 - *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
    15.8 - *  Swap reorganised 29.12.95, Stephen Tweedie
    15.9 - */
   15.10 -
   15.11 -#include <linux/slab.h>
   15.12 -#include <linux/smp_lock.h>
   15.13 -#include <linux/kernel_stat.h>
   15.14 -#include <linux/swap.h>
   15.15 -#include <linux/swapctl.h>
   15.16 -#include <linux/blkdev.h> /* for blk_size */
   15.17 -#include <linux/vmalloc.h>
   15.18 -#include <linux/pagemap.h>
   15.19 -#include <linux/shm.h>
   15.20 -
   15.21 -#include <asm/pgtable.h>
   15.22 -
   15.23 -spinlock_t swaplock = SPIN_LOCK_UNLOCKED;
   15.24 -unsigned int nr_swapfiles;
   15.25 -int total_swap_pages;
   15.26 -static int swap_overflow;
   15.27 -
   15.28 -static const char Bad_file[] = "Bad swap file entry ";
   15.29 -static const char Unused_file[] = "Unused swap file entry ";
   15.30 -static const char Bad_offset[] = "Bad swap offset entry ";
   15.31 -static const char Unused_offset[] = "Unused swap offset entry ";
   15.32 -
   15.33 -struct swap_list_t swap_list = {-1, -1};
   15.34 -
   15.35 -struct swap_info_struct swap_info[MAX_SWAPFILES];
   15.36 -
   15.37 -#define SWAPFILE_CLUSTER 256
   15.38 -
   15.39 -static inline int scan_swap_map(struct swap_info_struct *si)
   15.40 -{
   15.41 -	unsigned long offset;
   15.42 -	/* 
   15.43 -	 * We try to cluster swap pages by allocating them
   15.44 -	 * sequentially in swap.  Once we've allocated
   15.45 -	 * SWAPFILE_CLUSTER pages this way, however, we resort to
   15.46 -	 * first-free allocation, starting a new cluster.  This
   15.47 -	 * prevents us from scattering swap pages all over the entire
   15.48 -	 * swap partition, so that we reduce overall disk seek times
   15.49 -	 * between swap pages.  -- sct */
   15.50 -	if (si->cluster_nr) {
   15.51 -		while (si->cluster_next <= si->highest_bit) {
   15.52 -			offset = si->cluster_next++;
   15.53 -			if (si->swap_map[offset])
   15.54 -				continue;
   15.55 -			si->cluster_nr--;
   15.56 -			goto got_page;
   15.57 -		}
   15.58 -	}
   15.59 -	si->cluster_nr = SWAPFILE_CLUSTER;
   15.60 -
   15.61 -	/* try to find an empty (even not aligned) cluster. */
   15.62 -	offset = si->lowest_bit;
   15.63 - check_next_cluster:
   15.64 -	if (offset+SWAPFILE_CLUSTER-1 <= si->highest_bit)
   15.65 -	{
   15.66 -		int nr;
   15.67 -		for (nr = offset; nr < offset+SWAPFILE_CLUSTER; nr++)
   15.68 -			if (si->swap_map[nr])
   15.69 -			{
   15.70 -				offset = nr+1;
   15.71 -				goto check_next_cluster;
   15.72 -			}
   15.73 -		/* We found a completly empty cluster, so start
   15.74 -		 * using it.
   15.75 -		 */
   15.76 -		goto got_page;
   15.77 -	}
   15.78 -	/* No luck, so now go finegrined as usual. -Andrea */
   15.79 -	for (offset = si->lowest_bit; offset <= si->highest_bit ; offset++) {
   15.80 -		if (si->swap_map[offset])
   15.81 -			continue;
   15.82 -		si->lowest_bit = offset+1;
   15.83 -	got_page:
   15.84 -		if (offset == si->lowest_bit)
   15.85 -			si->lowest_bit++;
   15.86 -		if (offset == si->highest_bit)
   15.87 -			si->highest_bit--;
   15.88 -		if (si->lowest_bit > si->highest_bit) {
   15.89 -			si->lowest_bit = si->max;
   15.90 -			si->highest_bit = 0;
   15.91 -		}
   15.92 -		si->swap_map[offset] = 1;
   15.93 -		nr_swap_pages--;
   15.94 -		si->cluster_next = offset+1;
   15.95 -		return offset;
   15.96 -	}
   15.97 -	si->lowest_bit = si->max;
   15.98 -	si->highest_bit = 0;
   15.99 -	return 0;
  15.100 -}
  15.101 -
  15.102 -swp_entry_t get_swap_page(void)
  15.103 -{
  15.104 -	struct swap_info_struct * p;
  15.105 -	unsigned long offset;
  15.106 -	swp_entry_t entry;
  15.107 -	int type, wrapped = 0;
  15.108 -
  15.109 -	entry.val = 0;	/* Out of memory */
  15.110 -	swap_list_lock();
  15.111 -	type = swap_list.next;
  15.112 -	if (type < 0)
  15.113 -		goto out;
  15.114 -	if (nr_swap_pages <= 0)
  15.115 -		goto out;
  15.116 -
  15.117 -	while (1) {
  15.118 -		p = &swap_info[type];
  15.119 -		if ((p->flags & SWP_WRITEOK) == SWP_WRITEOK) {
  15.120 -			swap_device_lock(p);
  15.121 -			offset = scan_swap_map(p);
  15.122 -			swap_device_unlock(p);
  15.123 -			if (offset) {
  15.124 -				entry = SWP_ENTRY(type,offset);
  15.125 -				type = swap_info[type].next;
  15.126 -				if (type < 0 ||
  15.127 -					p->prio != swap_info[type].prio) {
  15.128 -						swap_list.next = swap_list.head;
  15.129 -				} else {
  15.130 -					swap_list.next = type;
  15.131 -				}
  15.132 -				goto out;
  15.133 -			}
  15.134 -		}
  15.135 -		type = p->next;
  15.136 -		if (!wrapped) {
  15.137 -			if (type < 0 || p->prio != swap_info[type].prio) {
  15.138 -				type = swap_list.head;
  15.139 -				wrapped = 1;
  15.140 -			}
  15.141 -		} else
  15.142 -			if (type < 0)
  15.143 -				goto out;	/* out of swap space */
  15.144 -	}
  15.145 -out:
  15.146 -	swap_list_unlock();
  15.147 -	return entry;
  15.148 -}
  15.149 -
  15.150 -static struct swap_info_struct * swap_info_get(swp_entry_t entry)
  15.151 -{
  15.152 -	struct swap_info_struct * p;
  15.153 -	unsigned long offset, type;
  15.154 -
  15.155 -	if (!entry.val)
  15.156 -		goto out;
  15.157 -	type = SWP_TYPE(entry);
  15.158 -	if (type >= nr_swapfiles)
  15.159 -		goto bad_nofile;
  15.160 -	p = & swap_info[type];
  15.161 -	if (!(p->flags & SWP_USED))
  15.162 -		goto bad_device;
  15.163 -	offset = SWP_OFFSET(entry);
  15.164 -	if (offset >= p->max)
  15.165 -		goto bad_offset;
  15.166 -	if (!p->swap_map[offset])
  15.167 -		goto bad_free;
  15.168 -	swap_list_lock();
  15.169 -	if (p->prio > swap_info[swap_list.next].prio)
  15.170 -		swap_list.next = type;
  15.171 -	swap_device_lock(p);
  15.172 -	return p;
  15.173 -
  15.174 -bad_free:
  15.175 -	printk(KERN_ERR "swap_free: %s%08lx\n", Unused_offset, entry.val);
  15.176 -	goto out;
  15.177 -bad_offset:
  15.178 -	printk(KERN_ERR "swap_free: %s%08lx\n", Bad_offset, entry.val);
  15.179 -	goto out;
  15.180 -bad_device:
  15.181 -	printk(KERN_ERR "swap_free: %s%08lx\n", Unused_file, entry.val);
  15.182 -	goto out;
  15.183 -bad_nofile:
  15.184 -	printk(KERN_ERR "swap_free: %s%08lx\n", Bad_file, entry.val);
  15.185 -out:
  15.186 -	return NULL;
  15.187 -}	
  15.188 -
  15.189 -static void swap_info_put(struct swap_info_struct * p)
  15.190 -{
  15.191 -	swap_device_unlock(p);
  15.192 -	swap_list_unlock();
  15.193 -}
  15.194 -
  15.195 -static int swap_entry_free(struct swap_info_struct *p, unsigned long offset)
  15.196 -{
  15.197 -	int count = p->swap_map[offset];
  15.198 -
  15.199 -	if (count < SWAP_MAP_MAX) {
  15.200 -		count--;
  15.201 -		p->swap_map[offset] = count;
  15.202 -		if (!count) {
  15.203 -			if (offset < p->lowest_bit)
  15.204 -				p->lowest_bit = offset;
  15.205 -			if (offset > p->highest_bit)
  15.206 -				p->highest_bit = offset;
  15.207 -			nr_swap_pages++;
  15.208 -		}
  15.209 -	}
  15.210 -	return count;
  15.211 -}
  15.212 -
  15.213 -/*
  15.214 - * Caller has made sure that the swapdevice corresponding to entry
  15.215 - * is still around or has not been recycled.
  15.216 - */
  15.217 -void swap_free(swp_entry_t entry)
  15.218 -{
  15.219 -	struct swap_info_struct * p;
  15.220 -
  15.221 -	p = swap_info_get(entry);
  15.222 -	if (p) {
  15.223 -		swap_entry_free(p, SWP_OFFSET(entry));
  15.224 -		swap_info_put(p);
  15.225 -	}
  15.226 -}
  15.227 -
  15.228 -/*
  15.229 - * Check if we're the only user of a swap page,
  15.230 - * when the page is locked.
  15.231 - */
  15.232 -static int exclusive_swap_page(struct page *page)
  15.233 -{
  15.234 -	int retval = 0;
  15.235 -	struct swap_info_struct * p;
  15.236 -	swp_entry_t entry;
  15.237 -
  15.238 -	entry.val = page->index;
  15.239 -	p = swap_info_get(entry);
  15.240 -	if (p) {
  15.241 -		/* Is the only swap cache user the cache itself? */
  15.242 -		if (p->swap_map[SWP_OFFSET(entry)] == 1) {
  15.243 -			/* Recheck the page count with the pagecache lock held.. */
  15.244 -			spin_lock(&pagecache_lock);
  15.245 -			if (page_count(page) - !!page->buffers == 2)
  15.246 -				retval = 1;
  15.247 -			spin_unlock(&pagecache_lock);
  15.248 -		}
  15.249 -		swap_info_put(p);
  15.250 -	}
  15.251 -	return retval;
  15.252 -}
  15.253 -
  15.254 -/*
  15.255 - * We can use this swap cache entry directly
  15.256 - * if there are no other references to it.
  15.257 - *
  15.258 - * Here "exclusive_swap_page()" does the real
  15.259 - * work, but we opportunistically check whether
  15.260 - * we need to get all the locks first..
  15.261 - */
  15.262 -int fastcall can_share_swap_page(struct page *page)
  15.263 -{
  15.264 -	int retval = 0;
  15.265 -
  15.266 -	if (!PageLocked(page))
  15.267 -		BUG();
  15.268 -	switch (page_count(page)) {
  15.269 -	case 3:
  15.270 -		if (!page->buffers)
  15.271 -			break;
  15.272 -		/* Fallthrough */
  15.273 -	case 2:
  15.274 -		if (!PageSwapCache(page))
  15.275 -			break;
  15.276 -		retval = exclusive_swap_page(page);
  15.277 -		break;
  15.278 -	case 1:
  15.279 -		if (PageReserved(page))
  15.280 -			break;
  15.281 -		retval = 1;
  15.282 -	}
  15.283 -	return retval;
  15.284 -}
  15.285 -
  15.286 -/*
  15.287 - * Work out if there are any other processes sharing this
  15.288 - * swap cache page. Free it if you can. Return success.
  15.289 - */
  15.290 -int fastcall remove_exclusive_swap_page(struct page *page)
  15.291 -{
  15.292 -	int retval;
  15.293 -	struct swap_info_struct * p;
  15.294 -	swp_entry_t entry;
  15.295 -
  15.296 -	if (!PageLocked(page))
  15.297 -		BUG();
  15.298 -	if (!PageSwapCache(page))
  15.299 -		return 0;
  15.300 -	if (page_count(page) - !!page->buffers != 2)	/* 2: us + cache */
  15.301 -		return 0;
  15.302 -
  15.303 -	entry.val = page->index;
  15.304 -	p = swap_info_get(entry);
  15.305 -	if (!p)
  15.306 -		return 0;
  15.307 -
  15.308 -	/* Is the only swap cache user the cache itself? */
  15.309 -	retval = 0;
  15.310 -	if (p->swap_map[SWP_OFFSET(entry)] == 1) {
  15.311 -		/* Recheck the page count with the pagecache lock held.. */
  15.312 -		spin_lock(&pagecache_lock);
  15.313 -		if (page_count(page) - !!page->buffers == 2) {
  15.314 -			__delete_from_swap_cache(page);
  15.315 -			SetPageDirty(page);
  15.316 -			retval = 1;
  15.317 -		}
  15.318 -		spin_unlock(&pagecache_lock);
  15.319 -	}
  15.320 -	swap_info_put(p);
  15.321 -
  15.322 -	if (retval) {
  15.323 -		block_flushpage(page, 0);
  15.324 -		swap_free(entry);
  15.325 -		page_cache_release(page);
  15.326 -	}
  15.327 -
  15.328 -	return retval;
  15.329 -}
  15.330 -
  15.331 -/*
  15.332 - * Free the swap entry like above, but also try to
  15.333 - * free the page cache entry if it is the last user.
  15.334 - */
  15.335 -void free_swap_and_cache(swp_entry_t entry)
  15.336 -{
  15.337 -	struct swap_info_struct * p;
  15.338 -	struct page *page = NULL;
  15.339 -
  15.340 -	p = swap_info_get(entry);
  15.341 -	if (p) {
  15.342 -		if (swap_entry_free(p, SWP_OFFSET(entry)) == 1)
  15.343 -			page = find_trylock_page(&swapper_space, entry.val);
  15.344 -		swap_info_put(p);
  15.345 -	}
  15.346 -	if (page) {
  15.347 -		page_cache_get(page);
  15.348 -		/* Only cache user (+us), or swap space full? Free it! */
  15.349 -		if (page_count(page) - !!page->buffers == 2 || vm_swap_full()) {
  15.350 -			delete_from_swap_cache(page);
  15.351 -			SetPageDirty(page);
  15.352 -		}
  15.353 -		UnlockPage(page);
  15.354 -		page_cache_release(page);
  15.355 -	}
  15.356 -}
  15.357 -
  15.358 -/*
  15.359 - * The swap entry has been read in advance, and we return 1 to indicate
  15.360 - * that the page has been used or is no longer needed.
  15.361 - *
  15.362 - * Always set the resulting pte to be nowrite (the same as COW pages
  15.363 - * after one process has exited).  We don't know just how many PTEs will
  15.364 - * share this swap entry, so be cautious and let do_wp_page work out
  15.365 - * what to do if a write is requested later.
  15.366 - */
  15.367 -/* mmlist_lock and vma->vm_mm->page_table_lock are held */
  15.368 -static inline void unuse_pte(struct vm_area_struct * vma, unsigned long address,
  15.369 -	pte_t *dir, swp_entry_t entry, struct page* page)
  15.370 -{
  15.371 -	pte_t pte = *dir;
  15.372 -
  15.373 -	if (likely(pte_to_swp_entry(pte).val != entry.val))
  15.374 -		return;
  15.375 -	if (unlikely(pte_none(pte) || pte_present(pte)))
  15.376 -		return;
  15.377 -	get_page(page);
  15.378 -	set_pte(dir, pte_mkold(mk_pte(page, vma->vm_page_prot)));
  15.379 -	swap_free(entry);
  15.380 -	++vma->vm_mm->rss;
  15.381 -}
  15.382 -
  15.383 -/* mmlist_lock and vma->vm_mm->page_table_lock are held */
  15.384 -static inline void unuse_pmd(struct vm_area_struct * vma, pmd_t *dir,
  15.385 -	unsigned long address, unsigned long size, unsigned long offset,
  15.386 -	swp_entry_t entry, struct page* page)
  15.387 -{
  15.388 -	pte_t * pte;
  15.389 -	unsigned long end;
  15.390 -
  15.391 -	if (pmd_none(*dir))
  15.392 -		return;
  15.393 -	if (pmd_bad(*dir)) {
  15.394 -		pmd_ERROR(*dir);
  15.395 -		pmd_clear(dir);
  15.396 -		return;
  15.397 -	}
  15.398 -	pte = pte_offset(dir, address);
  15.399 -	offset += address & PMD_MASK;
  15.400 -	address &= ~PMD_MASK;
  15.401 -	end = address + size;
  15.402 -	if (end > PMD_SIZE)
  15.403 -		end = PMD_SIZE;
  15.404 -	do {
  15.405 -		unuse_pte(vma, offset+address-vma->vm_start, pte, entry, page);
  15.406 -		address += PAGE_SIZE;
  15.407 -		pte++;
  15.408 -	} while (address && (address < end));
  15.409 -}
  15.410 -
  15.411 -/* mmlist_lock and vma->vm_mm->page_table_lock are held */
  15.412 -static inline void unuse_pgd(struct vm_area_struct * vma, pgd_t *dir,
  15.413 -	unsigned long address, unsigned long size,
  15.414 -	swp_entry_t entry, struct page* page)
  15.415 -{
  15.416 -	pmd_t * pmd;
  15.417 -	unsigned long offset, end;
  15.418 -
  15.419 -	if (pgd_none(*dir))
  15.420 -		return;
  15.421 -	if (pgd_bad(*dir)) {
  15.422 -		pgd_ERROR(*dir);
  15.423 -		pgd_clear(dir);
  15.424 -		return;
  15.425 -	}
  15.426 -	pmd = pmd_offset(dir, address);
  15.427 -	offset = address & PGDIR_MASK;
  15.428 -	address &= ~PGDIR_MASK;
  15.429 -	end = address + size;
  15.430 -	if (end > PGDIR_SIZE)
  15.431 -		end = PGDIR_SIZE;
  15.432 -	if (address >= end)
  15.433 -		BUG();
  15.434 -	do {
  15.435 -		unuse_pmd(vma, pmd, address, end - address, offset, entry,
  15.436 -			  page);
  15.437 -		address = (address + PMD_SIZE) & PMD_MASK;
  15.438 -		pmd++;
  15.439 -	} while (address && (address < end));
  15.440 -}
  15.441 -
  15.442 -/* mmlist_lock and vma->vm_mm->page_table_lock are held */
  15.443 -static void unuse_vma(struct vm_area_struct * vma, pgd_t *pgdir,
  15.444 -			swp_entry_t entry, struct page* page)
  15.445 -{
  15.446 -	unsigned long start = vma->vm_start, end = vma->vm_end;
  15.447 -
  15.448 -	if (start >= end)
  15.449 -		BUG();
  15.450 -	do {
  15.451 -		unuse_pgd(vma, pgdir, start, end - start, entry, page);
  15.452 -		start = (start + PGDIR_SIZE) & PGDIR_MASK;
  15.453 -		pgdir++;
  15.454 -	} while (start && (start < end));
  15.455 -}
  15.456 -
  15.457 -static void unuse_process(struct mm_struct * mm,
  15.458 -			swp_entry_t entry, struct page* page)
  15.459 -{
  15.460 -	struct vm_area_struct* vma;
  15.461 -
  15.462 -	/*
  15.463 -	 * Go through process' page directory.
  15.464 -	 */
  15.465 -	spin_lock(&mm->page_table_lock);
  15.466 -	for (vma = mm->mmap; vma; vma = vma->vm_next) {
  15.467 -		pgd_t * pgd = pgd_offset(mm, vma->vm_start);
  15.468 -		unuse_vma(vma, pgd, entry, page);
  15.469 -	}
  15.470 -	XEN_flush_page_update_queue();
  15.471 -	spin_unlock(&mm->page_table_lock);
  15.472 -	return;
  15.473 -}
  15.474 -
  15.475 -/*
  15.476 - * Scan swap_map from current position to next entry still in use.
  15.477 - * Recycle to start on reaching the end, returning 0 when empty.
  15.478 - */
  15.479 -static int find_next_to_unuse(struct swap_info_struct *si, int prev)
  15.480 -{
  15.481 -	int max = si->max;
  15.482 -	int i = prev;
  15.483 -	int count;
  15.484 -
  15.485 -	/*
  15.486 -	 * No need for swap_device_lock(si) here: we're just looking
  15.487 -	 * for whether an entry is in use, not modifying it; false
  15.488 -	 * hits are okay, and sys_swapoff() has already prevented new
  15.489 -	 * allocations from this area (while holding swap_list_lock()).
  15.490 -	 */
  15.491 -	for (;;) {
  15.492 -		if (++i >= max) {
  15.493 -			if (!prev) {
  15.494 -				i = 0;
  15.495 -				break;
  15.496 -			}
  15.497 -			/*
  15.498 -			 * No entries in use at top of swap_map,
  15.499 -			 * loop back to start and recheck there.
  15.500 -			 */
  15.501 -			max = prev + 1;
  15.502 -			prev = 0;
  15.503 -			i = 1;
  15.504 -		}
  15.505 -		count = si->swap_map[i];
  15.506 -		if (count && count != SWAP_MAP_BAD)
  15.507 -			break;
  15.508 -	}
  15.509 -	return i;
  15.510 -}
  15.511 -
  15.512 -/*
  15.513 - * We completely avoid races by reading each swap page in advance,
  15.514 - * and then search for the process using it.  All the necessary
  15.515 - * page table adjustments can then be made atomically.
  15.516 - */
  15.517 -static int try_to_unuse(unsigned int type)
  15.518 -{
  15.519 -	struct swap_info_struct * si = &swap_info[type];
  15.520 -	struct mm_struct *start_mm;
  15.521 -	unsigned short *swap_map;
  15.522 -	unsigned short swcount;
  15.523 -	struct page *page;
  15.524 -	swp_entry_t entry;
  15.525 -	int i = 0;
  15.526 -	int retval = 0;
  15.527 -	int reset_overflow = 0;
  15.528 -	int shmem;
  15.529 -
  15.530 -	/*
  15.531 -	 * When searching mms for an entry, a good strategy is to
  15.532 -	 * start at the first mm we freed the previous entry from
  15.533 -	 * (though actually we don't notice whether we or coincidence
  15.534 -	 * freed the entry).  Initialize this start_mm with a hold.
  15.535 -	 *
  15.536 -	 * A simpler strategy would be to start at the last mm we
  15.537 -	 * freed the previous entry from; but that would take less
  15.538 -	 * advantage of mmlist ordering (now preserved by swap_out()),
  15.539 -	 * which clusters forked address spaces together, most recent
  15.540 -	 * child immediately after parent.  If we race with dup_mmap(),
  15.541 -	 * we very much want to resolve parent before child, otherwise
  15.542 -	 * we may miss some entries: using last mm would invert that.
  15.543 -	 */
  15.544 -	start_mm = &init_mm;
  15.545 -	atomic_inc(&init_mm.mm_users);
  15.546 -
  15.547 -	/*
  15.548 -	 * Keep on scanning until all entries have gone.  Usually,
  15.549 -	 * one pass through swap_map is enough, but not necessarily:
  15.550 -	 * mmput() removes mm from mmlist before exit_mmap() and its
  15.551 -	 * zap_page_range().  That's not too bad, those entries are
  15.552 -	 * on their way out, and handled faster there than here.
  15.553 -	 * do_munmap() behaves similarly, taking the range out of mm's
  15.554 -	 * vma list before zap_page_range().  But unfortunately, when
  15.555 -	 * unmapping a part of a vma, it takes the whole out first,
  15.556 -	 * then reinserts what's left after (might even reschedule if
  15.557 -	 * open() method called) - so swap entries may be invisible
  15.558 -	 * to swapoff for a while, then reappear - but that is rare.
  15.559 -	 */
  15.560 -	while ((i = find_next_to_unuse(si, i))) {
  15.561 -		/* 
  15.562 -		 * Get a page for the entry, using the existing swap
  15.563 -		 * cache page if there is one.  Otherwise, get a clean
  15.564 -		 * page and read the swap into it. 
  15.565 -		 */
  15.566 -		swap_map = &si->swap_map[i];
  15.567 -		entry = SWP_ENTRY(type, i);
  15.568 -		page = read_swap_cache_async(entry);
  15.569 -		if (!page) {
  15.570 -			/*
  15.571 -			 * Either swap_duplicate() failed because entry
  15.572 -			 * has been freed independently, and will not be
  15.573 -			 * reused since sys_swapoff() already disabled
  15.574 -			 * allocation from here, or alloc_page() failed.
  15.575 -			 */
  15.576 -			if (!*swap_map)
  15.577 -				continue;
  15.578 -			retval = -ENOMEM;
  15.579 -			break;
  15.580 -		}
  15.581 -
  15.582 -		/*
  15.583 -		 * Don't hold on to start_mm if it looks like exiting.
  15.584 -		 */
  15.585 -		if (atomic_read(&start_mm->mm_users) == 1) {
  15.586 -			mmput(start_mm);
  15.587 -			start_mm = &init_mm;
  15.588 -			atomic_inc(&init_mm.mm_users);
  15.589 -		}
  15.590 -
  15.591 -		/*
  15.592 -		 * Wait for and lock page.  When do_swap_page races with
  15.593 -		 * try_to_unuse, do_swap_page can handle the fault much
  15.594 -		 * faster than try_to_unuse can locate the entry.  This
  15.595 -		 * apparently redundant "wait_on_page" lets try_to_unuse
  15.596 -		 * defer to do_swap_page in such a case - in some tests,
  15.597 -		 * do_swap_page and try_to_unuse repeatedly compete.
  15.598 -		 */
  15.599 -		wait_on_page(page);
  15.600 -		lock_page(page);
  15.601 -
  15.602 -		/*
  15.603 -		 * Remove all references to entry, without blocking.
  15.604 -		 * Whenever we reach init_mm, there's no address space
  15.605 -		 * to search, but use it as a reminder to search shmem.
  15.606 -		 */
  15.607 -		shmem = 0;
  15.608 -		swcount = *swap_map;
  15.609 -		if (swcount > 1) {
  15.610 -			flush_page_to_ram(page);
  15.611 -			if (start_mm == &init_mm)
  15.612 -				shmem = shmem_unuse(entry, page);
  15.613 -			else
  15.614 -				unuse_process(start_mm, entry, page);
  15.615 -		}
  15.616 -		if (*swap_map > 1) {
  15.617 -			int set_start_mm = (*swap_map >= swcount);
  15.618 -			struct list_head *p = &start_mm->mmlist;
  15.619 -			struct mm_struct *new_start_mm = start_mm;
  15.620 -			struct mm_struct *mm;
  15.621 -
  15.622 -			spin_lock(&mmlist_lock);
  15.623 -			while (*swap_map > 1 &&
  15.624 -					(p = p->next) != &start_mm->mmlist) {
  15.625 -				mm = list_entry(p, struct mm_struct, mmlist);
  15.626 -				swcount = *swap_map;
  15.627 -				if (mm == &init_mm) {
  15.628 -					set_start_mm = 1;
  15.629 -					spin_unlock(&mmlist_lock);
  15.630 -					shmem = shmem_unuse(entry, page);
  15.631 -					spin_lock(&mmlist_lock);
  15.632 -				} else
  15.633 -					unuse_process(mm, entry, page);
  15.634 -				if (set_start_mm && *swap_map < swcount) {
  15.635 -					new_start_mm = mm;
  15.636 -					set_start_mm = 0;
  15.637 -				}
  15.638 -			}
  15.639 -			atomic_inc(&new_start_mm->mm_users);
  15.640 -			spin_unlock(&mmlist_lock);
  15.641 -			mmput(start_mm);
  15.642 -			start_mm = new_start_mm;
  15.643 -		}
  15.644 -
  15.645 -		/*
  15.646 -		 * How could swap count reach 0x7fff when the maximum
  15.647 -		 * pid is 0x7fff, and there's no way to repeat a swap
  15.648 -		 * page within an mm (except in shmem, where it's the
  15.649 -		 * shared object which takes the reference count)?
  15.650 -		 * We believe SWAP_MAP_MAX cannot occur in Linux 2.4.
  15.651 -		 *
  15.652 -		 * If that's wrong, then we should worry more about
  15.653 -		 * exit_mmap() and do_munmap() cases described above:
  15.654 -		 * we might be resetting SWAP_MAP_MAX too early here.
  15.655 -		 * We know "Undead"s can happen, they're okay, so don't
  15.656 -		 * report them; but do report if we reset SWAP_MAP_MAX.
  15.657 -		 */
  15.658 -		if (*swap_map == SWAP_MAP_MAX) {
  15.659 -			swap_list_lock();
  15.660 -			swap_device_lock(si);
  15.661 -			nr_swap_pages++;
  15.662 -			*swap_map = 1;
  15.663 -			swap_device_unlock(si);
  15.664 -			swap_list_unlock();
  15.665 -			reset_overflow = 1;
  15.666 -		}
  15.667 -
  15.668 -		/*
  15.669 -		 * If a reference remains (rare), we would like to leave
  15.670 -		 * the page in the swap cache; but try_to_swap_out could
  15.671 -		 * then re-duplicate the entry once we drop page lock,
  15.672 -		 * so we might loop indefinitely; also, that page could
  15.673 -		 * not be swapped out to other storage meanwhile.  So:
  15.674 -		 * delete from cache even if there's another reference,
  15.675 -		 * after ensuring that the data has been saved to disk -
  15.676 -		 * since if the reference remains (rarer), it will be
  15.677 -		 * read from disk into another page.  Splitting into two
  15.678 -		 * pages would be incorrect if swap supported "shared
  15.679 -		 * private" pages, but they are handled by tmpfs files.
  15.680 -		 *
  15.681 -		 * Note shmem_unuse already deleted swappage from cache,
  15.682 -		 * unless corresponding filepage found already in cache:
  15.683 -		 * in which case it left swappage in cache, lowered its
  15.684 -		 * swap count to pass quickly through the loops above,
  15.685 -		 * and now we must reincrement count to try again later.
  15.686 -		 */
  15.687 -		if ((*swap_map > 1) && PageDirty(page) && PageSwapCache(page)) {
  15.688 -			rw_swap_page(WRITE, page);
  15.689 -			lock_page(page);
  15.690 -		}
  15.691 -		if (PageSwapCache(page)) {
  15.692 -			if (shmem)
  15.693 -				swap_duplicate(entry);
  15.694 -			else
  15.695 -				delete_from_swap_cache(page);
  15.696 -		}
  15.697 -
  15.698 -		/*
  15.699 -		 * So we could skip searching mms once swap count went
  15.700 -		 * to 1, we did not mark any present ptes as dirty: must
  15.701 -		 * mark page dirty so try_to_swap_out will preserve it.
  15.702 -		 */
  15.703 -		SetPageDirty(page);
  15.704 -		UnlockPage(page);
  15.705 -		page_cache_release(page);
  15.706 -
  15.707 -		/*
  15.708 -		 * Make sure that we aren't completely killing
  15.709 -		 * interactive performance.  Interruptible check on
  15.710 -		 * signal_pending() would be nice, but changes the spec?
  15.711 -		 */
  15.712 -		if (current->need_resched)
  15.713 -			schedule();
  15.714 -	}
  15.715 -
  15.716 -	mmput(start_mm);
  15.717 -	if (reset_overflow) {
  15.718 -		printk(KERN_WARNING "swapoff: cleared swap entry overflow\n");
  15.719 -		swap_overflow = 0;
  15.720 -	}
  15.721 -	return retval;
  15.722 -}
  15.723 -
  15.724 -asmlinkage long sys_swapoff(const char * specialfile)
  15.725 -{
  15.726 -	struct swap_info_struct * p = NULL;
  15.727 -	unsigned short *swap_map;
  15.728 -	struct nameidata nd;
  15.729 -	int i, type, prev;
  15.730 -	int err;
  15.731 -	
  15.732 -	if (!capable(CAP_SYS_ADMIN))
  15.733 -		return -EPERM;
  15.734 -
  15.735 -	err = user_path_walk(specialfile, &nd);
  15.736 -	if (err)
  15.737 -		goto out;
  15.738 -
  15.739 -	lock_kernel();
  15.740 -	prev = -1;
  15.741 -	swap_list_lock();
  15.742 -	for (type = swap_list.head; type >= 0; type = swap_info[type].next) {
  15.743 -		p = swap_info + type;
  15.744 -		if ((p->flags & SWP_WRITEOK) == SWP_WRITEOK) {
  15.745 -			if (p->swap_file == nd.dentry)
  15.746 -			  break;
  15.747 -		}
  15.748 -		prev = type;
  15.749 -	}
  15.750 -	err = -EINVAL;
  15.751 -	if (type < 0) {
  15.752 -		swap_list_unlock();
  15.753 -		goto out_dput;
  15.754 -	}
  15.755 -
  15.756 -	if (prev < 0) {
  15.757 -		swap_list.head = p->next;
  15.758 -	} else {
  15.759 -		swap_info[prev].next = p->next;
  15.760 -	}
  15.761 -	if (type == swap_list.next) {
  15.762 -		/* just pick something that's safe... */
  15.763 -		swap_list.next = swap_list.head;
  15.764 -	}
  15.765 -	nr_swap_pages -= p->pages;
  15.766 -	total_swap_pages -= p->pages;
  15.767 -	p->flags = SWP_USED;
  15.768 -	swap_list_unlock();
  15.769 -	unlock_kernel();
  15.770 -	err = try_to_unuse(type);
  15.771 -	lock_kernel();
  15.772 -	if (err) {
  15.773 -		/* re-insert swap space back into swap_list */
  15.774 -		swap_list_lock();
  15.775 -		for (prev = -1, i = swap_list.head; i >= 0; prev = i, i = swap_info[i].next)
  15.776 -			if (p->prio >= swap_info[i].prio)
  15.777 -				break;
  15.778 -		p->next = i;
  15.779 -		if (prev < 0)
  15.780 -			swap_list.head = swap_list.next = p - swap_info;
  15.781 -		else
  15.782 -			swap_info[prev].next = p - swap_info;
  15.783 -		nr_swap_pages += p->pages;
  15.784 -		total_swap_pages += p->pages;
  15.785 -		p->flags = SWP_WRITEOK;
  15.786 -		swap_list_unlock();
  15.787 -		goto out_dput;
  15.788 -	}
  15.789 -	if (p->swap_device)
  15.790 -		blkdev_put(p->swap_file->d_inode->i_bdev, BDEV_SWAP);
  15.791 -	path_release(&nd);
  15.792 -
  15.793 -	swap_list_lock();
  15.794 -	swap_device_lock(p);
  15.795 -	nd.mnt = p->swap_vfsmnt;
  15.796 -	nd.dentry = p->swap_file;
  15.797 -	p->swap_vfsmnt = NULL;
  15.798 -	p->swap_file = NULL;
  15.799 -	p->swap_device = 0;
  15.800 -	p->max = 0;
  15.801 -	swap_map = p->swap_map;
  15.802 -	p->swap_map = NULL;
  15.803 -	p->flags = 0;
  15.804 -	swap_device_unlock(p);
  15.805 -	swap_list_unlock();
  15.806 -	vfree(swap_map);
  15.807 -	err = 0;
  15.808 -
  15.809 -out_dput:
  15.810 -	unlock_kernel();
  15.811 -	path_release(&nd);
  15.812 -out:
  15.813 -	return err;
  15.814 -}
  15.815 -
  15.816 -int get_swaparea_info(char *buf)
  15.817 -{
  15.818 -	char * page = (char *) __get_free_page(GFP_KERNEL);
  15.819 -	struct swap_info_struct *ptr = swap_info;
  15.820 -	int i, j, len = 0, usedswap;
  15.821 -
  15.822 -	if (!page)
  15.823 -		return -ENOMEM;
  15.824 -
  15.825 -	len += sprintf(buf, "Filename\t\t\tType\t\tSize\tUsed\tPriority\n");
  15.826 -	for (i = 0 ; i < nr_swapfiles ; i++, ptr++) {
  15.827 -		if ((ptr->flags & SWP_USED) && ptr->swap_map) {
  15.828 -			char * path = d_path(ptr->swap_file, ptr->swap_vfsmnt,
  15.829 -						page, PAGE_SIZE);
  15.830 -
  15.831 -			len += sprintf(buf + len, "%-31s ", path);
  15.832 -
  15.833 -			if (!ptr->swap_device)
  15.834 -				len += sprintf(buf + len, "file\t\t");
  15.835 -			else
  15.836 -				len += sprintf(buf + len, "partition\t");
  15.837 -
  15.838 -			usedswap = 0;
  15.839 -			for (j = 0; j < ptr->max; ++j)
  15.840 -				switch (ptr->swap_map[j]) {
  15.841 -					case SWAP_MAP_BAD:
  15.842 -					case 0:
  15.843 -						continue;
  15.844 -					default:
  15.845 -						usedswap++;
  15.846 -				}
  15.847 -			len += sprintf(buf + len, "%d\t%d\t%d\n", ptr->pages << (PAGE_SHIFT - 10), 
  15.848 -				usedswap << (PAGE_SHIFT - 10), ptr->prio);
  15.849 -		}
  15.850 -	}
  15.851 -	free_page((unsigned long) page);
  15.852 -	return len;
  15.853 -}
  15.854 -
  15.855 -int is_swap_partition(kdev_t dev) {
  15.856 -	struct swap_info_struct *ptr = swap_info;
  15.857 -	int i;
  15.858 -
  15.859 -	for (i = 0 ; i < nr_swapfiles ; i++, ptr++) {
  15.860 -		if (ptr->flags & SWP_USED)
  15.861 -			if (ptr->swap_device == dev)
  15.862 -				return 1;
  15.863 -	}
  15.864 -	return 0;
  15.865 -}
  15.866 -
  15.867 -/*
  15.868 - * Written 01/25/92 by Simmule Turner, heavily changed by Linus.
  15.869 - *
  15.870 - * The swapon system call
  15.871 - */
  15.872 -asmlinkage long sys_swapon(const char * specialfile, int swap_flags)
  15.873 -{
  15.874 -	struct swap_info_struct * p;
  15.875 -	struct nameidata nd;
  15.876 -	struct inode * swap_inode;
  15.877 -	unsigned int type;
  15.878 -	int i, j, prev;
  15.879 -	int error;
  15.880 -	static int least_priority = 0;
  15.881 -	union swap_header *swap_header = 0;
  15.882 -	int swap_header_version;
  15.883 -	int nr_good_pages = 0;
  15.884 -	unsigned long maxpages = 1;
  15.885 -	int swapfilesize;
  15.886 -	struct block_device *bdev = NULL;
  15.887 -	unsigned short *swap_map;
  15.888 -	
  15.889 -	if (!capable(CAP_SYS_ADMIN))
  15.890 -		return -EPERM;
  15.891 -	lock_kernel();
  15.892 -	swap_list_lock();
  15.893 -	p = swap_info;
  15.894 -	for (type = 0 ; type < nr_swapfiles ; type++,p++)
  15.895 -		if (!(p->flags & SWP_USED))
  15.896 -			break;
  15.897 -	error = -EPERM;
  15.898 -	if (type >= MAX_SWAPFILES) {
  15.899 -		swap_list_unlock();
  15.900 -		goto out;
  15.901 -	}
  15.902 -	if (type >= nr_swapfiles)
  15.903 -		nr_swapfiles = type+1;
  15.904 -	p->flags = SWP_USED;
  15.905 -	p->swap_file = NULL;
  15.906 -	p->swap_vfsmnt = NULL;
  15.907 -	p->swap_device = 0;
  15.908 -	p->swap_map = NULL;
  15.909 -	p->lowest_bit = 0;
  15.910 -	p->highest_bit = 0;
  15.911 -	p->cluster_nr = 0;
  15.912 -	p->sdev_lock = SPIN_LOCK_UNLOCKED;
  15.913 -	p->next = -1;
  15.914 -	if (swap_flags & SWAP_FLAG_PREFER) {
  15.915 -		p->prio =
  15.916 -		  (swap_flags & SWAP_FLAG_PRIO_MASK)>>SWAP_FLAG_PRIO_SHIFT;
  15.917 -	} else {
  15.918 -		p->prio = --least_priority;
  15.919 -	}
  15.920 -	swap_list_unlock();
  15.921 -	error = user_path_walk(specialfile, &nd);
  15.922 -	if (error)
  15.923 -		goto bad_swap_2;
  15.924 -
  15.925 -	p->swap_file = nd.dentry;
  15.926 -	p->swap_vfsmnt = nd.mnt;
  15.927 -	swap_inode = nd.dentry->d_inode;
  15.928 -	error = -EINVAL;
  15.929 -
  15.930 -	if (S_ISBLK(swap_inode->i_mode)) {
  15.931 -		kdev_t dev = swap_inode->i_rdev;
  15.932 -		struct block_device_operations *bdops;
  15.933 -		devfs_handle_t de;
  15.934 -
  15.935 -		if (is_mounted(dev)) {
  15.936 -			error = -EBUSY;
  15.937 -			goto bad_swap_2;
  15.938 -		}
  15.939 -
  15.940 -		p->swap_device = dev;
  15.941 -		set_blocksize(dev, PAGE_SIZE);
  15.942 -		
  15.943 -		bd_acquire(swap_inode);
  15.944 -		bdev = swap_inode->i_bdev;
  15.945 -		de = devfs_get_handle_from_inode(swap_inode);
  15.946 -		bdops = devfs_get_ops(de);  /*  Increments module use count  */
  15.947 -		if (bdops) bdev->bd_op = bdops;
  15.948 -
  15.949 -		error = blkdev_get(bdev, FMODE_READ|FMODE_WRITE, 0, BDEV_SWAP);
  15.950 -		devfs_put_ops(de);/*Decrement module use count now we're safe*/
  15.951 -		if (error)
  15.952 -			goto bad_swap_2;
  15.953 -		set_blocksize(dev, PAGE_SIZE);
  15.954 -		error = -ENODEV;
  15.955 -		if (!dev || (blk_size[MAJOR(dev)] &&
  15.956 -		     !blk_size[MAJOR(dev)][MINOR(dev)]))
  15.957 -			goto bad_swap;
  15.958 -		swapfilesize = 0;
  15.959 -		if (blk_size[MAJOR(dev)])
  15.960 -			swapfilesize = blk_size[MAJOR(dev)][MINOR(dev)]
  15.961 -				>> (PAGE_SHIFT - 10);
  15.962 -	} else if (S_ISREG(swap_inode->i_mode))
  15.963 -		swapfilesize = swap_inode->i_size >> PAGE_SHIFT;
  15.964 -	else
  15.965 -		goto bad_swap;
  15.966 -
  15.967 -	error = -EBUSY;
  15.968 -	for (i = 0 ; i < nr_swapfiles ; i++) {
  15.969 -		struct swap_info_struct *q = &swap_info[i];
  15.970 -		if (i == type || !q->swap_file)
  15.971 -			continue;
  15.972 -		if (swap_inode->i_mapping == q->swap_file->d_inode->i_mapping)
  15.973 -			goto bad_swap;
  15.974 -	}
  15.975 -
  15.976 -	swap_header = (void *) __get_free_page(GFP_USER);
  15.977 -	if (!swap_header) {
  15.978 -		printk("Unable to start swapping: out of memory :-)\n");
  15.979 -		error = -ENOMEM;
  15.980 -		goto bad_swap;
  15.981 -	}
  15.982 -
  15.983 -	lock_page(virt_to_page(swap_header));
  15.984 -	rw_swap_page_nolock(READ, SWP_ENTRY(type,0), (char *) swap_header);
  15.985 -
  15.986 -	if (!memcmp("SWAP-SPACE",swap_header->magic.magic,10))
  15.987 -		swap_header_version = 1;
  15.988 -	else if (!memcmp("SWAPSPACE2",swap_header->magic.magic,10))
  15.989 -		swap_header_version = 2;
  15.990 -	else {
  15.991 -		printk("Unable to find swap-space signature\n");
  15.992 -		error = -EINVAL;
  15.993 -		goto bad_swap;
  15.994 -	}
  15.995 -	
  15.996 -	switch (swap_header_version) {
  15.997 -	case 1:
  15.998 -		memset(((char *) swap_header)+PAGE_SIZE-10,0,10);
  15.999 -		j = 0;
 15.1000 -		p->lowest_bit = 0;
 15.1001 -		p->highest_bit = 0;
 15.1002 -		for (i = 1 ; i < 8*PAGE_SIZE ; i++) {
 15.1003 -			if (test_bit(i,(char *) swap_header)) {
 15.1004 -				if (!p->lowest_bit)
 15.1005 -					p->lowest_bit = i;
 15.1006 -				p->highest_bit = i;
 15.1007 -				maxpages = i+1;
 15.1008 -				j++;
 15.1009 -			}
 15.1010 -		}
 15.1011 -		nr_good_pages = j;
 15.1012 -		p->swap_map = vmalloc(maxpages * sizeof(short));
 15.1013 -		if (!p->swap_map) {
 15.1014 -			error = -ENOMEM;		
 15.1015 -			goto bad_swap;
 15.1016 -		}
 15.1017 -		for (i = 1 ; i < maxpages ; i++) {
 15.1018 -			if (test_bit(i,(char *) swap_header))
 15.1019 -				p->swap_map[i] = 0;
 15.1020 -			else
 15.1021 -				p->swap_map[i] = SWAP_MAP_BAD;
 15.1022 -		}
 15.1023 -		break;
 15.1024 -
 15.1025 -	case 2:
 15.1026 -		/* Check the swap header's sub-version and the size of
 15.1027 -                   the swap file and bad block lists */
 15.1028 -		if (swap_header->info.version != 1) {
 15.1029 -			printk(KERN_WARNING
 15.1030 -			       "Unable to handle swap header version %d\n",
 15.1031 -			       swap_header->info.version);
 15.1032 -			error = -EINVAL;
 15.1033 -			goto bad_swap;
 15.1034 -		}
 15.1035 -
 15.1036 -		p->lowest_bit  = 1;
 15.1037 -		maxpages = SWP_OFFSET(SWP_ENTRY(0,~0UL)) - 1;
 15.1038 -		if (maxpages > swap_header->info.last_page)
 15.1039 -			maxpages = swap_header->info.last_page;
 15.1040 -		p->highest_bit = maxpages - 1;
 15.1041 -
 15.1042 -		error = -EINVAL;
 15.1043 -		if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES)
 15.1044 -			goto bad_swap;
 15.1045 -		
 15.1046 -		/* OK, set up the swap map and apply the bad block list */
 15.1047 -		if (!(p->swap_map = vmalloc(maxpages * sizeof(short)))) {
 15.1048 -			error = -ENOMEM;
 15.1049 -			goto bad_swap;
 15.1050 -		}
 15.1051 -
 15.1052 -		error = 0;
 15.1053 -		memset(p->swap_map, 0, maxpages * sizeof(short));
 15.1054 -		for (i=0; i<swap_header->info.nr_badpages; i++) {
 15.1055 -			int page = swap_header->info.badpages[i];
 15.1056 -			if (page <= 0 || page >= swap_header->info.last_page)
 15.1057 -				error = -EINVAL;
 15.1058 -			else
 15.1059 -				p->swap_map[page] = SWAP_MAP_BAD;
 15.1060 -		}
 15.1061 -		nr_good_pages = swap_header->info.last_page -
 15.1062 -				swap_header->info.nr_badpages -
 15.1063 -				1 /* header page */;
 15.1064 -		if (error) 
 15.1065 -			goto bad_swap;
 15.1066 -	}
 15.1067 -	
 15.1068 -	if (swapfilesize && maxpages > swapfilesize) {
 15.1069 -		printk(KERN_WARNING
 15.1070 -		       "Swap area shorter than signature indicates\n");
 15.1071 -		error = -EINVAL;
 15.1072 -		goto bad_swap;
 15.1073 -	}
 15.1074 -	if (!nr_good_pages) {
 15.1075 -		printk(KERN_WARNING "Empty swap-file\n");
 15.1076 -		error = -EINVAL;
 15.1077 -		goto bad_swap;
 15.1078 -	}
 15.1079 -	p->swap_map[0] = SWAP_MAP_BAD;
 15.1080 -	swap_list_lock();
 15.1081 -	swap_device_lock(p);
 15.1082 -	p->max = maxpages;
 15.1083 -	p->flags = SWP_WRITEOK;
 15.1084 -	p->pages = nr_good_pages;
 15.1085 -	nr_swap_pages += nr_good_pages;
 15.1086 -	total_swap_pages += nr_good_pages;
 15.1087 -	printk(KERN_INFO "Adding Swap: %dk swap-space (priority %d)\n",
 15.1088 -	       nr_good_pages<<(PAGE_SHIFT-10), p->prio);
 15.1089 -
 15.1090 -	/* insert swap space into swap_list: */
 15.1091 -	prev = -1;
 15.1092 -	for (i = swap_list.head; i >= 0; i = swap_info[i].next) {
 15.1093 -		if (p->prio >= swap_info[i].prio) {
 15.1094 -			break;
 15.1095 -		}
 15.1096 -		prev = i;
 15.1097 -	}
 15.1098 -	p->next = i;
 15.1099 -	if (prev < 0) {
 15.1100 -		swap_list.head = swap_list.next = p - swap_info;
 15.1101 -	} else {
 15.1102 -		swap_info[prev].next = p - swap_info;
 15.1103 -	}
 15.1104 -	swap_device_unlock(p);
 15.1105 -	swap_list_unlock();
 15.1106 -	error = 0;
 15.1107 -	goto out;
 15.1108 -bad_swap:
 15.1109 -	if (bdev)
 15.1110 -		blkdev_put(bdev, BDEV_SWAP);
 15.1111 -bad_swap_2:
 15.1112 -	swap_list_lock();
 15.1113 -	swap_map = p->swap_map;
 15.1114 -	nd.mnt = p->swap_vfsmnt;
 15.1115 -	nd.dentry = p->swap_file;
 15.1116 -	p->swap_device = 0;
 15.1117 -	p->swap_file = NULL;
 15.1118 -	p->swap_vfsmnt = NULL;
 15.1119 -	p->swap_map = NULL;
 15.1120 -	p->flags = 0;
 15.1121 -	if (!(swap_flags & SWAP_FLAG_PREFER))
 15.1122 -		++least_priority;
 15.1123 -	swap_list_unlock();
 15.1124 -	if (swap_map)
 15.1125 -		vfree(swap_map);
 15.1126 -	path_release(&nd);
 15.1127 -out:
 15.1128 -	if (swap_header)
 15.1129 -		free_page((long) swap_header);
 15.1130 -	unlock_kernel();
 15.1131 -	return error;
 15.1132 -}
 15.1133 -
 15.1134 -void si_swapinfo(struct sysinfo *val)
 15.1135 -{
 15.1136 -	unsigned int i;
 15.1137 -	unsigned long nr_to_be_unused = 0;
 15.1138 -
 15.1139 -	swap_list_lock();
 15.1140 -	for (i = 0; i < nr_swapfiles; i++) {
 15.1141 -		unsigned int j;
 15.1142 -		if (swap_info[i].flags != SWP_USED)
 15.1143 -			continue;
 15.1144 -		for (j = 0; j < swap_info[i].max; ++j) {
 15.1145 -			switch (swap_info[i].swap_map[j]) {
 15.1146 -				case 0:
 15.1147 -				case SWAP_MAP_BAD:
 15.1148 -					continue;
 15.1149 -				default:
 15.1150 -					nr_to_be_unused++;
 15.1151 -			}
 15.1152 -		}
 15.1153 -	}
 15.1154 -	val->freeswap = nr_swap_pages + nr_to_be_unused;
 15.1155 -	val->totalswap = total_swap_pages + nr_to_be_unused;
 15.1156 -	swap_list_unlock();
 15.1157 -}
 15.1158 -
 15.1159 -/*
 15.1160 - * Verify that a swap entry is valid and increment its swap map count.
 15.1161 - *
 15.1162 - * Note: if swap_map[] reaches SWAP_MAP_MAX the entries are treated as
 15.1163 - * "permanent", but will be reclaimed by the next swapoff.
 15.1164 - */
 15.1165 -int swap_duplicate(swp_entry_t entry)
 15.1166 -{
 15.1167 -	struct swap_info_struct * p;
 15.1168 -	unsigned long offset, type;
 15.1169 -	int result = 0;
 15.1170 -
 15.1171 -	type = SWP_TYPE(entry);
 15.1172 -	if (type >= nr_swapfiles)
 15.1173 -		goto bad_file;
 15.1174 -	p = type + swap_info;
 15.1175 -	offset = SWP_OFFSET(entry);
 15.1176 -
 15.1177 -	swap_device_lock(p);
 15.1178 -	if (offset < p->max && p->swap_map[offset]) {
 15.1179 -		if (p->swap_map[offset] < SWAP_MAP_MAX - 1) {
 15.1180 -			p->swap_map[offset]++;
 15.1181 -			result = 1;
 15.1182 -		} else if (p->swap_map[offset] <= SWAP_MAP_MAX) {
 15.1183 -			if (swap_overflow++ < 5)
 15.1184 -				printk(KERN_WARNING "swap_dup: swap entry overflow\n");
 15.1185 -			p->swap_map[offset] = SWAP_MAP_MAX;
 15.1186 -			result = 1;
 15.1187 -		}
 15.1188 -	}
 15.1189 -	swap_device_unlock(p);
 15.1190 -out:
 15.1191 -	return result;
 15.1192 -
 15.1193 -bad_file:
 15.1194 -	printk(KERN_ERR "swap_dup: %s%08lx\n", Bad_file, entry.val);
 15.1195 -	goto out;
 15.1196 -}
 15.1197 -
 15.1198 -/*
 15.1199 - * Prior swap_duplicate protects against swap device deletion.
 15.1200 - */
 15.1201 -void get_swaphandle_info(swp_entry_t entry, unsigned long *offset, 
 15.1202 -			kdev_t *dev, struct inode **swapf)
 15.1203 -{
 15.1204 -	unsigned long type;
 15.1205 -	struct swap_info_struct *p;
 15.1206 -
 15.1207 -	type = SWP_TYPE(entry);
 15.1208 -	if (type >= nr_swapfiles) {
 15.1209 -		printk(KERN_ERR "rw_swap_page: %s%08lx\n", Bad_file, entry.val);
 15.1210 -		return;
 15.1211 -	}
 15.1212 -
 15.1213 -	p = &swap_info[type];
 15.1214 -	*offset = SWP_OFFSET(entry);
 15.1215 -	if (*offset >= p->max && *offset != 0) {
 15.1216 -		printk(KERN_ERR "rw_swap_page: %s%08lx\n", Bad_offset, entry.val);
 15.1217 -		return;
 15.1218 -	}
 15.1219 -	if (p->swap_map && !p->swap_map[*offset]) {
 15.1220 -		printk(KERN_ERR "rw_swap_page: %s%08lx\n", Unused_offset, entry.val);
 15.1221 -		return;
 15.1222 -	}
 15.1223 -	if (!(p->flags & SWP_USED)) {
 15.1224 -		printk(KERN_ERR "rw_swap_page: %s%08lx\n", Unused_file, entry.val);
 15.1225 -		return;
 15.1226 -	}
 15.1227 -
 15.1228 -	if (p->swap_device) {
 15.1229 -		*dev = p->swap_device;
 15.1230 -	} else if (p->swap_file) {
 15.1231 -		*swapf = p->swap_file->d_inode;
 15.1232 -	} else {
 15.1233 -		printk(KERN_ERR "rw_swap_page: no swap file or device\n");
 15.1234 -	}
 15.1235 -	return;
 15.1236 -}
 15.1237 -
 15.1238 -/*
 15.1239 - * swap_device_lock prevents swap_map being freed. Don't grab an extra
 15.1240 - * reference on the swaphandle, it doesn't matter if it becomes unused.
 15.1241 - */
 15.1242 -int valid_swaphandles(swp_entry_t entry, unsigned long *offset)
 15.1243 -{
 15.1244 -	int ret = 0, i = 1 << page_cluster;
 15.1245 -	unsigned long toff;
 15.1246 -	struct swap_info_struct *swapdev = SWP_TYPE(entry) + swap_info;
 15.1247 -
 15.1248 -	if (!page_cluster)	/* no readahead */
 15.1249 -		return 0;
 15.1250 -	toff = (SWP_OFFSET(entry) >> page_cluster) << page_cluster;
 15.1251 -	if (!toff)		/* first page is swap header */
 15.1252 -		toff++, i--;
 15.1253 -	*offset = toff;
 15.1254 -
 15.1255 -	swap_device_lock(swapdev);
 15.1256 -	do {
 15.1257 -		/* Don't read-ahead past the end of the swap area */
 15.1258 -		if (toff >= swapdev->max)
 15.1259 -			break;
 15.1260 -		/* Don't read in free or bad pages */
 15.1261 -		if (!swapdev->swap_map[toff])
 15.1262 -			break;
 15.1263 -		if (swapdev->swap_map[toff] == SWAP_MAP_BAD)
 15.1264 -			break;
 15.1265 -		toff++;
 15.1266 -		ret++;
 15.1267 -	} while (--i);
 15.1268 -	swap_device_unlock(swapdev);
 15.1269 -	return ret;
 15.1270 -}
    16.1 --- a/linux-2.4.29-xen-sparse/mm/vmalloc.c	Fri Mar 25 19:30:52 2005 +0000
    16.2 +++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
    16.3 @@ -1,385 +0,0 @@
    16.4 -/*
    16.5 - *  linux/mm/vmalloc.c
    16.6 - *
    16.7 - *  Copyright (C) 1993  Linus Torvalds
    16.8 - *  Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
    16.9 - *  SMP-safe vmalloc/vfree/ioremap, Tigran Aivazian <tigran@veritas.com>, May 2000
   16.10 - */
   16.11 -
   16.12 -#include <linux/config.h>
   16.13 -#include <linux/slab.h>
   16.14 -#include <linux/vmalloc.h>
   16.15 -#include <linux/spinlock.h>
   16.16 -#include <linux/highmem.h>
   16.17 -#include <linux/smp_lock.h>
   16.18 -
   16.19 -#include <asm/uaccess.h>
   16.20 -#include <asm/pgalloc.h>
   16.21 -
   16.22 -rwlock_t vmlist_lock = RW_LOCK_UNLOCKED;
   16.23 -struct vm_struct * vmlist;
   16.24 -
   16.25 -static inline void free_area_pte(pmd_t * pmd, unsigned long address, unsigned long size)
   16.26 -{
   16.27 -	pte_t * pte;
   16.28 -	unsigned long end;
   16.29 -
   16.30 -	if (pmd_none(*pmd))
   16.31 -		return;
   16.32 -	if (pmd_bad(*pmd)) {
   16.33 -		pmd_ERROR(*pmd);
   16.34 -		pmd_clear(pmd);
   16.35 -		return;
   16.36 -	}
   16.37 -	pte = pte_offset(pmd, address);
   16.38 -	address &= ~PMD_MASK;
   16.39 -	end = address + size;
   16.40 -	if (end > PMD_SIZE)
   16.41 -		end = PMD_SIZE;
   16.42 -	do {
   16.43 -		pte_t page;
   16.44 -		page = ptep_get_and_clear(pte);
   16.45 -		address += PAGE_SIZE;
   16.46 -		pte++;
   16.47 -		if (pte_none(page))
   16.48 -			continue;
   16.49 -		if (pte_present(page)) {
   16.50 -			struct page *ptpage = pte_page(page);
   16.51 -			if (VALID_PAGE(ptpage) && (!PageReserved(ptpage)))
   16.52 -				__free_page(ptpage);
   16.53 -			continue;
   16.54 -		}
   16.55 -		printk(KERN_CRIT "Whee.. Swapped out page in kernel page table\n");
   16.56 -	} while (address < end);
   16.57 -}
   16.58 -
   16.59 -static inline void free_area_pmd(pgd_t * dir, unsigned long address, unsigned long size)
   16.60 -{
   16.61 -	pmd_t * pmd;
   16.62 -	unsigned long end;
   16.63 -
   16.64 -	if (pgd_none(*dir))
   16.65 -		return;
   16.66 -	if (pgd_bad(*dir)) {
   16.67 -		pgd_ERROR(*dir);
   16.68 -		pgd_clear(dir);
   16.69 -		return;
   16.70 -	}
   16.71 -	pmd = pmd_offset(dir, address);
   16.72 -	address &= ~PGDIR_MASK;
   16.73 -	end = address + size;
   16.74 -	if (end > PGDIR_SIZE)
   16.75 -		end = PGDIR_SIZE;
   16.76 -	do {
   16.77 -		free_area_pte(pmd, address, end - address);
   16.78 -		address = (address + PMD_SIZE) & PMD_MASK;
   16.79 -		pmd++;
   16.80 -	} while (address < end);
   16.81 -}
   16.82 -
   16.83 -void vmfree_area_pages(unsigned long address, unsigned long size)
   16.84 -{
   16.85 -	pgd_t * dir;
   16.86 -	unsigned long end = address + size;
   16.87 -
   16.88 -	dir = pgd_offset_k(address);
   16.89 -	flush_cache_all();
   16.90 -	do {
   16.91 -		free_area_pmd(dir, address, end - address);
   16.92 -		address = (address + PGDIR_SIZE) & PGDIR_MASK;
   16.93 -		dir++;
   16.94 -	} while (address && (address < end));
   16.95 -	flush_tlb_all();
   16.96 -}
   16.97 -
   16.98 -static inline int alloc_area_pte (pte_t * pte, unsigned long address,
   16.99 -			unsigned long size, int gfp_mask,
  16.100 -			pgprot_t prot, struct page ***pages)
  16.101 -{
  16.102 -	unsigned long end;
  16.103 -
  16.104 -	address &= ~PMD_MASK;
  16.105 -	end = address + size;
  16.106 -	if (end > PMD_SIZE)
  16.107 -		end = PMD_SIZE;
  16.108 -	do {
  16.109 -		struct page * page;
  16.110 -
  16.111 -		if (!pages) {
  16.112 -			spin_unlock(&init_mm.page_table_lock);
  16.113 -			page = alloc_page(gfp_mask);
  16.114 -			spin_lock(&init_mm.page_table_lock);
  16.115 -		} else {
  16.116 -			page = (**pages);
  16.117 -			(*pages)++;
  16.118 -
  16.119 -			/* Add a reference to the page so we can free later */
  16.120 -			if (page)
  16.121 -				atomic_inc(&page->count);
  16.122 -
  16.123 -		}
  16.124 -		if (!pte_none(*pte))
  16.125 -			printk(KERN_ERR "alloc_area_pte: page already exists\n");
  16.126 -		if (!page)
  16.127 -			return -ENOMEM;
  16.128 -		set_pte(pte, mk_pte(page, prot));
  16.129 -		address += PAGE_SIZE;
  16.130 -		pte++;
  16.131 -	} while (address < end);
  16.132 -	return 0;
  16.133 -}
  16.134 -
  16.135 -static inline int alloc_area_pmd(pmd_t * pmd, unsigned long address,
  16.136 -			unsigned long size, int gfp_mask,
  16.137 -			pgprot_t prot, struct page ***pages)
  16.138 -{
  16.139 -	unsigned long end;
  16.140 -
  16.141 -	address &= ~PGDIR_MASK;
  16.142 -	end = address + size;
  16.143 -	if (end > PGDIR_SIZE)
  16.144 -		end = PGDIR_SIZE;
  16.145 -	do {
  16.146 -		pte_t * pte = pte_alloc(&init_mm, pmd, address);
  16.147 -		if (!pte)
  16.148 -			return -ENOMEM;
  16.149 -		if (alloc_area_pte(pte, address, end - address,
  16.150 -					gfp_mask, prot, pages))
  16.151 -			return -ENOMEM;
  16.152 -		address = (address + PMD_SIZE) & PMD_MASK;
  16.153 -		pmd++;
  16.154 -	} while (address < end);
  16.155 -	return 0;
  16.156 -}
  16.157 -
  16.158 -/*static inline*/ int __vmalloc_area_pages (unsigned long address,
  16.159 -					unsigned long size,
  16.160 -					int gfp_mask,
  16.161 -					pgprot_t prot,
  16.162 -					struct page ***pages)
  16.163 -{
  16.164 -	pgd_t * dir;
  16.165 -	unsigned long start = address;
  16.166 -	unsigned long end = address + size;
  16.167 -
  16.168 -	dir = pgd_offset_k(address);
  16.169 -	spin_lock(&init_mm.page_table_lock);
  16.170 -	do {
  16.171 -		pmd_t *pmd;
  16.172 -		
  16.173 -		pmd = pmd_alloc(&init_mm, dir, address);
  16.174 -		if (!pmd)
  16.175 -			goto err;
  16.176 -
  16.177 -		if (alloc_area_pmd(pmd, address, end - address, gfp_mask, prot, pages))
  16.178 -			goto err;	// The kernel NEVER reclaims pmds, so no need to undo pmd_alloc() here
  16.179 -
  16.180 -		address = (address + PGDIR_SIZE) & PGDIR_MASK;
  16.181 -		dir++;
  16.182 -	} while (address && (address < end));
  16.183 -	spin_unlock(&init_mm.page_table_lock);
  16.184 -	flush_cache_all();
  16.185 -	XEN_flush_page_update_queue();
  16.186 -	return 0;
  16.187 -err:
  16.188 -	spin_unlock(&init_mm.page_table_lock);
  16.189 -	flush_cache_all();
  16.190 -	if (address > start)
  16.191 -		vmfree_area_pages(start, address - start);
  16.192 -	return -ENOMEM;
  16.193 -}
  16.194 -
  16.195 -int vmalloc_area_pages(unsigned long address, unsigned long size,
  16.196 -		       int gfp_mask, pgprot_t prot)
  16.197 -{
  16.198 -	return __vmalloc_area_pages(address, size, gfp_mask, prot, NULL);
  16.199 -}
  16.200 -
  16.201 -struct vm_struct * get_vm_area(unsigned long size, unsigned long flags)
  16.202 -{
  16.203 -	unsigned long addr, next;
  16.204 -	struct vm_struct **p, *tmp, *area;
  16.205 -
  16.206 -	area = (struct vm_struct *) kmalloc(sizeof(*area), GFP_KERNEL);
  16.207 -	if (!area)
  16.208 -		return NULL;
  16.209 -
  16.210 -	size += PAGE_SIZE;
  16.211 -	if (!size) {
  16.212 -		kfree (area);
  16.213 -		return NULL;
  16.214 -	}
  16.215 -
  16.216 -	addr = VMALLOC_START;
  16.217 -	write_lock(&vmlist_lock);
  16.218 -	for (p = &vmlist; (tmp = *p) ; p = &tmp->next) {
  16.219 -		if ((size + addr) < addr)
  16.220 -			goto out;
  16.221 -		if (size + addr <= (unsigned long) tmp->addr)
  16.222 -			break;
  16.223 -		next = tmp->size + (unsigned long) tmp->addr;
  16.224 -		if (next > addr) 
  16.225 -			addr = next;
  16.226 -		if (addr > VMALLOC_END-size)
  16.227 -			goto out;
  16.228 -	}
  16.229 -	area->flags = flags;
  16.230 -	area->addr = (void *)addr;
  16.231 -	area->size = size;
  16.232 -	area->next = *p;
  16.233 -	*p = area;
  16.234 -	write_unlock(&vmlist_lock);
  16.235 -	return area;
  16.236 -
  16.237 -out:
  16.238 -	write_unlock(&vmlist_lock);
  16.239 -	kfree(area);
  16.240 -	return NULL;
  16.241 -}
  16.242 -
  16.243 -void __vfree(void * addr, int free_area_pages)
  16.244 -{
  16.245 -	struct vm_struct **p, *tmp;
  16.246 -
  16.247 -	if (!addr)
  16.248 -		return;
  16.249 -	if ((PAGE_SIZE-1) & (unsigned long) addr) {
  16.250 -		printk(KERN_ERR "Trying to vfree() bad address (%p)\n", addr);
  16.251 -		return;
  16.252 -	}
  16.253 -	write_lock(&vmlist_lock);
  16.254 -	for (p = &vmlist ; (tmp = *p) ; p = &tmp->next) {
  16.255 -		if (tmp->addr == addr) {
  16.256 -			*p = tmp->next;
  16.257 -			if (free_area_pages)
  16.258 -				vmfree_area_pages(VMALLOC_VMADDR(tmp->addr), tmp->size);
  16.259 -			write_unlock(&vmlist_lock);
  16.260 -			kfree(tmp);
  16.261 -			return;
  16.262 -		}
  16.263 -	}
  16.264 -	write_unlock(&vmlist_lock);
  16.265 -	printk(KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n", addr);
  16.266 -}
  16.267 -
  16.268 -void vfree(void * addr)
  16.269 -{
  16.270 -	__vfree(addr,1);
  16.271 -}
  16.272 -
  16.273 -void * __vmalloc (unsigned long size, int gfp_mask, pgprot_t prot)
  16.274 -{
  16.275 -	void * addr;
  16.276 -	struct vm_struct *area;
  16.277 -
  16.278 -	size = PAGE_ALIGN(size);
  16.279 -	if (!size || (size >> PAGE_SHIFT) > num_physpages)
  16.280 -		return NULL;
  16.281 -	area = get_vm_area(size, VM_ALLOC);
  16.282 -	if (!area)
  16.283 -		return NULL;
  16.284 -	addr = area->addr;
  16.285 -	if (__vmalloc_area_pages(VMALLOC_VMADDR(addr), size, gfp_mask,
  16.286 -				 prot, NULL)) {
  16.287 -		__vfree(addr, 0);
  16.288 -		return NULL;
  16.289 -	}
  16.290 -	return addr;
  16.291 -}
  16.292 -
  16.293 -void * vmap(struct page **pages, int count,
  16.294 -	    unsigned long flags, pgprot_t prot)
  16.295 -{
  16.296 -	void * addr;
  16.297 -	struct vm_struct *area;
  16.298 -	unsigned long size = count << PAGE_SHIFT;
  16.299 -
  16.300 -	if (!size || size > (max_mapnr << PAGE_SHIFT))
  16.301 -		return NULL;
  16.302 -	area = get_vm_area(size, flags);
  16.303 -	if (!area) {
  16.304 -		return NULL;
  16.305 -	}
  16.306 -	addr = area->addr;
  16.307 -	if (__vmalloc_area_pages(VMALLOC_VMADDR(addr), size, 0,
  16.308 -				 prot, &pages)) {
  16.309 -		__vfree(addr, 0);
  16.310 -		return NULL;
  16.311 -	}
  16.312 -	return addr;
  16.313 -}
  16.314 -
  16.315 -long vread(char *buf, char *addr, unsigned long count)
  16.316 -{
  16.317 -	struct vm_struct *tmp;
  16.318 -	char *vaddr, *buf_start = buf;
  16.319 -	unsigned long n;
  16.320 -
  16.321 -	/* Don't allow overflow */
  16.322 -	if ((unsigned long) addr + count < count)
  16.323 -		count = -(unsigned long) addr;
  16.324 -
  16.325 -	read_lock(&vmlist_lock);
  16.326 -	for (tmp = vmlist; tmp; tmp = tmp->next) {
  16.327 -		vaddr = (char *) tmp->addr;
  16.328 -		if (addr >= vaddr + tmp->size - PAGE_SIZE)
  16.329 -			continue;
  16.330 -		while (addr < vaddr) {
  16.331 -			if (count == 0)
  16.332 -				goto finished;
  16.333 -			*buf = '\0';
  16.334 -			buf++;
  16.335 -			addr++;
  16.336 -			count--;
  16.337 -		}
  16.338 -		n = vaddr + tmp->size - PAGE_SIZE - addr;
  16.339 -		do {
  16.340 -			if (count == 0)
  16.341 -				goto finished;
  16.342 -			*buf = *addr;
  16.343 -			buf++;
  16.344 -			addr++;
  16.345 -			count--;
  16.346 -		} while (--n > 0);
  16.347 -	}
  16.348 -finished:
  16.349 -	read_unlock(&vmlist_lock);
  16.350 -	return buf - buf_start;
  16.351 -}
  16.352 -
  16.353 -long vwrite(char *buf, char *addr, unsigned long count)
  16.354 -{
  16.355 -	struct vm_struct *tmp;
  16.356 -	char *vaddr, *buf_start = buf;
  16.357 -	unsigned long n;
  16.358 -
  16.359 -	/* Don't allow overflow */
  16.360 -	if ((unsigned long) addr + count < count)
  16.361 -		count = -(unsigned long) addr;
  16.362 -
  16.363 -	read_lock(&vmlist_lock);
  16.364 -	for (tmp = vmlist; tmp; tmp = tmp->next) {
  16.365 -		vaddr = (char *) tmp->addr;
  16.366 -		if (addr >= vaddr + tmp->size - PAGE_SIZE)
  16.367 -			continue;
  16.368 -		while (addr < vaddr) {
  16.369 -			if (count == 0)
  16.370 -				goto finished;
  16.371 -			buf++;
  16.372 -			addr++;
  16.373 -			count--;
  16.374 -		}
  16.375 -		n = vaddr + tmp->size - PAGE_SIZE - addr;
  16.376 -		do {
  16.377 -			if (count == 0)
  16.378 -				goto finished;
  16.379 -			*addr = *buf;
  16.380 -			buf++;
  16.381 -			addr++;
  16.382 -			count--;
  16.383 -		} while (--n > 0);
  16.384 -	}
  16.385 -finished:
  16.386 -	read_unlock(&vmlist_lock);
  16.387 -	return buf - buf_start;
  16.388 -}
    17.1 --- a/linux-2.6.11-xen-sparse/arch/xen/Kconfig	Fri Mar 25 19:30:52 2005 +0000
    17.2 +++ b/linux-2.6.11-xen-sparse/arch/xen/Kconfig	Fri Mar 25 20:03:52 2005 +0000
    17.3 @@ -114,10 +114,6 @@ config XEN_BLKDEV_TAP
    17.4  	  to a character device, allowing device prototyping in application
    17.5  	  space.  Odds are that you want to say N here.
    17.6  
    17.7 -config XEN_WRITABLE_PAGETABLES
    17.8 -	bool
    17.9 -	default y
   17.10 -
   17.11  config XEN_SCRUB_PAGES
   17.12  	bool "Scrub memory before freeing it to Xen"
   17.13  	default y
    18.1 --- a/linux-2.6.11-xen-sparse/arch/xen/configs/xen0_defconfig	Fri Mar 25 19:30:52 2005 +0000
    18.2 +++ b/linux-2.6.11-xen-sparse/arch/xen/configs/xen0_defconfig	Fri Mar 25 20:03:52 2005 +0000
    18.3 @@ -19,7 +19,6 @@ CONFIG_XEN_BLKDEV_FRONTEND=y
    18.4  CONFIG_XEN_NETDEV_FRONTEND=y
    18.5  # CONFIG_XEN_NETDEV_FRONTEND_PIPELINED_TRANSMITTER is not set
    18.6  # CONFIG_XEN_BLKDEV_TAP is not set
    18.7 -CONFIG_XEN_WRITABLE_PAGETABLES=y
    18.8  CONFIG_XEN_SCRUB_PAGES=y
    18.9  CONFIG_X86=y
   18.10  # CONFIG_X86_64 is not set
    19.1 --- a/linux-2.6.11-xen-sparse/arch/xen/configs/xenU_defconfig	Fri Mar 25 19:30:52 2005 +0000
    19.2 +++ b/linux-2.6.11-xen-sparse/arch/xen/configs/xenU_defconfig	Fri Mar 25 20:03:52 2005 +0000
    19.3 @@ -16,7 +16,6 @@ CONFIG_XEN_BLKDEV_FRONTEND=y
    19.4  CONFIG_XEN_NETDEV_FRONTEND=y
    19.5  # CONFIG_XEN_NETDEV_FRONTEND_PIPELINED_TRANSMITTER is not set
    19.6  # CONFIG_XEN_BLKDEV_TAP is not set
    19.7 -CONFIG_XEN_WRITABLE_PAGETABLES=y
    19.8  CONFIG_XEN_SCRUB_PAGES=y
    19.9  CONFIG_X86=y
   19.10  # CONFIG_X86_64 is not set
    20.1 --- a/linux-2.6.11-xen-sparse/arch/xen/i386/kernel/traps.c	Fri Mar 25 19:30:52 2005 +0000
    20.2 +++ b/linux-2.6.11-xen-sparse/arch/xen/i386/kernel/traps.c	Fri Mar 25 20:03:52 2005 +0000
    20.3 @@ -963,7 +963,7 @@ void __init trap_init(void)
    20.4  	 * and a callgate to lcall27 for Solaris/x86 binaries
    20.5  	 */
    20.6  	make_lowmem_page_readonly(&default_ldt[0]);
    20.7 -	xen_flush_page_update_queue();
    20.8 +	flush_page_update_queue();
    20.9  
   20.10  	/*
   20.11  	 * Should be a barrier for any external CPU state.
    21.1 --- a/linux-2.6.11-xen-sparse/arch/xen/i386/mm/fault.c	Fri Mar 25 19:30:52 2005 +0000
    21.2 +++ b/linux-2.6.11-xen-sparse/arch/xen/i386/mm/fault.c	Fri Mar 25 20:03:52 2005 +0000
    21.3 @@ -553,7 +553,6 @@ vmalloc_fault:
    21.4  		if (!pmd_present(*pmd_k))
    21.5  			goto no_context;
    21.6  		set_pmd(pmd, *pmd_k);
    21.7 -		xen_flush_page_update_queue(); /* flush PMD update */
    21.8  
    21.9  		pte_k = pte_offset_kernel(pmd_k, address);
   21.10  		if (!pte_present(*pte_k))
    22.1 --- a/linux-2.6.11-xen-sparse/arch/xen/i386/mm/hypervisor.c	Fri Mar 25 19:30:52 2005 +0000
    22.2 +++ b/linux-2.6.11-xen-sparse/arch/xen/i386/mm/hypervisor.c	Fri Mar 25 20:03:52 2005 +0000
    22.3 @@ -48,19 +48,12 @@
    22.4   */
    22.5  static spinlock_t update_lock = SPIN_LOCK_UNLOCKED;
    22.6  
    22.7 -/* Linux 2.6 isn't using the traditional batched interface. */
    22.8 +#define QUEUE_SIZE 1 /*128*/
    22.9  #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0)
   22.10 -#define QUEUE_SIZE 2048
   22.11  #define pte_offset_kernel pte_offset
   22.12 -#define pmd_val_ma(v) (v).pmd;
   22.13  #define pud_t pgd_t
   22.14  #define pud_offset(d, va) d
   22.15  #else
   22.16 -#ifdef CONFIG_SMP
   22.17 -#define QUEUE_SIZE 1
   22.18 -#else
   22.19 -#define QUEUE_SIZE 128
   22.20 -#endif
   22.21  #define pmd_val_ma(v) (v).pud.pgd.pgd;
   22.22  #endif
   22.23  
    23.1 --- a/linux-2.6.11-xen-sparse/arch/xen/i386/mm/pgtable.c	Fri Mar 25 19:30:52 2005 +0000
    23.2 +++ b/linux-2.6.11-xen-sparse/arch/xen/i386/mm/pgtable.c	Fri Mar 25 20:03:52 2005 +0000
    23.3 @@ -195,7 +195,7 @@ pte_t *pte_alloc_one_kernel(struct mm_st
    23.4  	pte_t *pte = (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO);
    23.5  	if (pte) {
    23.6  		make_page_readonly(pte);
    23.7 -		xen_flush_page_update_queue();
    23.8 +		flush_page_update_queue();
    23.9  	}
   23.10  	return pte;
   23.11  }
    24.1 --- a/linux-2.6.11-xen-sparse/arch/xen/kernel/reboot.c	Fri Mar 25 19:30:52 2005 +0000
    24.2 +++ b/linux-2.6.11-xen-sparse/arch/xen/kernel/reboot.c	Fri Mar 25 20:03:52 2005 +0000
    24.3 @@ -109,10 +109,8 @@ static void __do_suspend(void)
    24.4  
    24.5      HYPERVISOR_vm_assist(VMASST_CMD_enable,
    24.6  			 VMASST_TYPE_4gb_segments);
    24.7 -#ifdef CONFIG_XEN_WRITABLE_PAGETABLES
    24.8      HYPERVISOR_vm_assist(VMASST_CMD_enable,
    24.9  			 VMASST_TYPE_writable_pagetables);
   24.10 -#endif
   24.11  
   24.12      shutting_down = -1; 
   24.13  
    25.1 --- a/linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/page.h	Fri Mar 25 19:30:52 2005 +0000
    25.2 +++ b/linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/page.h	Fri Mar 25 20:03:52 2005 +0000
    25.3 @@ -111,7 +111,7 @@ typedef struct { unsigned long pgprot; }
    25.4  static inline unsigned long pgd_val(pgd_t x)
    25.5  {
    25.6  	unsigned long ret = x.pgd;
    25.7 -	if (ret) ret = machine_to_phys(ret);
    25.8 +	if (ret) ret = machine_to_phys(ret) | 1;
    25.9  	return ret;
   25.10  }
   25.11  #define pgprot_val(x)	((x).pgprot)
    26.1 --- a/linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/pgtable.h	Fri Mar 25 19:30:52 2005 +0000
    26.2 +++ b/linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/pgtable.h	Fri Mar 25 20:03:52 2005 +0000
    26.3 @@ -407,7 +407,6 @@ extern void noexec_setup(const char *str
    26.4  	do {								  \
    26.5  		if (__dirty) {						  \
    26.6  		        if ( likely((__vma)->vm_mm == current->mm) ) {    \
    26.7 -			    xen_flush_page_update_queue();                \
    26.8  			    HYPERVISOR_update_va_mapping((__address), (__entry), UVMF_INVLPG); \
    26.9  			} else {                                          \
   26.10                              xen_l1_entry_update((__ptep), (__entry).pte_low); \
   26.11 @@ -426,7 +425,6 @@ do {				  					\
   26.12  #define ptep_establish_new(__vma, __address, __ptep, __entry)		\
   26.13  do {				  					\
   26.14  	if (likely((__vma)->vm_mm == current->mm)) {			\
   26.15 -		xen_flush_page_update_queue();				\
   26.16  		HYPERVISOR_update_va_mapping((__address),		\
   26.17  					     __entry, 0);		\
   26.18  	} else {							\
    27.1 --- a/linux-2.6.11-xen-sparse/include/asm-xen/hypervisor.h	Fri Mar 25 19:30:52 2005 +0000
    27.2 +++ b/linux-2.6.11-xen-sparse/include/asm-xen/hypervisor.h	Fri Mar 25 20:03:52 2005 +0000
    27.3 @@ -117,8 +117,6 @@ void _flush_page_update_queue(void);
    27.4      if (per_cpu(mmu_update_queue_idx, smp_processor_id()))	\
    27.5  	_flush_page_update_queue();				\
    27.6  } while (0)
    27.7 -#define xen_flush_page_update_queue() (_flush_page_update_queue())
    27.8 -#define XEN_flush_page_update_queue() (_flush_page_update_queue())
    27.9  void MULTICALL_flush_page_update_queue(void);
   27.10  
   27.11  #ifdef CONFIG_XEN_PHYSDEV_ACCESS