direct-io.hg

changeset 4344:38c9523f3621

bitkeeper revision 1.1236.1.133 (4244ed84WExuoqfU6If778CyipoE6g)

Merge bk://xen.bkbits.net/xeno-unstable.bk
into bkbits.net:/repos/x/xen-ia64/xeno-unstable-ia64.bk
author xen-ia64.adm@bkbits.net
date Fri Mar 25 22:52:21 2005 +0000 (2005-03-25)
parents 11d6e8d32546 38fe904ec5b1
children a18e1426d4c8 0111710ed751
files .rootkeys BitKeeper/etc/logging_ok linux-2.4.29-xen-sparse/arch/xen/kernel/head.S linux-2.4.29-xen-sparse/arch/xen/kernel/ldt.c linux-2.4.29-xen-sparse/arch/xen/kernel/traps.c linux-2.4.29-xen-sparse/arch/xen/mm/fault.c linux-2.4.29-xen-sparse/arch/xen/mm/init.c linux-2.4.29-xen-sparse/fs/exec.c linux-2.4.29-xen-sparse/include/asm-xen/page.h linux-2.4.29-xen-sparse/include/asm-xen/pgalloc.h linux-2.4.29-xen-sparse/include/asm-xen/pgtable-2level.h linux-2.4.29-xen-sparse/include/asm-xen/pgtable.h linux-2.4.29-xen-sparse/mm/highmem.c linux-2.4.29-xen-sparse/mm/memory.c linux-2.4.29-xen-sparse/mm/mremap.c linux-2.4.29-xen-sparse/mm/swapfile.c linux-2.4.29-xen-sparse/mm/vmalloc.c linux-2.6.11-xen-sparse/arch/xen/Kconfig linux-2.6.11-xen-sparse/arch/xen/configs/xen0_defconfig linux-2.6.11-xen-sparse/arch/xen/configs/xenU_defconfig linux-2.6.11-xen-sparse/arch/xen/i386/kernel/traps.c linux-2.6.11-xen-sparse/arch/xen/i386/mm/fault.c linux-2.6.11-xen-sparse/arch/xen/i386/mm/hypervisor.c linux-2.6.11-xen-sparse/arch/xen/i386/mm/pgtable.c linux-2.6.11-xen-sparse/arch/xen/kernel/reboot.c linux-2.6.11-xen-sparse/drivers/xen/blktap/blktap.h linux-2.6.11-xen-sparse/drivers/xen/blktap/blktap_userdev.c linux-2.6.11-xen-sparse/drivers/xen/privcmd/privcmd.c linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/page.h linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/pgtable.h linux-2.6.11-xen-sparse/include/asm-xen/hypervisor.h tools/blktap/Makefile tools/blktap/blktaplib.c tools/blktap/blockstore.c tools/blktap/parallax-threaded.h
line diff
     1.1 --- a/.rootkeys	Sat Mar 26 05:05:07 2005 +0000
     1.2 +++ b/.rootkeys	Fri Mar 25 22:52:21 2005 +0000
     1.3 @@ -167,7 +167,6 @@ 3f108aeaLcGDgQdFAANLTUEid0a05w linux-2.4
     1.4  3e5a4e66rw65CxyolW9PKz4GG42RcA linux-2.4.29-xen-sparse/drivers/char/tty_io.c
     1.5  40c9c0c1pPwYE3-4i-oI3ubUu7UgvQ linux-2.4.29-xen-sparse/drivers/scsi/aic7xxx/Makefile
     1.6  41f97f64nW0wmgLxhwzPTzkF4E5ERA linux-2.4.29-xen-sparse/drivers/usb/hcd.c
     1.7 -3e5a4e669uzIE54VwucPYtGwXLAbzA linux-2.4.29-xen-sparse/fs/exec.c
     1.8  3e5a4e66wbeCpsJgVf_U8Jde-CNcsA linux-2.4.29-xen-sparse/include/asm-xen/bugs.h
     1.9  3e5a4e66HdSkvIV6SJ1evG_xmTmXHA linux-2.4.29-xen-sparse/include/asm-xen/desc.h
    1.10  3e5a4e66SYp_UpAVcF8Lc1wa3Qtgzw linux-2.4.29-xen-sparse/include/asm-xen/fixmap.h
    1.11 @@ -205,8 +204,6 @@ 3e5a4e68GxCIaFH4sy01v1wjapetaA linux-2.4
    1.12  3f108af5VxPkLv13tXpXgoRKALQtXQ linux-2.4.29-xen-sparse/mm/mprotect.c
    1.13  3e5a4e681xMPdF9xCMwpyfuYMySU5g linux-2.4.29-xen-sparse/mm/mremap.c
    1.14  409ba2e7akOFqQUg6Qyg2s28xcXiMg linux-2.4.29-xen-sparse/mm/page_alloc.c
    1.15 -3e5a4e683HKVU-sxtagrDasRB8eBVw linux-2.4.29-xen-sparse/mm/swapfile.c
    1.16 -41180721bNns9Na7w1nJ0ZVt8bhUNA linux-2.4.29-xen-sparse/mm/vmalloc.c
    1.17  41505c57WAd5l1rlfCLNSCpx9J13vA linux-2.4.29-xen-sparse/net/core/skbuff.c
    1.18  40f562372u3A7_kfbYYixPHJJxYUxA linux-2.6.11-xen-sparse/arch/xen/Kconfig
    1.19  40f56237utH41NPukqHksuNf29IC9A linux-2.6.11-xen-sparse/arch/xen/Kconfig.drivers
     2.1 --- a/BitKeeper/etc/logging_ok	Sat Mar 26 05:05:07 2005 +0000
     2.2 +++ b/BitKeeper/etc/logging_ok	Fri Mar 25 22:52:21 2005 +0000
     2.3 @@ -74,6 +74,7 @@ rneugeba@wyvis.research.intel-research.n
     2.4  sd386@font.cl.cam.ac.uk
     2.5  shand@spidean.research.intel-research.net
     2.6  smh22@boulderdash.cl.cam.ac.uk
     2.7 +smh22@firebug.cl.cam.ac.uk
     2.8  smh22@labyrinth.cl.cam.ac.uk
     2.9  smh22@tempest.cl.cam.ac.uk
    2.10  smh22@uridium.cl.cam.ac.uk
     3.1 --- a/linux-2.4.29-xen-sparse/arch/xen/kernel/head.S	Sat Mar 26 05:05:07 2005 +0000
     3.2 +++ b/linux-2.4.29-xen-sparse/arch/xen/kernel/head.S	Fri Mar 25 22:52:21 2005 +0000
     3.3 @@ -1,6 +1,9 @@
     3.4  
     3.5  .section __xen_guest
     3.6 -    .asciz "GUEST_OS=linux,GUEST_VER=2.4,XEN_VER=3.0,VIRT_BASE=0xC0000000"
     3.7 +    .ascii "GUEST_OS=linux,GUEST_VER=2.4,XEN_VER=3.0,VIRT_BASE=0xC0000000"
     3.8 +    .ascii ",LOADER=generic"
     3.9 +    .ascii ",PT_MODE_WRITABLE"
    3.10 +    .byte  0
    3.11  
    3.12  .text
    3.13  #include <linux/config.h>
     4.1 --- a/linux-2.4.29-xen-sparse/arch/xen/kernel/ldt.c	Sat Mar 26 05:05:07 2005 +0000
     4.2 +++ b/linux-2.4.29-xen-sparse/arch/xen/kernel/ldt.c	Fri Mar 25 22:52:21 2005 +0000
     4.3 @@ -84,6 +84,7 @@ static inline int copy_ldt(mm_context_t 
     4.4  	}
     4.5  	memcpy(new->ldt, old->ldt, old->size*LDT_ENTRY_SIZE);
     4.6  	make_pages_readonly(new->ldt, (new->size*LDT_ENTRY_SIZE)/PAGE_SIZE);
     4.7 +	flush_page_update_queue();
     4.8  	return 0;
     4.9  }
    4.10  
     5.1 --- a/linux-2.4.29-xen-sparse/arch/xen/kernel/traps.c	Sat Mar 26 05:05:07 2005 +0000
     5.2 +++ b/linux-2.4.29-xen-sparse/arch/xen/kernel/traps.c	Fri Mar 25 22:52:21 2005 +0000
     5.3 @@ -623,6 +623,7 @@ void __init trap_init(void)
     5.4      set_call_gate(&default_ldt[0],lcall7);
     5.5      set_call_gate(&default_ldt[4],lcall27);
     5.6      __make_page_readonly(&default_ldt[0]);
     5.7 +    flush_page_update_queue();
     5.8  
     5.9      cpu_init();
    5.10  }
     6.1 --- a/linux-2.4.29-xen-sparse/arch/xen/mm/fault.c	Sat Mar 26 05:05:07 2005 +0000
     6.2 +++ b/linux-2.4.29-xen-sparse/arch/xen/mm/fault.c	Fri Mar 25 22:52:21 2005 +0000
     6.3 @@ -296,7 +296,6 @@ vmalloc_fault:
     6.4  		if (!pmd_present(*pmd_k))
     6.5  			goto no_context;
     6.6  		set_pmd(pmd, *pmd_k);
     6.7 -                XEN_flush_page_update_queue(); /* flush PMD update */
     6.8  
     6.9  		pte_k = pte_offset(pmd_k, address);
    6.10  		if (!pte_present(*pte_k))
     7.1 --- a/linux-2.4.29-xen-sparse/arch/xen/mm/init.c	Sat Mar 26 05:05:07 2005 +0000
     7.2 +++ b/linux-2.4.29-xen-sparse/arch/xen/mm/init.c	Fri Mar 25 22:52:21 2005 +0000
     7.3 @@ -142,7 +142,7 @@ static inline void set_pte_phys (unsigne
     7.4      }
     7.5      pte = pte_offset(pmd, vaddr);
     7.6  
     7.7 -    queue_l1_entry_update(pte, phys | pgprot_val(prot));
     7.8 +    set_pte(pte, (pte_t) { phys | pgprot_val(prot) });
     7.9  
    7.10      /*
    7.11       * It's enough to flush this one mapping.
    7.12 @@ -201,17 +201,13 @@ static void __init fixrange_init (unsign
    7.13                  kpgd = pgd_offset_k((unsigned long)pte);
    7.14                  kpmd = pmd_offset(kpgd, (unsigned long)pte);
    7.15                  kpte = pte_offset(kpmd, (unsigned long)pte);
    7.16 -                queue_l1_entry_update(kpte,
    7.17 -                                      (*(unsigned long *)kpte)&~_PAGE_RW);
    7.18 -
    7.19 +                set_pte(kpte, pte_wrprotect(*kpte));
    7.20                  set_pmd(pmd, __pmd(_KERNPG_TABLE + __pa(pte)));
    7.21              }
    7.22              vaddr += PMD_SIZE;
    7.23          }
    7.24          j = 0;
    7.25      }
    7.26 -	
    7.27 -    XEN_flush_page_update_queue();
    7.28  }
    7.29  
    7.30  
    7.31 @@ -257,10 +253,8 @@ static void __init pagetable_init (void)
    7.32              kpgd = pgd_offset_k((unsigned long)pte_base);
    7.33              kpmd = pmd_offset(kpgd, (unsigned long)pte_base);
    7.34              kpte = pte_offset(kpmd, (unsigned long)pte_base);
    7.35 -            queue_l1_entry_update(kpte,
    7.36 -                                  (*(unsigned long *)kpte)&~_PAGE_RW);
    7.37 +            set_pte(kpte, pte_wrprotect(*kpte));
    7.38              set_pmd(pmd, __pmd(_KERNPG_TABLE + __pa(pte_base)));
    7.39 -            XEN_flush_page_update_queue();
    7.40          }
    7.41      }
    7.42  
    7.43 @@ -311,6 +305,7 @@ void __init paging_init(void)
    7.44      pagetable_init();
    7.45  
    7.46      zone_sizes_init();
    7.47 +
    7.48      /* Switch to the real shared_info page, and clear the dummy page. */
    7.49      set_fixmap(FIX_SHARED_INFO, xen_start_info.shared_info);
    7.50      HYPERVISOR_shared_info = (shared_info_t *)fix_to_virt(FIX_SHARED_INFO);
     8.1 --- a/linux-2.4.29-xen-sparse/fs/exec.c	Sat Mar 26 05:05:07 2005 +0000
     8.2 +++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
     8.3 @@ -1,1179 +0,0 @@
     8.4 -/*
     8.5 - *  linux/fs/exec.c
     8.6 - *
     8.7 - *  Copyright (C) 1991, 1992  Linus Torvalds
     8.8 - */
     8.9 -
    8.10 -/*
    8.11 - * #!-checking implemented by tytso.
    8.12 - */
    8.13 -/*
    8.14 - * Demand-loading implemented 01.12.91 - no need to read anything but
    8.15 - * the header into memory. The inode of the executable is put into
    8.16 - * "current->executable", and page faults do the actual loading. Clean.
    8.17 - *
    8.18 - * Once more I can proudly say that linux stood up to being changed: it
    8.19 - * was less than 2 hours work to get demand-loading completely implemented.
    8.20 - *
    8.21 - * Demand loading changed July 1993 by Eric Youngdale.   Use mmap instead,
    8.22 - * current->executable is only used by the procfs.  This allows a dispatch
    8.23 - * table to check for several different types  of binary formats.  We keep
    8.24 - * trying until we recognize the file or we run out of supported binary
    8.25 - * formats. 
    8.26 - */
    8.27 -
    8.28 -#include <linux/config.h>
    8.29 -#include <linux/slab.h>
    8.30 -#include <linux/file.h>
    8.31 -#include <linux/mman.h>
    8.32 -#include <linux/a.out.h>
    8.33 -#include <linux/stat.h>
    8.34 -#include <linux/fcntl.h>
    8.35 -#include <linux/smp_lock.h>
    8.36 -#include <linux/init.h>
    8.37 -#include <linux/pagemap.h>
    8.38 -#include <linux/highmem.h>
    8.39 -#include <linux/spinlock.h>
    8.40 -#include <linux/personality.h>
    8.41 -#include <linux/swap.h>
    8.42 -#include <linux/utsname.h>
    8.43 -#define __NO_VERSION__
    8.44 -#include <linux/module.h>
    8.45 -
    8.46 -#include <asm/uaccess.h>
    8.47 -#include <asm/pgalloc.h>
    8.48 -#include <asm/mmu_context.h>
    8.49 -
    8.50 -#ifdef CONFIG_KMOD
    8.51 -#include <linux/kmod.h>
    8.52 -#endif
    8.53 -
    8.54 -int core_uses_pid;
    8.55 -char core_pattern[65] = "core";
    8.56 -int core_setuid_ok = 0;
    8.57 -/* The maximal length of core_pattern is also specified in sysctl.c */ 
    8.58 -
    8.59 -static struct linux_binfmt *formats;
    8.60 -static rwlock_t binfmt_lock = RW_LOCK_UNLOCKED;
    8.61 -
    8.62 -int register_binfmt(struct linux_binfmt * fmt)
    8.63 -{
    8.64 -	struct linux_binfmt ** tmp = &formats;
    8.65 -
    8.66 -	if (!fmt)
    8.67 -		return -EINVAL;
    8.68 -	if (fmt->next)
    8.69 -		return -EBUSY;
    8.70 -	write_lock(&binfmt_lock);
    8.71 -	while (*tmp) {
    8.72 -		if (fmt == *tmp) {
    8.73 -			write_unlock(&binfmt_lock);
    8.74 -			return -EBUSY;
    8.75 -		}
    8.76 -		tmp = &(*tmp)->next;
    8.77 -	}
    8.78 -	fmt->next = formats;
    8.79 -	formats = fmt;
    8.80 -	write_unlock(&binfmt_lock);
    8.81 -	return 0;	
    8.82 -}
    8.83 -
    8.84 -int unregister_binfmt(struct linux_binfmt * fmt)
    8.85 -{
    8.86 -	struct linux_binfmt ** tmp = &formats;
    8.87 -
    8.88 -	write_lock(&binfmt_lock);
    8.89 -	while (*tmp) {
    8.90 -		if (fmt == *tmp) {
    8.91 -			*tmp = fmt->next;
    8.92 -			write_unlock(&binfmt_lock);
    8.93 -			return 0;
    8.94 -		}
    8.95 -		tmp = &(*tmp)->next;
    8.96 -	}
    8.97 -	write_unlock(&binfmt_lock);
    8.98 -	return -EINVAL;
    8.99 -}
   8.100 -
   8.101 -static inline void put_binfmt(struct linux_binfmt * fmt)
   8.102 -{
   8.103 -	if (fmt->module)
   8.104 -		__MOD_DEC_USE_COUNT(fmt->module);
   8.105 -}
   8.106 -
   8.107 -/*
   8.108 - * Note that a shared library must be both readable and executable due to
   8.109 - * security reasons.
   8.110 - *
   8.111 - * Also note that we take the address to load from from the file itself.
   8.112 - */
   8.113 -asmlinkage long sys_uselib(const char * library)
   8.114 -{
   8.115 -	struct file * file;
   8.116 -	struct nameidata nd;
   8.117 -	int error;
   8.118 -
   8.119 -	error = user_path_walk(library, &nd);
   8.120 -	if (error)
   8.121 -		goto out;
   8.122 -
   8.123 -	error = -EINVAL;
   8.124 -	if (!S_ISREG(nd.dentry->d_inode->i_mode))
   8.125 -		goto exit;
   8.126 -
   8.127 -	error = permission(nd.dentry->d_inode, MAY_READ | MAY_EXEC);
   8.128 -	if (error)
   8.129 -		goto exit;
   8.130 -
   8.131 -	file = dentry_open(nd.dentry, nd.mnt, O_RDONLY);
   8.132 -	error = PTR_ERR(file);
   8.133 -	if (IS_ERR(file))
   8.134 -		goto out;
   8.135 -
   8.136 -	error = -ENOEXEC;
   8.137 -	if(file->f_op && file->f_op->read) {
   8.138 -		struct linux_binfmt * fmt;
   8.139 -
   8.140 -		read_lock(&binfmt_lock);
   8.141 -		for (fmt = formats ; fmt ; fmt = fmt->next) {
   8.142 -			if (!fmt->load_shlib)
   8.143 -				continue;
   8.144 -			if (!try_inc_mod_count(fmt->module))
   8.145 -				continue;
   8.146 -			read_unlock(&binfmt_lock);
   8.147 -			error = fmt->load_shlib(file);
   8.148 -			read_lock(&binfmt_lock);
   8.149 -			put_binfmt(fmt);
   8.150 -			if (error != -ENOEXEC)
   8.151 -				break;
   8.152 -		}
   8.153 -		read_unlock(&binfmt_lock);
   8.154 -	}
   8.155 -	fput(file);
   8.156 -out:
   8.157 -  	return error;
   8.158 -exit:
   8.159 -	path_release(&nd);
   8.160 -	goto out;
   8.161 -}
   8.162 -
   8.163 -/*
   8.164 - * count() counts the number of arguments/envelopes
   8.165 - */
   8.166 -static int count(char ** argv, int max)
   8.167 -{
   8.168 -	int i = 0;
   8.169 -
   8.170 -	if (argv != NULL) {
   8.171 -		for (;;) {
   8.172 -			char * p;
   8.173 -
   8.174 -			if (get_user(p, argv))
   8.175 -				return -EFAULT;
   8.176 -			if (!p)
   8.177 -				break;
   8.178 -			argv++;
   8.179 -			if(++i > max)
   8.180 -				return -E2BIG;
   8.181 -		}
   8.182 -	}
   8.183 -	return i;
   8.184 -}
   8.185 -
   8.186 -/*
   8.187 - * 'copy_strings()' copies argument/envelope strings from user
   8.188 - * memory to free pages in kernel mem. These are in a format ready
   8.189 - * to be put directly into the top of new user memory.
   8.190 - */
   8.191 -int copy_strings(int argc,char ** argv, struct linux_binprm *bprm) 
   8.192 -{
   8.193 -	struct page *kmapped_page = NULL;
   8.194 -	char *kaddr = NULL;
   8.195 -	int ret;
   8.196 -
   8.197 -	while (argc-- > 0) {
   8.198 -		char *str;
   8.199 -		int len;
   8.200 -		unsigned long pos;
   8.201 -
   8.202 -		if (get_user(str, argv+argc) ||
   8.203 -				!(len = strnlen_user(str, bprm->p))) {
   8.204 -			ret = -EFAULT;
   8.205 -			goto out;
   8.206 -		}
   8.207 -
   8.208 -		if (bprm->p < len)  {
   8.209 -			ret = -E2BIG;
   8.210 -			goto out;
   8.211 -		}
   8.212 -
   8.213 -		bprm->p -= len;
   8.214 -		/* XXX: add architecture specific overflow check here. */ 
   8.215 -		pos = bprm->p;
   8.216 -
   8.217 -		while (len > 0) {
   8.218 -			int i, new, err;
   8.219 -			int offset, bytes_to_copy;
   8.220 -			struct page *page;
   8.221 -
   8.222 -			offset = pos % PAGE_SIZE;
   8.223 -			i = pos/PAGE_SIZE;
   8.224 -			page = bprm->page[i];
   8.225 -			new = 0;
   8.226 -			if (!page) {
   8.227 -				page = alloc_page(GFP_HIGHUSER);
   8.228 -				bprm->page[i] = page;
   8.229 -				if (!page) {
   8.230 -					ret = -ENOMEM;
   8.231 -					goto out;
   8.232 -				}
   8.233 -				new = 1;
   8.234 -			}
   8.235 -
   8.236 -			if (page != kmapped_page) {
   8.237 -				if (kmapped_page)
   8.238 -					kunmap(kmapped_page);
   8.239 -				kmapped_page = page;
   8.240 -				kaddr = kmap(kmapped_page);
   8.241 -			}
   8.242 -			if (new && offset)
   8.243 -				memset(kaddr, 0, offset);
   8.244 -			bytes_to_copy = PAGE_SIZE - offset;
   8.245 -			if (bytes_to_copy > len) {
   8.246 -				bytes_to_copy = len;
   8.247 -				if (new)
   8.248 -					memset(kaddr+offset+len, 0,
   8.249 -						PAGE_SIZE-offset-len);
   8.250 -			}
   8.251 -			err = copy_from_user(kaddr+offset, str, bytes_to_copy);
   8.252 -			if (err) {
   8.253 -				ret = -EFAULT;
   8.254 -				goto out;
   8.255 -			}
   8.256 -
   8.257 -			pos += bytes_to_copy;
   8.258 -			str += bytes_to_copy;
   8.259 -			len -= bytes_to_copy;
   8.260 -		}
   8.261 -	}
   8.262 -	ret = 0;
   8.263 -out:
   8.264 -	if (kmapped_page)
   8.265 -		kunmap(kmapped_page);
   8.266 -	return ret;
   8.267 -}
   8.268 -
   8.269 -/*
   8.270 - * Like copy_strings, but get argv and its values from kernel memory.
   8.271 - */
   8.272 -int copy_strings_kernel(int argc,char ** argv, struct linux_binprm *bprm)
   8.273 -{
   8.274 -	int r;
   8.275 -	mm_segment_t oldfs = get_fs();
   8.276 -	set_fs(KERNEL_DS); 
   8.277 -	r = copy_strings(argc, argv, bprm);
   8.278 -	set_fs(oldfs);
   8.279 -	return r; 
   8.280 -}
   8.281 -
   8.282 -/*
   8.283 - * This routine is used to map in a page into an address space: needed by
   8.284 - * execve() for the initial stack and environment pages.
   8.285 - *
   8.286 - * tsk->mmap_sem is held for writing.
   8.287 - */
   8.288 -void put_dirty_page(struct task_struct * tsk, struct page *page, unsigned long address)
   8.289 -{
   8.290 -	pgd_t * pgd;
   8.291 -	pmd_t * pmd;
   8.292 -	pte_t * pte;
   8.293 -	struct vm_area_struct *vma; 
   8.294 -	pgprot_t prot = PAGE_COPY; 
   8.295 -
   8.296 -	if (page_count(page) != 1)
   8.297 -		printk(KERN_ERR "mem_map disagrees with %p at %08lx\n", page, address);
   8.298 -	pgd = pgd_offset(tsk->mm, address);
   8.299 -
   8.300 -	spin_lock(&tsk->mm->page_table_lock);
   8.301 -	pmd = pmd_alloc(tsk->mm, pgd, address);
   8.302 -	if (!pmd)
   8.303 -		goto out;
   8.304 -	pte = pte_alloc(tsk->mm, pmd, address);
   8.305 -	if (!pte)
   8.306 -		goto out;
   8.307 -	if (!pte_none(*pte))
   8.308 -		goto out;
   8.309 -	lru_cache_add(page);
   8.310 -	flush_dcache_page(page);
   8.311 -	flush_page_to_ram(page);
   8.312 -	/* lookup is cheap because there is only a single entry in the list */
   8.313 -	vma = find_vma(tsk->mm, address);
   8.314 -	if (vma)
   8.315 -		prot = vma->vm_page_prot;
   8.316 -	set_pte(pte, pte_mkdirty(pte_mkwrite(mk_pte(page, prot))));
   8.317 -	XEN_flush_page_update_queue();
   8.318 -	tsk->mm->rss++;
   8.319 -	spin_unlock(&tsk->mm->page_table_lock);
   8.320 -
   8.321 -	/* no need for flush_tlb */
   8.322 -	return;
   8.323 -out:
   8.324 -	spin_unlock(&tsk->mm->page_table_lock);
   8.325 -	__free_page(page);
   8.326 -	force_sig(SIGKILL, tsk);
   8.327 -	return;
   8.328 -}
   8.329 -
   8.330 -int setup_arg_pages(struct linux_binprm *bprm)
   8.331 -{
   8.332 -	unsigned long stack_base;
   8.333 -	struct vm_area_struct *mpnt;
   8.334 -	int i, ret;
   8.335 -
   8.336 -	stack_base = STACK_TOP - MAX_ARG_PAGES*PAGE_SIZE;
   8.337 -
   8.338 -	bprm->p += stack_base;
   8.339 -	if (bprm->loader)
   8.340 -		bprm->loader += stack_base;
   8.341 -	bprm->exec += stack_base;
   8.342 -
   8.343 -	mpnt = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
   8.344 -	if (!mpnt) 
   8.345 -		return -ENOMEM; 
   8.346 -	
   8.347 -	down_write(&current->mm->mmap_sem);
   8.348 -	{
   8.349 -		mpnt->vm_mm = current->mm;
   8.350 -		mpnt->vm_start = PAGE_MASK & (unsigned long) bprm->p;
   8.351 -		mpnt->vm_end = STACK_TOP;
   8.352 -		mpnt->vm_flags = VM_STACK_FLAGS;
   8.353 -		mpnt->vm_page_prot = protection_map[VM_STACK_FLAGS & 0x7];
   8.354 -		mpnt->vm_ops = NULL;
   8.355 -		mpnt->vm_pgoff = 0;
   8.356 -		mpnt->vm_file = NULL;
   8.357 -		mpnt->vm_private_data = (void *) 0;
   8.358 -		if ((ret = insert_vm_struct(current->mm, mpnt))) {
   8.359 -			up_write(&current->mm->mmap_sem);
   8.360 -			kmem_cache_free(vm_area_cachep, mpnt);
   8.361 -			return ret;
   8.362 -		}
   8.363 -		current->mm->total_vm = (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT;
   8.364 -	} 
   8.365 -
   8.366 -	for (i = 0 ; i < MAX_ARG_PAGES ; i++) {
   8.367 -		struct page *page = bprm->page[i];
   8.368 -		if (page) {
   8.369 -			bprm->page[i] = NULL;
   8.370 -			put_dirty_page(current,page,stack_base);
   8.371 -		}
   8.372 -		stack_base += PAGE_SIZE;
   8.373 -	}
   8.374 -	up_write(&current->mm->mmap_sem);
   8.375 -	
   8.376 -	return 0;
   8.377 -}
   8.378 -
   8.379 -struct file *open_exec(const char *name)
   8.380 -{
   8.381 -	struct nameidata nd;
   8.382 -	struct inode *inode;
   8.383 -	struct file *file;
   8.384 -	int err = 0;
   8.385 -
   8.386 -	err = path_lookup(name, LOOKUP_FOLLOW|LOOKUP_POSITIVE, &nd);
   8.387 -	file = ERR_PTR(err);
   8.388 -	if (!err) {
   8.389 -		inode = nd.dentry->d_inode;
   8.390 -		file = ERR_PTR(-EACCES);
   8.391 -		if (!(nd.mnt->mnt_flags & MNT_NOEXEC) &&
   8.392 -		    S_ISREG(inode->i_mode)) {
   8.393 -			int err = permission(inode, MAY_EXEC);
   8.394 -			if (!err && !(inode->i_mode & 0111))
   8.395 -				err = -EACCES;
   8.396 -			file = ERR_PTR(err);
   8.397 -			if (!err) {
   8.398 -				file = dentry_open(nd.dentry, nd.mnt, O_RDONLY);
   8.399 -				if (!IS_ERR(file)) {
   8.400 -					err = deny_write_access(file);
   8.401 -					if (err) {
   8.402 -						fput(file);
   8.403 -						file = ERR_PTR(err);
   8.404 -					}
   8.405 -				}
   8.406 -out:
   8.407 -				return file;
   8.408 -			}
   8.409 -		}
   8.410 -		path_release(&nd);
   8.411 -	}
   8.412 -	goto out;
   8.413 -}
   8.414 -
   8.415 -int kernel_read(struct file *file, unsigned long offset,
   8.416 -	char * addr, unsigned long count)
   8.417 -{
   8.418 -	mm_segment_t old_fs;
   8.419 -	loff_t pos = offset;
   8.420 -	int result = -ENOSYS;
   8.421 -
   8.422 -	if (!file->f_op->read)
   8.423 -		goto fail;
   8.424 -	old_fs = get_fs();
   8.425 -	set_fs(get_ds());
   8.426 -	result = file->f_op->read(file, addr, count, &pos);
   8.427 -	set_fs(old_fs);
   8.428 -fail:
   8.429 -	return result;
   8.430 -}
   8.431 -
   8.432 -static int exec_mmap(void)
   8.433 -{
   8.434 -	struct mm_struct * mm, * old_mm;
   8.435 -
   8.436 -	old_mm = current->mm;
   8.437 -
   8.438 -	if (old_mm && atomic_read(&old_mm->mm_users) == 1) {
   8.439 -		mm_release();
   8.440 -		down_write(&old_mm->mmap_sem);
   8.441 -		exit_mmap(old_mm);
   8.442 -		up_write(&old_mm->mmap_sem);
   8.443 -		return 0;
   8.444 -	}
   8.445 -
   8.446 -
   8.447 -	mm = mm_alloc();
   8.448 -	if (mm) {
   8.449 -		struct mm_struct *active_mm;
   8.450 -
   8.451 -		if (init_new_context(current, mm)) {
   8.452 -			mmdrop(mm);
   8.453 -			return -ENOMEM;
   8.454 -		}
   8.455 -
   8.456 -		/* Add it to the list of mm's */
   8.457 -		spin_lock(&mmlist_lock);
   8.458 -		list_add(&mm->mmlist, &init_mm.mmlist);
   8.459 -		mmlist_nr++;
   8.460 -		spin_unlock(&mmlist_lock);
   8.461 -
   8.462 -		task_lock(current);
   8.463 -		active_mm = current->active_mm;
   8.464 -		current->mm = mm;
   8.465 -		current->active_mm = mm;
   8.466 -		task_unlock(current);
   8.467 -		activate_mm(active_mm, mm);
   8.468 -		mm_release();
   8.469 -		if (old_mm) {
   8.470 -			if (active_mm != old_mm) BUG();
   8.471 -			mmput(old_mm);
   8.472 -			return 0;
   8.473 -		}
   8.474 -		mmdrop(active_mm);
   8.475 -		return 0;
   8.476 -	}
   8.477 -	return -ENOMEM;
   8.478 -}
   8.479 -
   8.480 -/*
   8.481 - * This function makes sure the current process has its own signal table,
   8.482 - * so that flush_signal_handlers can later reset the handlers without
   8.483 - * disturbing other processes.  (Other processes might share the signal
   8.484 - * table via the CLONE_SIGNAL option to clone().)
   8.485 - */
   8.486 - 
   8.487 -static inline int make_private_signals(void)
   8.488 -{
   8.489 -	struct signal_struct * newsig;
   8.490 -
   8.491 -	if (atomic_read(&current->sig->count) <= 1)
   8.492 -		return 0;
   8.493 -	newsig = kmem_cache_alloc(sigact_cachep, GFP_KERNEL);
   8.494 -	if (newsig == NULL)
   8.495 -		return -ENOMEM;
   8.496 -	spin_lock_init(&newsig->siglock);
   8.497 -	atomic_set(&newsig->count, 1);
   8.498 -	memcpy(newsig->action, current->sig->action, sizeof(newsig->action));
   8.499 -	spin_lock_irq(&current->sigmask_lock);
   8.500 -	current->sig = newsig;
   8.501 -	spin_unlock_irq(&current->sigmask_lock);
   8.502 -	return 0;
   8.503 -}
   8.504 -	
   8.505 -/*
   8.506 - * If make_private_signals() made a copy of the signal table, decrement the
   8.507 - * refcount of the original table, and free it if necessary.
   8.508 - * We don't do that in make_private_signals() so that we can back off
   8.509 - * in flush_old_exec() if an error occurs after calling make_private_signals().
   8.510 - */
   8.511 -
   8.512 -static inline void release_old_signals(struct signal_struct * oldsig)
   8.513 -{
   8.514 -	if (current->sig == oldsig)
   8.515 -		return;
   8.516 -	if (atomic_dec_and_test(&oldsig->count))
   8.517 -		kmem_cache_free(sigact_cachep, oldsig);
   8.518 -}
   8.519 -
   8.520 -/*
   8.521 - * These functions flushes out all traces of the currently running executable
   8.522 - * so that a new one can be started
   8.523 - */
   8.524 -
   8.525 -static inline void flush_old_files(struct files_struct * files)
   8.526 -{
   8.527 -	long j = -1;
   8.528 -
   8.529 -	write_lock(&files->file_lock);
   8.530 -	for (;;) {
   8.531 -		unsigned long set, i;
   8.532 -
   8.533 -		j++;
   8.534 -		i = j * __NFDBITS;
   8.535 -		if (i >= files->max_fds || i >= files->max_fdset)
   8.536 -			break;
   8.537 -		set = files->close_on_exec->fds_bits[j];
   8.538 -		if (!set)
   8.539 -			continue;
   8.540 -		files->close_on_exec->fds_bits[j] = 0;
   8.541 -		write_unlock(&files->file_lock);
   8.542 -		for ( ; set ; i++,set >>= 1) {
   8.543 -			if (set & 1) {
   8.544 -				sys_close(i);
   8.545 -			}
   8.546 -		}
   8.547 -		write_lock(&files->file_lock);
   8.548 -
   8.549 -	}
   8.550 -	write_unlock(&files->file_lock);
   8.551 -}
   8.552 -
   8.553 -/*
   8.554 - * An execve() will automatically "de-thread" the process.
   8.555 - * Note: we don't have to hold the tasklist_lock to test
   8.556 - * whether we migth need to do this. If we're not part of
   8.557 - * a thread group, there is no way we can become one
   8.558 - * dynamically. And if we are, we only need to protect the
   8.559 - * unlink - even if we race with the last other thread exit,
   8.560 - * at worst the list_del_init() might end up being a no-op.
   8.561 - */
   8.562 -static inline void de_thread(struct task_struct *tsk)
   8.563 -{
   8.564 -	if (!list_empty(&tsk->thread_group)) {
   8.565 -		write_lock_irq(&tasklist_lock);
   8.566 -		list_del_init(&tsk->thread_group);
   8.567 -		write_unlock_irq(&tasklist_lock);
   8.568 -	}
   8.569 -
   8.570 -	/* Minor oddity: this might stay the same. */
   8.571 -	tsk->tgid = tsk->pid;
   8.572 -}
   8.573 -
   8.574 -void get_task_comm(char *buf, struct task_struct *tsk)
   8.575 -{
   8.576 -	/* buf must be at least sizeof(tsk->comm) in size */
   8.577 -	task_lock(tsk);
   8.578 -	memcpy(buf, tsk->comm, sizeof(tsk->comm));
   8.579 -	task_unlock(tsk);
   8.580 -}
   8.581 -
   8.582 -void set_task_comm(struct task_struct *tsk, char *buf)
   8.583 -{
   8.584 -	task_lock(tsk);
   8.585 -	strncpy(tsk->comm, buf, sizeof(tsk->comm));
   8.586 -	tsk->comm[sizeof(tsk->comm)-1]='\0';
   8.587 -	task_unlock(tsk);
   8.588 -}
   8.589 -
   8.590 -int flush_old_exec(struct linux_binprm * bprm)
   8.591 -{
   8.592 -	char * name;
   8.593 -	int i, ch, retval;
   8.594 -	struct signal_struct * oldsig;
   8.595 -	struct files_struct * files;
   8.596 -	char tcomm[sizeof(current->comm)];
   8.597 -
   8.598 -	/*
   8.599 -	 * Make sure we have a private signal table
   8.600 -	 */
   8.601 -	oldsig = current->sig;
   8.602 -	retval = make_private_signals();
   8.603 -	if (retval) goto flush_failed;
   8.604 -
   8.605 -	/*
   8.606 -	 * Make sure we have private file handles. Ask the
   8.607 -	 * fork helper to do the work for us and the exit
   8.608 -	 * helper to do the cleanup of the old one.
   8.609 -	 */
   8.610 -	 
   8.611 -	files = current->files;		/* refcounted so safe to hold */
   8.612 -	retval = unshare_files();
   8.613 -	if(retval)
   8.614 -		goto flush_failed;
   8.615 -	
   8.616 -	/* 
   8.617 -	 * Release all of the old mmap stuff
   8.618 -	 */
   8.619 -	retval = exec_mmap();
   8.620 -	if (retval) goto mmap_failed;
   8.621 -
   8.622 -	/* This is the point of no return */
   8.623 -	steal_locks(files);
   8.624 -	put_files_struct(files);
   8.625 -	release_old_signals(oldsig);
   8.626 -
   8.627 -	current->sas_ss_sp = current->sas_ss_size = 0;
   8.628 -
   8.629 -	if (current->euid == current->uid && current->egid == current->gid) {
   8.630 -		current->mm->dumpable = 1;
   8.631 -		current->task_dumpable = 1;
   8.632 -	}
   8.633 -	name = bprm->filename;
   8.634 -	for (i=0; (ch = *(name++)) != '\0';) {
   8.635 -		if (ch == '/')
   8.636 -			i = 0;
   8.637 -		else
   8.638 -			if (i < (sizeof(tcomm) - 1))
   8.639 -				tcomm[i++] = ch;
   8.640 -	}
   8.641 -	tcomm[i] = '\0';
   8.642 -	set_task_comm(current, tcomm);
   8.643 -
   8.644 -	flush_thread();
   8.645 -
   8.646 -	de_thread(current);
   8.647 -
   8.648 -	if (bprm->e_uid != current->euid || bprm->e_gid != current->egid || 
   8.649 -	    permission(bprm->file->f_dentry->d_inode,MAY_READ))
   8.650 -		current->mm->dumpable = 0;
   8.651 -
   8.652 -	/* An exec changes our domain. We are no longer part of the thread
   8.653 -	   group */
   8.654 -	   
   8.655 -	current->self_exec_id++;
   8.656 -			
   8.657 -	flush_signal_handlers(current);
   8.658 -	flush_old_files(current->files);
   8.659 -
   8.660 -	return 0;
   8.661 -
   8.662 -mmap_failed:
   8.663 -	put_files_struct(current->files);
   8.664 -	current->files = files;
   8.665 -flush_failed:
   8.666 -	spin_lock_irq(&current->sigmask_lock);
   8.667 -	if (current->sig != oldsig) {
   8.668 -		kmem_cache_free(sigact_cachep, current->sig);
   8.669 -		current->sig = oldsig;
   8.670 -	}
   8.671 -	spin_unlock_irq(&current->sigmask_lock);
   8.672 -	return retval;
   8.673 -}
   8.674 -
   8.675 -/*
   8.676 - * We mustn't allow tracing of suid binaries, unless
   8.677 - * the tracer has the capability to trace anything..
   8.678 - */
   8.679 -static inline int must_not_trace_exec(struct task_struct * p)
   8.680 -{
   8.681 -	return (p->ptrace & PT_PTRACED) && !(p->ptrace & PT_PTRACE_CAP);
   8.682 -}
   8.683 -
   8.684 -/* 
   8.685 - * Fill the binprm structure from the inode. 
   8.686 - * Check permissions, then read the first 128 (BINPRM_BUF_SIZE) bytes
   8.687 - */
   8.688 -int prepare_binprm(struct linux_binprm *bprm)
   8.689 -{
   8.690 -	int mode;
   8.691 -	struct inode * inode = bprm->file->f_dentry->d_inode;
   8.692 -
   8.693 -	mode = inode->i_mode;
   8.694 -	/*
   8.695 -	 * Check execute perms again - if the caller has CAP_DAC_OVERRIDE,
   8.696 -	 * vfs_permission lets a non-executable through
   8.697 -	 */
   8.698 -	if (!(mode & 0111))	/* with at least _one_ execute bit set */
   8.699 -		return -EACCES;
   8.700 -	if (bprm->file->f_op == NULL)
   8.701 -		return -EACCES;
   8.702 -
   8.703 -	bprm->e_uid = current->euid;
   8.704 -	bprm->e_gid = current->egid;
   8.705 -
   8.706 -	if(!(bprm->file->f_vfsmnt->mnt_flags & MNT_NOSUID)) {
   8.707 -		/* Set-uid? */
   8.708 -		if (mode & S_ISUID)
   8.709 -			bprm->e_uid = inode->i_uid;
   8.710 -
   8.711 -		/* Set-gid? */
   8.712 -		/*
   8.713 -		 * If setgid is set but no group execute bit then this
   8.714 -		 * is a candidate for mandatory locking, not a setgid
   8.715 -		 * executable.
   8.716 -		 */
   8.717 -		if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP))
   8.718 -			bprm->e_gid = inode->i_gid;
   8.719 -	}
   8.720 -
   8.721 -	/* We don't have VFS support for capabilities yet */
   8.722 -	cap_clear(bprm->cap_inheritable);
   8.723 -	cap_clear(bprm->cap_permitted);
   8.724 -	cap_clear(bprm->cap_effective);
   8.725 -
   8.726 -	/*  To support inheritance of root-permissions and suid-root
   8.727 -         *  executables under compatibility mode, we raise all three
   8.728 -         *  capability sets for the file.
   8.729 -         *
   8.730 -         *  If only the real uid is 0, we only raise the inheritable
   8.731 -         *  and permitted sets of the executable file.
   8.732 -         */
   8.733 -
   8.734 -	if (!issecure(SECURE_NOROOT)) {
   8.735 -		if (bprm->e_uid == 0 || current->uid == 0) {
   8.736 -			cap_set_full(bprm->cap_inheritable);
   8.737 -			cap_set_full(bprm->cap_permitted);
   8.738 -		}
   8.739 -		if (bprm->e_uid == 0) 
   8.740 -			cap_set_full(bprm->cap_effective);
   8.741 -	}
   8.742 -
   8.743 -	memset(bprm->buf,0,BINPRM_BUF_SIZE);
   8.744 -	return kernel_read(bprm->file,0,bprm->buf,BINPRM_BUF_SIZE);
   8.745 -}
   8.746 -
   8.747 -/*
   8.748 - * This function is used to produce the new IDs and capabilities
   8.749 - * from the old ones and the file's capabilities.
   8.750 - *
   8.751 - * The formula used for evolving capabilities is:
   8.752 - *
   8.753 - *       pI' = pI
   8.754 - * (***) pP' = (fP & X) | (fI & pI)
   8.755 - *       pE' = pP' & fE          [NB. fE is 0 or ~0]
   8.756 - *
   8.757 - * I=Inheritable, P=Permitted, E=Effective // p=process, f=file
   8.758 - * ' indicates post-exec(), and X is the global 'cap_bset'.
   8.759 - *
   8.760 - */
   8.761 -
   8.762 -void compute_creds(struct linux_binprm *bprm) 
   8.763 -{
   8.764 -	kernel_cap_t new_permitted, working;
   8.765 -	int do_unlock = 0;
   8.766 -
   8.767 -	new_permitted = cap_intersect(bprm->cap_permitted, cap_bset);
   8.768 -	working = cap_intersect(bprm->cap_inheritable,
   8.769 -				current->cap_inheritable);
   8.770 -	new_permitted = cap_combine(new_permitted, working);
   8.771 -
   8.772 -	if (bprm->e_uid != current->uid || bprm->e_gid != current->gid ||
   8.773 -	    !cap_issubset(new_permitted, current->cap_permitted)) {
   8.774 -                current->mm->dumpable = 0;
   8.775 -		
   8.776 -		lock_kernel();
   8.777 -		if (must_not_trace_exec(current)
   8.778 -		    || atomic_read(&current->fs->count) > 1
   8.779 -		    || atomic_read(&current->files->count) > 1
   8.780 -		    || atomic_read(&current->sig->count) > 1) {
   8.781 -			if(!capable(CAP_SETUID)) {
   8.782 -				bprm->e_uid = current->uid;
   8.783 -				bprm->e_gid = current->gid;
   8.784 -			}
   8.785 -			if(!capable(CAP_SETPCAP)) {
   8.786 -				new_permitted = cap_intersect(new_permitted,
   8.787 -							current->cap_permitted);
   8.788 -			}
   8.789 -		}
   8.790 -		do_unlock = 1;
   8.791 -	}
   8.792 -
   8.793 -
   8.794 -	/* For init, we want to retain the capabilities set
   8.795 -         * in the init_task struct. Thus we skip the usual
   8.796 -         * capability rules */
   8.797 -	if (current->pid != 1) {
   8.798 -		current->cap_permitted = new_permitted;
   8.799 -		current->cap_effective =
   8.800 -			cap_intersect(new_permitted, bprm->cap_effective);
   8.801 -	}
   8.802 -	
   8.803 -        /* AUD: Audit candidate if current->cap_effective is set */
   8.804 -
   8.805 -        current->suid = current->euid = current->fsuid = bprm->e_uid;
   8.806 -        current->sgid = current->egid = current->fsgid = bprm->e_gid;
   8.807 -
   8.808 -	if(do_unlock)
   8.809 -		unlock_kernel();
   8.810 -	current->keep_capabilities = 0;
   8.811 -}
   8.812 -
   8.813 -
   8.814 -void remove_arg_zero(struct linux_binprm *bprm)
   8.815 -{
   8.816 -	if (bprm->argc) {
   8.817 -		unsigned long offset;
   8.818 -		char * kaddr;
   8.819 -		struct page *page;
   8.820 -
   8.821 -		offset = bprm->p % PAGE_SIZE;
   8.822 -		goto inside;
   8.823 -
   8.824 -		while (bprm->p++, *(kaddr+offset++)) {
   8.825 -			if (offset != PAGE_SIZE)
   8.826 -				continue;
   8.827 -			offset = 0;
   8.828 -			kunmap(page);
   8.829 -inside:
   8.830 -			page = bprm->page[bprm->p/PAGE_SIZE];
   8.831 -			kaddr = kmap(page);
   8.832 -		}
   8.833 -		kunmap(page);
   8.834 -		bprm->argc--;
   8.835 -	}
   8.836 -}
   8.837 -
   8.838 -/*
   8.839 - * cycle the list of binary formats handler, until one recognizes the image
   8.840 - */
   8.841 -int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs)
   8.842 -{
   8.843 -	int try,retval=0;
   8.844 -	struct linux_binfmt *fmt;
   8.845 -#ifdef __alpha__
   8.846 -	/* handle /sbin/loader.. */
   8.847 -	{
   8.848 -	    struct exec * eh = (struct exec *) bprm->buf;
   8.849 -
   8.850 -	    if (!bprm->loader && eh->fh.f_magic == 0x183 &&
   8.851 -		(eh->fh.f_flags & 0x3000) == 0x3000)
   8.852 -	    {
   8.853 -		struct file * file;
   8.854 -		unsigned long loader;
   8.855 -
   8.856 -		allow_write_access(bprm->file);
   8.857 -		fput(bprm->file);
   8.858 -		bprm->file = NULL;
   8.859 -
   8.860 -	        loader = PAGE_SIZE*MAX_ARG_PAGES-sizeof(void *);
   8.861 -
   8.862 -		file = open_exec("/sbin/loader");
   8.863 -		retval = PTR_ERR(file);
   8.864 -		if (IS_ERR(file))
   8.865 -			return retval;
   8.866 -
   8.867 -		/* Remember if the application is TASO.  */
   8.868 -		bprm->sh_bang = eh->ah.entry < 0x100000000;
   8.869 -
   8.870 -		bprm->file = file;
   8.871 -		bprm->loader = loader;
   8.872 -		retval = prepare_binprm(bprm);
   8.873 -		if (retval<0)
   8.874 -			return retval;
   8.875 -		/* should call search_binary_handler recursively here,
   8.876 -		   but it does not matter */
   8.877 -	    }
   8.878 -	}
   8.879 -#endif
   8.880 -	/* kernel module loader fixup */
   8.881 -	/* so we don't try to load run modprobe in kernel space. */
   8.882 -	set_fs(USER_DS);
   8.883 -	for (try=0; try<2; try++) {
   8.884 -		read_lock(&binfmt_lock);
   8.885 -		for (fmt = formats ; fmt ; fmt = fmt->next) {
   8.886 -			int (*fn)(struct linux_binprm *, struct pt_regs *) = fmt->load_binary;
   8.887 -			if (!fn)
   8.888 -				continue;
   8.889 -			if (!try_inc_mod_count(fmt->module))
   8.890 -				continue;
   8.891 -			read_unlock(&binfmt_lock);
   8.892 -			retval = fn(bprm, regs);
   8.893 -			if (retval >= 0) {
   8.894 -				put_binfmt(fmt);
   8.895 -				allow_write_access(bprm->file);
   8.896 -				if (bprm->file)
   8.897 -					fput(bprm->file);
   8.898 -				bprm->file = NULL;
   8.899 -				current->did_exec = 1;
   8.900 -				return retval;
   8.901 -			}
   8.902 -			read_lock(&binfmt_lock);
   8.903 -			put_binfmt(fmt);
   8.904 -			if (retval != -ENOEXEC)
   8.905 -				break;
   8.906 -			if (!bprm->file) {
   8.907 -				read_unlock(&binfmt_lock);
   8.908 -				return retval;
   8.909 -			}
   8.910 -		}
   8.911 -		read_unlock(&binfmt_lock);
   8.912 -		if (retval != -ENOEXEC) {
   8.913 -			break;
   8.914 -#ifdef CONFIG_KMOD
   8.915 -		}else{
   8.916 -#define printable(c) (((c)=='\t') || ((c)=='\n') || (0x20<=(c) && (c)<=0x7e))
   8.917 -			char modname[20];
   8.918 -			if (printable(bprm->buf[0]) &&
   8.919 -			    printable(bprm->buf[1]) &&
   8.920 -			    printable(bprm->buf[2]) &&
   8.921 -			    printable(bprm->buf[3]))
   8.922 -				break; /* -ENOEXEC */
   8.923 -			sprintf(modname, "binfmt-%04x", *(unsigned short *)(&bprm->buf[2]));
   8.924 -			request_module(modname);
   8.925 -#endif
   8.926 -		}
   8.927 -	}
   8.928 -	return retval;
   8.929 -}
   8.930 -
   8.931 -
   8.932 -/*
   8.933 - * sys_execve() executes a new program.
   8.934 - */
   8.935 -int do_execve(char * filename, char ** argv, char ** envp, struct pt_regs * regs)
   8.936 -{
   8.937 -	struct linux_binprm bprm;
   8.938 -	struct file *file;
   8.939 -	int retval;
   8.940 -	int i;
   8.941 -
   8.942 -	file = open_exec(filename);
   8.943 -
   8.944 -	retval = PTR_ERR(file);
   8.945 -	if (IS_ERR(file))
   8.946 -		return retval;
   8.947 -
   8.948 -	bprm.p = PAGE_SIZE*MAX_ARG_PAGES-sizeof(void *);
   8.949 -	memset(bprm.page, 0, MAX_ARG_PAGES*sizeof(bprm.page[0])); 
   8.950 -
   8.951 -	bprm.file = file;
   8.952 -	bprm.filename = filename;
   8.953 -	bprm.sh_bang = 0;
   8.954 -	bprm.loader = 0;
   8.955 -	bprm.exec = 0;
   8.956 -	if ((bprm.argc = count(argv, bprm.p / sizeof(void *))) < 0) {
   8.957 -		allow_write_access(file);
   8.958 -		fput(file);
   8.959 -		return bprm.argc;
   8.960 -	}
   8.961 -
   8.962 -	if ((bprm.envc = count(envp, bprm.p / sizeof(void *))) < 0) {
   8.963 -		allow_write_access(file);
   8.964 -		fput(file);
   8.965 -		return bprm.envc;
   8.966 -	}
   8.967 -
   8.968 -	retval = prepare_binprm(&bprm);
   8.969 -	if (retval < 0) 
   8.970 -		goto out; 
   8.971 -
   8.972 -	retval = copy_strings_kernel(1, &bprm.filename, &bprm);
   8.973 -	if (retval < 0) 
   8.974 -		goto out; 
   8.975 -
   8.976 -	bprm.exec = bprm.p;
   8.977 -	retval = copy_strings(bprm.envc, envp, &bprm);
   8.978 -	if (retval < 0) 
   8.979 -		goto out; 
   8.980 -
   8.981 -	retval = copy_strings(bprm.argc, argv, &bprm);
   8.982 -	if (retval < 0) 
   8.983 -		goto out; 
   8.984 -
   8.985 -	retval = search_binary_handler(&bprm,regs);
   8.986 -	if (retval >= 0)
   8.987 -		/* execve success */
   8.988 -		return retval;
   8.989 -
   8.990 -out:
   8.991 -	/* Something went wrong, return the inode and free the argument pages*/
   8.992 -	allow_write_access(bprm.file);
   8.993 -	if (bprm.file)
   8.994 -		fput(bprm.file);
   8.995 -
   8.996 -	for (i = 0 ; i < MAX_ARG_PAGES ; i++) {
   8.997 -		struct page * page = bprm.page[i];
   8.998 -		if (page)
   8.999 -			__free_page(page);
  8.1000 -	}
  8.1001 -
  8.1002 -	return retval;
  8.1003 -}
  8.1004 -
  8.1005 -void set_binfmt(struct linux_binfmt *new)
  8.1006 -{
  8.1007 -	struct linux_binfmt *old = current->binfmt;
  8.1008 -	if (new && new->module)
  8.1009 -		__MOD_INC_USE_COUNT(new->module);
  8.1010 -	current->binfmt = new;
  8.1011 -	if (old && old->module)
  8.1012 -		__MOD_DEC_USE_COUNT(old->module);
  8.1013 -}
  8.1014 -
  8.1015 -#define CORENAME_MAX_SIZE 64
  8.1016 -
  8.1017 -/* format_corename will inspect the pattern parameter, and output a
  8.1018 - * name into corename, which must have space for at least
  8.1019 - * CORENAME_MAX_SIZE bytes plus one byte for the zero terminator.
  8.1020 - */
  8.1021 -void format_corename(char *corename, const char *pattern, long signr)
  8.1022 -{
  8.1023 -	const char *pat_ptr = pattern;
  8.1024 -	char *out_ptr = corename;
  8.1025 -	char *const out_end = corename + CORENAME_MAX_SIZE;
  8.1026 -	int rc;
  8.1027 -	int pid_in_pattern = 0;
  8.1028 -
  8.1029 -	/* Repeat as long as we have more pattern to process and more output
  8.1030 -	   space */
  8.1031 -	while (*pat_ptr) {
  8.1032 -		if (*pat_ptr != '%') {
  8.1033 -			if (out_ptr == out_end)
  8.1034 -				goto out;
  8.1035 -			*out_ptr++ = *pat_ptr++;
  8.1036 -		} else {
  8.1037 -			switch (*++pat_ptr) {
  8.1038 -			case 0:
  8.1039 -				goto out;
  8.1040 -			/* Double percent, output one percent */
  8.1041 -			case '%':
  8.1042 -				if (out_ptr == out_end)
  8.1043 -					goto out;
  8.1044 -				*out_ptr++ = '%';
  8.1045 -				break;
  8.1046 -			/* pid */
  8.1047 -			case 'p':
  8.1048 -				pid_in_pattern = 1;
  8.1049 -				rc = snprintf(out_ptr, out_end - out_ptr,
  8.1050 -					      "%d", current->pid);
  8.1051 -				if (rc > out_end - out_ptr)
  8.1052 -					goto out;
  8.1053 -				out_ptr += rc;
  8.1054 -				break;
  8.1055 -			/* uid */
  8.1056 -			case 'u':
  8.1057 -				rc = snprintf(out_ptr, out_end - out_ptr,
  8.1058 -					      "%d", current->uid);
  8.1059 -				if (rc > out_end - out_ptr)
  8.1060 -					goto out;
  8.1061 -				out_ptr += rc;
  8.1062 -				break;
  8.1063 -			/* gid */
  8.1064 -			case 'g':
  8.1065 -				rc = snprintf(out_ptr, out_end - out_ptr,
  8.1066 -					      "%d", current->gid);
  8.1067 -				if (rc > out_end - out_ptr)
  8.1068 -					goto out;
  8.1069 -				out_ptr += rc;
  8.1070 -				break;
  8.1071 -			/* signal that caused the coredump */
  8.1072 -			case 's':
  8.1073 -				rc = snprintf(out_ptr, out_end - out_ptr,
  8.1074 -					      "%ld", signr);
  8.1075 -				if (rc > out_end - out_ptr)
  8.1076 -					goto out;
  8.1077 -				out_ptr += rc;
  8.1078 -				break;
  8.1079 -			/* UNIX time of coredump */
  8.1080 -			case 't': {
  8.1081 -				struct timeval tv;
  8.1082 -				do_gettimeofday(&tv);
  8.1083 -				rc = snprintf(out_ptr, out_end - out_ptr,
  8.1084 -					      "%ld", tv.tv_sec);
  8.1085 -				if (rc > out_end - out_ptr)
  8.1086 -					goto out;
  8.1087 -				out_ptr += rc;
  8.1088 -				break;
  8.1089 -			}
  8.1090 -			/* hostname */
  8.1091 -			case 'h':
  8.1092 -				down_read(&uts_sem);
  8.1093 -				rc = snprintf(out_ptr, out_end - out_ptr,
  8.1094 -					      "%s", system_utsname.nodename);
  8.1095 -				up_read(&uts_sem);
  8.1096 -				if (rc > out_end - out_ptr)
  8.1097 -					goto out;
  8.1098 -				out_ptr += rc;
  8.1099 -				break;
  8.1100 -			/* executable */
  8.1101 -			case 'e':
  8.1102 -				rc = snprintf(out_ptr, out_end - out_ptr,
  8.1103 -					      "%s", current->comm);
  8.1104 -				if (rc > out_end - out_ptr)
  8.1105 -					goto out;
  8.1106 -				out_ptr += rc;
  8.1107 -				break;
  8.1108 -			default:
  8.1109 -				break;
  8.1110 -			}
  8.1111 -			++pat_ptr;
  8.1112 -		}
  8.1113 -	}
  8.1114 -	/* Backward compatibility with core_uses_pid:
  8.1115 -	 *
  8.1116 -	 * If core_pattern does not include a %p (as is the default)
  8.1117 -	 * and core_uses_pid is set, then .%pid will be appended to
  8.1118 -	 * the filename */
  8.1119 -	if (!pid_in_pattern
  8.1120 -            && (core_uses_pid || atomic_read(&current->mm->mm_users) != 1)) {
  8.1121 -		rc = snprintf(out_ptr, out_end - out_ptr,
  8.1122 -			      ".%d", current->pid);
  8.1123 -		if (rc > out_end - out_ptr)
  8.1124 -			goto out;
  8.1125 -		out_ptr += rc;
  8.1126 -	}
  8.1127 -      out:
  8.1128 -	*out_ptr = 0;
  8.1129 -}
  8.1130 -
  8.1131 -int do_coredump(long signr, struct pt_regs * regs)
  8.1132 -{
  8.1133 -	struct linux_binfmt * binfmt;
  8.1134 -	char corename[CORENAME_MAX_SIZE + 1];
  8.1135 -	struct file * file;
  8.1136 -	struct inode * inode;
  8.1137 -	int retval = 0;
  8.1138 -	int fsuid = current->fsuid;
  8.1139 -
  8.1140 -	lock_kernel();
  8.1141 -	binfmt = current->binfmt;
  8.1142 -	if (!binfmt || !binfmt->core_dump)
  8.1143 -		goto fail;
  8.1144 -	if (!is_dumpable(current))
  8.1145 -	{
  8.1146 -		if(!core_setuid_ok || !current->task_dumpable)
  8.1147 -			goto fail;
  8.1148 -		current->fsuid = 0;
  8.1149 -	}
  8.1150 -	current->mm->dumpable = 0;
  8.1151 -	if (current->rlim[RLIMIT_CORE].rlim_cur < binfmt->min_coredump)
  8.1152 -		goto fail;
  8.1153 -
  8.1154 - 	format_corename(corename, core_pattern, signr);
  8.1155 -	file = filp_open(corename, O_CREAT | 2 | O_NOFOLLOW, 0600);
  8.1156 -	if (IS_ERR(file))
  8.1157 -		goto fail;
  8.1158 -	inode = file->f_dentry->d_inode;
  8.1159 -	if (inode->i_nlink > 1)
  8.1160 -		goto close_fail;	/* multiple links - don't dump */
  8.1161 -	if (d_unhashed(file->f_dentry))
  8.1162 -		goto close_fail;
  8.1163 -
  8.1164 -	if (!S_ISREG(inode->i_mode))
  8.1165 -		goto close_fail;
  8.1166 -	if (!file->f_op)
  8.1167 -		goto close_fail;
  8.1168 -	if (!file->f_op->write)
  8.1169 -		goto close_fail;
  8.1170 -	if (do_truncate(file->f_dentry, 0) != 0)
  8.1171 -		goto close_fail;
  8.1172 -
  8.1173 -	retval = binfmt->core_dump(signr, regs, file);
  8.1174 -
  8.1175 -close_fail:
  8.1176 -	filp_close(file, NULL);
  8.1177 -fail:
  8.1178 -	if (fsuid != current->fsuid)
  8.1179 -		current->fsuid = fsuid;
  8.1180 -	unlock_kernel();
  8.1181 -	return retval;
  8.1182 -}
     9.1 --- a/linux-2.4.29-xen-sparse/include/asm-xen/page.h	Sat Mar 26 05:05:07 2005 +0000
     9.2 +++ b/linux-2.4.29-xen-sparse/include/asm-xen/page.h	Fri Mar 25 22:52:21 2005 +0000
     9.3 @@ -85,23 +85,18 @@ typedef struct { unsigned long pgprot; }
     9.4  static inline unsigned long pmd_val(pmd_t x)
     9.5  {
     9.6      unsigned long ret = x.pmd;
     9.7 -    if ( (ret & 1) ) ret = machine_to_phys(ret);
     9.8 +    if ( ret ) ret = machine_to_phys(ret) | 1;
     9.9      return ret;
    9.10  }
    9.11  #define pmd_val_ma(x)   ((x).pmd)
    9.12  #define pgd_val(x)	({ BUG(); (unsigned long)0; })
    9.13  #define pgprot_val(x)	((x).pgprot)
    9.14  
    9.15 -static inline pte_t __pte(unsigned long x)
    9.16 -{
    9.17 -    if ( (x & 1) ) x = phys_to_machine(x);
    9.18 -    return ((pte_t) { (x) });
    9.19 -}
    9.20 -static inline pmd_t __pmd(unsigned long x)
    9.21 -{
    9.22 -    if ( (x & 1) ) x = phys_to_machine(x);
    9.23 -    return ((pmd_t) { (x) });
    9.24 -}
    9.25 +#define __pte(x) ({ unsigned long _x = (x); \
    9.26 +    (((_x)&1) ? ((pte_t) {phys_to_machine(_x)}) : ((pte_t) {(_x)})); })
    9.27 +#define __pte_ma(x)     ((pte_t) { (x) } )
    9.28 +#define __pmd(x) ({ unsigned long _x = (x); \
    9.29 +    (((_x)&1) ? ((pmd_t) {phys_to_machine(_x)}) : ((pmd_t) {(_x)})); })
    9.30  #define __pgd(x) ({ BUG(); (pgprot_t) { 0 }; })
    9.31  #define __pgprot(x)	((pgprot_t) { (x) } )
    9.32  
    10.1 --- a/linux-2.4.29-xen-sparse/include/asm-xen/pgalloc.h	Sat Mar 26 05:05:07 2005 +0000
    10.2 +++ b/linux-2.4.29-xen-sparse/include/asm-xen/pgalloc.h	Fri Mar 25 22:52:21 2005 +0000
    10.3 @@ -22,7 +22,6 @@
    10.4  #define pmd_populate(mm, pmd, pte) 		  \
    10.5   do {                                             \
    10.6    set_pmd(pmd, __pmd(_PAGE_TABLE + __pa(pte)));   \
    10.7 -  XEN_flush_page_update_queue();                 \
    10.8   } while ( 0 )
    10.9  
   10.10  /*
   10.11 @@ -79,8 +78,9 @@ static inline pgd_t *get_pgd_slow(void)
   10.12  		memcpy(pgd + USER_PTRS_PER_PGD,
   10.13  			init_mm.pgd + USER_PTRS_PER_PGD,
   10.14  			(PTRS_PER_PGD - USER_PTRS_PER_PGD) * sizeof(pgd_t));
   10.15 -                __make_page_readonly(pgd);
   10.16 +		__make_page_readonly(pgd);
   10.17  		queue_pgd_pin(__pa(pgd));
   10.18 +		flush_page_update_queue();
   10.19  	}
   10.20  	return pgd;
   10.21  }
   10.22 @@ -111,7 +111,8 @@ static inline void free_pgd_slow(pgd_t *
   10.23  	kmem_cache_free(pae_pgd_cachep, pgd);
   10.24  #else
   10.25  	queue_pgd_unpin(__pa(pgd));
   10.26 -        __make_page_writable(pgd);
   10.27 +	__make_page_writable(pgd);
   10.28 +	flush_page_update_queue();
   10.29  	free_page((unsigned long)pgd);
   10.30  #endif
   10.31  }
   10.32 @@ -135,6 +136,7 @@ static inline pte_t *pte_alloc_one(struc
   10.33          clear_page(pte);
   10.34          __make_page_readonly(pte);
   10.35          queue_pte_pin(__pa(pte));
   10.36 +        flush_page_update_queue();
   10.37      }
   10.38      return pte;
   10.39  
   10.40 @@ -155,6 +157,7 @@ static __inline__ void pte_free_slow(pte
   10.41  {
   10.42      queue_pte_unpin(__pa(pte));
   10.43      __make_page_writable(pte);
   10.44 +    flush_page_update_queue();
   10.45      free_page((unsigned long)pte);
   10.46  }
   10.47  
   10.48 @@ -208,22 +211,19 @@ extern int do_check_pgt_cache(int, int);
   10.49  
   10.50  static inline void flush_tlb_mm(struct mm_struct *mm)
   10.51  {
   10.52 -	if (mm == current->active_mm) queue_tlb_flush();
   10.53 -	XEN_flush_page_update_queue();
   10.54 +	if (mm == current->active_mm) xen_tlb_flush();
   10.55  }
   10.56  
   10.57  static inline void flush_tlb_page(struct vm_area_struct *vma,
   10.58  	unsigned long addr)
   10.59  {
   10.60 -	if (vma->vm_mm == current->active_mm) queue_invlpg(addr);
   10.61 -	XEN_flush_page_update_queue();
   10.62 +	if (vma->vm_mm == current->active_mm) xen_invlpg(addr);
   10.63  }
   10.64  
   10.65  static inline void flush_tlb_range(struct mm_struct *mm,
   10.66  	unsigned long start, unsigned long end)
   10.67  {
   10.68 -	if (mm == current->active_mm) queue_tlb_flush();
   10.69 -	XEN_flush_page_update_queue();
   10.70 +	if (mm == current->active_mm) xen_tlb_flush();
   10.71  }
   10.72  
   10.73  #else
   10.74 @@ -261,7 +261,6 @@ static inline void flush_tlb_pgtables(st
   10.75  				      unsigned long start, unsigned long end)
   10.76  {
   10.77      /* i386 does not keep any page table caches in TLB */
   10.78 -    XEN_flush_page_update_queue();
   10.79  }
   10.80  
   10.81  /*
    11.1 --- a/linux-2.4.29-xen-sparse/include/asm-xen/pgtable-2level.h	Sat Mar 26 05:05:07 2005 +0000
    11.2 +++ b/linux-2.4.29-xen-sparse/include/asm-xen/pgtable-2level.h	Fri Mar 25 22:52:21 2005 +0000
    11.3 @@ -34,9 +34,19 @@ static inline int pgd_bad(pgd_t pgd)		{ 
    11.4  static inline int pgd_present(pgd_t pgd)	{ return 1; }
    11.5  #define pgd_clear(xp)				do { } while (0)
    11.6  
    11.7 -#define set_pte(pteptr, pteval) queue_l1_entry_update(pteptr, (pteval).pte_low)
    11.8 -#define set_pte_atomic(pteptr, pteval) queue_l1_entry_update(pteptr, (pteval).pte_low)
    11.9 -#define set_pmd(pmdptr, pmdval) queue_l2_entry_update((pmdptr), (pmdval))
   11.10 +/*
   11.11 + * Certain architectures need to do special things when PTEs
   11.12 + * within a page table are directly modified.  Thus, the following
   11.13 + * hook is made available.
   11.14 + */
   11.15 +#define set_pte(pteptr, pteval) (*(pteptr) = pteval)
   11.16 +#define set_pte_atomic(pteptr, pteval) (*(pteptr) = pteval)
   11.17 +
   11.18 +/*
   11.19 + * (pmds are folded into pgds so this doesnt get actually called,
   11.20 + * but the define is needed for a generic inline function.)
   11.21 + */
   11.22 +#define set_pmd(pmdptr, pmdval) xen_l2_entry_update((pmdptr), (pmdval))
   11.23  #define set_pgd(pgdptr, pgdval) ((void)0)
   11.24  
   11.25  #define pgd_page(pgd) \
   11.26 @@ -47,6 +57,7 @@ static inline pmd_t * pmd_offset(pgd_t *
   11.27  	return (pmd_t *) dir;
   11.28  }
   11.29  
   11.30 +#define ptep_get_and_clear(xp)	__pte_ma(xchg(&(xp)->pte_low, 0))
   11.31  #define pte_same(a, b)		((a).pte_low == (b).pte_low)
   11.32  
   11.33  /*                                 
   11.34 @@ -83,21 +94,4 @@ static inline pmd_t * pmd_offset(pgd_t *
   11.35  #define pte_none(x)		(!(x).pte_low)
   11.36  #define __mk_pte(page_nr,pgprot) __pte(((page_nr) << PAGE_SHIFT) | pgprot_val(pgprot))
   11.37  
   11.38 -/*
   11.39 - * A note on implementation of this atomic 'get-and-clear' operation.
   11.40 - * This is actually very simple because XenoLinux can only run on a single
   11.41 - * processor. Therefore, we cannot race other processors setting the 'accessed'
   11.42 - * or 'dirty' bits on a page-table entry.
   11.43 - * Even if pages are shared between domains, that is not a problem because
   11.44 - * each domain will have separate page tables, with their own versions of
   11.45 - * accessed & dirty state.
   11.46 - */
   11.47 -static inline pte_t ptep_get_and_clear(pte_t *xp)
   11.48 -{
   11.49 -    pte_t pte = *xp;
   11.50 -    if ( !pte_none(pte) )
   11.51 -        queue_l1_entry_update(xp, 0);
   11.52 -    return pte;
   11.53 -}
   11.54 -
   11.55  #endif /* _I386_PGTABLE_2LEVEL_H */
    12.1 --- a/linux-2.4.29-xen-sparse/include/asm-xen/pgtable.h	Sat Mar 26 05:05:07 2005 +0000
    12.2 +++ b/linux-2.4.29-xen-sparse/include/asm-xen/pgtable.h	Fri Mar 25 22:52:21 2005 +0000
    12.3 @@ -38,11 +38,11 @@ extern void paging_init(void);
    12.4  
    12.5  extern unsigned long pgkern_mask;
    12.6  
    12.7 -#define __flush_tlb() ({ queue_tlb_flush(); XEN_flush_page_update_queue(); })
    12.8 +#define __flush_tlb() xen_tlb_flush()
    12.9  #define __flush_tlb_global() __flush_tlb()
   12.10  #define __flush_tlb_all() __flush_tlb_global()
   12.11 -#define __flush_tlb_one(addr) ({ queue_invlpg(addr); XEN_flush_page_update_queue(); })
   12.12 -#define __flush_tlb_single(addr) ({ queue_invlpg(addr); XEN_flush_page_update_queue(); })
   12.13 +#define __flush_tlb_one(addr) xen_invlpg(addr)
   12.14 +#define __flush_tlb_single(addr) xen_invlpg(addr)
   12.15  
   12.16  /*
   12.17   * ZERO_PAGE is a global shared page that is always zero: used
   12.18 @@ -179,12 +179,14 @@ extern void * high_memory;
   12.19  #define __S111	PAGE_SHARED
   12.20  
   12.21  #define pte_present(x)	((x).pte_low & (_PAGE_PRESENT | _PAGE_PROTNONE))
   12.22 -#define pte_clear(xp)	queue_l1_entry_update(xp, 0)
   12.23 +#define pte_clear(xp)	do { set_pte(xp, __pte(0)); } while (0)
   12.24  
   12.25 -#define pmd_none(x)	(!(x).pmd)
   12.26 -#define pmd_present(x)	((x).pmd & _PAGE_PRESENT)
   12.27 +#define pmd_none(x)	(!pmd_val(x))
   12.28 +/* pmd_present doesn't just test the _PAGE_PRESENT bit since wr.p.t.
   12.29 +   can temporarily clear it. */
   12.30 +#define pmd_present(x)	(pmd_val(x))
   12.31  #define pmd_clear(xp)	do { set_pmd(xp, __pmd(0)); } while (0)
   12.32 -#define	pmd_bad(x)	(((x).pmd & (~PAGE_MASK & ~_PAGE_USER)) != _KERNPG_TABLE)
   12.33 +#define pmd_bad(x)	((pmd_val(x) & (~PAGE_MASK & ~_PAGE_USER & ~_PAGE_PRESENT)) != (_KERNPG_TABLE & ~_PAGE_PRESENT))
   12.34  
   12.35  
   12.36  #define pages_to_mb(x) ((x) >> (20-PAGE_SHIFT))
   12.37 @@ -212,29 +214,28 @@ static inline pte_t pte_mkwrite(pte_t pt
   12.38  
   12.39  static inline int ptep_test_and_clear_dirty(pte_t *ptep)
   12.40  {
   12.41 -    unsigned long pteval = *(unsigned long *)ptep;
   12.42 -    int ret = pteval & _PAGE_DIRTY;
   12.43 -    if ( ret ) queue_l1_entry_update(ptep, pteval & ~_PAGE_DIRTY);
   12.44 -    return ret;
   12.45 +    if (!pte_dirty(*ptep))
   12.46 +        return 0;
   12.47 +    return test_and_clear_bit(_PAGE_BIT_DIRTY, &ptep->pte_low);
   12.48  }
   12.49 -static inline  int ptep_test_and_clear_young(pte_t *ptep)
   12.50 +
   12.51 +static inline int ptep_test_and_clear_young(pte_t *ptep)
   12.52  {
   12.53 -    unsigned long pteval = *(unsigned long *)ptep;
   12.54 -    int ret = pteval & _PAGE_ACCESSED;
   12.55 -    if ( ret ) queue_l1_entry_update(ptep, pteval & ~_PAGE_ACCESSED);
   12.56 -    return ret;
   12.57 +    if (!pte_young(*ptep))
   12.58 +        return 0;
   12.59 +    return test_and_clear_bit(_PAGE_BIT_ACCESSED, &ptep->pte_low);
   12.60  }
   12.61 +
   12.62  static inline void ptep_set_wrprotect(pte_t *ptep)
   12.63  {
   12.64 -    unsigned long pteval = *(unsigned long *)ptep;
   12.65 -    if ( (pteval & _PAGE_RW) )
   12.66 -        queue_l1_entry_update(ptep, pteval & ~_PAGE_RW);
   12.67 +    if (pte_write(*ptep))
   12.68 +        clear_bit(_PAGE_BIT_RW, &ptep->pte_low);
   12.69  }
   12.70 +
   12.71  static inline void ptep_mkdirty(pte_t *ptep)
   12.72  {
   12.73 -    unsigned long pteval = *(unsigned long *)ptep;
   12.74 -    if ( !(pteval & _PAGE_DIRTY) )
   12.75 -        queue_l1_entry_update(ptep, pteval | _PAGE_DIRTY);
   12.76 +    if (!pte_dirty(*ptep))
   12.77 +        set_bit(_PAGE_BIT_DIRTY, &ptep->pte_low);
   12.78  }
   12.79  
   12.80  /*
    13.1 --- a/linux-2.4.29-xen-sparse/mm/highmem.c	Sat Mar 26 05:05:07 2005 +0000
    13.2 +++ b/linux-2.4.29-xen-sparse/mm/highmem.c	Fri Mar 25 22:52:21 2005 +0000
    13.3 @@ -122,7 +122,6 @@ start:
    13.4  	}
    13.5  	vaddr = PKMAP_ADDR(last_pkmap_nr);
    13.6  	set_pte(&(pkmap_page_table[last_pkmap_nr]), mk_pte(page, kmap_prot));
    13.7 -	XEN_flush_page_update_queue();
    13.8  
    13.9  	pkmap_count[last_pkmap_nr] = 1;
   13.10  	page->virtual = (void *) vaddr;
    14.1 --- a/linux-2.4.29-xen-sparse/mm/memory.c	Sat Mar 26 05:05:07 2005 +0000
    14.2 +++ b/linux-2.4.29-xen-sparse/mm/memory.c	Fri Mar 25 22:52:21 2005 +0000
    14.3 @@ -153,7 +153,6 @@ void clear_page_tables(struct mm_struct 
    14.4  		free_one_pgd(page_dir);
    14.5  		page_dir++;
    14.6  	} while (--nr);
    14.7 -	XEN_flush_page_update_queue();
    14.8  	spin_unlock(&mm->page_table_lock);
    14.9  
   14.10  	/* keep the page table cache within bounds */
   14.11 @@ -249,10 +248,8 @@ skip_copy_pte_range:		address = (address
   14.12  
   14.13  				/* If it's a COW mapping, write protect it both in the parent and the child */
   14.14  				if (cow && pte_write(pte)) {
   14.15 -					/* XEN modification: modified ordering here to avoid RaW hazard. */
   14.16 +					ptep_set_wrprotect(src_pte);
   14.17  					pte = *src_pte;
   14.18 -					pte = pte_wrprotect(pte);
   14.19 -					ptep_set_wrprotect(src_pte);
   14.20  				}
   14.21  
   14.22  				/* If it's a shared mapping, mark it clean in the child */
   14.23 @@ -914,7 +911,6 @@ static inline void establish_pte(struct 
   14.24  {
   14.25  #ifdef CONFIG_XEN
   14.26  	if ( likely(vma->vm_mm == current->mm) ) {
   14.27 -		XEN_flush_page_update_queue();
   14.28  		HYPERVISOR_update_va_mapping(address, entry, UVMF_INVLPG);
   14.29  	} else {
   14.30  		set_pte(page_table, entry);
   14.31 @@ -1189,13 +1185,10 @@ static int do_swap_page(struct mm_struct
   14.32  	flush_page_to_ram(page);
   14.33  	flush_icache_page(vma, page);
   14.34  #ifdef CONFIG_XEN
   14.35 -	if ( likely(vma->vm_mm == current->mm) ) {
   14.36 -		XEN_flush_page_update_queue();
   14.37 +	if ( likely(vma->vm_mm == current->mm) )
   14.38  		HYPERVISOR_update_va_mapping(address, pte, 0);
   14.39 -	} else {
   14.40 +	else
   14.41  		set_pte(page_table, pte);
   14.42 -		XEN_flush_page_update_queue();
   14.43 -	}
   14.44  #else
   14.45  	set_pte(page_table, pte);
   14.46  #endif
   14.47 @@ -1245,13 +1238,10 @@ static int do_anonymous_page(struct mm_s
   14.48  	}
   14.49  
   14.50  #ifdef CONFIG_XEN
   14.51 -	if ( likely(vma->vm_mm == current->mm) ) {
   14.52 -		XEN_flush_page_update_queue();
   14.53 +	if ( likely(vma->vm_mm == current->mm) )
   14.54  		HYPERVISOR_update_va_mapping(addr, entry, 0);
   14.55 -	} else {
   14.56 +	else
   14.57  		set_pte(page_table, entry);
   14.58 -		XEN_flush_page_update_queue();
   14.59 -	}
   14.60  #else
   14.61  	set_pte(page_table, entry);
   14.62  #endif
   14.63 @@ -1331,13 +1321,10 @@ static int do_no_page(struct mm_struct *
   14.64  		if (write_access)
   14.65  			entry = pte_mkwrite(pte_mkdirty(entry));
   14.66  #ifdef CONFIG_XEN
   14.67 -		if ( likely(vma->vm_mm == current->mm) ) {
   14.68 -			XEN_flush_page_update_queue();
   14.69 +		if ( likely(vma->vm_mm == current->mm) )
   14.70  			HYPERVISOR_update_va_mapping(address, entry, 0);
   14.71 -		} else {
   14.72 +		else
   14.73  			set_pte(page_table, entry);
   14.74 -			XEN_flush_page_update_queue();
   14.75 -		}
   14.76  #else
   14.77  		set_pte(page_table, entry);
   14.78  #endif
   14.79 @@ -1484,7 +1471,6 @@ pte_t fastcall *pte_alloc(struct mm_stru
   14.80  		/* "fast" allocation can happen without dropping the lock.. */
   14.81  		new = pte_alloc_one_fast(mm, address);
   14.82  		if (!new) {
   14.83 -			XEN_flush_page_update_queue();
   14.84  			spin_unlock(&mm->page_table_lock);
   14.85  			new = pte_alloc_one(mm, address);
   14.86  			spin_lock(&mm->page_table_lock);
    15.1 --- a/linux-2.4.29-xen-sparse/mm/mremap.c	Sat Mar 26 05:05:07 2005 +0000
    15.2 +++ b/linux-2.4.29-xen-sparse/mm/mremap.c	Fri Mar 25 22:52:21 2005 +0000
    15.3 @@ -119,11 +119,9 @@ static int move_page_tables(struct mm_st
    15.4  	 * the old page tables)
    15.5  	 */
    15.6  oops_we_failed:
    15.7 -	XEN_flush_page_update_queue();
    15.8  	flush_cache_range(mm, new_addr, new_addr + len);
    15.9  	while ((offset += PAGE_SIZE) < len)
   15.10  		move_one_page(mm, new_addr + offset, old_addr + offset);
   15.11 -	XEN_flush_page_update_queue();
   15.12  	zap_page_range(mm, new_addr, len);
   15.13  	return -1;
   15.14  }
    16.1 --- a/linux-2.4.29-xen-sparse/mm/swapfile.c	Sat Mar 26 05:05:07 2005 +0000
    16.2 +++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
    16.3 @@ -1,1267 +0,0 @@
    16.4 -/*
    16.5 - *  linux/mm/swapfile.c
    16.6 - *
    16.7 - *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
    16.8 - *  Swap reorganised 29.12.95, Stephen Tweedie
    16.9 - */
   16.10 -
   16.11 -#include <linux/slab.h>
   16.12 -#include <linux/smp_lock.h>
   16.13 -#include <linux/kernel_stat.h>
   16.14 -#include <linux/swap.h>
   16.15 -#include <linux/swapctl.h>
   16.16 -#include <linux/blkdev.h> /* for blk_size */
   16.17 -#include <linux/vmalloc.h>
   16.18 -#include <linux/pagemap.h>
   16.19 -#include <linux/shm.h>
   16.20 -
   16.21 -#include <asm/pgtable.h>
   16.22 -
   16.23 -spinlock_t swaplock = SPIN_LOCK_UNLOCKED;
   16.24 -unsigned int nr_swapfiles;
   16.25 -int total_swap_pages;
   16.26 -static int swap_overflow;
   16.27 -
   16.28 -static const char Bad_file[] = "Bad swap file entry ";
   16.29 -static const char Unused_file[] = "Unused swap file entry ";
   16.30 -static const char Bad_offset[] = "Bad swap offset entry ";
   16.31 -static const char Unused_offset[] = "Unused swap offset entry ";
   16.32 -
   16.33 -struct swap_list_t swap_list = {-1, -1};
   16.34 -
   16.35 -struct swap_info_struct swap_info[MAX_SWAPFILES];
   16.36 -
   16.37 -#define SWAPFILE_CLUSTER 256
   16.38 -
   16.39 -static inline int scan_swap_map(struct swap_info_struct *si)
   16.40 -{
   16.41 -	unsigned long offset;
   16.42 -	/* 
   16.43 -	 * We try to cluster swap pages by allocating them
   16.44 -	 * sequentially in swap.  Once we've allocated
   16.45 -	 * SWAPFILE_CLUSTER pages this way, however, we resort to
   16.46 -	 * first-free allocation, starting a new cluster.  This
   16.47 -	 * prevents us from scattering swap pages all over the entire
   16.48 -	 * swap partition, so that we reduce overall disk seek times
   16.49 -	 * between swap pages.  -- sct */
   16.50 -	if (si->cluster_nr) {
   16.51 -		while (si->cluster_next <= si->highest_bit) {
   16.52 -			offset = si->cluster_next++;
   16.53 -			if (si->swap_map[offset])
   16.54 -				continue;
   16.55 -			si->cluster_nr--;
   16.56 -			goto got_page;
   16.57 -		}
   16.58 -	}
   16.59 -	si->cluster_nr = SWAPFILE_CLUSTER;
   16.60 -
   16.61 -	/* try to find an empty (even not aligned) cluster. */
   16.62 -	offset = si->lowest_bit;
   16.63 - check_next_cluster:
   16.64 -	if (offset+SWAPFILE_CLUSTER-1 <= si->highest_bit)
   16.65 -	{
   16.66 -		int nr;
   16.67 -		for (nr = offset; nr < offset+SWAPFILE_CLUSTER; nr++)
   16.68 -			if (si->swap_map[nr])
   16.69 -			{
   16.70 -				offset = nr+1;
   16.71 -				goto check_next_cluster;
   16.72 -			}
   16.73 -		/* We found a completly empty cluster, so start
   16.74 -		 * using it.
   16.75 -		 */
   16.76 -		goto got_page;
   16.77 -	}
   16.78 -	/* No luck, so now go finegrined as usual. -Andrea */
   16.79 -	for (offset = si->lowest_bit; offset <= si->highest_bit ; offset++) {
   16.80 -		if (si->swap_map[offset])
   16.81 -			continue;
   16.82 -		si->lowest_bit = offset+1;
   16.83 -	got_page:
   16.84 -		if (offset == si->lowest_bit)
   16.85 -			si->lowest_bit++;
   16.86 -		if (offset == si->highest_bit)
   16.87 -			si->highest_bit--;
   16.88 -		if (si->lowest_bit > si->highest_bit) {
   16.89 -			si->lowest_bit = si->max;
   16.90 -			si->highest_bit = 0;
   16.91 -		}
   16.92 -		si->swap_map[offset] = 1;
   16.93 -		nr_swap_pages--;
   16.94 -		si->cluster_next = offset+1;
   16.95 -		return offset;
   16.96 -	}
   16.97 -	si->lowest_bit = si->max;
   16.98 -	si->highest_bit = 0;
   16.99 -	return 0;
  16.100 -}
  16.101 -
  16.102 -swp_entry_t get_swap_page(void)
  16.103 -{
  16.104 -	struct swap_info_struct * p;
  16.105 -	unsigned long offset;
  16.106 -	swp_entry_t entry;
  16.107 -	int type, wrapped = 0;
  16.108 -
  16.109 -	entry.val = 0;	/* Out of memory */
  16.110 -	swap_list_lock();
  16.111 -	type = swap_list.next;
  16.112 -	if (type < 0)
  16.113 -		goto out;
  16.114 -	if (nr_swap_pages <= 0)
  16.115 -		goto out;
  16.116 -
  16.117 -	while (1) {
  16.118 -		p = &swap_info[type];
  16.119 -		if ((p->flags & SWP_WRITEOK) == SWP_WRITEOK) {
  16.120 -			swap_device_lock(p);
  16.121 -			offset = scan_swap_map(p);
  16.122 -			swap_device_unlock(p);
  16.123 -			if (offset) {
  16.124 -				entry = SWP_ENTRY(type,offset);
  16.125 -				type = swap_info[type].next;
  16.126 -				if (type < 0 ||
  16.127 -					p->prio != swap_info[type].prio) {
  16.128 -						swap_list.next = swap_list.head;
  16.129 -				} else {
  16.130 -					swap_list.next = type;
  16.131 -				}
  16.132 -				goto out;
  16.133 -			}
  16.134 -		}
  16.135 -		type = p->next;
  16.136 -		if (!wrapped) {
  16.137 -			if (type < 0 || p->prio != swap_info[type].prio) {
  16.138 -				type = swap_list.head;
  16.139 -				wrapped = 1;
  16.140 -			}
  16.141 -		} else
  16.142 -			if (type < 0)
  16.143 -				goto out;	/* out of swap space */
  16.144 -	}
  16.145 -out:
  16.146 -	swap_list_unlock();
  16.147 -	return entry;
  16.148 -}
  16.149 -
  16.150 -static struct swap_info_struct * swap_info_get(swp_entry_t entry)
  16.151 -{
  16.152 -	struct swap_info_struct * p;
  16.153 -	unsigned long offset, type;
  16.154 -
  16.155 -	if (!entry.val)
  16.156 -		goto out;
  16.157 -	type = SWP_TYPE(entry);
  16.158 -	if (type >= nr_swapfiles)
  16.159 -		goto bad_nofile;
  16.160 -	p = & swap_info[type];
  16.161 -	if (!(p->flags & SWP_USED))
  16.162 -		goto bad_device;
  16.163 -	offset = SWP_OFFSET(entry);
  16.164 -	if (offset >= p->max)
  16.165 -		goto bad_offset;
  16.166 -	if (!p->swap_map[offset])
  16.167 -		goto bad_free;
  16.168 -	swap_list_lock();
  16.169 -	if (p->prio > swap_info[swap_list.next].prio)
  16.170 -		swap_list.next = type;
  16.171 -	swap_device_lock(p);
  16.172 -	return p;
  16.173 -
  16.174 -bad_free:
  16.175 -	printk(KERN_ERR "swap_free: %s%08lx\n", Unused_offset, entry.val);
  16.176 -	goto out;
  16.177 -bad_offset:
  16.178 -	printk(KERN_ERR "swap_free: %s%08lx\n", Bad_offset, entry.val);
  16.179 -	goto out;
  16.180 -bad_device:
  16.181 -	printk(KERN_ERR "swap_free: %s%08lx\n", Unused_file, entry.val);
  16.182 -	goto out;
  16.183 -bad_nofile:
  16.184 -	printk(KERN_ERR "swap_free: %s%08lx\n", Bad_file, entry.val);
  16.185 -out:
  16.186 -	return NULL;
  16.187 -}	
  16.188 -
  16.189 -static void swap_info_put(struct swap_info_struct * p)
  16.190 -{
  16.191 -	swap_device_unlock(p);
  16.192 -	swap_list_unlock();
  16.193 -}
  16.194 -
  16.195 -static int swap_entry_free(struct swap_info_struct *p, unsigned long offset)
  16.196 -{
  16.197 -	int count = p->swap_map[offset];
  16.198 -
  16.199 -	if (count < SWAP_MAP_MAX) {
  16.200 -		count--;
  16.201 -		p->swap_map[offset] = count;
  16.202 -		if (!count) {
  16.203 -			if (offset < p->lowest_bit)
  16.204 -				p->lowest_bit = offset;
  16.205 -			if (offset > p->highest_bit)
  16.206 -				p->highest_bit = offset;
  16.207 -			nr_swap_pages++;
  16.208 -		}
  16.209 -	}
  16.210 -	return count;
  16.211 -}
  16.212 -
  16.213 -/*
  16.214 - * Caller has made sure that the swapdevice corresponding to entry
  16.215 - * is still around or has not been recycled.
  16.216 - */
  16.217 -void swap_free(swp_entry_t entry)
  16.218 -{
  16.219 -	struct swap_info_struct * p;
  16.220 -
  16.221 -	p = swap_info_get(entry);
  16.222 -	if (p) {
  16.223 -		swap_entry_free(p, SWP_OFFSET(entry));
  16.224 -		swap_info_put(p);
  16.225 -	}
  16.226 -}
  16.227 -
  16.228 -/*
  16.229 - * Check if we're the only user of a swap page,
  16.230 - * when the page is locked.
  16.231 - */
  16.232 -static int exclusive_swap_page(struct page *page)
  16.233 -{
  16.234 -	int retval = 0;
  16.235 -	struct swap_info_struct * p;
  16.236 -	swp_entry_t entry;
  16.237 -
  16.238 -	entry.val = page->index;
  16.239 -	p = swap_info_get(entry);
  16.240 -	if (p) {
  16.241 -		/* Is the only swap cache user the cache itself? */
  16.242 -		if (p->swap_map[SWP_OFFSET(entry)] == 1) {
  16.243 -			/* Recheck the page count with the pagecache lock held.. */
  16.244 -			spin_lock(&pagecache_lock);
  16.245 -			if (page_count(page) - !!page->buffers == 2)
  16.246 -				retval = 1;
  16.247 -			spin_unlock(&pagecache_lock);
  16.248 -		}
  16.249 -		swap_info_put(p);
  16.250 -	}
  16.251 -	return retval;
  16.252 -}
  16.253 -
  16.254 -/*
  16.255 - * We can use this swap cache entry directly
  16.256 - * if there are no other references to it.
  16.257 - *
  16.258 - * Here "exclusive_swap_page()" does the real
  16.259 - * work, but we opportunistically check whether
  16.260 - * we need to get all the locks first..
  16.261 - */
  16.262 -int fastcall can_share_swap_page(struct page *page)
  16.263 -{
  16.264 -	int retval = 0;
  16.265 -
  16.266 -	if (!PageLocked(page))
  16.267 -		BUG();
  16.268 -	switch (page_count(page)) {
  16.269 -	case 3:
  16.270 -		if (!page->buffers)
  16.271 -			break;
  16.272 -		/* Fallthrough */
  16.273 -	case 2:
  16.274 -		if (!PageSwapCache(page))
  16.275 -			break;
  16.276 -		retval = exclusive_swap_page(page);
  16.277 -		break;
  16.278 -	case 1:
  16.279 -		if (PageReserved(page))
  16.280 -			break;
  16.281 -		retval = 1;
  16.282 -	}
  16.283 -	return retval;
  16.284 -}
  16.285 -
  16.286 -/*
  16.287 - * Work out if there are any other processes sharing this
  16.288 - * swap cache page. Free it if you can. Return success.
  16.289 - */
  16.290 -int fastcall remove_exclusive_swap_page(struct page *page)
  16.291 -{
  16.292 -	int retval;
  16.293 -	struct swap_info_struct * p;
  16.294 -	swp_entry_t entry;
  16.295 -
  16.296 -	if (!PageLocked(page))
  16.297 -		BUG();
  16.298 -	if (!PageSwapCache(page))
  16.299 -		return 0;
  16.300 -	if (page_count(page) - !!page->buffers != 2)	/* 2: us + cache */
  16.301 -		return 0;
  16.302 -
  16.303 -	entry.val = page->index;
  16.304 -	p = swap_info_get(entry);
  16.305 -	if (!p)
  16.306 -		return 0;
  16.307 -
  16.308 -	/* Is the only swap cache user the cache itself? */
  16.309 -	retval = 0;
  16.310 -	if (p->swap_map[SWP_OFFSET(entry)] == 1) {
  16.311 -		/* Recheck the page count with the pagecache lock held.. */
  16.312 -		spin_lock(&pagecache_lock);
  16.313 -		if (page_count(page) - !!page->buffers == 2) {
  16.314 -			__delete_from_swap_cache(page);
  16.315 -			SetPageDirty(page);
  16.316 -			retval = 1;
  16.317 -		}
  16.318 -		spin_unlock(&pagecache_lock);
  16.319 -	}
  16.320 -	swap_info_put(p);
  16.321 -
  16.322 -	if (retval) {
  16.323 -		block_flushpage(page, 0);
  16.324 -		swap_free(entry);
  16.325 -		page_cache_release(page);
  16.326 -	}
  16.327 -
  16.328 -	return retval;
  16.329 -}
  16.330 -
  16.331 -/*
  16.332 - * Free the swap entry like above, but also try to
  16.333 - * free the page cache entry if it is the last user.
  16.334 - */
  16.335 -void free_swap_and_cache(swp_entry_t entry)
  16.336 -{
  16.337 -	struct swap_info_struct * p;
  16.338 -	struct page *page = NULL;
  16.339 -
  16.340 -	p = swap_info_get(entry);
  16.341 -	if (p) {
  16.342 -		if (swap_entry_free(p, SWP_OFFSET(entry)) == 1)
  16.343 -			page = find_trylock_page(&swapper_space, entry.val);
  16.344 -		swap_info_put(p);
  16.345 -	}
  16.346 -	if (page) {
  16.347 -		page_cache_get(page);
  16.348 -		/* Only cache user (+us), or swap space full? Free it! */
  16.349 -		if (page_count(page) - !!page->buffers == 2 || vm_swap_full()) {
  16.350 -			delete_from_swap_cache(page);
  16.351 -			SetPageDirty(page);
  16.352 -		}
  16.353 -		UnlockPage(page);
  16.354 -		page_cache_release(page);
  16.355 -	}
  16.356 -}
  16.357 -
  16.358 -/*
  16.359 - * The swap entry has been read in advance, and we return 1 to indicate
  16.360 - * that the page has been used or is no longer needed.
  16.361 - *
  16.362 - * Always set the resulting pte to be nowrite (the same as COW pages
  16.363 - * after one process has exited).  We don't know just how many PTEs will
  16.364 - * share this swap entry, so be cautious and let do_wp_page work out
  16.365 - * what to do if a write is requested later.
  16.366 - */
  16.367 -/* mmlist_lock and vma->vm_mm->page_table_lock are held */
  16.368 -static inline void unuse_pte(struct vm_area_struct * vma, unsigned long address,
  16.369 -	pte_t *dir, swp_entry_t entry, struct page* page)
  16.370 -{
  16.371 -	pte_t pte = *dir;
  16.372 -
  16.373 -	if (likely(pte_to_swp_entry(pte).val != entry.val))
  16.374 -		return;
  16.375 -	if (unlikely(pte_none(pte) || pte_present(pte)))
  16.376 -		return;
  16.377 -	get_page(page);
  16.378 -	set_pte(dir, pte_mkold(mk_pte(page, vma->vm_page_prot)));
  16.379 -	swap_free(entry);
  16.380 -	++vma->vm_mm->rss;
  16.381 -}
  16.382 -
  16.383 -/* mmlist_lock and vma->vm_mm->page_table_lock are held */
  16.384 -static inline void unuse_pmd(struct vm_area_struct * vma, pmd_t *dir,
  16.385 -	unsigned long address, unsigned long size, unsigned long offset,
  16.386 -	swp_entry_t entry, struct page* page)
  16.387 -{
  16.388 -	pte_t * pte;
  16.389 -	unsigned long end;
  16.390 -
  16.391 -	if (pmd_none(*dir))
  16.392 -		return;
  16.393 -	if (pmd_bad(*dir)) {
  16.394 -		pmd_ERROR(*dir);
  16.395 -		pmd_clear(dir);
  16.396 -		return;
  16.397 -	}
  16.398 -	pte = pte_offset(dir, address);
  16.399 -	offset += address & PMD_MASK;
  16.400 -	address &= ~PMD_MASK;
  16.401 -	end = address + size;
  16.402 -	if (end > PMD_SIZE)
  16.403 -		end = PMD_SIZE;
  16.404 -	do {
  16.405 -		unuse_pte(vma, offset+address-vma->vm_start, pte, entry, page);
  16.406 -		address += PAGE_SIZE;
  16.407 -		pte++;
  16.408 -	} while (address && (address < end));
  16.409 -}
  16.410 -
  16.411 -/* mmlist_lock and vma->vm_mm->page_table_lock are held */
  16.412 -static inline void unuse_pgd(struct vm_area_struct * vma, pgd_t *dir,
  16.413 -	unsigned long address, unsigned long size,
  16.414 -	swp_entry_t entry, struct page* page)
  16.415 -{
  16.416 -	pmd_t * pmd;
  16.417 -	unsigned long offset, end;
  16.418 -
  16.419 -	if (pgd_none(*dir))
  16.420 -		return;
  16.421 -	if (pgd_bad(*dir)) {
  16.422 -		pgd_ERROR(*dir);
  16.423 -		pgd_clear(dir);
  16.424 -		return;
  16.425 -	}
  16.426 -	pmd = pmd_offset(dir, address);
  16.427 -	offset = address & PGDIR_MASK;
  16.428 -	address &= ~PGDIR_MASK;
  16.429 -	end = address + size;
  16.430 -	if (end > PGDIR_SIZE)
  16.431 -		end = PGDIR_SIZE;
  16.432 -	if (address >= end)
  16.433 -		BUG();
  16.434 -	do {
  16.435 -		unuse_pmd(vma, pmd, address, end - address, offset, entry,
  16.436 -			  page);
  16.437 -		address = (address + PMD_SIZE) & PMD_MASK;
  16.438 -		pmd++;
  16.439 -	} while (address && (address < end));
  16.440 -}
  16.441 -
  16.442 -/* mmlist_lock and vma->vm_mm->page_table_lock are held */
  16.443 -static void unuse_vma(struct vm_area_struct * vma, pgd_t *pgdir,
  16.444 -			swp_entry_t entry, struct page* page)
  16.445 -{
  16.446 -	unsigned long start = vma->vm_start, end = vma->vm_end;
  16.447 -
  16.448 -	if (start >= end)
  16.449 -		BUG();
  16.450 -	do {
  16.451 -		unuse_pgd(vma, pgdir, start, end - start, entry, page);
  16.452 -		start = (start + PGDIR_SIZE) & PGDIR_MASK;
  16.453 -		pgdir++;
  16.454 -	} while (start && (start < end));
  16.455 -}
  16.456 -
  16.457 -static void unuse_process(struct mm_struct * mm,
  16.458 -			swp_entry_t entry, struct page* page)
  16.459 -{
  16.460 -	struct vm_area_struct* vma;
  16.461 -
  16.462 -	/*
  16.463 -	 * Go through process' page directory.
  16.464 -	 */
  16.465 -	spin_lock(&mm->page_table_lock);
  16.466 -	for (vma = mm->mmap; vma; vma = vma->vm_next) {
  16.467 -		pgd_t * pgd = pgd_offset(mm, vma->vm_start);
  16.468 -		unuse_vma(vma, pgd, entry, page);
  16.469 -	}
  16.470 -	XEN_flush_page_update_queue();
  16.471 -	spin_unlock(&mm->page_table_lock);
  16.472 -	return;
  16.473 -}
  16.474 -
  16.475 -/*
  16.476 - * Scan swap_map from current position to next entry still in use.
  16.477 - * Recycle to start on reaching the end, returning 0 when empty.
  16.478 - */
  16.479 -static int find_next_to_unuse(struct swap_info_struct *si, int prev)
  16.480 -{
  16.481 -	int max = si->max;
  16.482 -	int i = prev;
  16.483 -	int count;
  16.484 -
  16.485 -	/*
  16.486 -	 * No need for swap_device_lock(si) here: we're just looking
  16.487 -	 * for whether an entry is in use, not modifying it; false
  16.488 -	 * hits are okay, and sys_swapoff() has already prevented new
  16.489 -	 * allocations from this area (while holding swap_list_lock()).
  16.490 -	 */
  16.491 -	for (;;) {
  16.492 -		if (++i >= max) {
  16.493 -			if (!prev) {
  16.494 -				i = 0;
  16.495 -				break;
  16.496 -			}
  16.497 -			/*
  16.498 -			 * No entries in use at top of swap_map,
  16.499 -			 * loop back to start and recheck there.
  16.500 -			 */
  16.501 -			max = prev + 1;
  16.502 -			prev = 0;
  16.503 -			i = 1;
  16.504 -		}
  16.505 -		count = si->swap_map[i];
  16.506 -		if (count && count != SWAP_MAP_BAD)
  16.507 -			break;
  16.508 -	}
  16.509 -	return i;
  16.510 -}
  16.511 -
  16.512 -/*
  16.513 - * We completely avoid races by reading each swap page in advance,
  16.514 - * and then search for the process using it.  All the necessary
  16.515 - * page table adjustments can then be made atomically.
  16.516 - */
  16.517 -static int try_to_unuse(unsigned int type)
  16.518 -{
  16.519 -	struct swap_info_struct * si = &swap_info[type];
  16.520 -	struct mm_struct *start_mm;
  16.521 -	unsigned short *swap_map;
  16.522 -	unsigned short swcount;
  16.523 -	struct page *page;
  16.524 -	swp_entry_t entry;
  16.525 -	int i = 0;
  16.526 -	int retval = 0;
  16.527 -	int reset_overflow = 0;
  16.528 -	int shmem;
  16.529 -
  16.530 -	/*
  16.531 -	 * When searching mms for an entry, a good strategy is to
  16.532 -	 * start at the first mm we freed the previous entry from
  16.533 -	 * (though actually we don't notice whether we or coincidence
  16.534 -	 * freed the entry).  Initialize this start_mm with a hold.
  16.535 -	 *
  16.536 -	 * A simpler strategy would be to start at the last mm we
  16.537 -	 * freed the previous entry from; but that would take less
  16.538 -	 * advantage of mmlist ordering (now preserved by swap_out()),
  16.539 -	 * which clusters forked address spaces together, most recent
  16.540 -	 * child immediately after parent.  If we race with dup_mmap(),
  16.541 -	 * we very much want to resolve parent before child, otherwise
  16.542 -	 * we may miss some entries: using last mm would invert that.
  16.543 -	 */
  16.544 -	start_mm = &init_mm;
  16.545 -	atomic_inc(&init_mm.mm_users);
  16.546 -
  16.547 -	/*
  16.548 -	 * Keep on scanning until all entries have gone.  Usually,
  16.549 -	 * one pass through swap_map is enough, but not necessarily:
  16.550 -	 * mmput() removes mm from mmlist before exit_mmap() and its
  16.551 -	 * zap_page_range().  That's not too bad, those entries are
  16.552 -	 * on their way out, and handled faster there than here.
  16.553 -	 * do_munmap() behaves similarly, taking the range out of mm's
  16.554 -	 * vma list before zap_page_range().  But unfortunately, when
  16.555 -	 * unmapping a part of a vma, it takes the whole out first,
  16.556 -	 * then reinserts what's left after (might even reschedule if
  16.557 -	 * open() method called) - so swap entries may be invisible
  16.558 -	 * to swapoff for a while, then reappear - but that is rare.
  16.559 -	 */
  16.560 -	while ((i = find_next_to_unuse(si, i))) {
  16.561 -		/* 
  16.562 -		 * Get a page for the entry, using the existing swap
  16.563 -		 * cache page if there is one.  Otherwise, get a clean
  16.564 -		 * page and read the swap into it. 
  16.565 -		 */
  16.566 -		swap_map = &si->swap_map[i];
  16.567 -		entry = SWP_ENTRY(type, i);
  16.568 -		page = read_swap_cache_async(entry);
  16.569 -		if (!page) {
  16.570 -			/*
  16.571 -			 * Either swap_duplicate() failed because entry
  16.572 -			 * has been freed independently, and will not be
  16.573 -			 * reused since sys_swapoff() already disabled
  16.574 -			 * allocation from here, or alloc_page() failed.
  16.575 -			 */
  16.576 -			if (!*swap_map)
  16.577 -				continue;
  16.578 -			retval = -ENOMEM;
  16.579 -			break;
  16.580 -		}
  16.581 -
  16.582 -		/*
  16.583 -		 * Don't hold on to start_mm if it looks like exiting.
  16.584 -		 */
  16.585 -		if (atomic_read(&start_mm->mm_users) == 1) {
  16.586 -			mmput(start_mm);
  16.587 -			start_mm = &init_mm;
  16.588 -			atomic_inc(&init_mm.mm_users);
  16.589 -		}
  16.590 -
  16.591 -		/*
  16.592 -		 * Wait for and lock page.  When do_swap_page races with
  16.593 -		 * try_to_unuse, do_swap_page can handle the fault much
  16.594 -		 * faster than try_to_unuse can locate the entry.  This
  16.595 -		 * apparently redundant "wait_on_page" lets try_to_unuse
  16.596 -		 * defer to do_swap_page in such a case - in some tests,
  16.597 -		 * do_swap_page and try_to_unuse repeatedly compete.
  16.598 -		 */
  16.599 -		wait_on_page(page);
  16.600 -		lock_page(page);
  16.601 -
  16.602 -		/*
  16.603 -		 * Remove all references to entry, without blocking.
  16.604 -		 * Whenever we reach init_mm, there's no address space
  16.605 -		 * to search, but use it as a reminder to search shmem.
  16.606 -		 */
  16.607 -		shmem = 0;
  16.608 -		swcount = *swap_map;
  16.609 -		if (swcount > 1) {
  16.610 -			flush_page_to_ram(page);
  16.611 -			if (start_mm == &init_mm)
  16.612 -				shmem = shmem_unuse(entry, page);
  16.613 -			else
  16.614 -				unuse_process(start_mm, entry, page);
  16.615 -		}
  16.616 -		if (*swap_map > 1) {
  16.617 -			int set_start_mm = (*swap_map >= swcount);
  16.618 -			struct list_head *p = &start_mm->mmlist;
  16.619 -			struct mm_struct *new_start_mm = start_mm;
  16.620 -			struct mm_struct *mm;
  16.621 -
  16.622 -			spin_lock(&mmlist_lock);
  16.623 -			while (*swap_map > 1 &&
  16.624 -					(p = p->next) != &start_mm->mmlist) {
  16.625 -				mm = list_entry(p, struct mm_struct, mmlist);
  16.626 -				swcount = *swap_map;
  16.627 -				if (mm == &init_mm) {
  16.628 -					set_start_mm = 1;
  16.629 -					spin_unlock(&mmlist_lock);
  16.630 -					shmem = shmem_unuse(entry, page);
  16.631 -					spin_lock(&mmlist_lock);
  16.632 -				} else
  16.633 -					unuse_process(mm, entry, page);
  16.634 -				if (set_start_mm && *swap_map < swcount) {
  16.635 -					new_start_mm = mm;
  16.636 -					set_start_mm = 0;
  16.637 -				}
  16.638 -			}
  16.639 -			atomic_inc(&new_start_mm->mm_users);
  16.640 -			spin_unlock(&mmlist_lock);
  16.641 -			mmput(start_mm);
  16.642 -			start_mm = new_start_mm;
  16.643 -		}
  16.644 -
  16.645 -		/*
  16.646 -		 * How could swap count reach 0x7fff when the maximum
  16.647 -		 * pid is 0x7fff, and there's no way to repeat a swap
  16.648 -		 * page within an mm (except in shmem, where it's the
  16.649 -		 * shared object which takes the reference count)?
  16.650 -		 * We believe SWAP_MAP_MAX cannot occur in Linux 2.4.
  16.651 -		 *
  16.652 -		 * If that's wrong, then we should worry more about
  16.653 -		 * exit_mmap() and do_munmap() cases described above:
  16.654 -		 * we might be resetting SWAP_MAP_MAX too early here.
  16.655 -		 * We know "Undead"s can happen, they're okay, so don't
  16.656 -		 * report them; but do report if we reset SWAP_MAP_MAX.
  16.657 -		 */
  16.658 -		if (*swap_map == SWAP_MAP_MAX) {
  16.659 -			swap_list_lock();
  16.660 -			swap_device_lock(si);
  16.661 -			nr_swap_pages++;
  16.662 -			*swap_map = 1;
  16.663 -			swap_device_unlock(si);
  16.664 -			swap_list_unlock();
  16.665 -			reset_overflow = 1;
  16.666 -		}
  16.667 -
  16.668 -		/*
  16.669 -		 * If a reference remains (rare), we would like to leave
  16.670 -		 * the page in the swap cache; but try_to_swap_out could
  16.671 -		 * then re-duplicate the entry once we drop page lock,
  16.672 -		 * so we might loop indefinitely; also, that page could
  16.673 -		 * not be swapped out to other storage meanwhile.  So:
  16.674 -		 * delete from cache even if there's another reference,
  16.675 -		 * after ensuring that the data has been saved to disk -
  16.676 -		 * since if the reference remains (rarer), it will be
  16.677 -		 * read from disk into another page.  Splitting into two
  16.678 -		 * pages would be incorrect if swap supported "shared
  16.679 -		 * private" pages, but they are handled by tmpfs files.
  16.680 -		 *
  16.681 -		 * Note shmem_unuse already deleted swappage from cache,
  16.682 -		 * unless corresponding filepage found already in cache:
  16.683 -		 * in which case it left swappage in cache, lowered its
  16.684 -		 * swap count to pass quickly through the loops above,
  16.685 -		 * and now we must reincrement count to try again later.
  16.686 -		 */
  16.687 -		if ((*swap_map > 1) && PageDirty(page) && PageSwapCache(page)) {
  16.688 -			rw_swap_page(WRITE, page);
  16.689 -			lock_page(page);
  16.690 -		}
  16.691 -		if (PageSwapCache(page)) {
  16.692 -			if (shmem)
  16.693 -				swap_duplicate(entry);
  16.694 -			else
  16.695 -				delete_from_swap_cache(page);
  16.696 -		}
  16.697 -
  16.698 -		/*
  16.699 -		 * So we could skip searching mms once swap count went
  16.700 -		 * to 1, we did not mark any present ptes as dirty: must
  16.701 -		 * mark page dirty so try_to_swap_out will preserve it.
  16.702 -		 */
  16.703 -		SetPageDirty(page);
  16.704 -		UnlockPage(page);
  16.705 -		page_cache_release(page);
  16.706 -
  16.707 -		/*
  16.708 -		 * Make sure that we aren't completely killing
  16.709 -		 * interactive performance.  Interruptible check on
  16.710 -		 * signal_pending() would be nice, but changes the spec?
  16.711 -		 */
  16.712 -		if (current->need_resched)
  16.713 -			schedule();
  16.714 -	}
  16.715 -
  16.716 -	mmput(start_mm);
  16.717 -	if (reset_overflow) {
  16.718 -		printk(KERN_WARNING "swapoff: cleared swap entry overflow\n");
  16.719 -		swap_overflow = 0;
  16.720 -	}
  16.721 -	return retval;
  16.722 -}
  16.723 -
  16.724 -asmlinkage long sys_swapoff(const char * specialfile)
  16.725 -{
  16.726 -	struct swap_info_struct * p = NULL;
  16.727 -	unsigned short *swap_map;
  16.728 -	struct nameidata nd;
  16.729 -	int i, type, prev;
  16.730 -	int err;
  16.731 -	
  16.732 -	if (!capable(CAP_SYS_ADMIN))
  16.733 -		return -EPERM;
  16.734 -
  16.735 -	err = user_path_walk(specialfile, &nd);
  16.736 -	if (err)
  16.737 -		goto out;
  16.738 -
  16.739 -	lock_kernel();
  16.740 -	prev = -1;
  16.741 -	swap_list_lock();
  16.742 -	for (type = swap_list.head; type >= 0; type = swap_info[type].next) {
  16.743 -		p = swap_info + type;
  16.744 -		if ((p->flags & SWP_WRITEOK) == SWP_WRITEOK) {
  16.745 -			if (p->swap_file == nd.dentry)
  16.746 -			  break;
  16.747 -		}
  16.748 -		prev = type;
  16.749 -	}
  16.750 -	err = -EINVAL;
  16.751 -	if (type < 0) {
  16.752 -		swap_list_unlock();
  16.753 -		goto out_dput;
  16.754 -	}
  16.755 -
  16.756 -	if (prev < 0) {
  16.757 -		swap_list.head = p->next;
  16.758 -	} else {
  16.759 -		swap_info[prev].next = p->next;
  16.760 -	}
  16.761 -	if (type == swap_list.next) {
  16.762 -		/* just pick something that's safe... */
  16.763 -		swap_list.next = swap_list.head;
  16.764 -	}
  16.765 -	nr_swap_pages -= p->pages;
  16.766 -	total_swap_pages -= p->pages;
  16.767 -	p->flags = SWP_USED;
  16.768 -	swap_list_unlock();
  16.769 -	unlock_kernel();
  16.770 -	err = try_to_unuse(type);
  16.771 -	lock_kernel();
  16.772 -	if (err) {
  16.773 -		/* re-insert swap space back into swap_list */
  16.774 -		swap_list_lock();
  16.775 -		for (prev = -1, i = swap_list.head; i >= 0; prev = i, i = swap_info[i].next)
  16.776 -			if (p->prio >= swap_info[i].prio)
  16.777 -				break;
  16.778 -		p->next = i;
  16.779 -		if (prev < 0)
  16.780 -			swap_list.head = swap_list.next = p - swap_info;
  16.781 -		else
  16.782 -			swap_info[prev].next = p - swap_info;
  16.783 -		nr_swap_pages += p->pages;
  16.784 -		total_swap_pages += p->pages;
  16.785 -		p->flags = SWP_WRITEOK;
  16.786 -		swap_list_unlock();
  16.787 -		goto out_dput;
  16.788 -	}
  16.789 -	if (p->swap_device)
  16.790 -		blkdev_put(p->swap_file->d_inode->i_bdev, BDEV_SWAP);
  16.791 -	path_release(&nd);
  16.792 -
  16.793 -	swap_list_lock();
  16.794 -	swap_device_lock(p);
  16.795 -	nd.mnt = p->swap_vfsmnt;
  16.796 -	nd.dentry = p->swap_file;
  16.797 -	p->swap_vfsmnt = NULL;
  16.798 -	p->swap_file = NULL;
  16.799 -	p->swap_device = 0;
  16.800 -	p->max = 0;
  16.801 -	swap_map = p->swap_map;
  16.802 -	p->swap_map = NULL;
  16.803 -	p->flags = 0;
  16.804 -	swap_device_unlock(p);
  16.805 -	swap_list_unlock();
  16.806 -	vfree(swap_map);
  16.807 -	err = 0;
  16.808 -
  16.809 -out_dput:
  16.810 -	unlock_kernel();
  16.811 -	path_release(&nd);
  16.812 -out:
  16.813 -	return err;
  16.814 -}
  16.815 -
  16.816 -int get_swaparea_info(char *buf)
  16.817 -{
  16.818 -	char * page = (char *) __get_free_page(GFP_KERNEL);
  16.819 -	struct swap_info_struct *ptr = swap_info;
  16.820 -	int i, j, len = 0, usedswap;
  16.821 -
  16.822 -	if (!page)
  16.823 -		return -ENOMEM;
  16.824 -
  16.825 -	len += sprintf(buf, "Filename\t\t\tType\t\tSize\tUsed\tPriority\n");
  16.826 -	for (i = 0 ; i < nr_swapfiles ; i++, ptr++) {
  16.827 -		if ((ptr->flags & SWP_USED) && ptr->swap_map) {
  16.828 -			char * path = d_path(ptr->swap_file, ptr->swap_vfsmnt,
  16.829 -						page, PAGE_SIZE);
  16.830 -
  16.831 -			len += sprintf(buf + len, "%-31s ", path);
  16.832 -
  16.833 -			if (!ptr->swap_device)
  16.834 -				len += sprintf(buf + len, "file\t\t");
  16.835 -			else
  16.836 -				len += sprintf(buf + len, "partition\t");
  16.837 -
  16.838 -			usedswap = 0;
  16.839 -			for (j = 0; j < ptr->max; ++j)
  16.840 -				switch (ptr->swap_map[j]) {
  16.841 -					case SWAP_MAP_BAD:
  16.842 -					case 0:
  16.843 -						continue;
  16.844 -					default:
  16.845 -						usedswap++;
  16.846 -				}
  16.847 -			len += sprintf(buf + len, "%d\t%d\t%d\n", ptr->pages << (PAGE_SHIFT - 10), 
  16.848 -				usedswap << (PAGE_SHIFT - 10), ptr->prio);
  16.849 -		}
  16.850 -	}
  16.851 -	free_page((unsigned long) page);
  16.852 -	return len;
  16.853 -}
  16.854 -
  16.855 -int is_swap_partition(kdev_t dev) {
  16.856 -	struct swap_info_struct *ptr = swap_info;
  16.857 -	int i;
  16.858 -
  16.859 -	for (i = 0 ; i < nr_swapfiles ; i++, ptr++) {
  16.860 -		if (ptr->flags & SWP_USED)
  16.861 -			if (ptr->swap_device == dev)
  16.862 -				return 1;
  16.863 -	}
  16.864 -	return 0;
  16.865 -}
  16.866 -
  16.867 -/*
  16.868 - * Written 01/25/92 by Simmule Turner, heavily changed by Linus.
  16.869 - *
  16.870 - * The swapon system call
  16.871 - */
  16.872 -asmlinkage long sys_swapon(const char * specialfile, int swap_flags)
  16.873 -{
  16.874 -	struct swap_info_struct * p;
  16.875 -	struct nameidata nd;
  16.876 -	struct inode * swap_inode;
  16.877 -	unsigned int type;
  16.878 -	int i, j, prev;
  16.879 -	int error;
  16.880 -	static int least_priority = 0;
  16.881 -	union swap_header *swap_header = 0;
  16.882 -	int swap_header_version;
  16.883 -	int nr_good_pages = 0;
  16.884 -	unsigned long maxpages = 1;
  16.885 -	int swapfilesize;
  16.886 -	struct block_device *bdev = NULL;
  16.887 -	unsigned short *swap_map;
  16.888 -	
  16.889 -	if (!capable(CAP_SYS_ADMIN))
  16.890 -		return -EPERM;
  16.891 -	lock_kernel();
  16.892 -	swap_list_lock();
  16.893 -	p = swap_info;
  16.894 -	for (type = 0 ; type < nr_swapfiles ; type++,p++)
  16.895 -		if (!(p->flags & SWP_USED))
  16.896 -			break;
  16.897 -	error = -EPERM;
  16.898 -	if (type >= MAX_SWAPFILES) {
  16.899 -		swap_list_unlock();
  16.900 -		goto out;
  16.901 -	}
  16.902 -	if (type >= nr_swapfiles)
  16.903 -		nr_swapfiles = type+1;
  16.904 -	p->flags = SWP_USED;
  16.905 -	p->swap_file = NULL;
  16.906 -	p->swap_vfsmnt = NULL;
  16.907 -	p->swap_device = 0;
  16.908 -	p->swap_map = NULL;
  16.909 -	p->lowest_bit = 0;
  16.910 -	p->highest_bit = 0;
  16.911 -	p->cluster_nr = 0;
  16.912 -	p->sdev_lock = SPIN_LOCK_UNLOCKED;
  16.913 -	p->next = -1;
  16.914 -	if (swap_flags & SWAP_FLAG_PREFER) {
  16.915 -		p->prio =
  16.916 -		  (swap_flags & SWAP_FLAG_PRIO_MASK)>>SWAP_FLAG_PRIO_SHIFT;
  16.917 -	} else {
  16.918 -		p->prio = --least_priority;
  16.919 -	}
  16.920 -	swap_list_unlock();
  16.921 -	error = user_path_walk(specialfile, &nd);
  16.922 -	if (error)
  16.923 -		goto bad_swap_2;
  16.924 -
  16.925 -	p->swap_file = nd.dentry;
  16.926 -	p->swap_vfsmnt = nd.mnt;
  16.927 -	swap_inode = nd.dentry->d_inode;
  16.928 -	error = -EINVAL;
  16.929 -
  16.930 -	if (S_ISBLK(swap_inode->i_mode)) {
  16.931 -		kdev_t dev = swap_inode->i_rdev;
  16.932 -		struct block_device_operations *bdops;
  16.933 -		devfs_handle_t de;
  16.934 -
  16.935 -		if (is_mounted(dev)) {
  16.936 -			error = -EBUSY;
  16.937 -			goto bad_swap_2;
  16.938 -		}
  16.939 -
  16.940 -		p->swap_device = dev;
  16.941 -		set_blocksize(dev, PAGE_SIZE);
  16.942 -		
  16.943 -		bd_acquire(swap_inode);
  16.944 -		bdev = swap_inode->i_bdev;
  16.945 -		de = devfs_get_handle_from_inode(swap_inode);
  16.946 -		bdops = devfs_get_ops(de);  /*  Increments module use count  */
  16.947 -		if (bdops) bdev->bd_op = bdops;
  16.948 -
  16.949 -		error = blkdev_get(bdev, FMODE_READ|FMODE_WRITE, 0, BDEV_SWAP);
  16.950 -		devfs_put_ops(de);/*Decrement module use count now we're safe*/
  16.951 -		if (error)
  16.952 -			goto bad_swap_2;
  16.953 -		set_blocksize(dev, PAGE_SIZE);
  16.954 -		error = -ENODEV;
  16.955 -		if (!dev || (blk_size[MAJOR(dev)] &&
  16.956 -		     !blk_size[MAJOR(dev)][MINOR(dev)]))
  16.957 -			goto bad_swap;
  16.958 -		swapfilesize = 0;
  16.959 -		if (blk_size[MAJOR(dev)])
  16.960 -			swapfilesize = blk_size[MAJOR(dev)][MINOR(dev)]
  16.961 -				>> (PAGE_SHIFT - 10);
  16.962 -	} else if (S_ISREG(swap_inode->i_mode))
  16.963 -		swapfilesize = swap_inode->i_size >> PAGE_SHIFT;
  16.964 -	else
  16.965 -		goto bad_swap;
  16.966 -
  16.967 -	error = -EBUSY;
  16.968 -	for (i = 0 ; i < nr_swapfiles ; i++) {
  16.969 -		struct swap_info_struct *q = &swap_info[i];
  16.970 -		if (i == type || !q->swap_file)
  16.971 -			continue;
  16.972 -		if (swap_inode->i_mapping == q->swap_file->d_inode->i_mapping)
  16.973 -			goto bad_swap;
  16.974 -	}
  16.975 -
  16.976 -	swap_header = (void *) __get_free_page(GFP_USER);
  16.977 -	if (!swap_header) {
  16.978 -		printk("Unable to start swapping: out of memory :-)\n");
  16.979 -		error = -ENOMEM;
  16.980 -		goto bad_swap;
  16.981 -	}
  16.982 -
  16.983 -	lock_page(virt_to_page(swap_header));
  16.984 -	rw_swap_page_nolock(READ, SWP_ENTRY(type,0), (char *) swap_header);
  16.985 -
  16.986 -	if (!memcmp("SWAP-SPACE",swap_header->magic.magic,10))
  16.987 -		swap_header_version = 1;
  16.988 -	else if (!memcmp("SWAPSPACE2",swap_header->magic.magic,10))
  16.989 -		swap_header_version = 2;
  16.990 -	else {
  16.991 -		printk("Unable to find swap-space signature\n");
  16.992 -		error = -EINVAL;
  16.993 -		goto bad_swap;
  16.994 -	}
  16.995 -	
  16.996 -	switch (swap_header_version) {
  16.997 -	case 1:
  16.998 -		memset(((char *) swap_header)+PAGE_SIZE-10,0,10);
  16.999 -		j = 0;
 16.1000 -		p->lowest_bit = 0;
 16.1001 -		p->highest_bit = 0;
 16.1002 -		for (i = 1 ; i < 8*PAGE_SIZE ; i++) {
 16.1003 -			if (test_bit(i,(char *) swap_header)) {
 16.1004 -				if (!p->lowest_bit)
 16.1005 -					p->lowest_bit = i;
 16.1006 -				p->highest_bit = i;
 16.1007 -				maxpages = i+1;
 16.1008 -				j++;
 16.1009 -			}
 16.1010 -		}
 16.1011 -		nr_good_pages = j;
 16.1012 -		p->swap_map = vmalloc(maxpages * sizeof(short));
 16.1013 -		if (!p->swap_map) {
 16.1014 -			error = -ENOMEM;		
 16.1015 -			goto bad_swap;
 16.1016 -		}
 16.1017 -		for (i = 1 ; i < maxpages ; i++) {
 16.1018 -			if (test_bit(i,(char *) swap_header))
 16.1019 -				p->swap_map[i] = 0;
 16.1020 -			else
 16.1021 -				p->swap_map[i] = SWAP_MAP_BAD;
 16.1022 -		}
 16.1023 -		break;
 16.1024 -
 16.1025 -	case 2:
 16.1026 -		/* Check the swap header's sub-version and the size of
 16.1027 -                   the swap file and bad block lists */
 16.1028 -		if (swap_header->info.version != 1) {
 16.1029 -			printk(KERN_WARNING
 16.1030 -			       "Unable to handle swap header version %d\n",
 16.1031 -			       swap_header->info.version);
 16.1032 -			error = -EINVAL;
 16.1033 -			goto bad_swap;
 16.1034 -		}
 16.1035 -
 16.1036 -		p->lowest_bit  = 1;
 16.1037 -		maxpages = SWP_OFFSET(SWP_ENTRY(0,~0UL)) - 1;
 16.1038 -		if (maxpages > swap_header->info.last_page)
 16.1039 -			maxpages = swap_header->info.last_page;
 16.1040 -		p->highest_bit = maxpages - 1;
 16.1041 -
 16.1042 -		error = -EINVAL;
 16.1043 -		if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES)
 16.1044 -			goto bad_swap;
 16.1045 -		
 16.1046 -		/* OK, set up the swap map and apply the bad block list */
 16.1047 -		if (!(p->swap_map = vmalloc(maxpages * sizeof(short)))) {
 16.1048 -			error = -ENOMEM;
 16.1049 -			goto bad_swap;
 16.1050 -		}
 16.1051 -
 16.1052 -		error = 0;
 16.1053 -		memset(p->swap_map, 0, maxpages * sizeof(short));
 16.1054 -		for (i=0; i<swap_header->info.nr_badpages; i++) {
 16.1055 -			int page = swap_header->info.badpages[i];
 16.1056 -			if (page <= 0 || page >= swap_header->info.last_page)
 16.1057 -				error = -EINVAL;
 16.1058 -			else
 16.1059 -				p->swap_map[page] = SWAP_MAP_BAD;
 16.1060 -		}
 16.1061 -		nr_good_pages = swap_header->info.last_page -
 16.1062 -				swap_header->info.nr_badpages -
 16.1063 -				1 /* header page */;
 16.1064 -		if (error) 
 16.1065 -			goto bad_swap;
 16.1066 -	}
 16.1067 -	
 16.1068 -	if (swapfilesize && maxpages > swapfilesize) {
 16.1069 -		printk(KERN_WARNING
 16.1070 -		       "Swap area shorter than signature indicates\n");
 16.1071 -		error = -EINVAL;
 16.1072 -		goto bad_swap;
 16.1073 -	}
 16.1074 -	if (!nr_good_pages) {
 16.1075 -		printk(KERN_WARNING "Empty swap-file\n");
 16.1076 -		error = -EINVAL;
 16.1077 -		goto bad_swap;
 16.1078 -	}
 16.1079 -	p->swap_map[0] = SWAP_MAP_BAD;
 16.1080 -	swap_list_lock();
 16.1081 -	swap_device_lock(p);
 16.1082 -	p->max = maxpages;
 16.1083 -	p->flags = SWP_WRITEOK;
 16.1084 -	p->pages = nr_good_pages;
 16.1085 -	nr_swap_pages += nr_good_pages;
 16.1086 -	total_swap_pages += nr_good_pages;
 16.1087 -	printk(KERN_INFO "Adding Swap: %dk swap-space (priority %d)\n",
 16.1088 -	       nr_good_pages<<(PAGE_SHIFT-10), p->prio);
 16.1089 -
 16.1090 -	/* insert swap space into swap_list: */
 16.1091 -	prev = -1;
 16.1092 -	for (i = swap_list.head; i >= 0; i = swap_info[i].next) {
 16.1093 -		if (p->prio >= swap_info[i].prio) {
 16.1094 -			break;
 16.1095 -		}
 16.1096 -		prev = i;
 16.1097 -	}
 16.1098 -	p->next = i;
 16.1099 -	if (prev < 0) {
 16.1100 -		swap_list.head = swap_list.next = p - swap_info;
 16.1101 -	} else {
 16.1102 -		swap_info[prev].next = p - swap_info;
 16.1103 -	}
 16.1104 -	swap_device_unlock(p);
 16.1105 -	swap_list_unlock();
 16.1106 -	error = 0;
 16.1107 -	goto out;
 16.1108 -bad_swap:
 16.1109 -	if (bdev)
 16.1110 -		blkdev_put(bdev, BDEV_SWAP);
 16.1111 -bad_swap_2:
 16.1112 -	swap_list_lock();
 16.1113 -	swap_map = p->swap_map;
 16.1114 -	nd.mnt = p->swap_vfsmnt;
 16.1115 -	nd.dentry = p->swap_file;
 16.1116 -	p->swap_device = 0;
 16.1117 -	p->swap_file = NULL;
 16.1118 -	p->swap_vfsmnt = NULL;
 16.1119 -	p->swap_map = NULL;
 16.1120 -	p->flags = 0;
 16.1121 -	if (!(swap_flags & SWAP_FLAG_PREFER))
 16.1122 -		++least_priority;
 16.1123 -	swap_list_unlock();
 16.1124 -	if (swap_map)
 16.1125 -		vfree(swap_map);
 16.1126 -	path_release(&nd);
 16.1127 -out:
 16.1128 -	if (swap_header)
 16.1129 -		free_page((long) swap_header);
 16.1130 -	unlock_kernel();
 16.1131 -	return error;
 16.1132 -}
 16.1133 -
 16.1134 -void si_swapinfo(struct sysinfo *val)
 16.1135 -{
 16.1136 -	unsigned int i;
 16.1137 -	unsigned long nr_to_be_unused = 0;
 16.1138 -
 16.1139 -	swap_list_lock();
 16.1140 -	for (i = 0; i < nr_swapfiles; i++) {
 16.1141 -		unsigned int j;
 16.1142 -		if (swap_info[i].flags != SWP_USED)
 16.1143 -			continue;
 16.1144 -		for (j = 0; j < swap_info[i].max; ++j) {
 16.1145 -			switch (swap_info[i].swap_map[j]) {
 16.1146 -				case 0:
 16.1147 -				case SWAP_MAP_BAD:
 16.1148 -					continue;
 16.1149 -				default:
 16.1150 -					nr_to_be_unused++;
 16.1151 -			}
 16.1152 -		}
 16.1153 -	}
 16.1154 -	val->freeswap = nr_swap_pages + nr_to_be_unused;
 16.1155 -	val->totalswap = total_swap_pages + nr_to_be_unused;
 16.1156 -	swap_list_unlock();
 16.1157 -}
 16.1158 -
 16.1159 -/*
 16.1160 - * Verify that a swap entry is valid and increment its swap map count.
 16.1161 - *
 16.1162 - * Note: if swap_map[] reaches SWAP_MAP_MAX the entries are treated as
 16.1163 - * "permanent", but will be reclaimed by the next swapoff.
 16.1164 - */
 16.1165 -int swap_duplicate(swp_entry_t entry)
 16.1166 -{
 16.1167 -	struct swap_info_struct * p;
 16.1168 -	unsigned long offset, type;
 16.1169 -	int result = 0;
 16.1170 -
 16.1171 -	type = SWP_TYPE(entry);
 16.1172 -	if (type >= nr_swapfiles)
 16.1173 -		goto bad_file;
 16.1174 -	p = type + swap_info;
 16.1175 -	offset = SWP_OFFSET(entry);
 16.1176 -
 16.1177 -	swap_device_lock(p);
 16.1178 -	if (offset < p->max && p->swap_map[offset]) {
 16.1179 -		if (p->swap_map[offset] < SWAP_MAP_MAX - 1) {
 16.1180 -			p->swap_map[offset]++;
 16.1181 -			result = 1;
 16.1182 -		} else if (p->swap_map[offset] <= SWAP_MAP_MAX) {
 16.1183 -			if (swap_overflow++ < 5)
 16.1184 -				printk(KERN_WARNING "swap_dup: swap entry overflow\n");
 16.1185 -			p->swap_map[offset] = SWAP_MAP_MAX;
 16.1186 -			result = 1;
 16.1187 -		}
 16.1188 -	}
 16.1189 -	swap_device_unlock(p);
 16.1190 -out:
 16.1191 -	return result;
 16.1192 -
 16.1193 -bad_file:
 16.1194 -	printk(KERN_ERR "swap_dup: %s%08lx\n", Bad_file, entry.val);
 16.1195 -	goto out;
 16.1196 -}
 16.1197 -
 16.1198 -/*
 16.1199 - * Prior swap_duplicate protects against swap device deletion.
 16.1200 - */
 16.1201 -void get_swaphandle_info(swp_entry_t entry, unsigned long *offset, 
 16.1202 -			kdev_t *dev, struct inode **swapf)
 16.1203 -{
 16.1204 -	unsigned long type;
 16.1205 -	struct swap_info_struct *p;
 16.1206 -
 16.1207 -	type = SWP_TYPE(entry);
 16.1208 -	if (type >= nr_swapfiles) {
 16.1209 -		printk(KERN_ERR "rw_swap_page: %s%08lx\n", Bad_file, entry.val);
 16.1210 -		return;
 16.1211 -	}
 16.1212 -
 16.1213 -	p = &swap_info[type];
 16.1214 -	*offset = SWP_OFFSET(entry);
 16.1215 -	if (*offset >= p->max && *offset != 0) {
 16.1216 -		printk(KERN_ERR "rw_swap_page: %s%08lx\n", Bad_offset, entry.val);
 16.1217 -		return;
 16.1218 -	}
 16.1219 -	if (p->swap_map && !p->swap_map[*offset]) {
 16.1220 -		printk(KERN_ERR "rw_swap_page: %s%08lx\n", Unused_offset, entry.val);
 16.1221 -		return;
 16.1222 -	}
 16.1223 -	if (!(p->flags & SWP_USED)) {
 16.1224 -		printk(KERN_ERR "rw_swap_page: %s%08lx\n", Unused_file, entry.val);
 16.1225 -		return;
 16.1226 -	}
 16.1227 -
 16.1228 -	if (p->swap_device) {
 16.1229 -		*dev = p->swap_device;
 16.1230 -	} else if (p->swap_file) {
 16.1231 -		*swapf = p->swap_file->d_inode;
 16.1232 -	} else {
 16.1233 -		printk(KERN_ERR "rw_swap_page: no swap file or device\n");
 16.1234 -	}
 16.1235 -	return;
 16.1236 -}
 16.1237 -
 16.1238 -/*
 16.1239 - * swap_device_lock prevents swap_map being freed. Don't grab an extra
 16.1240 - * reference on the swaphandle, it doesn't matter if it becomes unused.
 16.1241 - */
 16.1242 -int valid_swaphandles(swp_entry_t entry, unsigned long *offset)
 16.1243 -{
 16.1244 -	int ret = 0, i = 1 << page_cluster;
 16.1245 -	unsigned long toff;
 16.1246 -	struct swap_info_struct *swapdev = SWP_TYPE(entry) + swap_info;
 16.1247 -
 16.1248 -	if (!page_cluster)	/* no readahead */
 16.1249 -		return 0;
 16.1250 -	toff = (SWP_OFFSET(entry) >> page_cluster) << page_cluster;
 16.1251 -	if (!toff)		/* first page is swap header */
 16.1252 -		toff++, i--;
 16.1253 -	*offset = toff;
 16.1254 -
 16.1255 -	swap_device_lock(swapdev);
 16.1256 -	do {
 16.1257 -		/* Don't read-ahead past the end of the swap area */
 16.1258 -		if (toff >= swapdev->max)
 16.1259 -			break;
 16.1260 -		/* Don't read in free or bad pages */
 16.1261 -		if (!swapdev->swap_map[toff])
 16.1262 -			break;
 16.1263 -		if (swapdev->swap_map[toff] == SWAP_MAP_BAD)
 16.1264 -			break;
 16.1265 -		toff++;
 16.1266 -		ret++;
 16.1267 -	} while (--i);
 16.1268 -	swap_device_unlock(swapdev);
 16.1269 -	return ret;
 16.1270 -}
    17.1 --- a/linux-2.4.29-xen-sparse/mm/vmalloc.c	Sat Mar 26 05:05:07 2005 +0000
    17.2 +++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
    17.3 @@ -1,385 +0,0 @@
    17.4 -/*
    17.5 - *  linux/mm/vmalloc.c
    17.6 - *
    17.7 - *  Copyright (C) 1993  Linus Torvalds
    17.8 - *  Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
    17.9 - *  SMP-safe vmalloc/vfree/ioremap, Tigran Aivazian <tigran@veritas.com>, May 2000
   17.10 - */
   17.11 -
   17.12 -#include <linux/config.h>
   17.13 -#include <linux/slab.h>
   17.14 -#include <linux/vmalloc.h>
   17.15 -#include <linux/spinlock.h>
   17.16 -#include <linux/highmem.h>
   17.17 -#include <linux/smp_lock.h>
   17.18 -
   17.19 -#include <asm/uaccess.h>
   17.20 -#include <asm/pgalloc.h>
   17.21 -
   17.22 -rwlock_t vmlist_lock = RW_LOCK_UNLOCKED;
   17.23 -struct vm_struct * vmlist;
   17.24 -
   17.25 -static inline void free_area_pte(pmd_t * pmd, unsigned long address, unsigned long size)
   17.26 -{
   17.27 -	pte_t * pte;
   17.28 -	unsigned long end;
   17.29 -
   17.30 -	if (pmd_none(*pmd))
   17.31 -		return;
   17.32 -	if (pmd_bad(*pmd)) {
   17.33 -		pmd_ERROR(*pmd);
   17.34 -		pmd_clear(pmd);
   17.35 -		return;
   17.36 -	}
   17.37 -	pte = pte_offset(pmd, address);
   17.38 -	address &= ~PMD_MASK;
   17.39 -	end = address + size;
   17.40 -	if (end > PMD_SIZE)
   17.41 -		end = PMD_SIZE;
   17.42 -	do {
   17.43 -		pte_t page;
   17.44 -		page = ptep_get_and_clear(pte);
   17.45 -		address += PAGE_SIZE;
   17.46 -		pte++;
   17.47 -		if (pte_none(page))
   17.48 -			continue;
   17.49 -		if (pte_present(page)) {
   17.50 -			struct page *ptpage = pte_page(page);
   17.51 -			if (VALID_PAGE(ptpage) && (!PageReserved(ptpage)))
   17.52 -				__free_page(ptpage);
   17.53 -			continue;
   17.54 -		}
   17.55 -		printk(KERN_CRIT "Whee.. Swapped out page in kernel page table\n");
   17.56 -	} while (address < end);
   17.57 -}
   17.58 -
   17.59 -static inline void free_area_pmd(pgd_t * dir, unsigned long address, unsigned long size)
   17.60 -{
   17.61 -	pmd_t * pmd;
   17.62 -	unsigned long end;
   17.63 -
   17.64 -	if (pgd_none(*dir))
   17.65 -		return;
   17.66 -	if (pgd_bad(*dir)) {
   17.67 -		pgd_ERROR(*dir);
   17.68 -		pgd_clear(dir);
   17.69 -		return;
   17.70 -	}
   17.71 -	pmd = pmd_offset(dir, address);
   17.72 -	address &= ~PGDIR_MASK;
   17.73 -	end = address + size;
   17.74 -	if (end > PGDIR_SIZE)
   17.75 -		end = PGDIR_SIZE;
   17.76 -	do {
   17.77 -		free_area_pte(pmd, address, end - address);
   17.78 -		address = (address + PMD_SIZE) & PMD_MASK;
   17.79 -		pmd++;
   17.80 -	} while (address < end);
   17.81 -}
   17.82 -
   17.83 -void vmfree_area_pages(unsigned long address, unsigned long size)
   17.84 -{
   17.85 -	pgd_t * dir;
   17.86 -	unsigned long end = address + size;
   17.87 -
   17.88 -	dir = pgd_offset_k(address);
   17.89 -	flush_cache_all();
   17.90 -	do {
   17.91 -		free_area_pmd(dir, address, end - address);
   17.92 -		address = (address + PGDIR_SIZE) & PGDIR_MASK;
   17.93 -		dir++;
   17.94 -	} while (address && (address < end));
   17.95 -	flush_tlb_all();
   17.96 -}
   17.97 -
   17.98 -static inline int alloc_area_pte (pte_t * pte, unsigned long address,
   17.99 -			unsigned long size, int gfp_mask,
  17.100 -			pgprot_t prot, struct page ***pages)
  17.101 -{
  17.102 -	unsigned long end;
  17.103 -
  17.104 -	address &= ~PMD_MASK;
  17.105 -	end = address + size;
  17.106 -	if (end > PMD_SIZE)
  17.107 -		end = PMD_SIZE;
  17.108 -	do {
  17.109 -		struct page * page;
  17.110 -
  17.111 -		if (!pages) {
  17.112 -			spin_unlock(&init_mm.page_table_lock);
  17.113 -			page = alloc_page(gfp_mask);
  17.114 -			spin_lock(&init_mm.page_table_lock);
  17.115 -		} else {
  17.116 -			page = (**pages);
  17.117 -			(*pages)++;
  17.118 -
  17.119 -			/* Add a reference to the page so we can free later */
  17.120 -			if (page)
  17.121 -				atomic_inc(&page->count);
  17.122 -
  17.123 -		}
  17.124 -		if (!pte_none(*pte))
  17.125 -			printk(KERN_ERR "alloc_area_pte: page already exists\n");
  17.126 -		if (!page)
  17.127 -			return -ENOMEM;
  17.128 -		set_pte(pte, mk_pte(page, prot));
  17.129 -		address += PAGE_SIZE;
  17.130 -		pte++;
  17.131 -	} while (address < end);
  17.132 -	return 0;
  17.133 -}
  17.134 -
  17.135 -static inline int alloc_area_pmd(pmd_t * pmd, unsigned long address,
  17.136 -			unsigned long size, int gfp_mask,
  17.137 -			pgprot_t prot, struct page ***pages)
  17.138 -{
  17.139 -	unsigned long end;
  17.140 -
  17.141 -	address &= ~PGDIR_MASK;
  17.142 -	end = address + size;
  17.143 -	if (end > PGDIR_SIZE)
  17.144 -		end = PGDIR_SIZE;
  17.145 -	do {
  17.146 -		pte_t * pte = pte_alloc(&init_mm, pmd, address);
  17.147 -		if (!pte)
  17.148 -			return -ENOMEM;
  17.149 -		if (alloc_area_pte(pte, address, end - address,
  17.150 -					gfp_mask, prot, pages))
  17.151 -			return -ENOMEM;
  17.152 -		address = (address + PMD_SIZE) & PMD_MASK;
  17.153 -		pmd++;
  17.154 -	} while (address < end);
  17.155 -	return 0;
  17.156 -}
  17.157 -
  17.158 -/*static inline*/ int __vmalloc_area_pages (unsigned long address,
  17.159 -					unsigned long size,
  17.160 -					int gfp_mask,
  17.161 -					pgprot_t prot,
  17.162 -					struct page ***pages)
  17.163 -{
  17.164 -	pgd_t * dir;
  17.165 -	unsigned long start = address;
  17.166 -	unsigned long end = address + size;
  17.167 -
  17.168 -	dir = pgd_offset_k(address);
  17.169 -	spin_lock(&init_mm.page_table_lock);
  17.170 -	do {
  17.171 -		pmd_t *pmd;
  17.172 -		
  17.173 -		pmd = pmd_alloc(&init_mm, dir, address);
  17.174 -		if (!pmd)
  17.175 -			goto err;
  17.176 -
  17.177 -		if (alloc_area_pmd(pmd, address, end - address, gfp_mask, prot, pages))
  17.178 -			goto err;	// The kernel NEVER reclaims pmds, so no need to undo pmd_alloc() here
  17.179 -
  17.180 -		address = (address + PGDIR_SIZE) & PGDIR_MASK;
  17.181 -		dir++;
  17.182 -	} while (address && (address < end));
  17.183 -	spin_unlock(&init_mm.page_table_lock);
  17.184 -	flush_cache_all();
  17.185 -	XEN_flush_page_update_queue();
  17.186 -	return 0;
  17.187 -err:
  17.188 -	spin_unlock(&init_mm.page_table_lock);
  17.189 -	flush_cache_all();
  17.190 -	if (address > start)
  17.191 -		vmfree_area_pages(start, address - start);
  17.192 -	return -ENOMEM;
  17.193 -}
  17.194 -
  17.195 -int vmalloc_area_pages(unsigned long address, unsigned long size,
  17.196 -		       int gfp_mask, pgprot_t prot)
  17.197 -{
  17.198 -	return __vmalloc_area_pages(address, size, gfp_mask, prot, NULL);
  17.199 -}
  17.200 -
  17.201 -struct vm_struct * get_vm_area(unsigned long size, unsigned long flags)
  17.202 -{
  17.203 -	unsigned long addr, next;
  17.204 -	struct vm_struct **p, *tmp, *area;
  17.205 -
  17.206 -	area = (struct vm_struct *) kmalloc(sizeof(*area), GFP_KERNEL);
  17.207 -	if (!area)
  17.208 -		return NULL;
  17.209 -
  17.210 -	size += PAGE_SIZE;
  17.211 -	if (!size) {
  17.212 -		kfree (area);
  17.213 -		return NULL;
  17.214 -	}
  17.215 -
  17.216 -	addr = VMALLOC_START;
  17.217 -	write_lock(&vmlist_lock);
  17.218 -	for (p = &vmlist; (tmp = *p) ; p = &tmp->next) {
  17.219 -		if ((size + addr) < addr)
  17.220 -			goto out;
  17.221 -		if (size + addr <= (unsigned long) tmp->addr)
  17.222 -			break;
  17.223 -		next = tmp->size + (unsigned long) tmp->addr;
  17.224 -		if (next > addr) 
  17.225 -			addr = next;
  17.226 -		if (addr > VMALLOC_END-size)
  17.227 -			goto out;
  17.228 -	}
  17.229 -	area->flags = flags;
  17.230 -	area->addr = (void *)addr;
  17.231 -	area->size = size;
  17.232 -	area->next = *p;
  17.233 -	*p = area;
  17.234 -	write_unlock(&vmlist_lock);
  17.235 -	return area;
  17.236 -
  17.237 -out:
  17.238 -	write_unlock(&vmlist_lock);
  17.239 -	kfree(area);
  17.240 -	return NULL;
  17.241 -}
  17.242 -
  17.243 -void __vfree(void * addr, int free_area_pages)
  17.244 -{
  17.245 -	struct vm_struct **p, *tmp;
  17.246 -
  17.247 -	if (!addr)
  17.248 -		return;
  17.249 -	if ((PAGE_SIZE-1) & (unsigned long) addr) {
  17.250 -		printk(KERN_ERR "Trying to vfree() bad address (%p)\n", addr);
  17.251 -		return;
  17.252 -	}
  17.253 -	write_lock(&vmlist_lock);
  17.254 -	for (p = &vmlist ; (tmp = *p) ; p = &tmp->next) {
  17.255 -		if (tmp->addr == addr) {
  17.256 -			*p = tmp->next;
  17.257 -			if (free_area_pages)
  17.258 -				vmfree_area_pages(VMALLOC_VMADDR(tmp->addr), tmp->size);
  17.259 -			write_unlock(&vmlist_lock);
  17.260 -			kfree(tmp);
  17.261 -			return;
  17.262 -		}
  17.263 -	}
  17.264 -	write_unlock(&vmlist_lock);
  17.265 -	printk(KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n", addr);
  17.266 -}
  17.267 -
  17.268 -void vfree(void * addr)
  17.269 -{
  17.270 -	__vfree(addr,1);
  17.271 -}
  17.272 -
  17.273 -void * __vmalloc (unsigned long size, int gfp_mask, pgprot_t prot)
  17.274 -{
  17.275 -	void * addr;
  17.276 -	struct vm_struct *area;
  17.277 -
  17.278 -	size = PAGE_ALIGN(size);
  17.279 -	if (!size || (size >> PAGE_SHIFT) > num_physpages)
  17.280 -		return NULL;
  17.281 -	area = get_vm_area(size, VM_ALLOC);
  17.282 -	if (!area)
  17.283 -		return NULL;
  17.284 -	addr = area->addr;
  17.285 -	if (__vmalloc_area_pages(VMALLOC_VMADDR(addr), size, gfp_mask,
  17.286 -				 prot, NULL)) {
  17.287 -		__vfree(addr, 0);
  17.288 -		return NULL;
  17.289 -	}
  17.290 -	return addr;
  17.291 -}
  17.292 -
  17.293 -void * vmap(struct page **pages, int count,
  17.294 -	    unsigned long flags, pgprot_t prot)
  17.295 -{
  17.296 -	void * addr;
  17.297 -	struct vm_struct *area;
  17.298 -	unsigned long size = count << PAGE_SHIFT;
  17.299 -
  17.300 -	if (!size || size > (max_mapnr << PAGE_SHIFT))
  17.301 -		return NULL;
  17.302 -	area = get_vm_area(size, flags);
  17.303 -	if (!area) {
  17.304 -		return NULL;
  17.305 -	}
  17.306 -	addr = area->addr;
  17.307 -	if (__vmalloc_area_pages(VMALLOC_VMADDR(addr), size, 0,
  17.308 -				 prot, &pages)) {
  17.309 -		__vfree(addr, 0);
  17.310 -		return NULL;
  17.311 -	}
  17.312 -	return addr;
  17.313 -}
  17.314 -
  17.315 -long vread(char *buf, char *addr, unsigned long count)
  17.316 -{
  17.317 -	struct vm_struct *tmp;
  17.318 -	char *vaddr, *buf_start = buf;
  17.319 -	unsigned long n;
  17.320 -
  17.321 -	/* Don't allow overflow */
  17.322 -	if ((unsigned long) addr + count < count)
  17.323 -		count = -(unsigned long) addr;
  17.324 -
  17.325 -	read_lock(&vmlist_lock);
  17.326 -	for (tmp = vmlist; tmp; tmp = tmp->next) {
  17.327 -		vaddr = (char *) tmp->addr;
  17.328 -		if (addr >= vaddr + tmp->size - PAGE_SIZE)
  17.329 -			continue;
  17.330 -		while (addr < vaddr) {
  17.331 -			if (count == 0)
  17.332 -				goto finished;
  17.333 -			*buf = '\0';
  17.334 -			buf++;
  17.335 -			addr++;
  17.336 -			count--;
  17.337 -		}
  17.338 -		n = vaddr + tmp->size - PAGE_SIZE - addr;
  17.339 -		do {
  17.340 -			if (count == 0)
  17.341 -				goto finished;
  17.342 -			*buf = *addr;
  17.343 -			buf++;
  17.344 -			addr++;
  17.345 -			count--;
  17.346 -		} while (--n > 0);
  17.347 -	}
  17.348 -finished:
  17.349 -	read_unlock(&vmlist_lock);
  17.350 -	return buf - buf_start;
  17.351 -}
  17.352 -
  17.353 -long vwrite(char *buf, char *addr, unsigned long count)
  17.354 -{
  17.355 -	struct vm_struct *tmp;
  17.356 -	char *vaddr, *buf_start = buf;
  17.357 -	unsigned long n;
  17.358 -
  17.359 -	/* Don't allow overflow */
  17.360 -	if ((unsigned long) addr + count < count)
  17.361 -		count = -(unsigned long) addr;
  17.362 -
  17.363 -	read_lock(&vmlist_lock);
  17.364 -	for (tmp = vmlist; tmp; tmp = tmp->next) {
  17.365 -		vaddr = (char *) tmp->addr;
  17.366 -		if (addr >= vaddr + tmp->size - PAGE_SIZE)
  17.367 -			continue;
  17.368 -		while (addr < vaddr) {
  17.369 -			if (count == 0)
  17.370 -				goto finished;
  17.371 -			buf++;
  17.372 -			addr++;
  17.373 -			count--;
  17.374 -		}
  17.375 -		n = vaddr + tmp->size - PAGE_SIZE - addr;
  17.376 -		do {
  17.377 -			if (count == 0)
  17.378 -				goto finished;
  17.379 -			*addr = *buf;
  17.380 -			buf++;
  17.381 -			addr++;
  17.382 -			count--;
  17.383 -		} while (--n > 0);
  17.384 -	}
  17.385 -finished:
  17.386 -	read_unlock(&vmlist_lock);
  17.387 -	return buf - buf_start;
  17.388 -}
    18.1 --- a/linux-2.6.11-xen-sparse/arch/xen/Kconfig	Sat Mar 26 05:05:07 2005 +0000
    18.2 +++ b/linux-2.6.11-xen-sparse/arch/xen/Kconfig	Fri Mar 25 22:52:21 2005 +0000
    18.3 @@ -114,10 +114,6 @@ config XEN_BLKDEV_TAP
    18.4  	  to a character device, allowing device prototyping in application
    18.5  	  space.  Odds are that you want to say N here.
    18.6  
    18.7 -config XEN_WRITABLE_PAGETABLES
    18.8 -	bool
    18.9 -	default y
   18.10 -
   18.11  config XEN_SCRUB_PAGES
   18.12  	bool "Scrub memory before freeing it to Xen"
   18.13  	default y
    19.1 --- a/linux-2.6.11-xen-sparse/arch/xen/configs/xen0_defconfig	Sat Mar 26 05:05:07 2005 +0000
    19.2 +++ b/linux-2.6.11-xen-sparse/arch/xen/configs/xen0_defconfig	Fri Mar 25 22:52:21 2005 +0000
    19.3 @@ -19,7 +19,6 @@ CONFIG_XEN_BLKDEV_FRONTEND=y
    19.4  CONFIG_XEN_NETDEV_FRONTEND=y
    19.5  # CONFIG_XEN_NETDEV_FRONTEND_PIPELINED_TRANSMITTER is not set
    19.6  # CONFIG_XEN_BLKDEV_TAP is not set
    19.7 -CONFIG_XEN_WRITABLE_PAGETABLES=y
    19.8  CONFIG_XEN_SCRUB_PAGES=y
    19.9  CONFIG_X86=y
   19.10  # CONFIG_X86_64 is not set
    20.1 --- a/linux-2.6.11-xen-sparse/arch/xen/configs/xenU_defconfig	Sat Mar 26 05:05:07 2005 +0000
    20.2 +++ b/linux-2.6.11-xen-sparse/arch/xen/configs/xenU_defconfig	Fri Mar 25 22:52:21 2005 +0000
    20.3 @@ -16,7 +16,6 @@ CONFIG_XEN_BLKDEV_FRONTEND=y
    20.4  CONFIG_XEN_NETDEV_FRONTEND=y
    20.5  # CONFIG_XEN_NETDEV_FRONTEND_PIPELINED_TRANSMITTER is not set
    20.6  # CONFIG_XEN_BLKDEV_TAP is not set
    20.7 -CONFIG_XEN_WRITABLE_PAGETABLES=y
    20.8  CONFIG_XEN_SCRUB_PAGES=y
    20.9  CONFIG_X86=y
   20.10  # CONFIG_X86_64 is not set
    21.1 --- a/linux-2.6.11-xen-sparse/arch/xen/i386/kernel/traps.c	Sat Mar 26 05:05:07 2005 +0000
    21.2 +++ b/linux-2.6.11-xen-sparse/arch/xen/i386/kernel/traps.c	Fri Mar 25 22:52:21 2005 +0000
    21.3 @@ -963,7 +963,7 @@ void __init trap_init(void)
    21.4  	 * and a callgate to lcall27 for Solaris/x86 binaries
    21.5  	 */
    21.6  	make_lowmem_page_readonly(&default_ldt[0]);
    21.7 -	xen_flush_page_update_queue();
    21.8 +	flush_page_update_queue();
    21.9  
   21.10  	/*
   21.11  	 * Should be a barrier for any external CPU state.
    22.1 --- a/linux-2.6.11-xen-sparse/arch/xen/i386/mm/fault.c	Sat Mar 26 05:05:07 2005 +0000
    22.2 +++ b/linux-2.6.11-xen-sparse/arch/xen/i386/mm/fault.c	Fri Mar 25 22:52:21 2005 +0000
    22.3 @@ -553,7 +553,6 @@ vmalloc_fault:
    22.4  		if (!pmd_present(*pmd_k))
    22.5  			goto no_context;
    22.6  		set_pmd(pmd, *pmd_k);
    22.7 -		xen_flush_page_update_queue(); /* flush PMD update */
    22.8  
    22.9  		pte_k = pte_offset_kernel(pmd_k, address);
   22.10  		if (!pte_present(*pte_k))
    23.1 --- a/linux-2.6.11-xen-sparse/arch/xen/i386/mm/hypervisor.c	Sat Mar 26 05:05:07 2005 +0000
    23.2 +++ b/linux-2.6.11-xen-sparse/arch/xen/i386/mm/hypervisor.c	Fri Mar 25 22:52:21 2005 +0000
    23.3 @@ -48,19 +48,12 @@
    23.4   */
    23.5  static spinlock_t update_lock = SPIN_LOCK_UNLOCKED;
    23.6  
    23.7 -/* Linux 2.6 isn't using the traditional batched interface. */
    23.8 +#define QUEUE_SIZE 128
    23.9  #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0)
   23.10 -#define QUEUE_SIZE 2048
   23.11  #define pte_offset_kernel pte_offset
   23.12 -#define pmd_val_ma(v) (v).pmd;
   23.13  #define pud_t pgd_t
   23.14  #define pud_offset(d, va) d
   23.15  #else
   23.16 -#ifdef CONFIG_SMP
   23.17 -#define QUEUE_SIZE 1
   23.18 -#else
   23.19 -#define QUEUE_SIZE 128
   23.20 -#endif
   23.21  #define pmd_val_ma(v) (v).pud.pgd.pgd;
   23.22  #endif
   23.23  
    24.1 --- a/linux-2.6.11-xen-sparse/arch/xen/i386/mm/pgtable.c	Sat Mar 26 05:05:07 2005 +0000
    24.2 +++ b/linux-2.6.11-xen-sparse/arch/xen/i386/mm/pgtable.c	Fri Mar 25 22:52:21 2005 +0000
    24.3 @@ -195,7 +195,7 @@ pte_t *pte_alloc_one_kernel(struct mm_st
    24.4  	pte_t *pte = (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO);
    24.5  	if (pte) {
    24.6  		make_page_readonly(pte);
    24.7 -		xen_flush_page_update_queue();
    24.8 +		flush_page_update_queue();
    24.9  	}
   24.10  	return pte;
   24.11  }
    25.1 --- a/linux-2.6.11-xen-sparse/arch/xen/kernel/reboot.c	Sat Mar 26 05:05:07 2005 +0000
    25.2 +++ b/linux-2.6.11-xen-sparse/arch/xen/kernel/reboot.c	Fri Mar 25 22:52:21 2005 +0000
    25.3 @@ -109,10 +109,8 @@ static void __do_suspend(void)
    25.4  
    25.5      HYPERVISOR_vm_assist(VMASST_CMD_enable,
    25.6  			 VMASST_TYPE_4gb_segments);
    25.7 -#ifdef CONFIG_XEN_WRITABLE_PAGETABLES
    25.8      HYPERVISOR_vm_assist(VMASST_CMD_enable,
    25.9  			 VMASST_TYPE_writable_pagetables);
   25.10 -#endif
   25.11  
   25.12      shutting_down = -1; 
   25.13  
    26.1 --- a/linux-2.6.11-xen-sparse/drivers/xen/blktap/blktap.h	Sat Mar 26 05:05:07 2005 +0000
    26.2 +++ b/linux-2.6.11-xen-sparse/drivers/xen/blktap/blktap.h	Fri Mar 25 22:52:21 2005 +0000
    26.3 @@ -30,6 +30,8 @@
    26.4  
    26.5  /* -------[ debug / pretty printing ]--------------------------------- */
    26.6  
    26.7 +#define PRINTK(_f, _a...) printk(KERN_ALERT "(file=%s, line=%d) " _f, \
    26.8 +                           __FILE__ , __LINE__ , ## _a )
    26.9  #if 0
   26.10  #define DPRINTK(_f, _a...) printk(KERN_ALERT "(file=%s, line=%d) " _f, \
   26.11                             __FILE__ , __LINE__ , ## _a )
    27.1 --- a/linux-2.6.11-xen-sparse/drivers/xen/blktap/blktap_userdev.c	Sat Mar 26 05:05:07 2005 +0000
    27.2 +++ b/linux-2.6.11-xen-sparse/drivers/xen/blktap/blktap_userdev.c	Fri Mar 25 22:52:21 2005 +0000
    27.3 @@ -299,7 +299,7 @@ int blktap_write_fe_ring(blkif_request_t
    27.4      }
    27.5  
    27.6      if ( RING_FULL(&blktap_ufe_ring) ) {
    27.7 -        DPRINTK("blktap: fe_ring is full, can't add.\n");
    27.8 +        PRINTK("blktap: fe_ring is full, can't add.\n");
    27.9          return 0;
   27.10      }
   27.11  
   27.12 @@ -383,10 +383,9 @@ static int blktap_read_fe_ring(void)
   27.13              zap_page_range(blktap_vma, MMAP_VADDR(ID_TO_IDX(resp_s->id), 0), 
   27.14                      ar->nr_pages << PAGE_SHIFT, NULL);
   27.15              write_resp_to_fe_ring(blkif, resp_s);
   27.16 +            blktap_ufe_ring.rsp_cons = i + 1;
   27.17              kick_fe_domain(blkif);
   27.18          }
   27.19 -        
   27.20 -        blktap_ufe_ring.rsp_cons = i;
   27.21      }
   27.22      return 0;
   27.23  }
    28.1 --- a/linux-2.6.11-xen-sparse/drivers/xen/privcmd/privcmd.c	Sat Mar 26 05:05:07 2005 +0000
    28.2 +++ b/linux-2.6.11-xen-sparse/drivers/xen/privcmd/privcmd.c	Fri Mar 25 22:52:21 2005 +0000
    28.3 @@ -88,6 +88,8 @@ static int privcmd_ioctl(struct inode *i
    28.4          {
    28.5              int j, n = ((mmapcmd.num-i)>PRIVCMD_MMAP_SZ)?
    28.6                  PRIVCMD_MMAP_SZ:(mmapcmd.num-i);
    28.7 +
    28.8 +
    28.9              if ( copy_from_user(&msg, p, n*sizeof(privcmd_mmap_entry_t)) )
   28.10                  return -EFAULT;
   28.11       
   28.12 @@ -96,6 +98,7 @@ static int privcmd_ioctl(struct inode *i
   28.13                  struct vm_area_struct *vma = 
   28.14                      find_vma( current->mm, msg[j].va );
   28.15  
   28.16 +
   28.17                  if ( !vma )
   28.18                      return -EINVAL;
   28.19  
   28.20 @@ -151,6 +154,7 @@ static int privcmd_ioctl(struct inode *i
   28.21          addr = m.addr;
   28.22          for ( i = 0; i < m.num; i++, addr += PAGE_SIZE, p++ )
   28.23          {
   28.24 +
   28.25              if ( get_user(mfn, p) )
   28.26                  return -EFAULT;
   28.27  
   28.28 @@ -166,10 +170,12 @@ static int privcmd_ioctl(struct inode *i
   28.29  
   28.30              v = w;
   28.31          }
   28.32 +
   28.33          ret = 0;
   28.34          break;
   28.35  
   28.36      batch_err:
   28.37 +        printk(KERN_ALERT "XXX SMH: ERROR IN MMAPBATCH\n"); 
   28.38          printk("batch_err ret=%d vma=%p addr=%lx num=%d arr=%p %lx-%lx\n", 
   28.39                 ret, vma, m.addr, m.num, m.arr, vma->vm_start, vma->vm_end);
   28.40          break;
   28.41 @@ -183,7 +189,7 @@ static int privcmd_ioctl(struct inode *i
   28.42          pgd_t *pgd = pgd_offset_k(m2pv);
   28.43          pud_t *pud = pud_offset(pgd, m2pv);
   28.44          pmd_t *pmd = pmd_offset(pud, m2pv);
   28.45 -        unsigned long m2p_start_mfn = pfn_to_mfn(pmd_val(*pmd) >> PAGE_SHIFT);
   28.46 +        unsigned long m2p_start_mfn = (*(unsigned long *)pmd) >> PAGE_SHIFT; 
   28.47          ret = put_user(m2p_start_mfn, (unsigned long *)data) ? -EFAULT: 0;
   28.48      }
   28.49      break;
    29.1 --- a/linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/page.h	Sat Mar 26 05:05:07 2005 +0000
    29.2 +++ b/linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/page.h	Fri Mar 25 22:52:21 2005 +0000
    29.3 @@ -111,7 +111,7 @@ typedef struct { unsigned long pgprot; }
    29.4  static inline unsigned long pgd_val(pgd_t x)
    29.5  {
    29.6  	unsigned long ret = x.pgd;
    29.7 -	if (ret) ret = machine_to_phys(ret);
    29.8 +	if (ret) ret = machine_to_phys(ret) | 1;
    29.9  	return ret;
   29.10  }
   29.11  #define pgprot_val(x)	((x).pgprot)
    30.1 --- a/linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/pgtable.h	Sat Mar 26 05:05:07 2005 +0000
    30.2 +++ b/linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/pgtable.h	Fri Mar 25 22:52:21 2005 +0000
    30.3 @@ -407,7 +407,6 @@ extern void noexec_setup(const char *str
    30.4  	do {								  \
    30.5  		if (__dirty) {						  \
    30.6  		        if ( likely((__vma)->vm_mm == current->mm) ) {    \
    30.7 -			    xen_flush_page_update_queue();                \
    30.8  			    HYPERVISOR_update_va_mapping((__address), (__entry), UVMF_INVLPG); \
    30.9  			} else {                                          \
   30.10                              xen_l1_entry_update((__ptep), (__entry).pte_low); \
   30.11 @@ -426,7 +425,6 @@ do {				  					\
   30.12  #define ptep_establish_new(__vma, __address, __ptep, __entry)		\
   30.13  do {				  					\
   30.14  	if (likely((__vma)->vm_mm == current->mm)) {			\
   30.15 -		xen_flush_page_update_queue();				\
   30.16  		HYPERVISOR_update_va_mapping((__address),		\
   30.17  					     __entry, 0);		\
   30.18  	} else {							\
    31.1 --- a/linux-2.6.11-xen-sparse/include/asm-xen/hypervisor.h	Sat Mar 26 05:05:07 2005 +0000
    31.2 +++ b/linux-2.6.11-xen-sparse/include/asm-xen/hypervisor.h	Fri Mar 25 22:52:21 2005 +0000
    31.3 @@ -117,8 +117,6 @@ void _flush_page_update_queue(void);
    31.4      if (per_cpu(mmu_update_queue_idx, smp_processor_id()))	\
    31.5  	_flush_page_update_queue();				\
    31.6  } while (0)
    31.7 -#define xen_flush_page_update_queue() (_flush_page_update_queue())
    31.8 -#define XEN_flush_page_update_queue() (_flush_page_update_queue())
    31.9  void MULTICALL_flush_page_update_queue(void);
   31.10  
   31.11  #ifdef CONFIG_XEN_PHYSDEV_ACCESS
    32.1 --- a/tools/blktap/Makefile	Sat Mar 26 05:05:07 2005 +0000
    32.2 +++ b/tools/blktap/Makefile	Fri Mar 25 22:52:21 2005 +0000
    32.3 @@ -58,7 +58,7 @@ OBJS     = $(patsubst %.c,%.o,$(SRCS))
    32.4  
    32.5  LIB      = libblktap.so libblktap.so.$(MAJOR) libblktap.so.$(MAJOR).$(MINOR)
    32.6  
    32.7 -all: mk-symlinks blkdump blkcow blkimg blkcowimg blkgnbd blkcowgnbd $(VDI_TOOLS) parallax 
    32.8 +all: mk-symlinks blkdump blkcow blkimg blkcowimg blkgnbd blkcowgnbd $(VDI_TOOLS) parallax parallax-threaded blockstored
    32.9  	$(MAKE) $(LIB)
   32.10  
   32.11  LINUX_ROOT := $(wildcard $(XEN_ROOT)/linux-2.6.*-xen-sparse)
   32.12 @@ -120,42 +120,42 @@ blkaio: $(LIB) blkaio.c blkaiolib.c
   32.13  	$(CC) $(CFLAGS) -o blkaio -L$(XEN_LIBXC) -L$(XEN_LIBXUTIL) -L. -lblktap blkaio.c blkaiolib.c -laio -lpthread
   32.14  
   32.15  parallax: $(LIB) $(PLX_SRCS)
   32.16 -	$(CC) $(CFLAGS) -o parallax -L$(XEN_LIBXC) -L$(XEN_LIBXUTIL) -L. -lblktap $(PLX_SRCS) libgnbd/libgnbd.a
   32.17 +	$(CC) $(CFLAGS) -o parallax -L$(XEN_LIBXC) -L$(XEN_LIBXUTIL) -L. -lblktap -lpthread $(PLX_SRCS) libgnbd/libgnbd.a
   32.18  
   32.19  parallax-threaded: $(LIB) $(PLXT_SRCS)
   32.20  	$(CC) $(CFLAGS) -o parallax-threaded -L$(XEN_LIBXC) -L$(XEN_LIBXUTIL) -L. -lpthread -lblktap $(PLXT_SRCS) libgnbd/libgnbd.a
   32.21  
   32.22  vdi_test: $(LIB) $(VDI_SRCS)
   32.23 -	$(CC) $(CFLAGS) -g3 -o vdi_test -DVDI_STANDALONE $(VDI_SRCS)
   32.24 +	$(CC) $(CFLAGS) -g3 -o vdi_test -DVDI_STANDALONE -lpthread $(VDI_SRCS)
   32.25  
   32.26  vdi_list: $(LIB) vdi_list.c $(VDI_SRCS)
   32.27 -	$(CC) $(CFLAGS) -g3 -o vdi_list vdi_list.c $(VDI_SRCS)
   32.28 +	$(CC) $(CFLAGS) -g3 -o vdi_list vdi_list.c -lpthread $(VDI_SRCS)
   32.29  
   32.30  vdi_create: $(LIB) vdi_create.c $(VDI_SRCS)
   32.31 -	$(CC) $(CFLAGS) -g3 -o vdi_create vdi_create.c $(VDI_SRCS)
   32.32 +	$(CC) $(CFLAGS) -g3 -o vdi_create vdi_create.c -lpthread $(VDI_SRCS)
   32.33  
   32.34  vdi_snap: $(LIB) vdi_snap.c $(VDI_SRCS)
   32.35 -	$(CC) $(CFLAGS) -g3 -o vdi_snap vdi_snap.c $(VDI_SRCS)
   32.36 +	$(CC) $(CFLAGS) -g3 -o vdi_snap vdi_snap.c -lpthread $(VDI_SRCS)
   32.37  
   32.38  vdi_snap_list: $(LIB) vdi_snap_list.c $(VDI_SRCS)
   32.39 -	$(CC) $(CFLAGS) -g3 -o vdi_snap_list vdi_snap_list.c $(VDI_SRCS)
   32.40 +	$(CC) $(CFLAGS) -g3 -o vdi_snap_list vdi_snap_list.c -lpthread $(VDI_SRCS)
   32.41  
   32.42  vdi_snap_delete: $(LIB) vdi_snap_delete.c $(VDI_SRCS)
   32.43 -	$(CC) $(CFLAGS) -g3 -o vdi_snap_delete vdi_snap_delete.c $(VDI_SRCS)
   32.44 +	$(CC) $(CFLAGS) -g3 -o vdi_snap_delete vdi_snap_delete.c -lpthread $(VDI_SRCS)
   32.45  
   32.46  vdi_tree: $(LIB) vdi_tree.c $(VDI_SRCS)
   32.47 -	$(CC) $(CFLAGS) -g3 -o vdi_tree vdi_tree.c $(VDI_SRCS)
   32.48 +	$(CC) $(CFLAGS) -g3 -o vdi_tree vdi_tree.c -lpthread $(VDI_SRCS)
   32.49  
   32.50  vdi_fill: $(LIB) vdi_fill.c $(VDI_SRCS)
   32.51 -	$(CC) $(CFLAGS) -g3 -o vdi_fill vdi_fill.c $(VDI_SRCS)
   32.52 +	$(CC) $(CFLAGS) -g3 -o vdi_fill vdi_fill.c -lpthread $(VDI_SRCS)
   32.53  
   32.54  vdi_validate: $(LIB) vdi_validate.c $(VDI_SRCS)
   32.55 -	$(CC) $(CFLAGS) -g3 -o vdi_validate vdi_validate.c $(VDI_SRCS)
   32.56 +	$(CC) $(CFLAGS) -g3 -o vdi_validate vdi_validate.c -lpthread $(VDI_SRCS)
   32.57  
   32.58  blockstored: blockstored.c
   32.59 -	$(CC) $(CFLAGS) -g3 -o blockstored blockstored.c
   32.60 +	$(CC) $(CFLAGS) -g3 -o blockstored -lpthread blockstored.c
   32.61  bstest: bstest.c blockstore.c
   32.62 -	$(CC) $(CFLAGS) -g3 -o bstest bstest.c blockstore.c
   32.63 +	$(CC) $(CFLAGS) -g3 -o bstest bstest.c -lpthread blockstore.c
   32.64  
   32.65  .PHONY: TAGS clean install mk-symlinks rpm
   32.66  TAGS:
    33.1 --- a/tools/blktap/blktaplib.c	Sat Mar 26 05:05:07 2005 +0000
    33.2 +++ b/tools/blktap/blktaplib.c	Fri Mar 25 22:52:21 2005 +0000
    33.3 @@ -248,12 +248,21 @@ static void apply_rsp_hooks(blkif_respon
    33.4      }
    33.5  }
    33.6  
    33.7 +static pthread_mutex_t push_mutex = PTHREAD_MUTEX_INITIALIZER;
    33.8 +
    33.9  void blktap_inject_response(blkif_response_t *rsp)
   33.10  {
   33.11 +    
   33.12      apply_rsp_hooks(rsp);
   33.13 +    
   33.14      write_rsp_to_fe_ring(rsp);
   33.15 +    
   33.16 +    pthread_mutex_lock(&push_mutex);
   33.17 +    
   33.18      RING_PUSH_RESPONSES(&fe_ring);
   33.19      ioctl(fd, BLKTAP_IOCTL_KICK_FE);
   33.20 +    
   33.21 +    pthread_mutex_unlock(&push_mutex);
   33.22  }
   33.23  
   33.24  /*-----[ Polling fd listeners ]------------------------------------------*/
   33.25 @@ -449,7 +458,9 @@ int blktap_listen(void)
   33.26              }
   33.27              /* Using this as a unidirectional ring. */
   33.28              ctrl_ring.req_cons = ctrl_ring.rsp_prod_pvt = i;
   33.29 +pthread_mutex_lock(&push_mutex);
   33.30              RING_PUSH_RESPONSES(&ctrl_ring);
   33.31 +pthread_mutex_unlock(&push_mutex);
   33.32              
   33.33              /* empty the fe_ring */
   33.34              notify_fe = 0;
   33.35 @@ -517,14 +528,18 @@ int blktap_listen(void)
   33.36  
   33.37              if (notify_be) {
   33.38                  DPRINTF("notifying be\n");
   33.39 +pthread_mutex_lock(&push_mutex);
   33.40                  RING_PUSH_REQUESTS(&be_ring);
   33.41                  ioctl(fd, BLKTAP_IOCTL_KICK_BE);
   33.42 +pthread_mutex_unlock(&push_mutex);
   33.43              }
   33.44  
   33.45              if (notify_fe) {
   33.46                  DPRINTF("notifying fe\n");
   33.47 +pthread_mutex_lock(&push_mutex);
   33.48                  RING_PUSH_RESPONSES(&fe_ring);
   33.49                  ioctl(fd, BLKTAP_IOCTL_KICK_FE);
   33.50 +pthread_mutex_unlock(&push_mutex);
   33.51              }
   33.52          }        
   33.53      }
    34.1 --- a/tools/blktap/blockstore.c	Sat Mar 26 05:05:07 2005 +0000
    34.2 +++ b/tools/blktap/blockstore.c	Fri Mar 25 22:52:21 2005 +0000
    34.3 @@ -13,13 +13,16 @@
    34.4  #include <string.h>
    34.5  #include <sys/types.h>
    34.6  #include <sys/stat.h>
    34.7 +#include <sys/time.h>
    34.8  #include <stdarg.h>
    34.9  #include "blockstore.h"
   34.10  #include <pthread.h>
   34.11  #include "parallax-threaded.h"
   34.12  
   34.13  #define BLOCKSTORE_REMOTE
   34.14 -#define BSDEBUG
   34.15 +//#define BSDEBUG
   34.16 +
   34.17 +#define RETRY_TIMEOUT 1000000 /* microseconds */
   34.18  
   34.19  /*****************************************************************************
   34.20   * Debugging
   34.21 @@ -63,6 +66,37 @@ struct sockaddr_in sin_local;
   34.22  int bssock = 0;
   34.23  
   34.24  /*****************************************************************************
   34.25 + * Notification                                                              *
   34.26 + *****************************************************************************/
   34.27 +
   34.28 +typedef struct pool_thread_t_struct {
   34.29 +    pthread_mutex_t ptmutex;
   34.30 +    pthread_cond_t ptcv;
   34.31 +    int newdata;
   34.32 +} pool_thread_t;
   34.33 +
   34.34 +pool_thread_t pool_thread[READ_POOL_SIZE+1];
   34.35 +
   34.36 +#define RECV_NOTIFY(tid) { \
   34.37 +    pthread_mutex_lock(&(pool_thread[tid].ptmutex)); \
   34.38 +    pool_thread[tid].newdata = 1; \
   34.39 +    DB("CV Waking %u", tid); \
   34.40 +    pthread_cond_signal(&(pool_thread[tid].ptcv)); \
   34.41 +    pthread_mutex_unlock(&(pool_thread[tid].ptmutex)); }
   34.42 +#define RECV_AWAIT(tid) { \
   34.43 +    pthread_mutex_lock(&(pool_thread[tid].ptmutex)); \
   34.44 +    if (pool_thread[tid].newdata) { \
   34.45 +        pool_thread[tid].newdata = 0; \
   34.46 +        DB("CV Woken %u", tid); \
   34.47 +    } \
   34.48 +    else { \
   34.49 +        DB("CV Waiting %u", tid); \
   34.50 +        pthread_cond_wait(&(pool_thread[tid].ptcv), \
   34.51 +                          &(pool_thread[tid].ptmutex)); \
   34.52 +    } \
   34.53 +    pthread_mutex_unlock(&(pool_thread[tid].ptmutex)); }
   34.54 +
   34.55 +/*****************************************************************************
   34.56   * Message queue management                                                  *
   34.57   *****************************************************************************/
   34.58  
   34.59 @@ -76,23 +110,6 @@ pthread_mutex_t ptmutex_recv;
   34.60  #define ENTER_RECV_CR pthread_mutex_lock(&ptmutex_recv)
   34.61  #define LEAVE_RECV_CR pthread_mutex_unlock(&ptmutex_recv)
   34.62  
   34.63 -int notify = 0;
   34.64 -pthread_mutex_t ptmutex_notify;
   34.65 -pthread_cond_t ptcv_notify;
   34.66 -#define RECV_NOTIFY { \
   34.67 -    pthread_mutex_lock(&ptmutex_notify); \
   34.68 -    notify = 1; \
   34.69 -    pthread_cond_signal(&ptcv_notify); \
   34.70 -    pthread_mutex_unlock(&ptmutex_notify); }
   34.71 -#define RECV_AWAIT { \
   34.72 -    pthread_mutex_lock(&ptmutex_notify); \
   34.73 -    if (notify) \
   34.74 -        notify = 0; \
   34.75 -    else \
   34.76 -        pthread_cond_wait(&ptcv_notify, &ptmutex_notify); \
   34.77 -    pthread_mutex_unlock(&ptmutex_notify); }
   34.78 -    
   34.79 -
   34.80  /* A message queue entry. We allocate one of these for every request we send.
   34.81   * Asynchronous reply reception also used one of these.
   34.82   */
   34.83 @@ -104,6 +121,8 @@ typedef struct bsq_t_struct {
   34.84      int length;
   34.85      struct msghdr msghdr;
   34.86      struct iovec iov[2];
   34.87 +    int tid;
   34.88 +    struct timeval tv_sent;
   34.89      bshdr_t message;
   34.90      void *block;
   34.91  } bsq_t;
   34.92 @@ -267,11 +286,13 @@ int send_message(bsq_t *qe) {
   34.93      qe->message.luid = new_luid();
   34.94  
   34.95      qe->status = 0;
   34.96 +    qe->tid = (int)pthread_getspecific(tid_key);
   34.97      if (enqueue(qe) < 0) {
   34.98          fprintf(stderr, "Error enqueuing request.\n");
   34.99          return -1;
  34.100      }
  34.101  
  34.102 +    gettimeofday(&(qe->tv_sent), NULL);
  34.103      DB("send_message to %d luid=%016llx\n", qe->server, qe->message.luid);
  34.104      rc = sendmsg(bssock, &(qe->msghdr), MSG_DONTWAIT);
  34.105      //rc = sendto(bssock, (void *)&(qe->message), qe->length, 0,
  34.106 @@ -407,6 +428,7 @@ void recv_recycle_buffer(bsq_t *q) {
  34.107  int wait_recv(bsq_t **reqs, int numreqs) {
  34.108      bsq_t *q, *m;
  34.109      unsigned int x, i;
  34.110 +    int tid = (int)pthread_getspecific(tid_key);
  34.111  
  34.112      DB("ENTER wait_recv %u\n", numreqs);
  34.113  
  34.114 @@ -420,7 +442,7 @@ int wait_recv(bsq_t **reqs, int numreqs)
  34.115          return numreqs;
  34.116      }
  34.117  
  34.118 -    RECV_AWAIT;
  34.119 +    RECV_AWAIT(tid);
  34.120  
  34.121      /*
  34.122      rxagain:
  34.123 @@ -442,6 +464,52 @@ int wait_recv(bsq_t **reqs, int numreqs)
  34.124  
  34.125  }
  34.126  
  34.127 +/* retry
  34.128 + */
  34.129 +static int retry_count = 0;
  34.130 +int retry(bsq_t *qe)
  34.131 +{
  34.132 +    int rc;
  34.133 +    gettimeofday(&(qe->tv_sent), NULL);
  34.134 +    DB("retry to %d luid=%016llx\n", qe->server, qe->message.luid);
  34.135 +    retry_count++;
  34.136 +    rc = sendmsg(bssock, &(qe->msghdr), MSG_DONTWAIT);
  34.137 +    if (rc < 0)
  34.138 +        return rc;
  34.139 +    return 0;
  34.140 +}
  34.141 +
  34.142 +/* queue runner
  34.143 + */
  34.144 +void *queue_runner(void *arg)
  34.145 +{
  34.146 +    for (;;) {
  34.147 +        struct timeval now;
  34.148 +        long long nowus, sus;
  34.149 +        bsq_t *q;
  34.150 +        int r;
  34.151 +
  34.152 +        sleep(1);
  34.153 +
  34.154 +        gettimeofday(&now, NULL);
  34.155 +        nowus = now.tv_usec + now.tv_sec * 1000000;
  34.156 +        ENTER_QUEUE_CR;
  34.157 +        r = retry_count;
  34.158 +        for (q = bs_head; q; q = q->next) {
  34.159 +            sus = q->tv_sent.tv_usec + q->tv_sent.tv_sec * 1000000;
  34.160 +            if ((nowus - sus) > RETRY_TIMEOUT) {
  34.161 +                if (retry(q) < 0) {
  34.162 +                    fprintf(stderr, "Error on sendmsg retry.\n");
  34.163 +                }
  34.164 +            }
  34.165 +        }
  34.166 +        if (r != retry_count) {
  34.167 +            fprintf(stderr, "RETRIES: %u %u\n", retry_count - r, retry_count);
  34.168 +        }
  34.169 +        LEAVE_QUEUE_CR;
  34.170 +    }
  34.171 +}
  34.172 +
  34.173  /* receive loop
  34.174   */
  34.175  void *receive_loop(void *arg)
  34.176 @@ -461,7 +529,7 @@ void *receive_loop(void *arg)
  34.177              }
  34.178              else {
  34.179                  DB("RX MATCH");
  34.180 -                RECV_NOTIFY;
  34.181 +                RECV_NOTIFY(m->tid);
  34.182              }
  34.183          }
  34.184      }
  34.185 @@ -1146,8 +1214,12 @@ int __init_blockstore(void)
  34.186      pthread_mutex_init(&ptmutex_queue, NULL);
  34.187      pthread_mutex_init(&ptmutex_luid, NULL);
  34.188      pthread_mutex_init(&ptmutex_recv, NULL);
  34.189 -    pthread_mutex_init(&ptmutex_notify, NULL);
  34.190 -    pthread_cond_init(&ptcv_notify, NULL);
  34.191 +    /*pthread_mutex_init(&ptmutex_notify, NULL);*/
  34.192 +    for (i = 0; i <= READ_POOL_SIZE; i++) {
  34.193 +        pool_thread[i].newdata = 0;
  34.194 +        pthread_mutex_init(&(pool_thread[i].ptmutex), NULL);
  34.195 +        pthread_cond_init(&(pool_thread[i].ptcv), NULL);
  34.196 +    }
  34.197  
  34.198      bsservers[0].hostname = "firebug.cl.cam.ac.uk";
  34.199      bsservers[1].hostname = "planb.cl.cam.ac.uk";
  34.200 @@ -1225,6 +1297,7 @@ int __init_blockstore(void)
  34.201      }
  34.202  
  34.203      pthread_create(&pthread_recv, NULL, receive_loop, NULL);
  34.204 +    pthread_create(&pthread_recv, NULL, queue_runner, NULL);
  34.205  
  34.206  #else /* /BLOCKSTORE_REMOTE */
  34.207      block_fp = open("blockstore.dat", O_RDWR | O_CREAT | O_LARGEFILE, 0644);
  34.208 @@ -1262,9 +1335,14 @@ int __init_blockstore(void)
  34.209  
  34.210  void __exit_blockstore(void)
  34.211  {
  34.212 +    int i;
  34.213      pthread_mutex_destroy(&ptmutex_recv);
  34.214      pthread_mutex_destroy(&ptmutex_luid);
  34.215      pthread_mutex_destroy(&ptmutex_queue);
  34.216 -    pthread_mutex_destroy(&ptmutex_notify);
  34.217 -    pthread_cond_destroy(&ptcv_notify);
  34.218 +    /*pthread_mutex_destroy(&ptmutex_notify);
  34.219 +      pthread_cond_destroy(&ptcv_notify);*/
  34.220 +    for (i = 0; i <= READ_POOL_SIZE; i++) {
  34.221 +        pthread_mutex_destroy(&(pool_thread[i].ptmutex));
  34.222 +        pthread_cond_destroy(&(pool_thread[i].ptcv));
  34.223 +    }
  34.224  }
    35.1 --- a/tools/blktap/parallax-threaded.h	Sat Mar 26 05:05:07 2005 +0000
    35.2 +++ b/tools/blktap/parallax-threaded.h	Fri Mar 25 22:52:21 2005 +0000
    35.3 @@ -14,7 +14,8 @@
    35.4  #define NOTHREADS
    35.5  #endif
    35.6  
    35.7 -#define READ_POOL_SIZE 128
    35.8 +//#define READ_POOL_SIZE 128
    35.9 +#define READ_POOL_SIZE 8
   35.10  
   35.11  /* per-thread identifier */
   35.12  pthread_key_t tid_key;