ia64/xen-unstable

changeset 2231:193f9a5f845c

bitkeeper revision 1.1159.1.56 (411ce9a6v81rvYXz3nv5gJ5LNweItA)

Merge scramble.cl.cam.ac.uk:/auto/groups/xeno/BK/xeno.bk
into scramble.cl.cam.ac.uk:/local/scratch/kaf24/xeno
author kaf24@scramble.cl.cam.ac.uk
date Fri Aug 13 16:17:42 2004 +0000 (2004-08-13)
parents c5441fce57f0 350a3acbb13f
children 29d3d09a420c
files .rootkeys linux-2.4.26-xen-sparse/arch/xen/drivers/balloon/balloon.c linux-2.4.26-xen-sparse/arch/xen/kernel/Makefile linux-2.4.26-xen-sparse/arch/xen/kernel/entry.S linux-2.4.26-xen-sparse/arch/xen/kernel/setup.c linux-2.4.26-xen-sparse/arch/xen/kernel/traps.c linux-2.4.26-xen-sparse/arch/xen/mm/Makefile linux-2.4.26-xen-sparse/arch/xen/mm/fault.c linux-2.4.26-xen-sparse/include/asm-xen/pgalloc.h linux-2.4.26-xen-sparse/include/asm-xen/pgtable.h linux-2.4.26-xen-sparse/mkbuildtree linux-2.4.26-xen-sparse/mm/mmap.c linux-2.4.26-xen-sparse/mm/vmalloc.c linux-2.6.7-xen-sparse/arch/xen/i386/mm/mmap.c linux-2.6.7-xen-sparse/arch/xen/kernel/fixup.c
line diff
     1.1 --- a/.rootkeys	Fri Aug 13 15:46:15 2004 +0000
     1.2 +++ b/.rootkeys	Fri Aug 13 16:17:42 2004 +0000
     1.3 @@ -118,6 +118,7 @@ 401c059bjLBFYHRD4Py2uM3eA1D4zQ linux-2.4
     1.4  3e6e7c1efbQe93xCvOpOVCnXTMmQ5w linux-2.4.26-xen-sparse/mkbuildtree
     1.5  406aeeafkrnCuIVWLFv3kfn4uAD5Eg linux-2.4.26-xen-sparse/mm/highmem.c
     1.6  3e5a4e68GxCIaFH4sy01v1wjapetaA linux-2.4.26-xen-sparse/mm/memory.c
     1.7 +411ce99d_uOUTK61pkqbdIAi1CIaSA linux-2.4.26-xen-sparse/mm/mmap.c
     1.8  3f108af5VxPkLv13tXpXgoRKALQtXQ linux-2.4.26-xen-sparse/mm/mprotect.c
     1.9  3e5a4e681xMPdF9xCMwpyfuYMySU5g linux-2.4.26-xen-sparse/mm/mremap.c
    1.10  409ba2e7akOFqQUg6Qyg2s28xcXiMg linux-2.4.26-xen-sparse/mm/page_alloc.c
     2.1 --- a/linux-2.4.26-xen-sparse/arch/xen/drivers/balloon/balloon.c	Fri Aug 13 15:46:15 2004 +0000
     2.2 +++ b/linux-2.4.26-xen-sparse/arch/xen/drivers/balloon/balloon.c	Fri Aug 13 16:17:42 2004 +0000
     2.3 @@ -65,7 +65,6 @@ static unsigned long inflate_balloon(uns
     2.4      unsigned long *currp;
     2.5      unsigned long curraddr;
     2.6      unsigned long ret = 0;
     2.7 -    unsigned long vaddr;
     2.8      unsigned long i, j;
     2.9  
    2.10      parray = (unsigned long *)vmalloc(num_pages * sizeof(unsigned long));
    2.11 @@ -102,7 +101,7 @@ static unsigned long inflate_balloon(uns
    2.12      for ( i = 0, currp = parray; i < num_pages; i++, currp++ )
    2.13      {
    2.14  	unsigned long mfn = phys_to_machine_mapping[*currp];
    2.15 -        curraddr = page_address(mem_map + *currp);
    2.16 +        curraddr = (unsigned long)page_address(mem_map + *currp);
    2.17  	if (curraddr)
    2.18              queue_l1_entry_update(get_ptep(curraddr), 0);
    2.19  
    2.20 @@ -178,7 +177,7 @@ unsigned long deflate_balloon(unsigned l
    2.21  
    2.22      if ( num_pages > credit )
    2.23      {
    2.24 -        printk(KERN_ERR "deflate_balloon: %d pages > %d credit.\n",
    2.25 +        printk(KERN_ERR "deflate_balloon: %lu pages > %lu credit.\n",
    2.26  			num_pages, credit);
    2.27          return -EAGAIN;
    2.28      }
     3.1 --- a/linux-2.4.26-xen-sparse/arch/xen/kernel/Makefile	Fri Aug 13 15:46:15 2004 +0000
     3.2 +++ b/linux-2.4.26-xen-sparse/arch/xen/kernel/Makefile	Fri Aug 13 16:17:42 2004 +0000
     3.3 @@ -11,7 +11,7 @@ export-objs     := i386_ksyms.o
     3.4  obj-y	:= process.o semaphore.o signal.o entry.o traps.o irq.o  \
     3.5  		ptrace.o ioport.o ldt.o setup.o time.o sys_i386.o \
     3.6  		i386_ksyms.o i387.o evtchn.o ctrl_if.o pci-dma.o \
     3.7 -		reboot.o
     3.8 +		reboot.o fixup.o
     3.9  
    3.10  ifdef CONFIG_PCI
    3.11  obj-y	+= pci-i386.o pci-pc.o
     4.1 --- a/linux-2.4.26-xen-sparse/arch/xen/kernel/entry.S	Fri Aug 13 15:46:15 2004 +0000
     4.2 +++ b/linux-2.4.26-xen-sparse/arch/xen/kernel/entry.S	Fri Aug 13 16:17:42 2004 +0000
     4.3 @@ -518,9 +518,8 @@ ENTRY(machine_check)
     4.4  	pushl $ SYMBOL_NAME(do_machine_check)
     4.5  	jmp error_code
     4.6  
     4.7 -ENTRY(spurious_interrupt_bug)
     4.8 -	pushl $0
     4.9 -	pushl $ SYMBOL_NAME(do_spurious_interrupt_bug)
    4.10 +ENTRY(fixup_4gb_segment)
    4.11 +	pushl $ SYMBOL_NAME(do_fixup_4gb_segment)
    4.12  	jmp error_code
    4.13  
    4.14  .data
     5.1 --- a/linux-2.4.26-xen-sparse/arch/xen/kernel/setup.c	Fri Aug 13 15:46:15 2004 +0000
     5.2 +++ b/linux-2.4.26-xen-sparse/arch/xen/kernel/setup.c	Fri Aug 13 16:17:42 2004 +0000
     5.3 @@ -230,6 +230,9 @@ void __init setup_arch(char **cmdline_p)
     5.4      blk_nohighio = 1;
     5.5  #endif
     5.6  
     5.7 +    HYPERVISOR_vm_assist(VMASST_CMD_enable,
     5.8 +                         VMASST_TYPE_4gb_segments);
     5.9 +        
    5.10      HYPERVISOR_set_callbacks(
    5.11          __KERNEL_CS, (unsigned long)hypervisor_callback,
    5.12          __KERNEL_CS, (unsigned long)failsafe_callback);
     6.1 --- a/linux-2.4.26-xen-sparse/arch/xen/kernel/traps.c	Fri Aug 13 15:46:15 2004 +0000
     6.2 +++ b/linux-2.4.26-xen-sparse/arch/xen/kernel/traps.c	Fri Aug 13 16:17:42 2004 +0000
     6.3 @@ -63,7 +63,7 @@ asmlinkage void safe_page_fault(void);
     6.4  asmlinkage void coprocessor_error(void);
     6.5  asmlinkage void simd_coprocessor_error(void);
     6.6  asmlinkage void alignment_check(void);
     6.7 -asmlinkage void spurious_interrupt_bug(void);
     6.8 +asmlinkage void fixup_4gb_segment(void);
     6.9  asmlinkage void machine_check(void);
    6.10  
    6.11  int kstack_depth_to_print = 24;
    6.12 @@ -539,11 +539,6 @@ asmlinkage void do_simd_coprocessor_erro
    6.13  	}
    6.14  }
    6.15  
    6.16 -asmlinkage void do_spurious_interrupt_bug(struct pt_regs * regs,
    6.17 -					  long error_code)
    6.18 -{
    6.19 -}
    6.20 -
    6.21  /*
    6.22   *  'math_state_restore()' saves the current math information in the
    6.23   * old math state array, and gets the new ones from the current task
    6.24 @@ -605,7 +600,7 @@ static trap_info_t trap_table[] = {
    6.25      { 12, 0, __KERNEL_CS, (unsigned long)stack_segment               },
    6.26      { 13, 0, __KERNEL_CS, (unsigned long)general_protection          },
    6.27      { 14, 0, __KERNEL_CS, (unsigned long)page_fault                  },
    6.28 -    { 15, 0, __KERNEL_CS, (unsigned long)spurious_interrupt_bug      },
    6.29 +    { 15, 0, __KERNEL_CS, (unsigned long)fixup_4gb_segment           },
    6.30      { 16, 0, __KERNEL_CS, (unsigned long)coprocessor_error           },
    6.31      { 17, 0, __KERNEL_CS, (unsigned long)alignment_check             },
    6.32      { 18, 0, __KERNEL_CS, (unsigned long)machine_check               },
     7.1 --- a/linux-2.4.26-xen-sparse/arch/xen/mm/Makefile	Fri Aug 13 15:46:15 2004 +0000
     7.2 +++ b/linux-2.4.26-xen-sparse/arch/xen/mm/Makefile	Fri Aug 13 16:17:42 2004 +0000
     7.3 @@ -9,7 +9,7 @@
     7.4  
     7.5  O_TARGET := mm.o
     7.6  
     7.7 -obj-y	 := init.o fault.o extable.o pageattr.o hypervisor.o ioremap.o
     7.8 +obj-y	 := init.o fault.o extable.o pageattr.o hypervisor.o ioremap.o mmap.o
     7.9  
    7.10  export-objs := pageattr.o
    7.11  
     8.1 --- a/linux-2.4.26-xen-sparse/arch/xen/mm/fault.c	Fri Aug 13 15:46:15 2004 +0000
     8.2 +++ b/linux-2.4.26-xen-sparse/arch/xen/mm/fault.c	Fri Aug 13 16:17:42 2004 +0000
     8.3 @@ -105,7 +105,8 @@ asmlinkage void do_page_fault(struct pt_
     8.4          }
     8.5  #endif
     8.6  
     8.7 -        if ( flush_page_update_queue() != 0 ) return;
     8.8 +	if ( flush_page_update_queue() != 0 )
     8.9 +		return;
    8.10  
    8.11  	/*
    8.12  	 * We fault-in kernel-space virtual memory on-demand. The
    8.13 @@ -120,8 +121,10 @@ asmlinkage void do_page_fault(struct pt_
    8.14  	 * (error_code & 4) == 0, and that the fault was not a
    8.15  	 * protection error (error_code & 1) == 0.
    8.16  	 */
    8.17 -	if (address >= TASK_SIZE && !(error_code & 5))
    8.18 -		goto vmalloc_fault;
    8.19 +	if (unlikely(address >= TASK_SIZE) ||
    8.20 +	    unlikely(address < (FIRST_USER_PGD_NR<<PGDIR_SHIFT)))
    8.21 +		if (!(error_code & 5))
    8.22 +			goto vmalloc_fault;
    8.23  
    8.24  	mm = tsk->mm;
    8.25  	info.si_code = SEGV_MAPERR;
     9.1 --- a/linux-2.4.26-xen-sparse/include/asm-xen/pgalloc.h	Fri Aug 13 15:46:15 2004 +0000
     9.2 +++ b/linux-2.4.26-xen-sparse/include/asm-xen/pgalloc.h	Fri Aug 13 16:17:42 2004 +0000
     9.3 @@ -54,11 +54,15 @@ static inline pgd_t *get_pgd_slow(void)
     9.4  			if (!pmd)
     9.5  				goto out_oom;
     9.6  			clear_page(pmd);
     9.7 -			set_pgd(pgd + i, __pgd(1 + __pa(pmd)));
     9.8 +			set_pgd(pgd + FIRST_USER_PGD_NR, __pgd(1 + __pa(pmd)));
     9.9  		}
    9.10 -		memcpy(pgd + USER_PTRS_PER_PGD,
    9.11 -			swapper_pg_dir + USER_PTRS_PER_PGD,
    9.12 -			(PTRS_PER_PGD - USER_PTRS_PER_PGD) * sizeof(pgd_t));
    9.13 +		memcpy(pgd,
    9.14 +			swapper_pg_dir,
    9.15 +			FIRST_USER_PGD_NR * sizeof(pgd_t));
    9.16 +		memcpy(pgd + FIRST_USER_PGD_NR + USER_PTRS_PER_PGD,
    9.17 +			swapper_pg_dir + FIRST_USER_PGD_NR + USER_PTRS_PER_PGD,
    9.18 +			(PTRS_PER_PGD - USER_PTRS_PER_PGD -
    9.19 +			 FIRST_USER_PGD_NR) * sizeof(pgd_t));
    9.20  	}
    9.21  	return pgd;
    9.22  out_oom:
    9.23 @@ -75,13 +79,17 @@ static inline pgd_t *get_pgd_slow(void)
    9.24  	pgd_t *pgd = (pgd_t *)__get_free_page(GFP_KERNEL);
    9.25  
    9.26  	if (pgd) {
    9.27 -		memset(pgd, 0, USER_PTRS_PER_PGD * sizeof(pgd_t));
    9.28 -		memcpy(pgd + USER_PTRS_PER_PGD,
    9.29 -			init_mm.pgd + USER_PTRS_PER_PGD,
    9.30 -			(PTRS_PER_PGD - USER_PTRS_PER_PGD) * sizeof(pgd_t));
    9.31 +		memset(pgd + FIRST_USER_PGD_NR,
    9.32 +			0, USER_PTRS_PER_PGD*sizeof(pgd_t));
    9.33 +		memcpy(pgd,
    9.34 +			init_mm.pgd,
    9.35 +			FIRST_USER_PGD_NR * sizeof(pgd_t));
    9.36 +		memcpy(pgd + FIRST_USER_PGD_NR + USER_PTRS_PER_PGD,
    9.37 +			init_mm.pgd + FIRST_USER_PGD_NR + USER_PTRS_PER_PGD,
    9.38 +			(PTRS_PER_PGD - USER_PTRS_PER_PGD -
    9.39 +			 FIRST_USER_PGD_NR) * sizeof(pgd_t));
    9.40                  __make_page_readonly(pgd);
    9.41  		queue_pgd_pin(__pa(pgd));
    9.42 -
    9.43  	}
    9.44  	return pgd;
    9.45  }
    10.1 --- a/linux-2.4.26-xen-sparse/include/asm-xen/pgtable.h	Fri Aug 13 15:46:15 2004 +0000
    10.2 +++ b/linux-2.4.26-xen-sparse/include/asm-xen/pgtable.h	Fri Aug 13 16:17:42 2004 +0000
    10.3 @@ -83,16 +83,16 @@ extern void pgtable_cache_init(void);
    10.4  #define PGDIR_SIZE	(1UL << PGDIR_SHIFT)
    10.5  #define PGDIR_MASK	(~(PGDIR_SIZE-1))
    10.6  
    10.7 -#define USER_PTRS_PER_PGD	(TASK_SIZE/PGDIR_SIZE)
    10.8 -#define FIRST_USER_PGD_NR	0
    10.9 +#define FIRST_USER_PGD_NR	(1)
   10.10 +#define USER_PTRS_PER_PGD	((TASK_SIZE/PGDIR_SIZE)-FIRST_USER_PGD_NR)
   10.11  
   10.12 +#if 0 /* XEN */
   10.13  #define USER_PGD_PTRS (PAGE_OFFSET >> PGDIR_SHIFT)
   10.14  #define KERNEL_PGD_PTRS (PTRS_PER_PGD-USER_PGD_PTRS)
   10.15 -
   10.16  #define TWOLEVEL_PGDIR_SHIFT	22
   10.17  #define BOOT_USER_PGD_PTRS (__PAGE_OFFSET >> TWOLEVEL_PGDIR_SHIFT)
   10.18  #define BOOT_KERNEL_PGD_PTRS (1024-BOOT_USER_PGD_PTRS)
   10.19 -
   10.20 +#endif
   10.21  
   10.22  #ifndef __ASSEMBLY__
   10.23  /* 4MB is just a nice "safety zone". Also, we align to a fresh pde. */
   10.24 @@ -367,4 +367,7 @@ static inline unsigned long arbitrary_vi
   10.25  
   10.26  #define io_remap_page_range remap_page_range
   10.27  
   10.28 +#define HAVE_ARCH_UNMAPPED_AREA
   10.29 +#define HAVE_ARCH_CHECK_FIXED_MAPPING
   10.30 +
   10.31  #endif /* _I386_PGTABLE_H */
    11.1 --- a/linux-2.4.26-xen-sparse/mkbuildtree	Fri Aug 13 15:46:15 2004 +0000
    11.2 +++ b/linux-2.4.26-xen-sparse/mkbuildtree	Fri Aug 13 16:17:42 2004 +0000
    11.3 @@ -222,6 +222,7 @@ ln -sf ../../i386/kernel/semaphore.c
    11.4  ln -sf ../../i386/kernel/sys_i386.c 
    11.5  ln -sf ../../../${LINUX_26}/arch/xen/kernel/ctrl_if.c
    11.6  ln -sf ../../../${LINUX_26}/arch/xen/kernel/evtchn.c
    11.7 +ln -sf ../../../${LINUX_26}/arch/xen/kernel/fixup.c
    11.8  ln -sf ../../../${LINUX_26}/arch/xen/kernel/reboot.c
    11.9  ln -sf ../../../${LINUX_26}/arch/xen/i386/kernel/ioport.c
   11.10  ln -sf ../../../${LINUX_26}/arch/xen/i386/kernel/pci-dma.c
   11.11 @@ -242,6 +243,7 @@ cd ${AD}/arch/xen/mm
   11.12  ln -sf ../../i386/mm/extable.c 
   11.13  ln -sf ../../i386/mm/pageattr.c 
   11.14  ln -sf ../../../${LINUX_26}/arch/xen/i386/mm/hypervisor.c
   11.15 +ln -sf ../../../${LINUX_26}/arch/xen/i386/mm/mmap.c
   11.16  
   11.17  cd ${AD}/arch/xen/drivers/console
   11.18  ln -sf ../../../../${LINUX_26}/drivers/xen/console/console.c 
    12.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    12.2 +++ b/linux-2.4.26-xen-sparse/mm/mmap.c	Fri Aug 13 16:17:42 2004 +0000
    12.3 @@ -0,0 +1,1219 @@
    12.4 +/*
    12.5 + *	linux/mm/mmap.c
    12.6 + *
    12.7 + * Written by obz.
    12.8 + */
    12.9 +#include <linux/slab.h>
   12.10 +#include <linux/shm.h>
   12.11 +#include <linux/mman.h>
   12.12 +#include <linux/pagemap.h>
   12.13 +#include <linux/swap.h>
   12.14 +#include <linux/swapctl.h>
   12.15 +#include <linux/smp_lock.h>
   12.16 +#include <linux/init.h>
   12.17 +#include <linux/file.h>
   12.18 +#include <linux/fs.h>
   12.19 +#include <linux/personality.h>
   12.20 +#include <linux/mount.h>
   12.21 +
   12.22 +#include <asm/uaccess.h>
   12.23 +#include <asm/pgalloc.h>
   12.24 +
   12.25 +/*
   12.26 + * WARNING: the debugging will use recursive algorithms so never enable this
   12.27 + * unless you know what you are doing.
   12.28 + */
   12.29 +#undef DEBUG_MM_RB
   12.30 +
   12.31 +/* description of effects of mapping type and prot in current implementation.
   12.32 + * this is due to the limited x86 page protection hardware.  The expected
   12.33 + * behavior is in parens:
   12.34 + *
   12.35 + * map_type	prot
   12.36 + *		PROT_NONE	PROT_READ	PROT_WRITE	PROT_EXEC
   12.37 + * MAP_SHARED	r: (no) no	r: (yes) yes	r: (no) yes	r: (no) yes
   12.38 + *		w: (no) no	w: (no) no	w: (yes) yes	w: (no) no
   12.39 + *		x: (no) no	x: (no) yes	x: (no) yes	x: (yes) yes
   12.40 + *		
   12.41 + * MAP_PRIVATE	r: (no) no	r: (yes) yes	r: (no) yes	r: (no) yes
   12.42 + *		w: (no) no	w: (no) no	w: (copy) copy	w: (no) no
   12.43 + *		x: (no) no	x: (no) yes	x: (no) yes	x: (yes) yes
   12.44 + *
   12.45 + */
   12.46 +pgprot_t protection_map[16] = {
   12.47 +	__P000, __P001, __P010, __P011, __P100, __P101, __P110, __P111,
   12.48 +	__S000, __S001, __S010, __S011, __S100, __S101, __S110, __S111
   12.49 +};
   12.50 +
   12.51 +int sysctl_overcommit_memory;
   12.52 +int max_map_count = DEFAULT_MAX_MAP_COUNT;
   12.53 +
   12.54 +/* Check that a process has enough memory to allocate a
   12.55 + * new virtual mapping.
   12.56 + */
   12.57 +int vm_enough_memory(long pages)
   12.58 +{
   12.59 +	/* Stupid algorithm to decide if we have enough memory: while
   12.60 +	 * simple, it hopefully works in most obvious cases.. Easy to
   12.61 +	 * fool it, but this should catch most mistakes.
   12.62 +	 */
   12.63 +	/* 23/11/98 NJC: Somewhat less stupid version of algorithm,
   12.64 +	 * which tries to do "TheRightThing".  Instead of using half of
   12.65 +	 * (buffers+cache), use the minimum values.  Allow an extra 2%
   12.66 +	 * of num_physpages for safety margin.
   12.67 +	 */
   12.68 +
   12.69 +	unsigned long free;
   12.70 +	
   12.71 +        /* Sometimes we want to use more memory than we have. */
   12.72 +	if (sysctl_overcommit_memory)
   12.73 +	    return 1;
   12.74 +
   12.75 +	/* The page cache contains buffer pages these days.. */
   12.76 +	free = page_cache_size;
   12.77 +	free += nr_free_pages();
   12.78 +	free += nr_swap_pages;
   12.79 +
   12.80 +	/*
   12.81 +	 * This double-counts: the nrpages are both in the page-cache
   12.82 +	 * and in the swapper space. At the same time, this compensates
   12.83 +	 * for the swap-space over-allocation (ie "nr_swap_pages" being
   12.84 +	 * too small.
   12.85 +	 */
   12.86 +	free += swapper_space.nrpages;
   12.87 +
   12.88 +	/*
   12.89 +	 * The code below doesn't account for free space in the inode
   12.90 +	 * and dentry slab cache, slab cache fragmentation, inodes and
   12.91 +	 * dentries which will become freeable under VM load, etc.
   12.92 +	 * Lets just hope all these (complex) factors balance out...
   12.93 +	 */
   12.94 +	free += (dentry_stat.nr_unused * sizeof(struct dentry)) >> PAGE_SHIFT;
   12.95 +	free += (inodes_stat.nr_unused * sizeof(struct inode)) >> PAGE_SHIFT;
   12.96 +
   12.97 +	return free > pages;
   12.98 +}
   12.99 +
  12.100 +/* Remove one vm structure from the inode's i_mapping address space. */
  12.101 +static inline void __remove_shared_vm_struct(struct vm_area_struct *vma)
  12.102 +{
  12.103 +	struct file * file = vma->vm_file;
  12.104 +
  12.105 +	if (file) {
  12.106 +		struct inode *inode = file->f_dentry->d_inode;
  12.107 +		if (vma->vm_flags & VM_DENYWRITE)
  12.108 +			atomic_inc(&inode->i_writecount);
  12.109 +		if(vma->vm_next_share)
  12.110 +			vma->vm_next_share->vm_pprev_share = vma->vm_pprev_share;
  12.111 +		*vma->vm_pprev_share = vma->vm_next_share;
  12.112 +	}
  12.113 +}
  12.114 +
  12.115 +static inline void remove_shared_vm_struct(struct vm_area_struct *vma)
  12.116 +{
  12.117 +	lock_vma_mappings(vma);
  12.118 +	__remove_shared_vm_struct(vma);
  12.119 +	unlock_vma_mappings(vma);
  12.120 +}
  12.121 +
  12.122 +void lock_vma_mappings(struct vm_area_struct *vma)
  12.123 +{
  12.124 +	struct address_space *mapping;
  12.125 +
  12.126 +	mapping = NULL;
  12.127 +	if (vma->vm_file)
  12.128 +		mapping = vma->vm_file->f_dentry->d_inode->i_mapping;
  12.129 +	if (mapping)
  12.130 +		spin_lock(&mapping->i_shared_lock);
  12.131 +}
  12.132 +
  12.133 +void unlock_vma_mappings(struct vm_area_struct *vma)
  12.134 +{
  12.135 +	struct address_space *mapping;
  12.136 +
  12.137 +	mapping = NULL;
  12.138 +	if (vma->vm_file)
  12.139 +		mapping = vma->vm_file->f_dentry->d_inode->i_mapping;
  12.140 +	if (mapping)
  12.141 +		spin_unlock(&mapping->i_shared_lock);
  12.142 +}
  12.143 +
  12.144 +/*
  12.145 + *  sys_brk() for the most part doesn't need the global kernel
  12.146 + *  lock, except when an application is doing something nasty
  12.147 + *  like trying to un-brk an area that has already been mapped
  12.148 + *  to a regular file.  in this case, the unmapping will need
  12.149 + *  to invoke file system routines that need the global lock.
  12.150 + */
  12.151 +asmlinkage unsigned long sys_brk(unsigned long brk)
  12.152 +{
  12.153 +	unsigned long rlim, retval;
  12.154 +	unsigned long newbrk, oldbrk;
  12.155 +	struct mm_struct *mm = current->mm;
  12.156 +
  12.157 +	down_write(&mm->mmap_sem);
  12.158 +
  12.159 +	if (brk < mm->end_code)
  12.160 +		goto out;
  12.161 +	newbrk = PAGE_ALIGN(brk);
  12.162 +	oldbrk = PAGE_ALIGN(mm->brk);
  12.163 +	if (oldbrk == newbrk)
  12.164 +		goto set_brk;
  12.165 +
  12.166 +	/* Always allow shrinking brk. */
  12.167 +	if (brk <= mm->brk) {
  12.168 +		if (!do_munmap(mm, newbrk, oldbrk-newbrk))
  12.169 +			goto set_brk;
  12.170 +		goto out;
  12.171 +	}
  12.172 +
  12.173 +	/* Check against rlimit.. */
  12.174 +	rlim = current->rlim[RLIMIT_DATA].rlim_cur;
  12.175 +	if (rlim < RLIM_INFINITY && brk - mm->start_data > rlim)
  12.176 +		goto out;
  12.177 +
  12.178 +	/* Check against existing mmap mappings. */
  12.179 +	if (find_vma_intersection(mm, oldbrk, newbrk+PAGE_SIZE))
  12.180 +		goto out;
  12.181 +
  12.182 +	/* Check if we have enough memory.. */
  12.183 +	if (!vm_enough_memory((newbrk-oldbrk) >> PAGE_SHIFT))
  12.184 +		goto out;
  12.185 +
  12.186 +	/* Ok, looks good - let it rip. */
  12.187 +	if (do_brk(oldbrk, newbrk-oldbrk) != oldbrk)
  12.188 +		goto out;
  12.189 +set_brk:
  12.190 +	mm->brk = brk;
  12.191 +out:
  12.192 +	retval = mm->brk;
  12.193 +	up_write(&mm->mmap_sem);
  12.194 +	return retval;
  12.195 +}
  12.196 +
  12.197 +/* Combine the mmap "prot" and "flags" argument into one "vm_flags" used
  12.198 + * internally. Essentially, translate the "PROT_xxx" and "MAP_xxx" bits
  12.199 + * into "VM_xxx".
  12.200 + */
  12.201 +static inline unsigned long calc_vm_flags(unsigned long prot, unsigned long flags)
  12.202 +{
  12.203 +#define _trans(x,bit1,bit2) \
  12.204 +((bit1==bit2)?(x&bit1):(x&bit1)?bit2:0)
  12.205 +
  12.206 +	unsigned long prot_bits, flag_bits;
  12.207 +	prot_bits =
  12.208 +		_trans(prot, PROT_READ, VM_READ) |
  12.209 +		_trans(prot, PROT_WRITE, VM_WRITE) |
  12.210 +		_trans(prot, PROT_EXEC, VM_EXEC);
  12.211 +	flag_bits =
  12.212 +		_trans(flags, MAP_GROWSDOWN, VM_GROWSDOWN) |
  12.213 +		_trans(flags, MAP_DENYWRITE, VM_DENYWRITE) |
  12.214 +		_trans(flags, MAP_EXECUTABLE, VM_EXECUTABLE);
  12.215 +	return prot_bits | flag_bits;
  12.216 +#undef _trans
  12.217 +}
  12.218 +
  12.219 +#ifdef DEBUG_MM_RB
  12.220 +static int browse_rb(rb_node_t * rb_node) {
  12.221 +	int i = 0;
  12.222 +	if (rb_node) {
  12.223 +		i++;
  12.224 +		i += browse_rb(rb_node->rb_left);
  12.225 +		i += browse_rb(rb_node->rb_right);
  12.226 +	}
  12.227 +	return i;
  12.228 +}
  12.229 +
  12.230 +static void validate_mm(struct mm_struct * mm) {
  12.231 +	int bug = 0;
  12.232 +	int i = 0;
  12.233 +	struct vm_area_struct * tmp = mm->mmap;
  12.234 +	while (tmp) {
  12.235 +		tmp = tmp->vm_next;
  12.236 +		i++;
  12.237 +	}
  12.238 +	if (i != mm->map_count)
  12.239 +		printk("map_count %d vm_next %d\n", mm->map_count, i), bug = 1;
  12.240 +	i = browse_rb(mm->mm_rb.rb_node);
  12.241 +	if (i != mm->map_count)
  12.242 +		printk("map_count %d rb %d\n", mm->map_count, i), bug = 1;
  12.243 +	if (bug)
  12.244 +		BUG();
  12.245 +}
  12.246 +#else
  12.247 +#define validate_mm(mm) do { } while (0)
  12.248 +#endif
  12.249 +
  12.250 +static struct vm_area_struct * find_vma_prepare(struct mm_struct * mm, unsigned long addr,
  12.251 +						struct vm_area_struct ** pprev,
  12.252 +						rb_node_t *** rb_link, rb_node_t ** rb_parent)
  12.253 +{
  12.254 +	struct vm_area_struct * vma;
  12.255 +	rb_node_t ** __rb_link, * __rb_parent, * rb_prev;
  12.256 +
  12.257 +	__rb_link = &mm->mm_rb.rb_node;
  12.258 +	rb_prev = __rb_parent = NULL;
  12.259 +	vma = NULL;
  12.260 +
  12.261 +	while (*__rb_link) {
  12.262 +		struct vm_area_struct *vma_tmp;
  12.263 +
  12.264 +		__rb_parent = *__rb_link;
  12.265 +		vma_tmp = rb_entry(__rb_parent, struct vm_area_struct, vm_rb);
  12.266 +
  12.267 +		if (vma_tmp->vm_end > addr) {
  12.268 +			vma = vma_tmp;
  12.269 +			if (vma_tmp->vm_start <= addr)
  12.270 +				return vma;
  12.271 +			__rb_link = &__rb_parent->rb_left;
  12.272 +		} else {
  12.273 +			rb_prev = __rb_parent;
  12.274 +			__rb_link = &__rb_parent->rb_right;
  12.275 +		}
  12.276 +	}
  12.277 +
  12.278 +	*pprev = NULL;
  12.279 +	if (rb_prev)
  12.280 +		*pprev = rb_entry(rb_prev, struct vm_area_struct, vm_rb);
  12.281 +	*rb_link = __rb_link;
  12.282 +	*rb_parent = __rb_parent;
  12.283 +	return vma;
  12.284 +}
  12.285 +
  12.286 +static inline void __vma_link_list(struct mm_struct * mm, struct vm_area_struct * vma, struct vm_area_struct * prev,
  12.287 +				   rb_node_t * rb_parent)
  12.288 +{
  12.289 +	if (prev) {
  12.290 +		vma->vm_next = prev->vm_next;
  12.291 +		prev->vm_next = vma;
  12.292 +	} else {
  12.293 +		mm->mmap = vma;
  12.294 +		if (rb_parent)
  12.295 +			vma->vm_next = rb_entry(rb_parent, struct vm_area_struct, vm_rb);
  12.296 +		else
  12.297 +			vma->vm_next = NULL;
  12.298 +	}
  12.299 +}
  12.300 +
  12.301 +static inline void __vma_link_rb(struct mm_struct * mm, struct vm_area_struct * vma,
  12.302 +				 rb_node_t ** rb_link, rb_node_t * rb_parent)
  12.303 +{
  12.304 +	rb_link_node(&vma->vm_rb, rb_parent, rb_link);
  12.305 +	rb_insert_color(&vma->vm_rb, &mm->mm_rb);
  12.306 +}
  12.307 +
  12.308 +static inline void __vma_link_file(struct vm_area_struct * vma)
  12.309 +{
  12.310 +	struct file * file;
  12.311 +
  12.312 +	file = vma->vm_file;
  12.313 +	if (file) {
  12.314 +		struct inode * inode = file->f_dentry->d_inode;
  12.315 +		struct address_space *mapping = inode->i_mapping;
  12.316 +		struct vm_area_struct **head;
  12.317 +
  12.318 +		if (vma->vm_flags & VM_DENYWRITE)
  12.319 +			atomic_dec(&inode->i_writecount);
  12.320 +
  12.321 +		head = &mapping->i_mmap;
  12.322 +		if (vma->vm_flags & VM_SHARED)
  12.323 +			head = &mapping->i_mmap_shared;
  12.324 +      
  12.325 +		/* insert vma into inode's share list */
  12.326 +		if((vma->vm_next_share = *head) != NULL)
  12.327 +			(*head)->vm_pprev_share = &vma->vm_next_share;
  12.328 +		*head = vma;
  12.329 +		vma->vm_pprev_share = head;
  12.330 +	}
  12.331 +}
  12.332 +
  12.333 +static void __vma_link(struct mm_struct * mm, struct vm_area_struct * vma,  struct vm_area_struct * prev,
  12.334 +		       rb_node_t ** rb_link, rb_node_t * rb_parent)
  12.335 +{
  12.336 +	__vma_link_list(mm, vma, prev, rb_parent);
  12.337 +	__vma_link_rb(mm, vma, rb_link, rb_parent);
  12.338 +	__vma_link_file(vma);
  12.339 +}
  12.340 +
  12.341 +static inline void vma_link(struct mm_struct * mm, struct vm_area_struct * vma, struct vm_area_struct * prev,
  12.342 +			    rb_node_t ** rb_link, rb_node_t * rb_parent)
  12.343 +{
  12.344 +	lock_vma_mappings(vma);
  12.345 +	spin_lock(&mm->page_table_lock);
  12.346 +	__vma_link(mm, vma, prev, rb_link, rb_parent);
  12.347 +	spin_unlock(&mm->page_table_lock);
  12.348 +	unlock_vma_mappings(vma);
  12.349 +
  12.350 +	mm->map_count++;
  12.351 +	validate_mm(mm);
  12.352 +}
  12.353 +
  12.354 +static int vma_merge(struct mm_struct * mm, struct vm_area_struct * prev,
  12.355 +		     rb_node_t * rb_parent, unsigned long addr, unsigned long end, unsigned long vm_flags)
  12.356 +{
  12.357 +	spinlock_t * lock = &mm->page_table_lock;
  12.358 +	if (!prev) {
  12.359 +		prev = rb_entry(rb_parent, struct vm_area_struct, vm_rb);
  12.360 +		goto merge_next;
  12.361 +	}
  12.362 +	if (prev->vm_end == addr && can_vma_merge(prev, vm_flags)) {
  12.363 +		struct vm_area_struct * next;
  12.364 +
  12.365 +		spin_lock(lock);
  12.366 +		prev->vm_end = end;
  12.367 +		next = prev->vm_next;
  12.368 +		if (next && prev->vm_end == next->vm_start && can_vma_merge(next, vm_flags)) {
  12.369 +			prev->vm_end = next->vm_end;
  12.370 +			__vma_unlink(mm, next, prev);
  12.371 +			spin_unlock(lock);
  12.372 +
  12.373 +			mm->map_count--;
  12.374 +			kmem_cache_free(vm_area_cachep, next);
  12.375 +			return 1;
  12.376 +		}
  12.377 +		spin_unlock(lock);
  12.378 +		return 1;
  12.379 +	}
  12.380 +
  12.381 +	prev = prev->vm_next;
  12.382 +	if (prev) {
  12.383 + merge_next:
  12.384 +		if (!can_vma_merge(prev, vm_flags))
  12.385 +			return 0;
  12.386 +		if (end == prev->vm_start) {
  12.387 +			spin_lock(lock);
  12.388 +			prev->vm_start = addr;
  12.389 +			spin_unlock(lock);
  12.390 +			return 1;
  12.391 +		}
  12.392 +	}
  12.393 +
  12.394 +	return 0;
  12.395 +}
  12.396 +
  12.397 +unsigned long do_mmap_pgoff(struct file * file, unsigned long addr, unsigned long len,
  12.398 +	unsigned long prot, unsigned long flags, unsigned long pgoff)
  12.399 +{
  12.400 +	struct mm_struct * mm = current->mm;
  12.401 +	struct vm_area_struct * vma, * prev;
  12.402 +	unsigned int vm_flags;
  12.403 +	int correct_wcount = 0;
  12.404 +	int error;
  12.405 +	rb_node_t ** rb_link, * rb_parent;
  12.406 +
  12.407 +	if (file) {
  12.408 +		if (!file->f_op || !file->f_op->mmap)
  12.409 +			return -ENODEV;
  12.410 +
  12.411 +		if ((prot & PROT_EXEC) && (file->f_vfsmnt->mnt_flags & MNT_NOEXEC))
  12.412 +			return -EPERM;
  12.413 +	}
  12.414 +
  12.415 +	if (!len)
  12.416 +		return addr;
  12.417 +
  12.418 +	len = PAGE_ALIGN(len);
  12.419 +
  12.420 +	if (len > TASK_SIZE || len == 0)
  12.421 +		return -EINVAL;
  12.422 +
  12.423 +	/* offset overflow? */
  12.424 +	if ((pgoff + (len >> PAGE_SHIFT)) < pgoff)
  12.425 +		return -EINVAL;
  12.426 +
  12.427 +	/* Too many mappings? */
  12.428 +	if (mm->map_count > max_map_count)
  12.429 +		return -ENOMEM;
  12.430 +
  12.431 +	/* Obtain the address to map to. we verify (or select) it and ensure
  12.432 +	 * that it represents a valid section of the address space.
  12.433 +	 */
  12.434 +	addr = get_unmapped_area(file, addr, len, pgoff, flags);
  12.435 +	if (addr & ~PAGE_MASK)
  12.436 +		return addr;
  12.437 +
  12.438 +	/* Do simple checking here so the lower-level routines won't have
  12.439 +	 * to. we assume access permissions have been handled by the open
  12.440 +	 * of the memory object, so we don't do any here.
  12.441 +	 */
  12.442 +	vm_flags = calc_vm_flags(prot,flags) | mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
  12.443 +
  12.444 +	/* mlock MCL_FUTURE? */
  12.445 +	if (vm_flags & VM_LOCKED) {
  12.446 +		unsigned long locked = mm->locked_vm << PAGE_SHIFT;
  12.447 +		locked += len;
  12.448 +		if (locked > current->rlim[RLIMIT_MEMLOCK].rlim_cur)
  12.449 +			return -EAGAIN;
  12.450 +	}
  12.451 +
  12.452 +	if (file) {
  12.453 +		switch (flags & MAP_TYPE) {
  12.454 +		case MAP_SHARED:
  12.455 +			if ((prot & PROT_WRITE) && !(file->f_mode & FMODE_WRITE))
  12.456 +				return -EACCES;
  12.457 +
  12.458 +			/* Make sure we don't allow writing to an append-only file.. */
  12.459 +			if (IS_APPEND(file->f_dentry->d_inode) && (file->f_mode & FMODE_WRITE))
  12.460 +				return -EACCES;
  12.461 +
  12.462 +			/* make sure there are no mandatory locks on the file. */
  12.463 +			if (locks_verify_locked(file->f_dentry->d_inode))
  12.464 +				return -EAGAIN;
  12.465 +
  12.466 +			vm_flags |= VM_SHARED | VM_MAYSHARE;
  12.467 +			if (!(file->f_mode & FMODE_WRITE))
  12.468 +				vm_flags &= ~(VM_MAYWRITE | VM_SHARED);
  12.469 +
  12.470 +			/* fall through */
  12.471 +		case MAP_PRIVATE:
  12.472 +			if (!(file->f_mode & FMODE_READ))
  12.473 +				return -EACCES;
  12.474 +			break;
  12.475 +
  12.476 +		default:
  12.477 +			return -EINVAL;
  12.478 +		}
  12.479 +	} else {
  12.480 +		vm_flags |= VM_SHARED | VM_MAYSHARE;
  12.481 +		switch (flags & MAP_TYPE) {
  12.482 +		default:
  12.483 +			return -EINVAL;
  12.484 +		case MAP_PRIVATE:
  12.485 +			vm_flags &= ~(VM_SHARED | VM_MAYSHARE);
  12.486 +			/* fall through */
  12.487 +		case MAP_SHARED:
  12.488 +			break;
  12.489 +		}
  12.490 +	}
  12.491 +
  12.492 +	/* Clear old maps */
  12.493 +munmap_back:
  12.494 +	vma = find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent);
  12.495 +	if (vma && vma->vm_start < addr + len) {
  12.496 +		if (do_munmap(mm, addr, len))
  12.497 +			return -ENOMEM;
  12.498 +		goto munmap_back;
  12.499 +	}
  12.500 +
  12.501 +	/* Check against address space limit. */
  12.502 +	if ((mm->total_vm << PAGE_SHIFT) + len
  12.503 +	    > current->rlim[RLIMIT_AS].rlim_cur)
  12.504 +		return -ENOMEM;
  12.505 +
  12.506 +	/* Private writable mapping? Check memory availability.. */
  12.507 +	if ((vm_flags & (VM_SHARED | VM_WRITE)) == VM_WRITE &&
  12.508 +	    !(flags & MAP_NORESERVE)				 &&
  12.509 +	    !vm_enough_memory(len >> PAGE_SHIFT))
  12.510 +		return -ENOMEM;
  12.511 +
  12.512 +	/* Can we just expand an old anonymous mapping? */
  12.513 +	if (!file && !(vm_flags & VM_SHARED) && rb_parent)
  12.514 +		if (vma_merge(mm, prev, rb_parent, addr, addr + len, vm_flags))
  12.515 +			goto out;
  12.516 +
  12.517 +	/* Determine the object being mapped and call the appropriate
  12.518 +	 * specific mapper. the address has already been validated, but
  12.519 +	 * not unmapped, but the maps are removed from the list.
  12.520 +	 */
  12.521 +	vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
  12.522 +	if (!vma)
  12.523 +		return -ENOMEM;
  12.524 +
  12.525 +	vma->vm_mm = mm;
  12.526 +	vma->vm_start = addr;
  12.527 +	vma->vm_end = addr + len;
  12.528 +	vma->vm_flags = vm_flags;
  12.529 +	vma->vm_page_prot = protection_map[vm_flags & 0x0f];
  12.530 +	vma->vm_ops = NULL;
  12.531 +	vma->vm_pgoff = pgoff;
  12.532 +	vma->vm_file = NULL;
  12.533 +	vma->vm_private_data = NULL;
  12.534 +	vma->vm_raend = 0;
  12.535 +
  12.536 +	if (file) {
  12.537 +		error = -EINVAL;
  12.538 +		if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP))
  12.539 +			goto free_vma;
  12.540 +		if (vm_flags & VM_DENYWRITE) {
  12.541 +			error = deny_write_access(file);
  12.542 +			if (error)
  12.543 +				goto free_vma;
  12.544 +			correct_wcount = 1;
  12.545 +		}
  12.546 +		vma->vm_file = file;
  12.547 +		get_file(file);
  12.548 +		error = file->f_op->mmap(file, vma);
  12.549 +		if (error)
  12.550 +			goto unmap_and_free_vma;
  12.551 +	} else if (flags & MAP_SHARED) {
  12.552 +		error = shmem_zero_setup(vma);
  12.553 +		if (error)
  12.554 +			goto free_vma;
  12.555 +	}
  12.556 +
  12.557 +	/* Can addr have changed??
  12.558 +	 *
  12.559 +	 * Answer: Yes, several device drivers can do it in their
  12.560 +	 *         f_op->mmap method. -DaveM
  12.561 +	 */
  12.562 +	if (addr != vma->vm_start) {
  12.563 +		/*
  12.564 +		 * It is a bit too late to pretend changing the virtual
  12.565 +		 * area of the mapping, we just corrupted userspace
  12.566 +		 * in the do_munmap, so FIXME (not in 2.4 to avoid breaking
  12.567 +		 * the driver API).
  12.568 +		 */
  12.569 +		struct vm_area_struct * stale_vma;
  12.570 +		/* Since addr changed, we rely on the mmap op to prevent 
  12.571 +		 * collisions with existing vmas and just use find_vma_prepare 
  12.572 +		 * to update the tree pointers.
  12.573 +		 */
  12.574 +		addr = vma->vm_start;
  12.575 +		stale_vma = find_vma_prepare(mm, addr, &prev,
  12.576 +						&rb_link, &rb_parent);
  12.577 +		/*
  12.578 +		 * Make sure the lowlevel driver did its job right.
  12.579 +		 */
  12.580 +		if (unlikely(stale_vma && stale_vma->vm_start < vma->vm_end)) {
  12.581 +			printk(KERN_ERR "buggy mmap operation: [<%p>]\n",
  12.582 +				file ? file->f_op->mmap : NULL);
  12.583 +			BUG();
  12.584 +		}
  12.585 +	}
  12.586 +
  12.587 +	vma_link(mm, vma, prev, rb_link, rb_parent);
  12.588 +	if (correct_wcount)
  12.589 +		atomic_inc(&file->f_dentry->d_inode->i_writecount);
  12.590 +
  12.591 +out:	
  12.592 +	mm->total_vm += len >> PAGE_SHIFT;
  12.593 +	if (vm_flags & VM_LOCKED) {
  12.594 +		mm->locked_vm += len >> PAGE_SHIFT;
  12.595 +		make_pages_present(addr, addr + len);
  12.596 +	}
  12.597 +	return addr;
  12.598 +
  12.599 +unmap_and_free_vma:
  12.600 +	if (correct_wcount)
  12.601 +		atomic_inc(&file->f_dentry->d_inode->i_writecount);
  12.602 +	vma->vm_file = NULL;
  12.603 +	fput(file);
  12.604 +
  12.605 +	/* Undo any partial mapping done by a device driver. */
  12.606 +	zap_page_range(mm, vma->vm_start, vma->vm_end - vma->vm_start);
  12.607 +free_vma:
  12.608 +	kmem_cache_free(vm_area_cachep, vma);
  12.609 +	return error;
  12.610 +}
  12.611 +
  12.612 +/* Get an address range which is currently unmapped.
  12.613 + * For shmat() with addr=0.
  12.614 + *
  12.615 + * Ugly calling convention alert:
  12.616 + * Return value with the low bits set means error value,
  12.617 + * ie
  12.618 + *	if (ret & ~PAGE_MASK)
  12.619 + *		error = ret;
  12.620 + *
  12.621 + * This function "knows" that -ENOMEM has the bits set.
  12.622 + */
  12.623 +#ifndef HAVE_ARCH_UNMAPPED_AREA
  12.624 +static inline unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags)
  12.625 +{
  12.626 +	struct vm_area_struct *vma;
  12.627 +
  12.628 +	if (len > TASK_SIZE)
  12.629 +		return -ENOMEM;
  12.630 +
  12.631 +	if (addr) {
  12.632 +		addr = PAGE_ALIGN(addr);
  12.633 +		vma = find_vma(current->mm, addr);
  12.634 +		if (TASK_SIZE - len >= addr &&
  12.635 +		    (!vma || addr + len <= vma->vm_start))
  12.636 +			return addr;
  12.637 +	}
  12.638 +	addr = PAGE_ALIGN(TASK_UNMAPPED_BASE);
  12.639 +
  12.640 +	for (vma = find_vma(current->mm, addr); ; vma = vma->vm_next) {
  12.641 +		/* At this point:  (!vma || addr < vma->vm_end). */
  12.642 +		if (TASK_SIZE - len < addr)
  12.643 +			return -ENOMEM;
  12.644 +		if (!vma || addr + len <= vma->vm_start)
  12.645 +			return addr;
  12.646 +		addr = vma->vm_end;
  12.647 +	}
  12.648 +}
  12.649 +#else
  12.650 +extern unsigned long arch_get_unmapped_area(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
  12.651 +#endif	
  12.652 +
  12.653 +#ifndef HAVE_ARCH_CHECK_FIXED_MAPPING
  12.654 +#define arch_check_fixed_mapping(_file,_addr,_len,_pgoff,_flags) 0
  12.655 +#else
  12.656 +extern unsigned long
  12.657 +arch_check_fixed_mapping(struct file *, unsigned long, unsigned long,
  12.658 +			unsigned long, unsigned long);
  12.659 +#endif
  12.660 +
  12.661 +unsigned long get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags)
  12.662 +{
  12.663 +	unsigned long ret;
  12.664 +
  12.665 +	if (flags & MAP_FIXED) {
  12.666 +		if (addr > TASK_SIZE - len)
  12.667 +			return -ENOMEM;
  12.668 +		if (addr & ~PAGE_MASK)
  12.669 +			return -EINVAL;
  12.670 +		ret = arch_check_fixed_mapping(file, addr, len, pgoff, flags);
  12.671 +		if (ret != 0)
  12.672 +			return ret;
  12.673 +		return addr;
  12.674 +	}
  12.675 +
  12.676 +	if (file && file->f_op && file->f_op->get_unmapped_area)
  12.677 +		return file->f_op->get_unmapped_area(file, addr, len, pgoff, flags);
  12.678 +
  12.679 +	return arch_get_unmapped_area(file, addr, len, pgoff, flags);
  12.680 +}
  12.681 +
  12.682 +/* Look up the first VMA which satisfies  addr < vm_end,  NULL if none. */
  12.683 +struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr)
  12.684 +{
  12.685 +	struct vm_area_struct *vma = NULL;
  12.686 +
  12.687 +	if (mm) {
  12.688 +		/* Check the cache first. */
  12.689 +		/* (Cache hit rate is typically around 35%.) */
  12.690 +		vma = mm->mmap_cache;
  12.691 +		if (!(vma && vma->vm_end > addr && vma->vm_start <= addr)) {
  12.692 +			rb_node_t * rb_node;
  12.693 +
  12.694 +			rb_node = mm->mm_rb.rb_node;
  12.695 +			vma = NULL;
  12.696 +
  12.697 +			while (rb_node) {
  12.698 +				struct vm_area_struct * vma_tmp;
  12.699 +
  12.700 +				vma_tmp = rb_entry(rb_node, struct vm_area_struct, vm_rb);
  12.701 +
  12.702 +				if (vma_tmp->vm_end > addr) {
  12.703 +					vma = vma_tmp;
  12.704 +					if (vma_tmp->vm_start <= addr)
  12.705 +						break;
  12.706 +					rb_node = rb_node->rb_left;
  12.707 +				} else
  12.708 +					rb_node = rb_node->rb_right;
  12.709 +			}
  12.710 +			if (vma)
  12.711 +				mm->mmap_cache = vma;
  12.712 +		}
  12.713 +	}
  12.714 +	return vma;
  12.715 +}
  12.716 +
  12.717 +/* Same as find_vma, but also return a pointer to the previous VMA in *pprev. */
  12.718 +struct vm_area_struct * find_vma_prev(struct mm_struct * mm, unsigned long addr,
  12.719 +				      struct vm_area_struct **pprev)
  12.720 +{
  12.721 +	if (mm) {
  12.722 +		/* Go through the RB tree quickly. */
  12.723 +		struct vm_area_struct * vma;
  12.724 +		rb_node_t * rb_node, * rb_last_right, * rb_prev;
  12.725 +		
  12.726 +		rb_node = mm->mm_rb.rb_node;
  12.727 +		rb_last_right = rb_prev = NULL;
  12.728 +		vma = NULL;
  12.729 +
  12.730 +		while (rb_node) {
  12.731 +			struct vm_area_struct * vma_tmp;
  12.732 +
  12.733 +			vma_tmp = rb_entry(rb_node, struct vm_area_struct, vm_rb);
  12.734 +
  12.735 +			if (vma_tmp->vm_end > addr) {
  12.736 +				vma = vma_tmp;
  12.737 +				rb_prev = rb_last_right;
  12.738 +				if (vma_tmp->vm_start <= addr)
  12.739 +					break;
  12.740 +				rb_node = rb_node->rb_left;
  12.741 +			} else {
  12.742 +				rb_last_right = rb_node;
  12.743 +				rb_node = rb_node->rb_right;
  12.744 +			}
  12.745 +		}
  12.746 +		if (vma) {
  12.747 +			if (vma->vm_rb.rb_left) {
  12.748 +				rb_prev = vma->vm_rb.rb_left;
  12.749 +				while (rb_prev->rb_right)
  12.750 +					rb_prev = rb_prev->rb_right;
  12.751 +			}
  12.752 +			*pprev = NULL;
  12.753 +			if (rb_prev)
  12.754 +				*pprev = rb_entry(rb_prev, struct vm_area_struct, vm_rb);
  12.755 +			if ((rb_prev ? (*pprev)->vm_next : mm->mmap) != vma)
  12.756 +				BUG();
  12.757 +			return vma;
  12.758 +		}
  12.759 +	}
  12.760 +	*pprev = NULL;
  12.761 +	return NULL;
  12.762 +}
  12.763 +
  12.764 +struct vm_area_struct * find_extend_vma(struct mm_struct * mm, unsigned long addr)
  12.765 +{
  12.766 +	struct vm_area_struct * vma;
  12.767 +	unsigned long start;
  12.768 +
  12.769 +	addr &= PAGE_MASK;
  12.770 +	vma = find_vma(mm,addr);
  12.771 +	if (!vma)
  12.772 +		return NULL;
  12.773 +	if (vma->vm_start <= addr)
  12.774 +		return vma;
  12.775 +	if (!(vma->vm_flags & VM_GROWSDOWN))
  12.776 +		return NULL;
  12.777 +	start = vma->vm_start;
  12.778 +	if (expand_stack(vma, addr))
  12.779 +		return NULL;
  12.780 +	if (vma->vm_flags & VM_LOCKED) {
  12.781 +		make_pages_present(addr, start);
  12.782 +	}
  12.783 +	return vma;
  12.784 +}
  12.785 +
  12.786 +/* Normal function to fix up a mapping
  12.787 + * This function is the default for when an area has no specific
  12.788 + * function.  This may be used as part of a more specific routine.
  12.789 + * This function works out what part of an area is affected and
  12.790 + * adjusts the mapping information.  Since the actual page
  12.791 + * manipulation is done in do_mmap(), none need be done here,
  12.792 + * though it would probably be more appropriate.
  12.793 + *
  12.794 + * By the time this function is called, the area struct has been
  12.795 + * removed from the process mapping list, so it needs to be
  12.796 + * reinserted if necessary.
  12.797 + *
  12.798 + * The 4 main cases are:
  12.799 + *    Unmapping the whole area
  12.800 + *    Unmapping from the start of the segment to a point in it
  12.801 + *    Unmapping from an intermediate point to the end
  12.802 + *    Unmapping between to intermediate points, making a hole.
  12.803 + *
  12.804 + * Case 4 involves the creation of 2 new areas, for each side of
  12.805 + * the hole.  If possible, we reuse the existing area rather than
  12.806 + * allocate a new one, and the return indicates whether the old
  12.807 + * area was reused.
  12.808 + */
  12.809 +static struct vm_area_struct * unmap_fixup(struct mm_struct *mm, 
  12.810 +	struct vm_area_struct *area, unsigned long addr, size_t len, 
  12.811 +	struct vm_area_struct *extra)
  12.812 +{
  12.813 +	struct vm_area_struct *mpnt;
  12.814 +	unsigned long end = addr + len;
  12.815 +
  12.816 +	area->vm_mm->total_vm -= len >> PAGE_SHIFT;
  12.817 +	if (area->vm_flags & VM_LOCKED)
  12.818 +		area->vm_mm->locked_vm -= len >> PAGE_SHIFT;
  12.819 +
  12.820 +	/* Unmapping the whole area. */
  12.821 +	if (addr == area->vm_start && end == area->vm_end) {
  12.822 +		if (area->vm_ops && area->vm_ops->close)
  12.823 +			area->vm_ops->close(area);
  12.824 +		if (area->vm_file)
  12.825 +			fput(area->vm_file);
  12.826 +		kmem_cache_free(vm_area_cachep, area);
  12.827 +		return extra;
  12.828 +	}
  12.829 +
  12.830 +	/* Work out to one of the ends. */
  12.831 +	if (end == area->vm_end) {
  12.832 +		/*
  12.833 +		 * here area isn't visible to the semaphore-less readers
  12.834 +		 * so we don't need to update it under the spinlock.
  12.835 +		 */
  12.836 +		area->vm_end = addr;
  12.837 +		lock_vma_mappings(area);
  12.838 +		spin_lock(&mm->page_table_lock);
  12.839 +	} else if (addr == area->vm_start) {
  12.840 +		area->vm_pgoff += (end - area->vm_start) >> PAGE_SHIFT;
  12.841 +		/* same locking considerations of the above case */
  12.842 +		area->vm_start = end;
  12.843 +		lock_vma_mappings(area);
  12.844 +		spin_lock(&mm->page_table_lock);
  12.845 +	} else {
  12.846 +	/* Unmapping a hole: area->vm_start < addr <= end < area->vm_end */
  12.847 +		/* Add end mapping -- leave beginning for below */
  12.848 +		mpnt = extra;
  12.849 +		extra = NULL;
  12.850 +
  12.851 +		mpnt->vm_mm = area->vm_mm;
  12.852 +		mpnt->vm_start = end;
  12.853 +		mpnt->vm_end = area->vm_end;
  12.854 +		mpnt->vm_page_prot = area->vm_page_prot;
  12.855 +		mpnt->vm_flags = area->vm_flags;
  12.856 +		mpnt->vm_raend = 0;
  12.857 +		mpnt->vm_ops = area->vm_ops;
  12.858 +		mpnt->vm_pgoff = area->vm_pgoff + ((end - area->vm_start) >> PAGE_SHIFT);
  12.859 +		mpnt->vm_file = area->vm_file;
  12.860 +		mpnt->vm_private_data = area->vm_private_data;
  12.861 +		if (mpnt->vm_file)
  12.862 +			get_file(mpnt->vm_file);
  12.863 +		if (mpnt->vm_ops && mpnt->vm_ops->open)
  12.864 +			mpnt->vm_ops->open(mpnt);
  12.865 +		area->vm_end = addr;	/* Truncate area */
  12.866 +
  12.867 +		/* Because mpnt->vm_file == area->vm_file this locks
  12.868 +		 * things correctly.
  12.869 +		 */
  12.870 +		lock_vma_mappings(area);
  12.871 +		spin_lock(&mm->page_table_lock);
  12.872 +		__insert_vm_struct(mm, mpnt);
  12.873 +	}
  12.874 +
  12.875 +	__insert_vm_struct(mm, area);
  12.876 +	spin_unlock(&mm->page_table_lock);
  12.877 +	unlock_vma_mappings(area);
  12.878 +	return extra;
  12.879 +}
  12.880 +
  12.881 +/*
  12.882 + * Try to free as many page directory entries as we can,
  12.883 + * without having to work very hard at actually scanning
  12.884 + * the page tables themselves.
  12.885 + *
  12.886 + * Right now we try to free page tables if we have a nice
  12.887 + * PGDIR-aligned area that got free'd up. We could be more
  12.888 + * granular if we want to, but this is fast and simple,
  12.889 + * and covers the bad cases.
  12.890 + *
  12.891 + * "prev", if it exists, points to a vma before the one
  12.892 + * we just free'd - but there's no telling how much before.
  12.893 + */
  12.894 +static void free_pgtables(struct mm_struct * mm, struct vm_area_struct *prev,
  12.895 +	unsigned long start, unsigned long end)
  12.896 +{
  12.897 +	unsigned long first = start & PGDIR_MASK;
  12.898 +	unsigned long last = end + PGDIR_SIZE - 1;
  12.899 +	unsigned long start_index, end_index;
  12.900 +
  12.901 +	if (!prev) {
  12.902 +		prev = mm->mmap;
  12.903 +		if (!prev)
  12.904 +			goto no_mmaps;
  12.905 +		if (prev->vm_end > start) {
  12.906 +			if (last > prev->vm_start)
  12.907 +				last = prev->vm_start;
  12.908 +			goto no_mmaps;
  12.909 +		}
  12.910 +	}
  12.911 +	for (;;) {
  12.912 +		struct vm_area_struct *next = prev->vm_next;
  12.913 +
  12.914 +		if (next) {
  12.915 +			if (next->vm_start < start) {
  12.916 +				prev = next;
  12.917 +				continue;
  12.918 +			}
  12.919 +			if (last > next->vm_start)
  12.920 +				last = next->vm_start;
  12.921 +		}
  12.922 +		if (prev->vm_end > first)
  12.923 +			first = prev->vm_end + PGDIR_SIZE - 1;
  12.924 +		break;
  12.925 +	}
  12.926 +no_mmaps:
  12.927 +	if (last < first)
  12.928 +		return;
  12.929 +	/*
  12.930 +	 * If the PGD bits are not consecutive in the virtual address, the
  12.931 +	 * old method of shifting the VA >> by PGDIR_SHIFT doesn't work.
  12.932 +	 */
  12.933 +	start_index = pgd_index(first);
  12.934 +	end_index = pgd_index(last);
  12.935 +	if (end_index > start_index) {
  12.936 +		clear_page_tables(mm, start_index, end_index - start_index);
  12.937 +		flush_tlb_pgtables(mm, first & PGDIR_MASK, last & PGDIR_MASK);
  12.938 +	}
  12.939 +}
  12.940 +
  12.941 +/* Munmap is split into 2 main parts -- this part which finds
  12.942 + * what needs doing, and the areas themselves, which do the
  12.943 + * work.  This now handles partial unmappings.
  12.944 + * Jeremy Fitzhardine <jeremy@sw.oz.au>
  12.945 + */
  12.946 +int do_munmap(struct mm_struct *mm, unsigned long addr, size_t len)
  12.947 +{
  12.948 +	struct vm_area_struct *mpnt, *prev, **npp, *free, *extra;
  12.949 +
  12.950 +	if ((addr & ~PAGE_MASK) || addr > TASK_SIZE || len > TASK_SIZE-addr)
  12.951 +		return -EINVAL;
  12.952 +
  12.953 +	if ((len = PAGE_ALIGN(len)) == 0)
  12.954 +		return -EINVAL;
  12.955 +
  12.956 +	/* Check if this memory area is ok - put it on the temporary
  12.957 +	 * list if so..  The checks here are pretty simple --
  12.958 +	 * every area affected in some way (by any overlap) is put
  12.959 +	 * on the list.  If nothing is put on, nothing is affected.
  12.960 +	 */
  12.961 +	mpnt = find_vma_prev(mm, addr, &prev);
  12.962 +	if (!mpnt)
  12.963 +		return 0;
  12.964 +	/* we have  addr < mpnt->vm_end  */
  12.965 +
  12.966 +	if (mpnt->vm_start >= addr+len)
  12.967 +		return 0;
  12.968 +
  12.969 +	/* If we'll make "hole", check the vm areas limit */
  12.970 +	if ((mpnt->vm_start < addr && mpnt->vm_end > addr+len)
  12.971 +	    && mm->map_count >= max_map_count)
  12.972 +		return -ENOMEM;
  12.973 +
  12.974 +	/*
  12.975 +	 * We may need one additional vma to fix up the mappings ... 
  12.976 +	 * and this is the last chance for an easy error exit.
  12.977 +	 */
  12.978 +	extra = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
  12.979 +	if (!extra)
  12.980 +		return -ENOMEM;
  12.981 +
  12.982 +	npp = (prev ? &prev->vm_next : &mm->mmap);
  12.983 +	free = NULL;
  12.984 +	spin_lock(&mm->page_table_lock);
  12.985 +	for ( ; mpnt && mpnt->vm_start < addr+len; mpnt = *npp) {
  12.986 +		*npp = mpnt->vm_next;
  12.987 +		mpnt->vm_next = free;
  12.988 +		free = mpnt;
  12.989 +		rb_erase(&mpnt->vm_rb, &mm->mm_rb);
  12.990 +	}
  12.991 +	mm->mmap_cache = NULL;	/* Kill the cache. */
  12.992 +	spin_unlock(&mm->page_table_lock);
  12.993 +
  12.994 +	/* Ok - we have the memory areas we should free on the 'free' list,
  12.995 +	 * so release them, and unmap the page range..
  12.996 +	 * If the one of the segments is only being partially unmapped,
  12.997 +	 * it will put new vm_area_struct(s) into the address space.
  12.998 +	 * In that case we have to be careful with VM_DENYWRITE.
  12.999 +	 */
 12.1000 +	while ((mpnt = free) != NULL) {
 12.1001 +		unsigned long st, end, size;
 12.1002 +		struct file *file = NULL;
 12.1003 +
 12.1004 +		free = free->vm_next;
 12.1005 +
 12.1006 +		st = addr < mpnt->vm_start ? mpnt->vm_start : addr;
 12.1007 +		end = addr+len;
 12.1008 +		end = end > mpnt->vm_end ? mpnt->vm_end : end;
 12.1009 +		size = end - st;
 12.1010 +
 12.1011 +		if (mpnt->vm_flags & VM_DENYWRITE &&
 12.1012 +		    (st != mpnt->vm_start || end != mpnt->vm_end) &&
 12.1013 +		    (file = mpnt->vm_file) != NULL) {
 12.1014 +			atomic_dec(&file->f_dentry->d_inode->i_writecount);
 12.1015 +		}
 12.1016 +		remove_shared_vm_struct(mpnt);
 12.1017 +		mm->map_count--;
 12.1018 +
 12.1019 +		zap_page_range(mm, st, size);
 12.1020 +
 12.1021 +		/*
 12.1022 +		 * Fix the mapping, and free the old area if it wasn't reused.
 12.1023 +		 */
 12.1024 +		extra = unmap_fixup(mm, mpnt, st, size, extra);
 12.1025 +		if (file)
 12.1026 +			atomic_inc(&file->f_dentry->d_inode->i_writecount);
 12.1027 +	}
 12.1028 +	validate_mm(mm);
 12.1029 +
 12.1030 +	/* Release the extra vma struct if it wasn't used */
 12.1031 +	if (extra)
 12.1032 +		kmem_cache_free(vm_area_cachep, extra);
 12.1033 +
 12.1034 +	free_pgtables(mm, prev, addr, addr+len);
 12.1035 +
 12.1036 +	return 0;
 12.1037 +}
 12.1038 +
 12.1039 +asmlinkage long sys_munmap(unsigned long addr, size_t len)
 12.1040 +{
 12.1041 +	int ret;
 12.1042 +	struct mm_struct *mm = current->mm;
 12.1043 +
 12.1044 +	down_write(&mm->mmap_sem);
 12.1045 +	ret = do_munmap(mm, addr, len);
 12.1046 +	up_write(&mm->mmap_sem);
 12.1047 +	return ret;
 12.1048 +}
 12.1049 +
 12.1050 +/*
 12.1051 + *  this is really a simplified "do_mmap".  it only handles
 12.1052 + *  anonymous maps.  eventually we may be able to do some
 12.1053 + *  brk-specific accounting here.
 12.1054 + */
 12.1055 +unsigned long do_brk(unsigned long addr, unsigned long len)
 12.1056 +{
 12.1057 +	struct mm_struct * mm = current->mm;
 12.1058 +	struct vm_area_struct * vma, * prev;
 12.1059 +	unsigned long flags;
 12.1060 +	rb_node_t ** rb_link, * rb_parent;
 12.1061 +
 12.1062 +	len = PAGE_ALIGN(len);
 12.1063 +	if (!len)
 12.1064 +		return addr;
 12.1065 +
 12.1066 +	if ((addr + len) > TASK_SIZE || (addr + len) < addr)
 12.1067 +		return -EINVAL;
 12.1068 +
 12.1069 +	/*
 12.1070 +	 * mlock MCL_FUTURE?
 12.1071 +	 */
 12.1072 +	if (mm->def_flags & VM_LOCKED) {
 12.1073 +		unsigned long locked = mm->locked_vm << PAGE_SHIFT;
 12.1074 +		locked += len;
 12.1075 +		if (locked > current->rlim[RLIMIT_MEMLOCK].rlim_cur)
 12.1076 +			return -EAGAIN;
 12.1077 +	}
 12.1078 +
 12.1079 +	/*
 12.1080 +	 * Clear old maps.  this also does some error checking for us
 12.1081 +	 */
 12.1082 + munmap_back:
 12.1083 +	vma = find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent);
 12.1084 +	if (vma && vma->vm_start < addr + len) {
 12.1085 +		if (do_munmap(mm, addr, len))
 12.1086 +			return -ENOMEM;
 12.1087 +		goto munmap_back;
 12.1088 +	}
 12.1089 +
 12.1090 +	/* Check against address space limits *after* clearing old maps... */
 12.1091 +	if ((mm->total_vm << PAGE_SHIFT) + len
 12.1092 +	    > current->rlim[RLIMIT_AS].rlim_cur)
 12.1093 +		return -ENOMEM;
 12.1094 +
 12.1095 +	if (mm->map_count > max_map_count)
 12.1096 +		return -ENOMEM;
 12.1097 +
 12.1098 +	if (!vm_enough_memory(len >> PAGE_SHIFT))
 12.1099 +		return -ENOMEM;
 12.1100 +
 12.1101 +	flags = VM_DATA_DEFAULT_FLAGS | mm->def_flags;
 12.1102 +
 12.1103 +	/* Can we just expand an old anonymous mapping? */
 12.1104 +	if (rb_parent && vma_merge(mm, prev, rb_parent, addr, addr + len, flags))
 12.1105 +		goto out;
 12.1106 +
 12.1107 +	/*
 12.1108 +	 * create a vma struct for an anonymous mapping
 12.1109 +	 */
 12.1110 +	vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
 12.1111 +	if (!vma)
 12.1112 +		return -ENOMEM;
 12.1113 +
 12.1114 +	vma->vm_mm = mm;
 12.1115 +	vma->vm_start = addr;
 12.1116 +	vma->vm_end = addr + len;
 12.1117 +	vma->vm_flags = flags;
 12.1118 +	vma->vm_page_prot = protection_map[flags & 0x0f];
 12.1119 +	vma->vm_ops = NULL;
 12.1120 +	vma->vm_pgoff = 0;
 12.1121 +	vma->vm_file = NULL;
 12.1122 +	vma->vm_private_data = NULL;
 12.1123 +
 12.1124 +	vma_link(mm, vma, prev, rb_link, rb_parent);
 12.1125 +
 12.1126 +out:
 12.1127 +	mm->total_vm += len >> PAGE_SHIFT;
 12.1128 +	if (flags & VM_LOCKED) {
 12.1129 +		mm->locked_vm += len >> PAGE_SHIFT;
 12.1130 +		make_pages_present(addr, addr + len);
 12.1131 +	}
 12.1132 +	return addr;
 12.1133 +}
 12.1134 +
 12.1135 +/* Build the RB tree corresponding to the VMA list. */
 12.1136 +void build_mmap_rb(struct mm_struct * mm)
 12.1137 +{
 12.1138 +	struct vm_area_struct * vma;
 12.1139 +	rb_node_t ** rb_link, * rb_parent;
 12.1140 +
 12.1141 +	mm->mm_rb = RB_ROOT;
 12.1142 +	rb_link = &mm->mm_rb.rb_node;
 12.1143 +	rb_parent = NULL;
 12.1144 +	for (vma = mm->mmap; vma; vma = vma->vm_next) {
 12.1145 +		__vma_link_rb(mm, vma, rb_link, rb_parent);
 12.1146 +		rb_parent = &vma->vm_rb;
 12.1147 +		rb_link = &rb_parent->rb_right;
 12.1148 +	}
 12.1149 +}
 12.1150 +
 12.1151 +/* Release all mmaps. */
 12.1152 +void exit_mmap(struct mm_struct * mm)
 12.1153 +{
 12.1154 +	struct vm_area_struct * mpnt;
 12.1155 +
 12.1156 +	release_segments(mm);
 12.1157 +	spin_lock(&mm->page_table_lock);
 12.1158 +	mpnt = mm->mmap;
 12.1159 +	mm->mmap = mm->mmap_cache = NULL;
 12.1160 +	mm->mm_rb = RB_ROOT;
 12.1161 +	mm->rss = 0;
 12.1162 +	spin_unlock(&mm->page_table_lock);
 12.1163 +	mm->total_vm = 0;
 12.1164 +	mm->locked_vm = 0;
 12.1165 +
 12.1166 +	flush_cache_mm(mm);
 12.1167 +	while (mpnt) {
 12.1168 +		struct vm_area_struct * next = mpnt->vm_next;
 12.1169 +		unsigned long start = mpnt->vm_start;
 12.1170 +		unsigned long end = mpnt->vm_end;
 12.1171 +		unsigned long size = end - start;
 12.1172 +
 12.1173 +		if (mpnt->vm_ops) {
 12.1174 +			if (mpnt->vm_ops->close)
 12.1175 +				mpnt->vm_ops->close(mpnt);
 12.1176 +		}
 12.1177 +		mm->map_count--;
 12.1178 +		remove_shared_vm_struct(mpnt);
 12.1179 +		zap_page_range(mm, start, size);
 12.1180 +		if (mpnt->vm_file)
 12.1181 +			fput(mpnt->vm_file);
 12.1182 +		kmem_cache_free(vm_area_cachep, mpnt);
 12.1183 +		mpnt = next;
 12.1184 +	}
 12.1185 +
 12.1186 +	/* This is just debugging */
 12.1187 +	if (mm->map_count)
 12.1188 +		BUG();
 12.1189 +
 12.1190 +	clear_page_tables(mm, FIRST_USER_PGD_NR, USER_PTRS_PER_PGD);
 12.1191 +
 12.1192 +	flush_tlb_mm(mm);
 12.1193 +}
 12.1194 +
 12.1195 +/* Insert vm structure into process list sorted by address
 12.1196 + * and into the inode's i_mmap ring.  If vm_file is non-NULL
 12.1197 + * then the i_shared_lock must be held here.
 12.1198 + */
 12.1199 +void __insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma)
 12.1200 +{
 12.1201 +	struct vm_area_struct * __vma, * prev;
 12.1202 +	rb_node_t ** rb_link, * rb_parent;
 12.1203 +
 12.1204 +	__vma = find_vma_prepare(mm, vma->vm_start, &prev, &rb_link, &rb_parent);
 12.1205 +	if (__vma && __vma->vm_start < vma->vm_end)
 12.1206 +		BUG();
 12.1207 +	__vma_link(mm, vma, prev, rb_link, rb_parent);
 12.1208 +	mm->map_count++;
 12.1209 +	validate_mm(mm);
 12.1210 +}
 12.1211 +
 12.1212 +void insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma)
 12.1213 +{
 12.1214 +	struct vm_area_struct * __vma, * prev;
 12.1215 +	rb_node_t ** rb_link, * rb_parent;
 12.1216 +
 12.1217 +	__vma = find_vma_prepare(mm, vma->vm_start, &prev, &rb_link, &rb_parent);
 12.1218 +	if (__vma && __vma->vm_start < vma->vm_end)
 12.1219 +		BUG();
 12.1220 +	vma_link(mm, vma, prev, rb_link, rb_parent);
 12.1221 +	validate_mm(mm);
 12.1222 +}
    13.1 --- a/linux-2.4.26-xen-sparse/mm/vmalloc.c	Fri Aug 13 15:46:15 2004 +0000
    13.2 +++ b/linux-2.4.26-xen-sparse/mm/vmalloc.c	Fri Aug 13 16:17:42 2004 +0000
    13.3 @@ -152,7 +152,7 @@ static inline int alloc_area_pmd(pmd_t *
    13.4  	return 0;
    13.5  }
    13.6  
    13.7 -static inline int __vmalloc_area_pages (unsigned long address,
    13.8 +/*static inline*/ int __vmalloc_area_pages (unsigned long address,
    13.9  					unsigned long size,
   13.10  					int gfp_mask,
   13.11  					pgprot_t prot,
    14.1 --- a/linux-2.6.7-xen-sparse/arch/xen/i386/mm/mmap.c	Fri Aug 13 15:46:15 2004 +0000
    14.2 +++ b/linux-2.6.7-xen-sparse/arch/xen/i386/mm/mmap.c	Fri Aug 13 16:17:42 2004 +0000
    14.3 @@ -1,5 +1,6 @@
    14.4  
    14.5  #include <linux/slab.h>
    14.6 +#include <linux/version.h>
    14.7  #include <linux/mman.h>
    14.8  #include <linux/init.h>
    14.9  #include <asm/pgalloc.h>
   14.10 @@ -23,7 +24,11 @@ arch_get_unmapped_area(struct file *filp
   14.11  		    (!vma || ((addr + len) <= vma->vm_start)))
   14.12  			return addr;
   14.13  	}
   14.14 +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
   14.15  	start_addr = addr = mm->free_area_cache;
   14.16 +#else
   14.17 +	addr = PAGE_ALIGN(TASK_UNMAPPED_BASE);
   14.18 +#endif
   14.19  
   14.20  full_search:
   14.21  	for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
   14.22 @@ -43,7 +48,9 @@ full_search:
   14.23  			/*
   14.24  			 * Remember the place where we stopped the search:
   14.25  			 */
   14.26 +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
   14.27  			mm->free_area_cache = addr + len;
   14.28 +#endif
   14.29  			return addr;
   14.30  		}
   14.31  		addr = vma->vm_end;
    15.1 --- a/linux-2.6.7-xen-sparse/arch/xen/kernel/fixup.c	Fri Aug 13 15:46:15 2004 +0000
    15.2 +++ b/linux-2.6.7-xen-sparse/arch/xen/kernel/fixup.c	Fri Aug 13 16:17:42 2004 +0000
    15.3 @@ -25,9 +25,11 @@
    15.4  #include <linux/config.h>
    15.5  #include <linux/init.h>
    15.6  #include <linux/sched.h>
    15.7 +#include <linux/slab.h>
    15.8  #include <linux/kernel.h>
    15.9 +#include <linux/pagemap.h>
   15.10 +#include <linux/vmalloc.h>
   15.11  #include <linux/highmem.h>
   15.12 -#include <linux/vmalloc.h>
   15.13  #include <asm/fixmap.h>
   15.14  #include <asm/pgtable.h>
   15.15  #include <asm/uaccess.h>
   15.16 @@ -44,6 +46,29 @@
   15.17  #define DPRINTK(_f, _a...) ((void)0)
   15.18  #endif
   15.19  
   15.20 +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0)
   15.21 +#define TestSetPageLocked(_p) TryLockPage(_p)
   15.22 +#define PageAnon(_p)          0 /* no equivalent in 2.4 */
   15.23 +#define pte_offset_kernel     pte_offset
   15.24 +extern int __vmalloc_area_pages(unsigned long address,
   15.25 +                                unsigned long size,
   15.26 +                                int gfp_mask,
   15.27 +                                pgprot_t prot,
   15.28 +                                struct page ***pages);
   15.29 +#else
   15.30 +static inline int __vmalloc_area_pages(unsigned long address,
   15.31 +                                unsigned long size,
   15.32 +                                int gfp_mask,
   15.33 +                                pgprot_t prot,
   15.34 +                                struct page ***pages)
   15.35 +{
   15.36 +    struct vm_struct vma;
   15.37 +    vma.addr = (void *)address;
   15.38 +    vma.size = size + PAGE_SIZE; /* retarded interface */
   15.39 +    return map_vm_area(&vma, prot, pages);
   15.40 +}
   15.41 +#endif
   15.42 +
   15.43  static unsigned char *fixup_buf;
   15.44  #define FIXUP_BUF_USER  PAGE_SIZE
   15.45  #define FIXUP_BUF_ORDER 1
   15.46 @@ -214,35 +239,41 @@ static unsigned int parse_insn(unsigned 
   15.47   * Mainly this function checks that our patches can't erroneously get flushed
   15.48   * to a file on disc, which would screw us after reboot!
   15.49   */
   15.50 -static int safe_to_patch(unsigned long addr)
   15.51 +#define SUCCESS 1
   15.52 +#define FAIL    0
   15.53 +static int safe_to_patch(struct mm_struct *mm, unsigned long addr)
   15.54  {
   15.55 -    struct mm_struct      *mm = current->mm;
   15.56      struct vm_area_struct *vma;
   15.57      struct file           *file;
   15.58      unsigned char          _name[30], *name;
   15.59  
   15.60      /* Always safe to patch the fixup buffer. */
   15.61      if ( addr <= (FIXUP_BUF_USER + FIXUP_BUF_SIZE) )
   15.62 -        return 1;
   15.63 -
   15.64 -    down_read(&mm->mmap_sem);
   15.65 +        return SUCCESS;
   15.66  
   15.67      if ( (vma = find_vma(current->mm, addr)) == NULL )
   15.68      {
   15.69          DPRINTK("No VMA contains fault address.");
   15.70 -        goto fail;
   15.71 +        return FAIL;
   15.72      }
   15.73  
   15.74 -    /* No backing file, so safe to patch. */
   15.75 +    /* Only patch shared libraries. */
   15.76      if ( (file = vma->vm_file) == NULL )
   15.77 -        goto success;
   15.78 +    {
   15.79 +        DPRINTK("VMA is anonymous!");
   15.80 +        return FAIL;
   15.81 +    }
   15.82  
   15.83      /* No shared mappings => nobody can dirty the file. */
   15.84      /* XXX Note the assumption that noone will dirty the file in future! */
   15.85 +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
   15.86      if ( file->f_mapping->i_mmap_writable != 0 )
   15.87 +#else
   15.88 +    if ( file->f_dentry->d_inode->i_mapping->i_mmap_shared != NULL )
   15.89 +#endif
   15.90      {
   15.91          DPRINTK("Shared mappings exist.");
   15.92 -        goto fail;
   15.93 +        return FAIL;
   15.94      }
   15.95  
   15.96      /*
   15.97 @@ -251,24 +282,19 @@ static int safe_to_patch(unsigned long a
   15.98       * unlinking the old files and installing completely fresh ones. :-)
   15.99       */
  15.100      name = d_path(file->f_dentry, file->f_vfsmnt, _name, sizeof(_name));
  15.101 -    if ( strncmp("/lib/tls", name, 8) != 0 )
  15.102 +    if ( IS_ERR(name) || (strncmp("/lib/tls", name, 8) != 0) )
  15.103      {
  15.104          DPRINTK("Backing file is not in /lib/tls");
  15.105 -        goto fail;
  15.106 +        return FAIL;
  15.107      }
  15.108  
  15.109 - success:
  15.110 -    up_read(&mm->mmap_sem);
  15.111 -    return 1;
  15.112 -
  15.113 - fail:
  15.114 -    up_read(&mm->mmap_sem);
  15.115 -    return 0;
  15.116 +    return SUCCESS;
  15.117  }
  15.118  
  15.119  asmlinkage void do_fixup_4gb_segment(struct pt_regs *regs, long error_code)
  15.120  {
  15.121      static unsigned int fixup_idx = 0;
  15.122 +    struct mm_struct *mm = current->mm;
  15.123      unsigned int fi;
  15.124      int save_indirect_reg, hash, i;
  15.125      unsigned int insn_len = (unsigned int)error_code, new_insn_len;
  15.126 @@ -288,13 +314,16 @@ asmlinkage void do_fixup_4gb_segment(str
  15.127          return;
  15.128      }
  15.129  
  15.130 -    if ( unlikely(!safe_to_patch(eip)) )
  15.131 -        return;
  15.132 +    /* Hold the mmap_sem to prevent the mapping from disappearing under us. */
  15.133 +    down_read(&mm->mmap_sem);
  15.134 +
  15.135 +    if ( unlikely(!safe_to_patch(mm, eip)) )
  15.136 +        goto out;
  15.137  
  15.138      if ( unlikely(copy_from_user(b, (void *)eip, sizeof(b)) != 0) )
  15.139      {
  15.140          DPRINTK("Could not read instruction bytes from user space.");
  15.141 -        return;
  15.142 +        goto out;
  15.143      }
  15.144  
  15.145      /* Already created a fixup for this code sequence? */
  15.146 @@ -312,7 +341,7 @@ asmlinkage void do_fixup_4gb_segment(str
  15.147          if ( !printed )
  15.148              printk(KERN_ALERT "WARNING: Out of room in segment-fixup page.\n");
  15.149          printed = 1;
  15.150 -        return;
  15.151 +        goto out;
  15.152      }
  15.153  
  15.154      /* Must be a handleable opcode with GS override. */
  15.155 @@ -320,7 +349,7 @@ asmlinkage void do_fixup_4gb_segment(str
  15.156           !test_bit((unsigned int)b[1], (unsigned long *)handleable_code) )
  15.157      {
  15.158          DPRINTK("No GS override, or not a MOV (%02x %02x).", b[0], b[1]);
  15.159 -        return;
  15.160 +        goto out;
  15.161      }
  15.162  
  15.163      modrm = b[2];
  15.164 @@ -335,7 +364,7 @@ asmlinkage void do_fixup_4gb_segment(str
  15.165      if ( rm == 4 )
  15.166      {
  15.167          DPRINTK("We don't grok SIB bytes.");
  15.168 -        return;
  15.169 +        goto out;
  15.170      }
  15.171  
  15.172      /* Ensure Mod/RM specifies (r32) or disp8(r32). */
  15.173 @@ -345,14 +374,14 @@ asmlinkage void do_fixup_4gb_segment(str
  15.174          if ( rm == 5 )
  15.175          {
  15.176              DPRINTK("Unhandleable disp32 EA %d.", rm);
  15.177 -            return;
  15.178 +            goto out;
  15.179          }
  15.180          break;            /* m32 == (r32) */
  15.181      case 1:
  15.182          break;            /* m32 == disp8(r32) */
  15.183      default:
  15.184          DPRINTK("Unhandleable Mod value %d.", mod);
  15.185 -        return;
  15.186 +        goto out;
  15.187      }
  15.188  
  15.189      /* Indirect jump pointer. */
  15.190 @@ -398,7 +427,7 @@ asmlinkage void do_fixup_4gb_segment(str
  15.191                         parse_insn(&b[insn_len], &opcode, &decode)) == 0) )
  15.192          {
  15.193              DPRINTK("Could not decode following instruction.");
  15.194 -            return;
  15.195 +            goto out;
  15.196          }
  15.197  
  15.198          if ( (decode & CODE_MASK) == JMP )
  15.199 @@ -520,7 +549,7 @@ asmlinkage void do_fixup_4gb_segment(str
  15.200                       test_bit(opcode, (unsigned long *)opcode_uses_reg) )
  15.201                  {
  15.202                      DPRINTK("Data movement to ESP unsupported.");
  15.203 -                    return;
  15.204 +                    goto out;
  15.205                  }
  15.206  
  15.207                  if ( rm == 4 )
  15.208 @@ -528,7 +557,7 @@ asmlinkage void do_fixup_4gb_segment(str
  15.209                      if ( mod == 3 )
  15.210                      {
  15.211                          DPRINTK("Data movement to ESP is unsupported.");
  15.212 -                        return;
  15.213 +                        goto out;
  15.214                      }
  15.215  
  15.216                      sib = fixup_buf[fi++] = b[insn_len++];
  15.217 @@ -585,14 +614,14 @@ asmlinkage void do_fixup_4gb_segment(str
  15.218          if ( (insn_len += new_insn_len) > 20 )
  15.219          {
  15.220              DPRINTK("Code to patch is too long!");
  15.221 -            return;
  15.222 +            goto out;
  15.223          }
  15.224  
  15.225          /* Can't have a RET in the middle of a patch sequence. */
  15.226          if ( (opcode == 0xc3) && (insn_len < PATCH_LEN) )
  15.227          {
  15.228              DPRINTK("RET in middle of patch seq!\n");
  15.229 -            return;
  15.230 +            goto out;
  15.231          }
  15.232      }
  15.233  
  15.234 @@ -601,7 +630,7 @@ asmlinkage void do_fixup_4gb_segment(str
  15.235      if ( unlikely(fe == NULL) )
  15.236      {
  15.237          DPRINTK("Not enough memory to allocate a fixup_entry.");
  15.238 -        return;
  15.239 +        goto out;
  15.240      }
  15.241      fe->patched_code_len = insn_len;
  15.242      memcpy(fe->patched_code, b, insn_len);
  15.243 @@ -619,7 +648,13 @@ asmlinkage void do_fixup_4gb_segment(str
  15.244      if ( unlikely(((eip ^ (eip + fe->patched_code_len)) & PAGE_MASK) != 0) )
  15.245      {
  15.246          DPRINTK("Patch instruction would straddle a page boundary.");
  15.247 -        return;
  15.248 +        goto out;
  15.249 +    }
  15.250 +
  15.251 +    if ( put_user(eip + PATCH_LEN, (unsigned long *)regs->esp - 1) != 0 )
  15.252 +    {
  15.253 +        DPRINTK("Failed to place return address on user stack.");
  15.254 +        goto out;
  15.255      }
  15.256  
  15.257      /* Create the patching instructions in a temporary buffer. */
  15.258 @@ -630,40 +665,56 @@ asmlinkage void do_fixup_4gb_segment(str
  15.259      for ( i = 5; i < fe->patched_code_len; i++ )
  15.260          patch[i] = 0x90; /* nop */
  15.261  
  15.262 -    /* Find the physical page that is to be patched. Check it isn't dirty. */
  15.263 +    spin_lock(&mm->page_table_lock);
  15.264 +
  15.265 +    /* Find the physical page that is to be patched. */
  15.266      pgd = pgd_offset(current->mm, eip);
  15.267      pmd = pmd_offset(pgd, eip);
  15.268      pte = pte_offset_kernel(pmd, eip);
  15.269      page = pte_page(*pte);
  15.270 -    if ( unlikely(PageDirty(page)) )
  15.271 +
  15.272 +    /*
  15.273 +     * We get lock to prevent page going AWOL on us. Also a locked page
  15.274 +     * might be getting flushed to disc!
  15.275 +     */
  15.276 +    if ( unlikely(TestSetPageLocked(page)) )
  15.277      {
  15.278 -        DPRINTK("Page is already dirty.");
  15.279 -        return;
  15.280 +        DPRINTK("Page is locked.");
  15.281 +        spin_unlock(&mm->page_table_lock);
  15.282 +        goto out;
  15.283      }
  15.284  
  15.285 -    if ( put_user(eip + PATCH_LEN, (unsigned long *)regs->esp - 1) != 0 )
  15.286 +    /*
  15.287 +     * If page is dirty it will get flushed back to disc - bad news! An
  15.288 +     * anonymous page may be moulinexed under our feet by another thread.
  15.289 +     */
  15.290 +    if ( unlikely(PageDirty(page)) || unlikely(PageAnon(page)) )
  15.291      {
  15.292 -        DPRINTK("Failed to place return address on user stack.");
  15.293 -        return;
  15.294 +        DPRINTK("Page is dirty or anonymous.");
  15.295 +        unlock_page(page);
  15.296 +        spin_unlock(&mm->page_table_lock);
  15.297 +        goto out;
  15.298      }
  15.299  
  15.300 +    veip = kmap(page);
  15.301 +    memcpy((char *)veip + (eip & ~PAGE_MASK), patch, fe->patched_code_len);
  15.302 +    kunmap(page);
  15.303 +
  15.304 +    unlock_page(page);
  15.305 +    spin_unlock(&mm->page_table_lock);
  15.306 +
  15.307      /* Success! Return to user land to execute 2nd insn of the pair. */
  15.308      regs->esp -= 4;
  15.309      regs->eip = FIXUP_BUF_USER + fe->return_idx;
  15.310  
  15.311 -    /* [SMP] Need to pause other threads while patching. */
  15.312 -    veip = kmap(page);
  15.313 -    memcpy((char *)veip + (eip & ~PAGE_MASK), patch, fe->patched_code_len);
  15.314 -    kunmap(page);
  15.315 -
  15.316 -    return;
  15.317 + out:
  15.318 +    up_read(&mm->mmap_sem);
  15.319  }
  15.320  
  15.321  static int nosegfixup = 0;
  15.322  
  15.323  static int __init fixup_init(void)
  15.324  {
  15.325 -    struct vm_struct vma;
  15.326      struct page *_pages[1<<FIXUP_BUF_ORDER], **pages=_pages;
  15.327      int i;
  15.328  
  15.329 @@ -677,9 +728,8 @@ static int __init fixup_init(void)
  15.330      for ( i = 0; i < (1<<FIXUP_BUF_ORDER); i++ )
  15.331          _pages[i] = virt_to_page(fixup_buf) + i;
  15.332  
  15.333 -    vma.addr = (void *)FIXUP_BUF_USER;
  15.334 -    vma.size = FIXUP_BUF_SIZE + PAGE_SIZE; /* fucking stupid interface */
  15.335 -    if ( map_vm_area(&vma, PAGE_READONLY, &pages) != 0 )
  15.336 +    if ( __vmalloc_area_pages(FIXUP_BUF_USER, FIXUP_BUF_SIZE, 
  15.337 +                              0, PAGE_READONLY, &pages) != 0 )
  15.338          BUG();
  15.339  
  15.340      memset(fixup_hash, 0, sizeof(fixup_hash));