ia64/xen-unstable

changeset 2211:0667ac4c62f5

bitkeeper revision 1.1159.1.45 (411ba27bakvhuObM31GE8j9h7Ma0FA)

Merge labyrinth.cl.cam.ac.uk:/auto/groups/xeno/BK/xeno.bk
into labyrinth.cl.cam.ac.uk:/auto/anfs/scratch/labyrinth/iap10/xeno-clone/xeno.bk
author iap10@labyrinth.cl.cam.ac.uk
date Thu Aug 12 17:01:47 2004 +0000 (2004-08-12)
parents 0213aef0e364 b8884dc7fd28
children 36edd9229334
files .rootkeys linux-2.4.26-xen-sparse/arch/xen/drivers/blkif/frontend/main.c linux-2.6.7-xen-sparse/arch/xen/Kconfig linux-2.6.7-xen-sparse/arch/xen/i386/kernel/process.c linux-2.6.7-xen-sparse/arch/xen/i386/kernel/setup.c linux-2.6.7-xen-sparse/arch/xen/i386/mm/Makefile linux-2.6.7-xen-sparse/arch/xen/i386/mm/mmap.c linux-2.6.7-xen-sparse/arch/xen/kernel/ctrl_if.c linux-2.6.7-xen-sparse/drivers/xen/blkback/blkback.c linux-2.6.7-xen-sparse/drivers/xen/blkfront/blkfront.c linux-2.6.7-xen-sparse/drivers/xen/netback/netback.c linux-2.6.7-xen-sparse/drivers/xen/netfront/netfront.c linux-2.6.7-xen-sparse/include/asm-xen/asm-i386/page.h linux-2.6.7-xen-sparse/include/asm-xen/asm-i386/pgalloc.h linux-2.6.7-xen-sparse/include/asm-xen/asm-i386/pgtable-2level.h linux-2.6.7-xen-sparse/include/asm-xen/asm-i386/pgtable.h linux-2.6.7-xen-sparse/mm/mmap.c tools/python/xen/lowlevel/xu/xu.c
line diff
     1.1 --- a/.rootkeys	Thu Aug 12 14:52:11 2004 +0000
     1.2 +++ b/.rootkeys	Thu Aug 12 17:01:47 2004 +0000
     1.3 @@ -159,6 +159,7 @@ 4118cc35CbY8rfGVspF5O-7EkXBEAA linux-2.6
     1.4  40f562383SKvDStdtrvzr5fyCbW4rw linux-2.6.7-xen-sparse/arch/xen/i386/mm/hypervisor.c
     1.5  40f56239xcNylAxuGsQHwi1AyMLV8w linux-2.6.7-xen-sparse/arch/xen/i386/mm/init.c
     1.6  41062ab7CjxC1UBaFhOMWWdhHkIUyg linux-2.6.7-xen-sparse/arch/xen/i386/mm/ioremap.c
     1.7 +411b9db3oFpYQc4C-_mO2lRTcSz8UQ linux-2.6.7-xen-sparse/arch/xen/i386/mm/mmap.c
     1.8  40f5623906UYHv1rsVUeRc0tFT0dWw linux-2.6.7-xen-sparse/arch/xen/i386/mm/pgtable.c
     1.9  4107adf12ndy94MidCaivDibJ3pPAg linux-2.6.7-xen-sparse/arch/xen/i386/pci/Makefile
    1.10  4107adf1WcCgkhsdLTRGX52cOG1vJg linux-2.6.7-xen-sparse/arch/xen/i386/pci/direct.c
    1.11 @@ -241,6 +242,7 @@ 3f108af1ylCIm82H052FVTfXACBHrw linux-2.6
    1.12  3fa8e3f0kBLeE4To2vpdi3cpJbIkbQ linux-2.6.7-xen-sparse/include/asm-xen/suspend.h
    1.13  3f689063BoW-HWV3auUJ-OqXfcGArw linux-2.6.7-xen-sparse/include/asm-xen/xen_proc.h
    1.14  40f56a0ddHCSs3501MY4hRf22tctOw linux-2.6.7-xen-sparse/mkbuildtree
    1.15 +411b9db3dpQAK-pcP8WwcRHZGn2eKg linux-2.6.7-xen-sparse/mm/mmap.c
    1.16  410a94a4KT6I6X0LVc7djB39tRDp4g linux-2.6.7-xen-sparse/mm/page_alloc.c
    1.17  40e1b09db5mN69Ijj0X_Eol-S7dXiw tools/Make.defs
    1.18  3f776bd1Hy9rn69ntXBhPReUFw9IEA tools/Makefile
     2.1 --- a/linux-2.4.26-xen-sparse/arch/xen/drivers/blkif/frontend/main.c	Thu Aug 12 14:52:11 2004 +0000
     2.2 +++ b/linux-2.4.26-xen-sparse/arch/xen/drivers/blkif/frontend/main.c	Thu Aug 12 17:01:47 2004 +0000
     2.3 @@ -16,8 +16,6 @@
     2.4  #include <scsi/scsi.h>
     2.5  #include <asm/ctrl_if.h>
     2.6  
     2.7 -
     2.8 -
     2.9  typedef unsigned char byte; /* from linux/ide.h */
    2.10  
    2.11  #define BLKIF_STATE_CLOSED       0
    2.12 @@ -95,6 +93,7 @@ static inline void translate_req_to_mfn(
    2.13  static inline void flush_requests(void)
    2.14  {
    2.15      DISABLE_SCATTERGATHER();
    2.16 +    wmb(); /* Ensure that the frontend can see the requests. */
    2.17      blk_ring->req_prod = req_prod;
    2.18      notify_via_evtchn(blkif_evtchn);
    2.19  }
    2.20 @@ -533,7 +532,7 @@ static void kick_pending_request_queues(
    2.21  
    2.22  static void blkif_int(int irq, void *dev_id, struct pt_regs *ptregs)
    2.23  {
    2.24 -    BLKIF_RING_IDX i; 
    2.25 +    BLKIF_RING_IDX i, rp; 
    2.26      unsigned long flags; 
    2.27      struct buffer_head *bh, *next_bh;
    2.28      
    2.29 @@ -541,13 +540,14 @@ static void blkif_int(int irq, void *dev
    2.30  
    2.31      if ( unlikely(blkif_state == BLKIF_STATE_CLOSED || recovery) )
    2.32      {
    2.33 -        printk("Bailed out\n");
    2.34 -        
    2.35          spin_unlock_irqrestore(&io_request_lock, flags);
    2.36          return;
    2.37      }
    2.38  
    2.39 -    for ( i = resp_cons; i != blk_ring->resp_prod; i++ )
    2.40 +    rp = blk_ring->resp_prod;
    2.41 +    rmb(); /* Ensure we see queued responses up to 'rp'. */
    2.42 +
    2.43 +    for ( i = resp_cons; i != rp; i++ )
    2.44      {
    2.45          blkif_response_t *bret = &blk_ring->ring[MASK_BLKIF_IDX(i)].resp;
    2.46          switch ( bret->operation )
     3.1 --- a/linux-2.6.7-xen-sparse/arch/xen/Kconfig	Thu Aug 12 14:52:11 2004 +0000
     3.2 +++ b/linux-2.6.7-xen-sparse/arch/xen/Kconfig	Thu Aug 12 17:01:47 2004 +0000
     3.3 @@ -36,6 +36,12 @@ config XEN_PHYSDEV_ACCESS
     3.4  	help
     3.5  	  Device-driver domain (physical device access)
     3.6  
     3.7 +config XEN_WRITABLE_PAGETABLES
     3.8 +	bool "Use writable pagetables"
     3.9 +	default n
    3.10 +	help
    3.11 +	  Use writable L1 pagetables
    3.12 +
    3.13  endmenu
    3.14  
    3.15  # Xen's block device backend driver needs 2^12 pages
     4.1 --- a/linux-2.6.7-xen-sparse/arch/xen/i386/kernel/process.c	Thu Aug 12 14:52:11 2004 +0000
     4.2 +++ b/linux-2.6.7-xen-sparse/arch/xen/i386/kernel/process.c	Thu Aug 12 17:01:47 2004 +0000
     4.3 @@ -46,6 +46,8 @@
     4.4  #include <asm/i387.h>
     4.5  #include <asm/irq.h>
     4.6  #include <asm/desc.h>
     4.7 +#include <asm-xen/multicall.h>
     4.8 +#include <asm/hypervisor-ifs/dom0_ops.h>
     4.9  #ifdef CONFIG_MATH_EMULATION
    4.10  #include <asm/math_emu.h>
    4.11  #endif
    4.12 @@ -501,6 +503,7 @@ struct task_struct fastcall * __switch_t
    4.13  	int cpu = smp_processor_id();
    4.14  	struct tss_struct *tss = init_tss + cpu;
    4.15  	unsigned long flags;
    4.16 +	dom0_op_t op;
    4.17  
    4.18  	local_irq_save(flags);
    4.19  
    4.20 @@ -522,30 +525,51 @@ struct task_struct fastcall * __switch_t
    4.21  		"xorl %%eax,%%eax; movl %%eax,%%fs; movl %%eax,%%gs" : : :
    4.22  		"eax" );
    4.23  
    4.24 -	flush_page_update_queue();
    4.25 +	MULTICALL_flush_page_update_queue();
    4.26  
    4.27  	/* never put a printk in __switch_to... printk() calls wake_up*() indirectly */
    4.28  
    4.29 -	__unlazy_fpu(prev_p);
    4.30 +	/*
    4.31 +	 * This is basically '__unlazy_fpu', except that we queue a
    4.32 +	 * multicall to indicate FPU task switch, rather than
    4.33 +	 * synchronously trapping to Xen.
    4.34 +	 */
    4.35 +	if (prev_p->thread_info->status & TS_USEDFPU) {
    4.36 +		save_init_fpu(prev_p);
    4.37 +		queue_multicall0(__HYPERVISOR_fpu_taskswitch);
    4.38 +	}
    4.39  
    4.40  	/*
    4.41  	 * Reload esp0, LDT and the page table pointer:
    4.42 +	 * This is load_esp0(tss, next) with a multicall.
    4.43  	 */
    4.44 -	load_esp0(tss, next);
    4.45 +	tss->esp0 = next->esp0;
    4.46 +	/* This can only happen when SEP is enabled, no need to test
    4.47 +	 * "SEP"arately */
    4.48 +	if (unlikely(tss->ss1 != next->sysenter_cs)) {
    4.49 +		tss->ss1 = next->sysenter_cs;
    4.50 +		wrmsr(MSR_IA32_SYSENTER_CS, next->sysenter_cs, 0);
    4.51 +	}
    4.52 +	queue_multicall2(__HYPERVISOR_stack_switch, tss->ss0, tss->esp0);
    4.53  
    4.54  	/*
    4.55  	 * Load the per-thread Thread-Local Storage descriptor.
    4.56 +	 * This is load_TLS(next, cpu) with multicalls.
    4.57  	 */
    4.58 -	load_TLS(next, cpu);
    4.59 +#define C(i) queue_multicall3(__HYPERVISOR_update_descriptor, virt_to_machine(&get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN + i]), ((u32 *)&next->tls_array[i])[0], ((u32 *)&next->tls_array[i])[1])
    4.60 +	C(0); C(1); C(2);
    4.61 +#undef C
    4.62  
    4.63  	if (start_info.flags & SIF_PRIVILEGED) {
    4.64 -		dom0_op_t op;
    4.65  		op.cmd           = DOM0_IOPL;
    4.66  		op.u.iopl.domain = DOMID_SELF;
    4.67  		op.u.iopl.iopl   = next->io_pl;
    4.68 -		HYPERVISOR_dom0_op(&op);
    4.69 +		queue_multicall1(__HYPERVISOR_dom0_op, (unsigned long)&op);
    4.70  	}
    4.71  
    4.72 +	/* EXECUTE ALL TASK SWITCH XEN SYSCALLS AT THIS POINT. */
    4.73 +	execute_multicall_list();
    4.74 +
    4.75  	local_irq_restore(flags);
    4.76  
    4.77  	/*
     5.1 --- a/linux-2.6.7-xen-sparse/arch/xen/i386/kernel/setup.c	Thu Aug 12 14:52:11 2004 +0000
     5.2 +++ b/linux-2.6.7-xen-sparse/arch/xen/i386/kernel/setup.c	Thu Aug 12 17:01:47 2004 +0000
     5.3 @@ -1104,7 +1104,7 @@ void __init setup_arch(char **cmdline_p)
     5.4  
     5.5  	HYPERVISOR_vm_assist(VMASST_CMD_enable,
     5.6  			     VMASST_TYPE_4gb_segments);
     5.7 -#if 0
     5.8 +#ifdef CONFIG_XEN_WRITABLE_PAGETABLES
     5.9  	HYPERVISOR_vm_assist(VMASST_CMD_enable,
    5.10  			     VMASST_TYPE_writeable_pagetables);
    5.11  #endif
     6.1 --- a/linux-2.6.7-xen-sparse/arch/xen/i386/mm/Makefile	Thu Aug 12 14:52:11 2004 +0000
     6.2 +++ b/linux-2.6.7-xen-sparse/arch/xen/i386/mm/Makefile	Thu Aug 12 17:01:47 2004 +0000
     6.3 @@ -6,7 +6,7 @@ XENARCH	:= $(subst ",,$(CONFIG_XENARCH))
     6.4  
     6.5  CFLAGS	+= -Iarch/$(XENARCH)/mm
     6.6  
     6.7 -obj-y	:= init.o fault.o ioremap.o pgtable.o hypervisor.o
     6.8 +obj-y	:= init.o fault.o ioremap.o pgtable.o hypervisor.o mmap.o
     6.9  c-obj-y	:= extable.o pageattr.o 
    6.10  
    6.11  c-obj-$(CONFIG_DISCONTIGMEM)	+= discontig.o
     7.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     7.2 +++ b/linux-2.6.7-xen-sparse/arch/xen/i386/mm/mmap.c	Thu Aug 12 17:01:47 2004 +0000
     7.3 @@ -0,0 +1,60 @@
     7.4 +
     7.5 +#include <linux/slab.h>
     7.6 +#include <linux/mman.h>
     7.7 +#include <linux/init.h>
     7.8 +#include <asm/pgalloc.h>
     7.9 +
    7.10 +unsigned long
    7.11 +arch_get_unmapped_area(struct file *filp, unsigned long addr,
    7.12 +		unsigned long len, unsigned long pgoff, unsigned long flags)
    7.13 +{
    7.14 +	struct mm_struct *mm = current->mm;
    7.15 +	struct vm_area_struct *vma;
    7.16 +	unsigned long start_addr;
    7.17 +
    7.18 +	if (len > TASK_SIZE)
    7.19 +		return -ENOMEM;
    7.20 +
    7.21 +	if (addr) {
    7.22 +		addr = PAGE_ALIGN(addr);
    7.23 +		vma = find_vma(mm, addr);
    7.24 +		if (((TASK_SIZE - len) >= addr) &&
    7.25 +		    (addr >= (FIRST_USER_PGD_NR<<PGDIR_SHIFT)) &&
    7.26 +		    (!vma || ((addr + len) <= vma->vm_start)))
    7.27 +			return addr;
    7.28 +	}
    7.29 +	start_addr = addr = mm->free_area_cache;
    7.30 +
    7.31 +full_search:
    7.32 +	for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
    7.33 +		/* At this point:  (!vma || addr < vma->vm_end). */
    7.34 +		if (TASK_SIZE - len < addr) {
    7.35 +			/*
    7.36 +			 * Start a new search - just in case we missed
    7.37 +			 * some holes.
    7.38 +			 */
    7.39 +			if (start_addr != TASK_UNMAPPED_BASE) {
    7.40 +				start_addr = addr = TASK_UNMAPPED_BASE;
    7.41 +				goto full_search;
    7.42 +			}
    7.43 +			return -ENOMEM;
    7.44 +		}
    7.45 +		if (!vma || addr + len <= vma->vm_start) {
    7.46 +			/*
    7.47 +			 * Remember the place where we stopped the search:
    7.48 +			 */
    7.49 +			mm->free_area_cache = addr + len;
    7.50 +			return addr;
    7.51 +		}
    7.52 +		addr = vma->vm_end;
    7.53 +	}
    7.54 +}
    7.55 +
    7.56 +unsigned long
    7.57 +arch_check_fixed_mapping(struct file *filp, unsigned long addr,
    7.58 +		unsigned long len, unsigned long pgoff, unsigned long flags)
    7.59 +{
    7.60 +	if ( addr < (FIRST_USER_PGD_NR<<PGDIR_SHIFT) )
    7.61 +		return -EINVAL;
    7.62 +	return 0;
    7.63 +}
     8.1 --- a/linux-2.6.7-xen-sparse/arch/xen/kernel/ctrl_if.c	Thu Aug 12 14:52:11 2004 +0000
     8.2 +++ b/linux-2.6.7-xen-sparse/arch/xen/kernel/ctrl_if.c	Thu Aug 12 17:01:47 2004 +0000
     8.3 @@ -93,8 +93,12 @@ static void __ctrl_if_tx_tasklet(unsigne
     8.4      control_if_t *ctrl_if = get_ctrl_if();
     8.5      ctrl_msg_t   *msg;
     8.6      int           was_full = TX_FULL(ctrl_if);
     8.7 +    CONTROL_RING_IDX rp;
     8.8  
     8.9 -    while ( ctrl_if_tx_resp_cons != ctrl_if->tx_resp_prod )
    8.10 +    rp = ctrl_if->tx_resp_prod;
    8.11 +    rmb(); /* Ensure we see all requests up to 'rp'. */
    8.12 +
    8.13 +    while ( ctrl_if_tx_resp_cons != rp )
    8.14      {
    8.15          msg = &ctrl_if->tx_ring[MASK_CONTROL_IDX(ctrl_if_tx_resp_cons)];
    8.16  
    8.17 @@ -132,8 +136,12 @@ static void __ctrl_if_tx_tasklet(unsigne
    8.18  static void __ctrl_if_rxmsg_deferred(void *unused)
    8.19  {
    8.20      ctrl_msg_t *msg;
    8.21 +    CONTROL_RING_IDX dp;
    8.22  
    8.23 -    while ( ctrl_if_rxmsg_deferred_cons != ctrl_if_rxmsg_deferred_prod )
    8.24 +    dp = ctrl_if_rxmsg_deferred_prod;
    8.25 +    rmb(); /* Ensure we see all deferred requests up to 'dp'. */
    8.26 +
    8.27 +    while ( ctrl_if_rxmsg_deferred_cons != dp )
    8.28      {
    8.29          msg = &ctrl_if_rxmsg_deferred[MASK_CONTROL_IDX(
    8.30              ctrl_if_rxmsg_deferred_cons++)];
    8.31 @@ -145,8 +153,13 @@ static void __ctrl_if_rx_tasklet(unsigne
    8.32  {
    8.33      control_if_t *ctrl_if = get_ctrl_if();
    8.34      ctrl_msg_t    msg, *pmsg;
    8.35 +    CONTROL_RING_IDX rp, dp;
    8.36  
    8.37 -    while ( ctrl_if_rx_req_cons != ctrl_if->rx_req_prod )
    8.38 +    dp = ctrl_if_rxmsg_deferred_prod;
    8.39 +    rp = ctrl_if->rx_req_prod;
    8.40 +    rmb(); /* Ensure we see all requests up to 'rp'. */
    8.41 +
    8.42 +    while ( ctrl_if_rx_req_cons != rp )
    8.43      {
    8.44          pmsg = &ctrl_if->rx_ring[MASK_CONTROL_IDX(ctrl_if_rx_req_cons++)];
    8.45          memcpy(&msg, pmsg, offsetof(ctrl_msg_t, msg));
    8.46 @@ -161,20 +174,21 @@ static void __ctrl_if_rx_tasklet(unsigne
    8.47  
    8.48          if ( test_bit(msg.type, 
    8.49                        (unsigned long *)&ctrl_if_rxmsg_blocking_context) )
    8.50 -        {
    8.51 -            pmsg = &ctrl_if_rxmsg_deferred[MASK_CONTROL_IDX(
    8.52 -                ctrl_if_rxmsg_deferred_prod++)];
    8.53 -            memcpy(pmsg, &msg, offsetof(ctrl_msg_t, msg) + msg.length);
    8.54 +            memcpy(&ctrl_if_rxmsg_deferred[MASK_CONTROL_IDX(dp++)],
    8.55 +                   &msg, offsetof(ctrl_msg_t, msg) + msg.length);
    8.56 +        else
    8.57 +            (*ctrl_if_rxmsg_handler[msg.type])(&msg, 0);
    8.58 +    }
    8.59 +
    8.60 +    if ( dp != ctrl_if_rxmsg_deferred_prod )
    8.61 +    {
    8.62 +        wmb();
    8.63 +        ctrl_if_rxmsg_deferred_prod = dp;
    8.64  #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0)
    8.65 -            schedule_task(&ctrl_if_rxmsg_deferred_tq);
    8.66 +        schedule_task(&ctrl_if_rxmsg_deferred_tq);
    8.67  #else
    8.68 -            schedule_work(&ctrl_if_rxmsg_deferred_work);
    8.69 +        schedule_work(&ctrl_if_rxmsg_deferred_work);
    8.70  #endif
    8.71 -        }
    8.72 -        else
    8.73 -        {
    8.74 -            (*ctrl_if_rxmsg_handler[msg.type])(&msg, 0);
    8.75 -        }
    8.76      }
    8.77  }
    8.78  
     9.1 --- a/linux-2.6.7-xen-sparse/drivers/xen/blkback/blkback.c	Thu Aug 12 14:52:11 2004 +0000
     9.2 +++ b/linux-2.6.7-xen-sparse/drivers/xen/blkback/blkback.c	Thu Aug 12 17:01:47 2004 +0000
     9.3 @@ -268,13 +268,15 @@ static int do_block_io_op(blkif_t *blkif
     9.4  {
     9.5      blkif_ring_t *blk_ring = blkif->blk_ring_base;
     9.6      blkif_request_t *req;
     9.7 -    BLKIF_RING_IDX i;
     9.8 +    BLKIF_RING_IDX i, rp;
     9.9      int more_to_do = 0;
    9.10  
    9.11 +    rp = blk_ring->req_prod;
    9.12 +    rmb(); /* Ensure we see queued requests up to 'rp'. */
    9.13 +
    9.14      /* Take items off the comms ring, taking care not to overflow. */
    9.15      for ( i = blkif->blk_req_cons; 
    9.16 -          (i != blk_ring->req_prod) && ((i-blkif->blk_resp_prod) != 
    9.17 -                                        BLKIF_RING_SIZE);
    9.18 +          (i != rp) && ((i-blkif->blk_resp_prod) != BLKIF_RING_SIZE);
    9.19            i++ )
    9.20      {
    9.21          if ( (max_to_do-- == 0) || (NR_PENDING_REQS == MAX_PENDING_REQS) )
    9.22 @@ -533,7 +535,7 @@ static void make_response(blkif_t *blkif
    9.23      resp->id        = id;
    9.24      resp->operation = op;
    9.25      resp->status    = st;
    9.26 -    wmb();
    9.27 +    wmb(); /* Ensure other side can see the response fields. */
    9.28      blkif->blk_ring_base->resp_prod = ++blkif->blk_resp_prod;
    9.29      spin_unlock_irqrestore(&blkif->blk_ring_lock, flags);
    9.30  
    10.1 --- a/linux-2.6.7-xen-sparse/drivers/xen/blkfront/blkfront.c	Thu Aug 12 14:52:11 2004 +0000
    10.2 +++ b/linux-2.6.7-xen-sparse/drivers/xen/blkfront/blkfront.c	Thu Aug 12 17:01:47 2004 +0000
    10.3 @@ -82,6 +82,7 @@ static inline void translate_req_to_mfn(
    10.4  
    10.5  static inline void flush_requests(void)
    10.6  {
    10.7 +    wmb(); /* Ensure that the frontend can see the requests. */
    10.8      blk_ring->req_prod = req_prod;
    10.9      notify_via_evtchn(blkif_evtchn);
   10.10  }
   10.11 @@ -363,34 +364,39 @@ static irqreturn_t blkif_int(int irq, vo
   10.12  {
   10.13      struct request *req;
   10.14      blkif_response_t *bret;
   10.15 -    BLKIF_RING_IDX i; 
   10.16 +    BLKIF_RING_IDX i, rp;
   10.17      unsigned long flags; 
   10.18  
   10.19      spin_lock_irqsave(&blkif_io_lock, flags);     
   10.20  
   10.21 -    if (unlikely(blkif_state == BLKIF_STATE_CLOSED || recovery)) {
   10.22 -        printk("Bailed out\n");
   10.23 -        
   10.24 +    if ( unlikely(blkif_state == BLKIF_STATE_CLOSED) || 
   10.25 +         unlikely(recovery) )
   10.26 +    {
   10.27          spin_unlock_irqrestore(&blkif_io_lock, flags);
   10.28          return IRQ_HANDLED;
   10.29      }
   10.30  
   10.31 -    for (i = resp_cons; i != blk_ring->resp_prod; i++) {
   10.32 +    rp = blk_ring->resp_prod;
   10.33 +    rmb(); /* Ensure we see queued responses up to 'rp'. */
   10.34 +
   10.35 +    for ( i = resp_cons; i != rp; i++ )
   10.36 +    {
   10.37          bret = &blk_ring->ring[MASK_BLKIF_IDX(i)].resp;
   10.38 -        switch (bret->operation) {
   10.39 +        switch ( bret->operation )
   10.40 +        {
   10.41          case BLKIF_OP_READ:
   10.42          case BLKIF_OP_WRITE:
   10.43 -            if (unlikely(bret->status != BLKIF_RSP_OKAY))
   10.44 +            if ( unlikely(bret->status != BLKIF_RSP_OKAY) )
   10.45                  DPRINTK("Bad return from blkdev data request: %lx\n",
   10.46                          bret->status);
   10.47              req = (struct request *)bret->id;
   10.48 -            /* XXXcl pass up status */
   10.49 -            if (unlikely(end_that_request_first(req, 1,
   10.50 -                                                req->hard_nr_sectors)))
   10.51 +            if ( unlikely(end_that_request_first
   10.52 +                          (req, 
   10.53 +                           (bret->status != BLKIF_RSP_OKAY),
   10.54 +                           req->hard_nr_sectors)) )
   10.55                  BUG();
   10.56 -
   10.57              end_that_request_last(req);
   10.58 -            blkif_completion( bret, req );
   10.59 +            blkif_completion(bret, req);
   10.60              break;
   10.61          case BLKIF_OP_PROBE:
   10.62              memcpy(&blkif_control_rsp, bret, sizeof(*bret));
   10.63 @@ -404,8 +410,9 @@ static irqreturn_t blkif_int(int irq, vo
   10.64      resp_cons = i;
   10.65      resp_cons_rec = i;
   10.66  
   10.67 -    if (xlbd_blk_queue &&
   10.68 -        test_bit(QUEUE_FLAG_STOPPED, &xlbd_blk_queue->queue_flags)) {
   10.69 +    if ( (xlbd_blk_queue != NULL) &&
   10.70 +         test_bit(QUEUE_FLAG_STOPPED, &xlbd_blk_queue->queue_flags) )
   10.71 +    {
   10.72          blk_start_queue(xlbd_blk_queue);
   10.73          /* XXXcl call to request_fn should not be needed but
   10.74           * we get stuck without...  needs investigating
    11.1 --- a/linux-2.6.7-xen-sparse/drivers/xen/netback/netback.c	Thu Aug 12 14:52:11 2004 +0000
    11.2 +++ b/linux-2.6.7-xen-sparse/drivers/xen/netback/netback.c	Thu Aug 12 17:01:47 2004 +0000
    11.3 @@ -446,6 +446,7 @@ static void net_tx_action(unsigned long 
    11.4              netif_put(netif);
    11.5              continue;
    11.6          }
    11.7 +        rmb(); /* Ensure that we see the request. */
    11.8          memcpy(&txreq, &netif->tx->ring[MASK_NETIF_TX_IDX(i)].req, 
    11.9                 sizeof(txreq));
   11.10          netif->tx_req_cons++;
    12.1 --- a/linux-2.6.7-xen-sparse/drivers/xen/netfront/netfront.c	Thu Aug 12 14:52:11 2004 +0000
    12.2 +++ b/linux-2.6.7-xen-sparse/drivers/xen/netfront/netfront.c	Thu Aug 12 17:01:47 2004 +0000
    12.3 @@ -118,10 +118,8 @@ static void netctrl_init(void)
    12.4   */
    12.5  static int netctrl_err(int err)
    12.6  {
    12.7 -    if(err < 0 && !netctrl.err){
    12.8 +    if ( (err < 0) && !netctrl.err )
    12.9          netctrl.err = err;
   12.10 -        printk(KERN_WARNING "%s> err=%d\n", __FUNCTION__, err);
   12.11 -    }
   12.12      return netctrl.err;
   12.13  }
   12.14  
   12.15 @@ -177,7 +175,6 @@ static int network_open(struct net_devic
   12.16      return 0;
   12.17  }
   12.18  
   12.19 -
   12.20  static void network_tx_buf_gc(struct net_device *dev)
   12.21  {
   12.22      NETIF_RING_IDX i, prod;
   12.23 @@ -190,6 +187,7 @@ static void network_tx_buf_gc(struct net
   12.24  
   12.25      do {
   12.26          prod = np->tx->resp_prod;
   12.27 +        rmb(); /* Ensure we see responses up to 'rp'. */
   12.28  
   12.29          for ( i = np->tx_resp_cons; i != prod; i++ )
   12.30          {
   12.31 @@ -295,6 +293,7 @@ static void network_alloc_rx_buffers(str
   12.32      if ( rx_mcl[nr_pfns].args[5] != nr_pfns )
   12.33          panic("Unable to reduce memory reservation\n");
   12.34  
   12.35 +    /* Above is a suitable barrier to ensure backend will see requests. */
   12.36      np->rx->req_prod = i;
   12.37  }
   12.38  
   12.39 @@ -344,7 +343,7 @@ static int network_start_xmit(struct sk_
   12.40      tx->addr = virt_to_machine(skb->data);
   12.41      tx->size = skb->len;
   12.42  
   12.43 -    wmb();
   12.44 +    wmb(); /* Ensure that backend will see the request. */
   12.45      np->tx->req_prod = i + 1;
   12.46  
   12.47      network_tx_buf_gc(dev);
   12.48 @@ -392,7 +391,7 @@ static int netif_poll(struct net_device 
   12.49      struct net_private *np = dev->priv;
   12.50      struct sk_buff *skb;
   12.51      netif_rx_response_t *rx;
   12.52 -    NETIF_RING_IDX i;
   12.53 +    NETIF_RING_IDX i, rp;
   12.54      mmu_update_t *mmu = rx_mmu;
   12.55      multicall_entry_t *mcl = rx_mcl;
   12.56      int work_done, budget, more_to_do = 1;
   12.57 @@ -412,8 +411,11 @@ static int netif_poll(struct net_device 
   12.58      if ( (budget = *pbudget) > dev->quota )
   12.59          budget = dev->quota;
   12.60  
   12.61 +    rp = np->rx->resp_prod;
   12.62 +    rmb(); /* Ensure we see queued responses up to 'rp'. */
   12.63 +
   12.64      for ( i = np->rx_resp_cons, work_done = 0; 
   12.65 -          (i != np->rx->resp_prod) && (work_done < budget); 
   12.66 +          (i != rp) && (work_done < budget); 
   12.67            i++, work_done++ )
   12.68      {
   12.69          rx = &np->rx->ring[MASK_NETIF_RX_IDX(i)].resp;
   12.70 @@ -904,9 +906,8 @@ void netif_suspend(void)
   12.71  
   12.72  void netif_resume(void)
   12.73  {
   12.74 -    ctrl_msg_t                       cmsg;
   12.75 -    netif_fe_interface_connect_t     up;
   12.76 -//    netif_fe_driver_status_changed_t   st;
   12.77 +    ctrl_msg_t                   cmsg;
   12.78 +    netif_fe_interface_connect_t up;
   12.79      struct net_device *dev = NULL;
   12.80      struct net_private *np = NULL;
   12.81      int i;
    13.1 --- a/linux-2.6.7-xen-sparse/include/asm-xen/asm-i386/page.h	Thu Aug 12 14:52:11 2004 +0000
    13.2 +++ b/linux-2.6.7-xen-sparse/include/asm-xen/asm-i386/page.h	Thu Aug 12 17:01:47 2004 +0000
    13.3 @@ -95,7 +95,7 @@ typedef struct { unsigned long pgprot; }
    13.4  static inline unsigned long pmd_val(pmd_t x)
    13.5  {
    13.6      unsigned long ret = x.pmd;
    13.7 -    if ( (ret & 1) ) ret = machine_to_phys(ret);
    13.8 +    if ( (ret) ) ret = machine_to_phys(ret);
    13.9      return ret;
   13.10  }
   13.11  #define pgd_val(x)	({ BUG(); (unsigned long)0; })
    14.1 --- a/linux-2.6.7-xen-sparse/include/asm-xen/asm-i386/pgalloc.h	Thu Aug 12 14:52:11 2004 +0000
    14.2 +++ b/linux-2.6.7-xen-sparse/include/asm-xen/asm-i386/pgalloc.h	Thu Aug 12 17:01:47 2004 +0000
    14.3 @@ -6,6 +6,7 @@
    14.4  #include <asm/fixmap.h>
    14.5  #include <linux/threads.h>
    14.6  #include <linux/mm.h>		/* for struct page */
    14.7 +#include <asm/io.h>		/* for phys_to_virt and page_to_pseudophys */
    14.8  
    14.9  #define pmd_populate_kernel(mm, pmd, pte) \
   14.10  		set_pmd(pmd, __pmd(_PAGE_TABLE + __pa(pte)))
   14.11 @@ -15,7 +16,8 @@ static inline void pmd_populate(struct m
   14.12  	set_pmd(pmd, __pmd(_PAGE_TABLE +
   14.13  		((unsigned long long)page_to_pfn(pte) <<
   14.14  			(unsigned long long) PAGE_SHIFT)));
   14.15 -	flush_page_update_queue(); /* XXXcl flush */
   14.16 +	flush_page_update_queue();
   14.17 +	/* XXXcl queue */
   14.18  }
   14.19  /*
   14.20   * Allocate and free page tables.
   14.21 @@ -30,17 +32,25 @@ extern struct page *pte_alloc_one(struct
   14.22  static inline void pte_free_kernel(pte_t *pte)
   14.23  {
   14.24  	free_page((unsigned long)pte);
   14.25 +	__make_page_writeable(pte);
   14.26  }
   14.27  
   14.28  static inline void pte_free(struct page *pte)
   14.29  {
   14.30 -	__free_page(pte);
   14.31 +#ifdef CONFIG_HIGHPTE
   14.32 +	if (pte < highmem_start_page)
   14.33 +#endif
   14.34 +	{
   14.35 +		__make_page_writeable(phys_to_virt(page_to_pseudophys(pte)));
   14.36 +		__free_page(pte);
   14.37 +	}
   14.38  }
   14.39  
   14.40  
   14.41  #define __pte_free_tlb(tlb,pte) do {			\
   14.42  	tlb_remove_page((tlb),(pte));			\
   14.43 -	flush_page_update_queue(); /* XXXcl flush */	\
   14.44 +	flush_page_update_queue();			\
   14.45 +	/* XXXcl queue */ \
   14.46  } while (0)
   14.47  
   14.48  /*
    15.1 --- a/linux-2.6.7-xen-sparse/include/asm-xen/asm-i386/pgtable-2level.h	Thu Aug 12 14:52:11 2004 +0000
    15.2 +++ b/linux-2.6.7-xen-sparse/include/asm-xen/asm-i386/pgtable-2level.h	Thu Aug 12 17:01:47 2004 +0000
    15.3 @@ -40,8 +40,13 @@ static inline int pgd_present(pgd_t pgd)
    15.4   * within a page table are directly modified.  Thus, the following
    15.5   * hook is made available.
    15.6   */
    15.7 +#ifdef CONFIG_XEN_WRITABLE_PAGETABLES
    15.8 +#define set_pte(pteptr, pteval) (*(pteptr) = pteval)
    15.9 +#define set_pte_atomic(pteptr, pteval) (*(pteptr) = pteval)
   15.10 +#else
   15.11  #define set_pte(pteptr, pteval) queue_l1_entry_update(pteptr, (pteval).pte_low)
   15.12  #define set_pte_atomic(pteptr, pteval) queue_l1_entry_update(pteptr, (pteval).pte_low)
   15.13 +#endif
   15.14  /*
   15.15   * (pmds are folded into pgds so this doesn't get actually called,
   15.16   * but the define is needed for a generic inline function.)
   15.17 @@ -70,7 +75,7 @@ static inline pte_t ptep_get_and_clear(p
   15.18  {
   15.19  	pte_t pte = *xp;
   15.20  	if (pte.pte_low)
   15.21 -		queue_l1_entry_update(xp, 0);
   15.22 +		set_pte(xp, __pte_ma(0));
   15.23  	return pte;
   15.24  }
   15.25  
    16.1 --- a/linux-2.6.7-xen-sparse/include/asm-xen/asm-i386/pgtable.h	Thu Aug 12 14:52:11 2004 +0000
    16.2 +++ b/linux-2.6.7-xen-sparse/include/asm-xen/asm-i386/pgtable.h	Thu Aug 12 17:01:47 2004 +0000
    16.3 @@ -191,7 +191,7 @@ extern unsigned long pg0[];
    16.4  #define pmd_none(x)	(!pmd_val(x))
    16.5  #define pmd_present(x)	(pmd_val(x) & _PAGE_PRESENT)
    16.6  /* pmd_clear below */
    16.7 -#define	pmd_bad(x)	((pmd_val(x) & (~PAGE_MASK & ~_PAGE_USER)) != _KERNPG_TABLE)
    16.8 +#define	pmd_bad(x)	((pmd_val(x) & (~PAGE_MASK & ~_PAGE_USER & ~_PAGE_PRESENT)) != (_KERNPG_TABLE & ~_PAGE_PRESENT))
    16.9  
   16.10  
   16.11  #define pages_to_mb(x) ((x) >> (20-PAGE_SHIFT))
   16.12 @@ -241,9 +241,9 @@ static inline  int ptep_test_and_clear_y
   16.13  }
   16.14  static inline void ptep_set_wrprotect(pte_t *ptep)
   16.15  {
   16.16 -	unsigned long pteval = *(unsigned long *)ptep;
   16.17 -	if ((pteval & _PAGE_RW))
   16.18 -		queue_l1_entry_update(ptep, pteval & ~_PAGE_RW);
   16.19 +	pte_t pte = *ptep;
   16.20 +	if (pte_write(pte))
   16.21 +		set_pte(ptep, pte_wrprotect(pte));
   16.22  }
   16.23  static inline void ptep_mkdirty(pte_t *ptep)
   16.24  {
   16.25 @@ -283,6 +283,7 @@ static inline pte_t pte_modify(pte_t pte
   16.26  	pmd_t p = *(xp);					\
   16.27  	set_pmd(xp, __pmd(0));					\
   16.28  	__make_page_writeable((void *)pmd_page_kernel(p));	\
   16.29 +	/* XXXcl queue */ \
   16.30  } while (0)
   16.31  
   16.32  #ifndef CONFIG_DISCONTIGMEM
   16.33 @@ -401,6 +402,7 @@ static inline void make_page_readonly(vo
   16.34  	if ( (unsigned long)va >= VMALLOC_START )
   16.35  		__make_page_readonly(machine_to_virt(
   16.36  			*(unsigned long *)pte&PAGE_MASK));
   16.37 +	/* XXXcl queue */
   16.38  }
   16.39  
   16.40  static inline void make_page_writeable(void *va)
   16.41 @@ -412,6 +414,7 @@ static inline void make_page_writeable(v
   16.42  	if ( (unsigned long)va >= VMALLOC_START )
   16.43  		__make_page_writeable(machine_to_virt(
   16.44  			*(unsigned long *)pte&PAGE_MASK));
   16.45 +	/* XXXcl queue */
   16.46  }
   16.47  
   16.48  static inline void make_pages_readonly(void *va, unsigned int nr)
   16.49 @@ -421,6 +424,7 @@ static inline void make_pages_readonly(v
   16.50  		make_page_readonly(va);
   16.51  		va = (void *)((unsigned long)va + PAGE_SIZE);
   16.52  	}
   16.53 +	/* XXXcl queue */
   16.54  }
   16.55  
   16.56  static inline void make_pages_writeable(void *va, unsigned int nr)
   16.57 @@ -430,6 +434,7 @@ static inline void make_pages_writeable(
   16.58  		make_page_writeable(va);
   16.59  		va = (void *)((unsigned long)va + PAGE_SIZE);
   16.60  	}
   16.61 +	/* XXXcl queue */
   16.62  }
   16.63  
   16.64  static inline unsigned long arbitrary_virt_to_phys(void *va)
   16.65 @@ -457,4 +462,7 @@ static inline unsigned long arbitrary_vi
   16.66  #define __HAVE_ARCH_PTE_SAME
   16.67  #include <asm-generic/pgtable.h>
   16.68  
   16.69 +#define HAVE_ARCH_UNMAPPED_AREA
   16.70 +#define HAVE_ARCH_CHECK_FIXED_MAPPING
   16.71 +
   16.72  #endif /* _I386_PGTABLE_H */
    17.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    17.2 +++ b/linux-2.6.7-xen-sparse/mm/mmap.c	Thu Aug 12 17:01:47 2004 +0000
    17.3 @@ -0,0 +1,1816 @@
    17.4 +/*
    17.5 + * mm/mmap.c
    17.6 + *
    17.7 + * Written by obz.
    17.8 + *
    17.9 + * Address space accounting code	<alan@redhat.com>
   17.10 + */
   17.11 +
   17.12 +#include <linux/slab.h>
   17.13 +#include <linux/shm.h>
   17.14 +#include <linux/mman.h>
   17.15 +#include <linux/pagemap.h>
   17.16 +#include <linux/swap.h>
   17.17 +#include <linux/syscalls.h>
   17.18 +#include <linux/init.h>
   17.19 +#include <linux/file.h>
   17.20 +#include <linux/fs.h>
   17.21 +#include <linux/personality.h>
   17.22 +#include <linux/security.h>
   17.23 +#include <linux/hugetlb.h>
   17.24 +#include <linux/profile.h>
   17.25 +#include <linux/module.h>
   17.26 +#include <linux/mount.h>
   17.27 +#include <linux/mempolicy.h>
   17.28 +#include <linux/rmap.h>
   17.29 +
   17.30 +#include <asm/uaccess.h>
   17.31 +#include <asm/pgalloc.h>
   17.32 +#include <asm/cacheflush.h>
   17.33 +#include <asm/tlb.h>
   17.34 +
   17.35 +/*
   17.36 + * WARNING: the debugging will use recursive algorithms so never enable this
   17.37 + * unless you know what you are doing.
   17.38 + */
   17.39 +#undef DEBUG_MM_RB
   17.40 +
   17.41 +/* description of effects of mapping type and prot in current implementation.
   17.42 + * this is due to the limited x86 page protection hardware.  The expected
   17.43 + * behavior is in parens:
   17.44 + *
   17.45 + * map_type	prot
   17.46 + *		PROT_NONE	PROT_READ	PROT_WRITE	PROT_EXEC
   17.47 + * MAP_SHARED	r: (no) no	r: (yes) yes	r: (no) yes	r: (no) yes
   17.48 + *		w: (no) no	w: (no) no	w: (yes) yes	w: (no) no
   17.49 + *		x: (no) no	x: (no) yes	x: (no) yes	x: (yes) yes
   17.50 + *		
   17.51 + * MAP_PRIVATE	r: (no) no	r: (yes) yes	r: (no) yes	r: (no) yes
   17.52 + *		w: (no) no	w: (no) no	w: (copy) copy	w: (no) no
   17.53 + *		x: (no) no	x: (no) yes	x: (no) yes	x: (yes) yes
   17.54 + *
   17.55 + */
   17.56 +pgprot_t protection_map[16] = {
   17.57 +	__P000, __P001, __P010, __P011, __P100, __P101, __P110, __P111,
   17.58 +	__S000, __S001, __S010, __S011, __S100, __S101, __S110, __S111
   17.59 +};
   17.60 +
   17.61 +int sysctl_overcommit_memory = 0;	/* default is heuristic overcommit */
   17.62 +int sysctl_overcommit_ratio = 50;	/* default is 50% */
   17.63 +int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT;
   17.64 +atomic_t vm_committed_space = ATOMIC_INIT(0);
   17.65 +
   17.66 +EXPORT_SYMBOL(sysctl_overcommit_memory);
   17.67 +EXPORT_SYMBOL(sysctl_overcommit_ratio);
   17.68 +EXPORT_SYMBOL(sysctl_max_map_count);
   17.69 +EXPORT_SYMBOL(vm_committed_space);
   17.70 +
   17.71 +/*
   17.72 + * Requires inode->i_mapping->i_mmap_lock
   17.73 + */
   17.74 +static void __remove_shared_vm_struct(struct vm_area_struct *vma,
   17.75 +		struct file *file, struct address_space *mapping)
   17.76 +{
   17.77 +	if (vma->vm_flags & VM_DENYWRITE)
   17.78 +		atomic_inc(&file->f_dentry->d_inode->i_writecount);
   17.79 +	if (vma->vm_flags & VM_SHARED)
   17.80 +		mapping->i_mmap_writable--;
   17.81 +
   17.82 +	flush_dcache_mmap_lock(mapping);
   17.83 +	if (unlikely(vma->vm_flags & VM_NONLINEAR))
   17.84 +		list_del_init(&vma->shared.vm_set.list);
   17.85 +	else
   17.86 +		vma_prio_tree_remove(vma, &mapping->i_mmap);
   17.87 +	flush_dcache_mmap_unlock(mapping);
   17.88 +}
   17.89 +
   17.90 +/*
   17.91 + * Remove one vm structure and free it.
   17.92 + */
   17.93 +static void remove_vm_struct(struct vm_area_struct *vma)
   17.94 +{
   17.95 +	struct file *file = vma->vm_file;
   17.96 +
   17.97 +	if (file) {
   17.98 +		struct address_space *mapping = file->f_mapping;
   17.99 +		spin_lock(&mapping->i_mmap_lock);
  17.100 +		__remove_shared_vm_struct(vma, file, mapping);
  17.101 +		spin_unlock(&mapping->i_mmap_lock);
  17.102 +	}
  17.103 +	if (vma->vm_ops && vma->vm_ops->close)
  17.104 +		vma->vm_ops->close(vma);
  17.105 +	if (file)
  17.106 +		fput(file);
  17.107 +	anon_vma_unlink(vma);
  17.108 +	mpol_free(vma_policy(vma));
  17.109 +	kmem_cache_free(vm_area_cachep, vma);
  17.110 +}
  17.111 +
  17.112 +/*
  17.113 + *  sys_brk() for the most part doesn't need the global kernel
  17.114 + *  lock, except when an application is doing something nasty
  17.115 + *  like trying to un-brk an area that has already been mapped
  17.116 + *  to a regular file.  in this case, the unmapping will need
  17.117 + *  to invoke file system routines that need the global lock.
  17.118 + */
  17.119 +asmlinkage unsigned long sys_brk(unsigned long brk)
  17.120 +{
  17.121 +	unsigned long rlim, retval;
  17.122 +	unsigned long newbrk, oldbrk;
  17.123 +	struct mm_struct *mm = current->mm;
  17.124 +
  17.125 +	down_write(&mm->mmap_sem);
  17.126 +
  17.127 +	if (brk < mm->end_code)
  17.128 +		goto out;
  17.129 +	newbrk = PAGE_ALIGN(brk);
  17.130 +	oldbrk = PAGE_ALIGN(mm->brk);
  17.131 +	if (oldbrk == newbrk)
  17.132 +		goto set_brk;
  17.133 +
  17.134 +	/* Always allow shrinking brk. */
  17.135 +	if (brk <= mm->brk) {
  17.136 +		if (!do_munmap(mm, newbrk, oldbrk-newbrk))
  17.137 +			goto set_brk;
  17.138 +		goto out;
  17.139 +	}
  17.140 +
  17.141 +	/* Check against rlimit.. */
  17.142 +	rlim = current->rlim[RLIMIT_DATA].rlim_cur;
  17.143 +	if (rlim < RLIM_INFINITY && brk - mm->start_data > rlim)
  17.144 +		goto out;
  17.145 +
  17.146 +	/* Check against existing mmap mappings. */
  17.147 +	if (find_vma_intersection(mm, oldbrk, newbrk+PAGE_SIZE))
  17.148 +		goto out;
  17.149 +
  17.150 +	/* Ok, looks good - let it rip. */
  17.151 +	if (do_brk(oldbrk, newbrk-oldbrk) != oldbrk)
  17.152 +		goto out;
  17.153 +set_brk:
  17.154 +	mm->brk = brk;
  17.155 +out:
  17.156 +	retval = mm->brk;
  17.157 +	up_write(&mm->mmap_sem);
  17.158 +	return retval;
  17.159 +}
  17.160 +
  17.161 +#ifdef DEBUG_MM_RB
  17.162 +static int browse_rb(struct rb_root *root)
  17.163 +{
  17.164 +	int i = 0, j;
  17.165 +	struct rb_node *nd, *pn = NULL;
  17.166 +	unsigned long prev = 0, pend = 0;
  17.167 +
  17.168 +	for (nd = rb_first(root); nd; nd = rb_next(nd)) {
  17.169 +		struct vm_area_struct *vma;
  17.170 +		vma = rb_entry(nd, struct vm_area_struct, vm_rb);
  17.171 +		if (vma->vm_start < prev)
  17.172 +			printk("vm_start %lx prev %lx\n", vma->vm_start, prev), i = -1;
  17.173 +		if (vma->vm_start < pend)
  17.174 +			printk("vm_start %lx pend %lx\n", vma->vm_start, pend);
  17.175 +		if (vma->vm_start > vma->vm_end)
  17.176 +			printk("vm_end %lx < vm_start %lx\n", vma->vm_end, vma->vm_start);
  17.177 +		i++;
  17.178 +		pn = nd;
  17.179 +	}
  17.180 +	j = 0;
  17.181 +	for (nd = pn; nd; nd = rb_prev(nd)) {
  17.182 +		j++;
  17.183 +	}
  17.184 +	if (i != j)
  17.185 +		printk("backwards %d, forwards %d\n", j, i), i = 0;
  17.186 +	return i;
  17.187 +}
  17.188 +
  17.189 +void validate_mm(struct mm_struct *mm)
  17.190 +{
  17.191 +	int bug = 0;
  17.192 +	int i = 0;
  17.193 +	struct vm_area_struct *tmp = mm->mmap;
  17.194 +	while (tmp) {
  17.195 +		tmp = tmp->vm_next;
  17.196 +		i++;
  17.197 +	}
  17.198 +	if (i != mm->map_count)
  17.199 +		printk("map_count %d vm_next %d\n", mm->map_count, i), bug = 1;
  17.200 +	i = browse_rb(&mm->mm_rb);
  17.201 +	if (i != mm->map_count)
  17.202 +		printk("map_count %d rb %d\n", mm->map_count, i), bug = 1;
  17.203 +	if (bug)
  17.204 +		BUG();
  17.205 +}
  17.206 +#else
  17.207 +#define validate_mm(mm) do { } while (0)
  17.208 +#endif
  17.209 +
  17.210 +static struct vm_area_struct *
  17.211 +find_vma_prepare(struct mm_struct *mm, unsigned long addr,
  17.212 +		struct vm_area_struct **pprev, struct rb_node ***rb_link,
  17.213 +		struct rb_node ** rb_parent)
  17.214 +{
  17.215 +	struct vm_area_struct * vma;
  17.216 +	struct rb_node ** __rb_link, * __rb_parent, * rb_prev;
  17.217 +
  17.218 +	__rb_link = &mm->mm_rb.rb_node;
  17.219 +	rb_prev = __rb_parent = NULL;
  17.220 +	vma = NULL;
  17.221 +
  17.222 +	while (*__rb_link) {
  17.223 +		struct vm_area_struct *vma_tmp;
  17.224 +
  17.225 +		__rb_parent = *__rb_link;
  17.226 +		vma_tmp = rb_entry(__rb_parent, struct vm_area_struct, vm_rb);
  17.227 +
  17.228 +		if (vma_tmp->vm_end > addr) {
  17.229 +			vma = vma_tmp;
  17.230 +			if (vma_tmp->vm_start <= addr)
  17.231 +				return vma;
  17.232 +			__rb_link = &__rb_parent->rb_left;
  17.233 +		} else {
  17.234 +			rb_prev = __rb_parent;
  17.235 +			__rb_link = &__rb_parent->rb_right;
  17.236 +		}
  17.237 +	}
  17.238 +
  17.239 +	*pprev = NULL;
  17.240 +	if (rb_prev)
  17.241 +		*pprev = rb_entry(rb_prev, struct vm_area_struct, vm_rb);
  17.242 +	*rb_link = __rb_link;
  17.243 +	*rb_parent = __rb_parent;
  17.244 +	return vma;
  17.245 +}
  17.246 +
  17.247 +static inline void
  17.248 +__vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
  17.249 +		struct vm_area_struct *prev, struct rb_node *rb_parent)
  17.250 +{
  17.251 +	if (prev) {
  17.252 +		vma->vm_next = prev->vm_next;
  17.253 +		prev->vm_next = vma;
  17.254 +	} else {
  17.255 +		mm->mmap = vma;
  17.256 +		if (rb_parent)
  17.257 +			vma->vm_next = rb_entry(rb_parent,
  17.258 +					struct vm_area_struct, vm_rb);
  17.259 +		else
  17.260 +			vma->vm_next = NULL;
  17.261 +	}
  17.262 +}
  17.263 +
  17.264 +void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma,
  17.265 +		struct rb_node **rb_link, struct rb_node *rb_parent)
  17.266 +{
  17.267 +	rb_link_node(&vma->vm_rb, rb_parent, rb_link);
  17.268 +	rb_insert_color(&vma->vm_rb, &mm->mm_rb);
  17.269 +}
  17.270 +
  17.271 +static inline void __vma_link_file(struct vm_area_struct *vma)
  17.272 +{
  17.273 +	struct file * file;
  17.274 +
  17.275 +	file = vma->vm_file;
  17.276 +	if (file) {
  17.277 +		struct address_space *mapping = file->f_mapping;
  17.278 +
  17.279 +		if (vma->vm_flags & VM_DENYWRITE)
  17.280 +			atomic_dec(&file->f_dentry->d_inode->i_writecount);
  17.281 +		if (vma->vm_flags & VM_SHARED)
  17.282 +			mapping->i_mmap_writable++;
  17.283 +
  17.284 +		flush_dcache_mmap_lock(mapping);
  17.285 +		if (unlikely(vma->vm_flags & VM_NONLINEAR))
  17.286 +			list_add_tail(&vma->shared.vm_set.list,
  17.287 +					&mapping->i_mmap_nonlinear);
  17.288 +		else
  17.289 +			vma_prio_tree_insert(vma, &mapping->i_mmap);
  17.290 +		flush_dcache_mmap_unlock(mapping);
  17.291 +	}
  17.292 +}
  17.293 +
  17.294 +static void
  17.295 +__vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
  17.296 +	struct vm_area_struct *prev, struct rb_node **rb_link,
  17.297 +	struct rb_node *rb_parent)
  17.298 +{
  17.299 +	__vma_link_list(mm, vma, prev, rb_parent);
  17.300 +	__vma_link_rb(mm, vma, rb_link, rb_parent);
  17.301 +	__anon_vma_link(vma);
  17.302 +}
  17.303 +
  17.304 +static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
  17.305 +			struct vm_area_struct *prev, struct rb_node **rb_link,
  17.306 +			struct rb_node *rb_parent)
  17.307 +{
  17.308 +	struct address_space *mapping = NULL;
  17.309 +
  17.310 +	if (vma->vm_file)
  17.311 +		mapping = vma->vm_file->f_mapping;
  17.312 +
  17.313 +	if (mapping)
  17.314 +		spin_lock(&mapping->i_mmap_lock);
  17.315 +	anon_vma_lock(vma);
  17.316 +
  17.317 +	__vma_link(mm, vma, prev, rb_link, rb_parent);
  17.318 +	__vma_link_file(vma);
  17.319 +
  17.320 +	anon_vma_unlock(vma);
  17.321 +	if (mapping)
  17.322 +		spin_unlock(&mapping->i_mmap_lock);
  17.323 +
  17.324 +	mark_mm_hugetlb(mm, vma);
  17.325 +	mm->map_count++;
  17.326 +	validate_mm(mm);
  17.327 +}
  17.328 +
  17.329 +/*
  17.330 + * Helper for vma_adjust in the split_vma insert case:
  17.331 + * insert vm structure into list and rbtree and anon_vma,
  17.332 + * but it has already been inserted into prio_tree earlier.
  17.333 + */
  17.334 +static void
  17.335 +__insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma)
  17.336 +{
  17.337 +	struct vm_area_struct * __vma, * prev;
  17.338 +	struct rb_node ** rb_link, * rb_parent;
  17.339 +
  17.340 +	__vma = find_vma_prepare(mm, vma->vm_start,&prev, &rb_link, &rb_parent);
  17.341 +	if (__vma && __vma->vm_start < vma->vm_end)
  17.342 +		BUG();
  17.343 +	__vma_link(mm, vma, prev, rb_link, rb_parent);
  17.344 +	mm->map_count++;
  17.345 +}
  17.346 +
  17.347 +static inline void
  17.348 +__vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma,
  17.349 +		struct vm_area_struct *prev)
  17.350 +{
  17.351 +	prev->vm_next = vma->vm_next;
  17.352 +	rb_erase(&vma->vm_rb, &mm->mm_rb);
  17.353 +	if (mm->mmap_cache == vma)
  17.354 +		mm->mmap_cache = prev;
  17.355 +}
  17.356 +
  17.357 +/*
  17.358 + * We cannot adjust vm_start, vm_end, vm_pgoff fields of a vma that
  17.359 + * is already present in an i_mmap tree without adjusting the tree.
  17.360 + * The following helper function should be used when such adjustments
  17.361 + * are necessary.  The "insert" vma (if any) is to be inserted
  17.362 + * before we drop the necessary locks.
  17.363 + */
  17.364 +void vma_adjust(struct vm_area_struct *vma, unsigned long start,
  17.365 +	unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert)
  17.366 +{
  17.367 +	struct mm_struct *mm = vma->vm_mm;
  17.368 +	struct vm_area_struct *next = vma->vm_next;
  17.369 +	struct address_space *mapping = NULL;
  17.370 +	struct prio_tree_root *root = NULL;
  17.371 +	struct file *file = vma->vm_file;
  17.372 +	struct anon_vma *anon_vma = NULL;
  17.373 +	long adjust_next = 0;
  17.374 +	int remove_next = 0;
  17.375 +
  17.376 +	if (next && !insert) {
  17.377 +		if (end >= next->vm_end) {
  17.378 +			/*
  17.379 +			 * vma expands, overlapping all the next, and
  17.380 +			 * perhaps the one after too (mprotect case 6).
  17.381 +			 */
  17.382 +again:			remove_next = 1 + (end > next->vm_end);
  17.383 +			end = next->vm_end;
  17.384 +			anon_vma = next->anon_vma;
  17.385 +		} else if (end > next->vm_start) {
  17.386 +			/*
  17.387 +			 * vma expands, overlapping part of the next:
  17.388 +			 * mprotect case 5 shifting the boundary up.
  17.389 +			 */
  17.390 +			adjust_next = (end - next->vm_start) >> PAGE_SHIFT;
  17.391 +			anon_vma = next->anon_vma;
  17.392 +		} else if (end < vma->vm_end) {
  17.393 +			/*
  17.394 +			 * vma shrinks, and !insert tells it's not
  17.395 +			 * split_vma inserting another: so it must be
  17.396 +			 * mprotect case 4 shifting the boundary down.
  17.397 +			 */
  17.398 +			adjust_next = - ((vma->vm_end - end) >> PAGE_SHIFT);
  17.399 +			anon_vma = next->anon_vma;
  17.400 +		}
  17.401 +	}
  17.402 +
  17.403 +	if (file) {
  17.404 +		mapping = file->f_mapping;
  17.405 +		if (!(vma->vm_flags & VM_NONLINEAR))
  17.406 +			root = &mapping->i_mmap;
  17.407 +		spin_lock(&mapping->i_mmap_lock);
  17.408 +		if (insert) {
  17.409 +			/*
  17.410 +			 * Put into prio_tree now, so instantiated pages
  17.411 +			 * are visible to arm/parisc __flush_dcache_page
  17.412 +			 * throughout; but we cannot insert into address
  17.413 +			 * space until vma start or end is updated.
  17.414 +			 */
  17.415 +			__vma_link_file(insert);
  17.416 +		}
  17.417 +	}
  17.418 +
  17.419 +	/*
  17.420 +	 * When changing only vma->vm_end, we don't really need
  17.421 +	 * anon_vma lock: but is that case worth optimizing out?
  17.422 +	 */
  17.423 +	if (vma->anon_vma)
  17.424 +		anon_vma = vma->anon_vma;
  17.425 +	if (anon_vma)
  17.426 +		spin_lock(&anon_vma->lock);
  17.427 +
  17.428 +	if (root) {
  17.429 +		flush_dcache_mmap_lock(mapping);
  17.430 +		vma_prio_tree_remove(vma, root);
  17.431 +		if (adjust_next)
  17.432 +			vma_prio_tree_remove(next, root);
  17.433 +	}
  17.434 +
  17.435 +	vma->vm_start = start;
  17.436 +	vma->vm_end = end;
  17.437 +	vma->vm_pgoff = pgoff;
  17.438 +	if (adjust_next) {
  17.439 +		next->vm_start += adjust_next << PAGE_SHIFT;
  17.440 +		next->vm_pgoff += adjust_next;
  17.441 +	}
  17.442 +
  17.443 +	if (root) {
  17.444 +		if (adjust_next) {
  17.445 +			vma_prio_tree_init(next);
  17.446 +			vma_prio_tree_insert(next, root);
  17.447 +		}
  17.448 +		vma_prio_tree_init(vma);
  17.449 +		vma_prio_tree_insert(vma, root);
  17.450 +		flush_dcache_mmap_unlock(mapping);
  17.451 +	}
  17.452 +
  17.453 +	if (remove_next) {
  17.454 +		/*
  17.455 +		 * vma_merge has merged next into vma, and needs
  17.456 +		 * us to remove next before dropping the locks.
  17.457 +		 */
  17.458 +		__vma_unlink(mm, next, vma);
  17.459 +		if (file)
  17.460 +			__remove_shared_vm_struct(next, file, mapping);
  17.461 +		if (next->anon_vma)
  17.462 +			__anon_vma_merge(vma, next);
  17.463 +	} else if (insert) {
  17.464 +		/*
  17.465 +		 * split_vma has split insert from vma, and needs
  17.466 +		 * us to insert it before dropping the locks
  17.467 +		 * (it may either follow vma or precede it).
  17.468 +		 */
  17.469 +		__insert_vm_struct(mm, insert);
  17.470 +	}
  17.471 +
  17.472 +	if (anon_vma)
  17.473 +		spin_unlock(&anon_vma->lock);
  17.474 +	if (mapping)
  17.475 +		spin_unlock(&mapping->i_mmap_lock);
  17.476 +
  17.477 +	if (remove_next) {
  17.478 +		if (file)
  17.479 +			fput(file);
  17.480 +		mm->map_count--;
  17.481 +		mpol_free(vma_policy(next));
  17.482 +		kmem_cache_free(vm_area_cachep, next);
  17.483 +		/*
  17.484 +		 * In mprotect's case 6 (see comments on vma_merge),
  17.485 +		 * we must remove another next too. It would clutter
  17.486 +		 * up the code too much to do both in one go.
  17.487 +		 */
  17.488 +		if (remove_next == 2) {
  17.489 +			next = vma->vm_next;
  17.490 +			goto again;
  17.491 +		}
  17.492 +	}
  17.493 +
  17.494 +	validate_mm(mm);
  17.495 +}
  17.496 +
  17.497 +/*
  17.498 + * If the vma has a ->close operation then the driver probably needs to release
  17.499 + * per-vma resources, so we don't attempt to merge those.
  17.500 + */
  17.501 +#define VM_SPECIAL (VM_IO | VM_DONTCOPY | VM_DONTEXPAND | VM_RESERVED)
  17.502 +
  17.503 +static inline int is_mergeable_vma(struct vm_area_struct *vma,
  17.504 +			struct file *file, unsigned long vm_flags)
  17.505 +{
  17.506 +	if (vma->vm_flags != vm_flags)
  17.507 +		return 0;
  17.508 +	if (vma->vm_file != file)
  17.509 +		return 0;
  17.510 +	if (vma->vm_ops && vma->vm_ops->close)
  17.511 +		return 0;
  17.512 +	return 1;
  17.513 +}
  17.514 +
  17.515 +static inline int is_mergeable_anon_vma(struct anon_vma *anon_vma1,
  17.516 +					struct anon_vma *anon_vma2)
  17.517 +{
  17.518 +	return !anon_vma1 || !anon_vma2 || (anon_vma1 == anon_vma2);
  17.519 +}
  17.520 +
  17.521 +/*
  17.522 + * Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff)
  17.523 + * in front of (at a lower virtual address and file offset than) the vma.
  17.524 + *
  17.525 + * We cannot merge two vmas if they have differently assigned (non-NULL)
  17.526 + * anon_vmas, nor if same anon_vma is assigned but offsets incompatible.
  17.527 + *
  17.528 + * We don't check here for the merged mmap wrapping around the end of pagecache
  17.529 + * indices (16TB on ia32) because do_mmap_pgoff() does not permit mmap's which
  17.530 + * wrap, nor mmaps which cover the final page at index -1UL.
  17.531 + */
  17.532 +static int
  17.533 +can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags,
  17.534 +	struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff)
  17.535 +{
  17.536 +	if (is_mergeable_vma(vma, file, vm_flags) &&
  17.537 +	    is_mergeable_anon_vma(anon_vma, vma->anon_vma)) {
  17.538 +		if (vma->vm_pgoff == vm_pgoff)
  17.539 +			return 1;
  17.540 +	}
  17.541 +	return 0;
  17.542 +}
  17.543 +
  17.544 +/*
  17.545 + * Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff)
  17.546 + * beyond (at a higher virtual address and file offset than) the vma.
  17.547 + *
  17.548 + * We cannot merge two vmas if they have differently assigned (non-NULL)
  17.549 + * anon_vmas, nor if same anon_vma is assigned but offsets incompatible.
  17.550 + */
  17.551 +static int
  17.552 +can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
  17.553 +	struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff)
  17.554 +{
  17.555 +	if (is_mergeable_vma(vma, file, vm_flags) &&
  17.556 +	    is_mergeable_anon_vma(anon_vma, vma->anon_vma)) {
  17.557 +		pgoff_t vm_pglen;
  17.558 +		vm_pglen = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
  17.559 +		if (vma->vm_pgoff + vm_pglen == vm_pgoff)
  17.560 +			return 1;
  17.561 +	}
  17.562 +	return 0;
  17.563 +}
  17.564 +
  17.565 +/*
  17.566 + * Given a mapping request (addr,end,vm_flags,file,pgoff), figure out
  17.567 + * whether that can be merged with its predecessor or its successor.
  17.568 + * Or both (it neatly fills a hole).
  17.569 + *
  17.570 + * In most cases - when called for mmap, brk or mremap - [addr,end) is
  17.571 + * certain not to be mapped by the time vma_merge is called; but when
  17.572 + * called for mprotect, it is certain to be already mapped (either at
  17.573 + * an offset within prev, or at the start of next), and the flags of
  17.574 + * this area are about to be changed to vm_flags - and the no-change
  17.575 + * case has already been eliminated.
  17.576 + *
  17.577 + * The following mprotect cases have to be considered, where AAAA is
  17.578 + * the area passed down from mprotect_fixup, never extending beyond one
  17.579 + * vma, PPPPPP is the prev vma specified, and NNNNNN the next vma after:
  17.580 + *
  17.581 + *     AAAA             AAAA                AAAA          AAAA
  17.582 + *    PPPPPPNNNNNN    PPPPPPNNNNNN    PPPPPPNNNNNN    PPPPNNNNXXXX
  17.583 + *    cannot merge    might become    might become    might become
  17.584 + *                    PPNNNNNNNNNN    PPPPPPPPPPNN    PPPPPPPPPPPP 6 or
  17.585 + *    mmap, brk or    case 4 below    case 5 below    PPPPPPPPXXXX 7 or
  17.586 + *    mremap move:                                    PPPPNNNNNNNN 8
  17.587 + *        AAAA
  17.588 + *    PPPP    NNNN    PPPPPPPPPPPP    PPPPPPPPNNNN    PPPPNNNNNNNN
  17.589 + *    might become    case 1 below    case 2 below    case 3 below
  17.590 + *
  17.591 + * Odd one out? Case 8, because it extends NNNN but needs flags of XXXX:
  17.592 + * mprotect_fixup updates vm_flags & vm_page_prot on successful return.
  17.593 + */
  17.594 +struct vm_area_struct *vma_merge(struct mm_struct *mm,
  17.595 +			struct vm_area_struct *prev, unsigned long addr,
  17.596 +			unsigned long end, unsigned long vm_flags,
  17.597 +		     	struct anon_vma *anon_vma, struct file *file,
  17.598 +			pgoff_t pgoff, struct mempolicy *policy)
  17.599 +{
  17.600 +	pgoff_t pglen = (end - addr) >> PAGE_SHIFT;
  17.601 +	struct vm_area_struct *area, *next;
  17.602 +
  17.603 +	/*
  17.604 +	 * We later require that vma->vm_flags == vm_flags,
  17.605 +	 * so this tests vma->vm_flags & VM_SPECIAL, too.
  17.606 +	 */
  17.607 +	if (vm_flags & VM_SPECIAL)
  17.608 +		return NULL;
  17.609 +
  17.610 +	if (prev)
  17.611 +		next = prev->vm_next;
  17.612 +	else
  17.613 +		next = mm->mmap;
  17.614 +	area = next;
  17.615 +	if (next && next->vm_end == end)		/* cases 6, 7, 8 */
  17.616 +		next = next->vm_next;
  17.617 +
  17.618 +	/*
  17.619 +	 * Can it merge with the predecessor?
  17.620 +	 */
  17.621 +	if (prev && prev->vm_end == addr &&
  17.622 +  			mpol_equal(vma_policy(prev), policy) &&
  17.623 +			can_vma_merge_after(prev, vm_flags,
  17.624 +						anon_vma, file, pgoff)) {
  17.625 +		/*
  17.626 +		 * OK, it can.  Can we now merge in the successor as well?
  17.627 +		 */
  17.628 +		if (next && end == next->vm_start &&
  17.629 +				mpol_equal(policy, vma_policy(next)) &&
  17.630 +				can_vma_merge_before(next, vm_flags,
  17.631 +					anon_vma, file, pgoff+pglen) &&
  17.632 +				is_mergeable_anon_vma(prev->anon_vma,
  17.633 +						      next->anon_vma)) {
  17.634 +							/* cases 1, 6 */
  17.635 +			vma_adjust(prev, prev->vm_start,
  17.636 +				next->vm_end, prev->vm_pgoff, NULL);
  17.637 +		} else					/* cases 2, 5, 7 */
  17.638 +			vma_adjust(prev, prev->vm_start,
  17.639 +				end, prev->vm_pgoff, NULL);
  17.640 +		return prev;
  17.641 +	}
  17.642 +
  17.643 +	/*
  17.644 +	 * Can this new request be merged in front of next?
  17.645 +	 */
  17.646 +	if (next && end == next->vm_start &&
  17.647 + 			mpol_equal(policy, vma_policy(next)) &&
  17.648 +			can_vma_merge_before(next, vm_flags,
  17.649 +					anon_vma, file, pgoff+pglen)) {
  17.650 +		if (prev && addr < prev->vm_end)	/* case 4 */
  17.651 +			vma_adjust(prev, prev->vm_start,
  17.652 +				addr, prev->vm_pgoff, NULL);
  17.653 +		else					/* cases 3, 8 */
  17.654 +			vma_adjust(area, addr, next->vm_end,
  17.655 +				next->vm_pgoff - pglen, NULL);
  17.656 +		return area;
  17.657 +	}
  17.658 +
  17.659 +	return NULL;
  17.660 +}
  17.661 +
  17.662 +/*
  17.663 + * find_mergeable_anon_vma is used by anon_vma_prepare, to check
  17.664 + * neighbouring vmas for a suitable anon_vma, before it goes off
  17.665 + * to allocate a new anon_vma.  It checks because a repetitive
  17.666 + * sequence of mprotects and faults may otherwise lead to distinct
  17.667 + * anon_vmas being allocated, preventing vma merge in subsequent
  17.668 + * mprotect.
  17.669 + */
  17.670 +struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma)
  17.671 +{
  17.672 +	struct vm_area_struct *near;
  17.673 +	unsigned long vm_flags;
  17.674 +
  17.675 +	near = vma->vm_next;
  17.676 +	if (!near)
  17.677 +		goto try_prev;
  17.678 +
  17.679 +	/*
  17.680 +	 * Since only mprotect tries to remerge vmas, match flags
  17.681 +	 * which might be mprotected into each other later on.
  17.682 +	 * Neither mlock nor madvise tries to remerge at present,
  17.683 +	 * so leave their flags as obstructing a merge.
  17.684 +	 */
  17.685 +	vm_flags = vma->vm_flags & ~(VM_READ|VM_WRITE|VM_EXEC);
  17.686 +	vm_flags |= near->vm_flags & (VM_READ|VM_WRITE|VM_EXEC);
  17.687 +
  17.688 +	if (near->anon_vma && vma->vm_end == near->vm_start &&
  17.689 + 			mpol_equal(vma_policy(vma), vma_policy(near)) &&
  17.690 +			can_vma_merge_before(near, vm_flags,
  17.691 +				NULL, vma->vm_file, vma->vm_pgoff +
  17.692 +				((vma->vm_end - vma->vm_start) >> PAGE_SHIFT)))
  17.693 +		return near->anon_vma;
  17.694 +try_prev:
  17.695 +	/*
  17.696 +	 * It is potentially slow to have to call find_vma_prev here.
  17.697 +	 * But it's only on the first write fault on the vma, not
  17.698 +	 * every time, and we could devise a way to avoid it later
  17.699 +	 * (e.g. stash info in next's anon_vma_node when assigning
  17.700 +	 * an anon_vma, or when trying vma_merge).  Another time.
  17.701 +	 */
  17.702 +	if (find_vma_prev(vma->vm_mm, vma->vm_start, &near) != vma)
  17.703 +		BUG();
  17.704 +	if (!near)
  17.705 +		goto none;
  17.706 +
  17.707 +	vm_flags = vma->vm_flags & ~(VM_READ|VM_WRITE|VM_EXEC);
  17.708 +	vm_flags |= near->vm_flags & (VM_READ|VM_WRITE|VM_EXEC);
  17.709 +
  17.710 +	if (near->anon_vma && near->vm_end == vma->vm_start &&
  17.711 +  			mpol_equal(vma_policy(near), vma_policy(vma)) &&
  17.712 +			can_vma_merge_after(near, vm_flags,
  17.713 +				NULL, vma->vm_file, vma->vm_pgoff))
  17.714 +		return near->anon_vma;
  17.715 +none:
  17.716 +	/*
  17.717 +	 * There's no absolute need to look only at touching neighbours:
  17.718 +	 * we could search further afield for "compatible" anon_vmas.
  17.719 +	 * But it would probably just be a waste of time searching,
  17.720 +	 * or lead to too many vmas hanging off the same anon_vma.
  17.721 +	 * We're trying to allow mprotect remerging later on,
  17.722 +	 * not trying to minimize memory used for anon_vmas.
  17.723 +	 */
  17.724 +	return NULL;
  17.725 +}
  17.726 +
  17.727 +/*
  17.728 + * The caller must hold down_write(current->mm->mmap_sem).
  17.729 + */
  17.730 +
  17.731 +unsigned long do_mmap_pgoff(struct file * file, unsigned long addr,
  17.732 +			unsigned long len, unsigned long prot,
  17.733 +			unsigned long flags, unsigned long pgoff)
  17.734 +{
  17.735 +	struct mm_struct * mm = current->mm;
  17.736 +	struct vm_area_struct * vma, * prev;
  17.737 +	struct inode *inode;
  17.738 +	unsigned int vm_flags;
  17.739 +	int correct_wcount = 0;
  17.740 +	int error;
  17.741 +	struct rb_node ** rb_link, * rb_parent;
  17.742 +	int accountable = 1;
  17.743 +	unsigned long charged = 0;
  17.744 +
  17.745 +	if (file) {
  17.746 +		if (is_file_hugepages(file))
  17.747 +			accountable = 0;
  17.748 +
  17.749 +		if (!file->f_op || !file->f_op->mmap)
  17.750 +			return -ENODEV;
  17.751 +
  17.752 +		if ((prot & PROT_EXEC) &&
  17.753 +		    (file->f_vfsmnt->mnt_flags & MNT_NOEXEC))
  17.754 +			return -EPERM;
  17.755 +	}
  17.756 +
  17.757 +	if (!len)
  17.758 +		return addr;
  17.759 +
  17.760 +	/* Careful about overflows.. */
  17.761 +	len = PAGE_ALIGN(len);
  17.762 +	if (!len || len > TASK_SIZE)
  17.763 +		return -EINVAL;
  17.764 +
  17.765 +	/* offset overflow? */
  17.766 +	if ((pgoff + (len >> PAGE_SHIFT)) < pgoff)
  17.767 +		return -EINVAL;
  17.768 +
  17.769 +	/* Too many mappings? */
  17.770 +	if (mm->map_count > sysctl_max_map_count)
  17.771 +		return -ENOMEM;
  17.772 +
  17.773 +	/* Obtain the address to map to. we verify (or select) it and ensure
  17.774 +	 * that it represents a valid section of the address space.
  17.775 +	 */
  17.776 +	addr = get_unmapped_area(file, addr, len, pgoff, flags);
  17.777 +	if (addr & ~PAGE_MASK)
  17.778 +		return addr;
  17.779 +
  17.780 +	/* Do simple checking here so the lower-level routines won't have
  17.781 +	 * to. we assume access permissions have been handled by the open
  17.782 +	 * of the memory object, so we don't do any here.
  17.783 +	 */
  17.784 +	vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags) |
  17.785 +			mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
  17.786 +
  17.787 +	if (flags & MAP_LOCKED) {
  17.788 +		if (!capable(CAP_IPC_LOCK))
  17.789 +			return -EPERM;
  17.790 +		vm_flags |= VM_LOCKED;
  17.791 +	}
  17.792 +	/* mlock MCL_FUTURE? */
  17.793 +	if (vm_flags & VM_LOCKED) {
  17.794 +		unsigned long locked = mm->locked_vm << PAGE_SHIFT;
  17.795 +		locked += len;
  17.796 +		if (locked > current->rlim[RLIMIT_MEMLOCK].rlim_cur)
  17.797 +			return -EAGAIN;
  17.798 +	}
  17.799 +
  17.800 +	inode = file ? file->f_dentry->d_inode : NULL;
  17.801 +
  17.802 +	if (file) {
  17.803 +		switch (flags & MAP_TYPE) {
  17.804 +		case MAP_SHARED:
  17.805 +			if ((prot&PROT_WRITE) && !(file->f_mode&FMODE_WRITE))
  17.806 +				return -EACCES;
  17.807 +
  17.808 +			/*
  17.809 +			 * Make sure we don't allow writing to an append-only
  17.810 +			 * file..
  17.811 +			 */
  17.812 +			if (IS_APPEND(inode) && (file->f_mode & FMODE_WRITE))
  17.813 +				return -EACCES;
  17.814 +
  17.815 +			/*
  17.816 +			 * Make sure there are no mandatory locks on the file.
  17.817 +			 */
  17.818 +			if (locks_verify_locked(inode))
  17.819 +				return -EAGAIN;
  17.820 +
  17.821 +			vm_flags |= VM_SHARED | VM_MAYSHARE;
  17.822 +			if (!(file->f_mode & FMODE_WRITE))
  17.823 +				vm_flags &= ~(VM_MAYWRITE | VM_SHARED);
  17.824 +
  17.825 +			/* fall through */
  17.826 +		case MAP_PRIVATE:
  17.827 +			if (!(file->f_mode & FMODE_READ))
  17.828 +				return -EACCES;
  17.829 +			break;
  17.830 +
  17.831 +		default:
  17.832 +			return -EINVAL;
  17.833 +		}
  17.834 +	} else {
  17.835 +		switch (flags & MAP_TYPE) {
  17.836 +		case MAP_SHARED:
  17.837 +			vm_flags |= VM_SHARED | VM_MAYSHARE;
  17.838 +			break;
  17.839 +		case MAP_PRIVATE:
  17.840 +			/*
  17.841 +			 * Set pgoff according to addr for anon_vma.
  17.842 +			 */
  17.843 +			pgoff = addr >> PAGE_SHIFT;
  17.844 +			break;
  17.845 +		default:
  17.846 +			return -EINVAL;
  17.847 +		}
  17.848 +	}
  17.849 +
  17.850 +	error = security_file_mmap(file, prot, flags);
  17.851 +	if (error)
  17.852 +		return error;
  17.853 +		
  17.854 +	/* Clear old maps */
  17.855 +	error = -ENOMEM;
  17.856 +munmap_back:
  17.857 +	vma = find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent);
  17.858 +	if (vma && vma->vm_start < addr + len) {
  17.859 +		if (do_munmap(mm, addr, len))
  17.860 +			return -ENOMEM;
  17.861 +		goto munmap_back;
  17.862 +	}
  17.863 +
  17.864 +	/* Check against address space limit. */
  17.865 +	if ((mm->total_vm << PAGE_SHIFT) + len
  17.866 +	    > current->rlim[RLIMIT_AS].rlim_cur)
  17.867 +		return -ENOMEM;
  17.868 +
  17.869 +	if (accountable && (!(flags & MAP_NORESERVE) ||
  17.870 +			sysctl_overcommit_memory > 1)) {
  17.871 +		if (vm_flags & VM_SHARED) {
  17.872 +			/* Check memory availability in shmem_file_setup? */
  17.873 +			vm_flags |= VM_ACCOUNT;
  17.874 +		} else if (vm_flags & VM_WRITE) {
  17.875 +			/*
  17.876 +			 * Private writable mapping: check memory availability
  17.877 +			 */
  17.878 +			charged = len >> PAGE_SHIFT;
  17.879 +			if (security_vm_enough_memory(charged))
  17.880 +				return -ENOMEM;
  17.881 +			vm_flags |= VM_ACCOUNT;
  17.882 +		}
  17.883 +	}
  17.884 +
  17.885 +	/*
  17.886 +	 * Can we just expand an old private anonymous mapping?
  17.887 +	 * The VM_SHARED test is necessary because shmem_zero_setup
  17.888 +	 * will create the file object for a shared anonymous map below.
  17.889 +	 */
  17.890 +	if (!file && !(vm_flags & VM_SHARED) &&
  17.891 +	    vma_merge(mm, prev, addr, addr + len, vm_flags,
  17.892 +					NULL, NULL, pgoff, NULL))
  17.893 +		goto out;
  17.894 +
  17.895 +	/*
  17.896 +	 * Determine the object being mapped and call the appropriate
  17.897 +	 * specific mapper. the address has already been validated, but
  17.898 +	 * not unmapped, but the maps are removed from the list.
  17.899 +	 */
  17.900 +	vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
  17.901 +	if (!vma) {
  17.902 +		error = -ENOMEM;
  17.903 +		goto unacct_error;
  17.904 +	}
  17.905 +	memset(vma, 0, sizeof(*vma));
  17.906 +
  17.907 +	vma->vm_mm = mm;
  17.908 +	vma->vm_start = addr;
  17.909 +	vma->vm_end = addr + len;
  17.910 +	vma->vm_flags = vm_flags;
  17.911 +	vma->vm_page_prot = protection_map[vm_flags & 0x0f];
  17.912 +	vma->vm_pgoff = pgoff;
  17.913 +
  17.914 +	if (file) {
  17.915 +		error = -EINVAL;
  17.916 +		if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP))
  17.917 +			goto free_vma;
  17.918 +		if (vm_flags & VM_DENYWRITE) {
  17.919 +			error = deny_write_access(file);
  17.920 +			if (error)
  17.921 +				goto free_vma;
  17.922 +			correct_wcount = 1;
  17.923 +		}
  17.924 +		vma->vm_file = file;
  17.925 +		get_file(file);
  17.926 +		error = file->f_op->mmap(file, vma);
  17.927 +		if (error)
  17.928 +			goto unmap_and_free_vma;
  17.929 +	} else if (vm_flags & VM_SHARED) {
  17.930 +		error = shmem_zero_setup(vma);
  17.931 +		if (error)
  17.932 +			goto free_vma;
  17.933 +	}
  17.934 +
  17.935 +	/* We set VM_ACCOUNT in a shared mapping's vm_flags, to inform
  17.936 +	 * shmem_zero_setup (perhaps called through /dev/zero's ->mmap)
  17.937 +	 * that memory reservation must be checked; but that reservation
  17.938 +	 * belongs to shared memory object, not to vma: so now clear it.
  17.939 +	 */
  17.940 +	if ((vm_flags & (VM_SHARED|VM_ACCOUNT)) == (VM_SHARED|VM_ACCOUNT))
  17.941 +		vma->vm_flags &= ~VM_ACCOUNT;
  17.942 +
  17.943 +	/* Can addr have changed??
  17.944 +	 *
  17.945 +	 * Answer: Yes, several device drivers can do it in their
  17.946 +	 *         f_op->mmap method. -DaveM
  17.947 +	 */
  17.948 +	addr = vma->vm_start;
  17.949 +
  17.950 +	if (!file || !vma_merge(mm, prev, addr, vma->vm_end,
  17.951 +			vma->vm_flags, NULL, file, pgoff, vma_policy(vma))) {
  17.952 +		vma_link(mm, vma, prev, rb_link, rb_parent);
  17.953 +		if (correct_wcount)
  17.954 +			atomic_inc(&inode->i_writecount);
  17.955 +	} else {
  17.956 +		if (file) {
  17.957 +			if (correct_wcount)
  17.958 +				atomic_inc(&inode->i_writecount);
  17.959 +			fput(file);
  17.960 +		}
  17.961 +		mpol_free(vma_policy(vma));
  17.962 +		kmem_cache_free(vm_area_cachep, vma);
  17.963 +	}
  17.964 +out:	
  17.965 +	mm->total_vm += len >> PAGE_SHIFT;
  17.966 +	if (vm_flags & VM_LOCKED) {
  17.967 +		mm->locked_vm += len >> PAGE_SHIFT;
  17.968 +		make_pages_present(addr, addr + len);
  17.969 +	}
  17.970 +	if (flags & MAP_POPULATE) {
  17.971 +		up_write(&mm->mmap_sem);
  17.972 +		sys_remap_file_pages(addr, len, 0,
  17.973 +					pgoff, flags & MAP_NONBLOCK);
  17.974 +		down_write(&mm->mmap_sem);
  17.975 +	}
  17.976 +	return addr;
  17.977 +
  17.978 +unmap_and_free_vma:
  17.979 +	if (correct_wcount)
  17.980 +		atomic_inc(&inode->i_writecount);
  17.981 +	vma->vm_file = NULL;
  17.982 +	fput(file);
  17.983 +
  17.984 +	/* Undo any partial mapping done by a device driver. */
  17.985 +	zap_page_range(vma, vma->vm_start, vma->vm_end - vma->vm_start, NULL);
  17.986 +free_vma:
  17.987 +	kmem_cache_free(vm_area_cachep, vma);
  17.988 +unacct_error:
  17.989 +	if (charged)
  17.990 +		vm_unacct_memory(charged);
  17.991 +	return error;
  17.992 +}
  17.993 +
  17.994 +EXPORT_SYMBOL(do_mmap_pgoff);
  17.995 +
  17.996 +/* Get an address range which is currently unmapped.
  17.997 + * For shmat() with addr=0.
  17.998 + *
  17.999 + * Ugly calling convention alert:
 17.1000 + * Return value with the low bits set means error value,
 17.1001 + * ie
 17.1002 + *	if (ret & ~PAGE_MASK)
 17.1003 + *		error = ret;
 17.1004 + *
 17.1005 + * This function "knows" that -ENOMEM has the bits set.
 17.1006 + */
 17.1007 +#ifndef HAVE_ARCH_UNMAPPED_AREA
 17.1008 +static inline unsigned long
 17.1009 +arch_get_unmapped_area(struct file *filp, unsigned long addr,
 17.1010 +		unsigned long len, unsigned long pgoff, unsigned long flags)
 17.1011 +{
 17.1012 +	struct mm_struct *mm = current->mm;
 17.1013 +	struct vm_area_struct *vma;
 17.1014 +	unsigned long start_addr;
 17.1015 +
 17.1016 +	if (len > TASK_SIZE)
 17.1017 +		return -ENOMEM;
 17.1018 +
 17.1019 +	if (addr) {
 17.1020 +		addr = PAGE_ALIGN(addr);
 17.1021 +		vma = find_vma(mm, addr);
 17.1022 +		if (TASK_SIZE - len >= addr &&
 17.1023 +		    (!vma || addr + len <= vma->vm_start))
 17.1024 +			return addr;
 17.1025 +	}
 17.1026 +	start_addr = addr = mm->free_area_cache;
 17.1027 +
 17.1028 +full_search:
 17.1029 +	for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
 17.1030 +		/* At this point:  (!vma || addr < vma->vm_end). */
 17.1031 +		if (TASK_SIZE - len < addr) {
 17.1032 +			/*
 17.1033 +			 * Start a new search - just in case we missed
 17.1034 +			 * some holes.
 17.1035 +			 */
 17.1036 +			if (start_addr != TASK_UNMAPPED_BASE) {
 17.1037 +				start_addr = addr = TASK_UNMAPPED_BASE;
 17.1038 +				goto full_search;
 17.1039 +			}
 17.1040 +			return -ENOMEM;
 17.1041 +		}
 17.1042 +		if (!vma || addr + len <= vma->vm_start) {
 17.1043 +			/*
 17.1044 +			 * Remember the place where we stopped the search:
 17.1045 +			 */
 17.1046 +			mm->free_area_cache = addr + len;
 17.1047 +			return addr;
 17.1048 +		}
 17.1049 +		addr = vma->vm_end;
 17.1050 +	}
 17.1051 +}
 17.1052 +#else
 17.1053 +extern unsigned long
 17.1054 +arch_get_unmapped_area(struct file *, unsigned long, unsigned long,
 17.1055 +			unsigned long, unsigned long);
 17.1056 +#endif	
 17.1057 +
 17.1058 +#ifndef HAVE_ARCH_CHECK_FIXED_MAPPING
 17.1059 +#define arch_check_fixed_mapping(_file,_addr,_len,_pgoff,_flags) 0
 17.1060 +#else
 17.1061 +extern unsigned long
 17.1062 +arch_check_fixed_mapping(struct file *, unsigned long, unsigned long,
 17.1063 +			unsigned long, unsigned long);
 17.1064 +#endif
 17.1065 +
 17.1066 +unsigned long
 17.1067 +get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
 17.1068 +		unsigned long pgoff, unsigned long flags)
 17.1069 +{
 17.1070 +	if (flags & MAP_FIXED) {
 17.1071 +		unsigned long ret;
 17.1072 +
 17.1073 +		if (addr > TASK_SIZE - len)
 17.1074 +			return -ENOMEM;
 17.1075 +		if (addr & ~PAGE_MASK)
 17.1076 +			return -EINVAL;
 17.1077 +		ret = arch_check_fixed_mapping(file, addr, len, pgoff, flags);
 17.1078 +		if (ret != 0)
 17.1079 +			return ret;
 17.1080 +		if (file && is_file_hugepages(file))  {
 17.1081 +			/*
 17.1082 +			 * Check if the given range is hugepage aligned, and
 17.1083 +			 * can be made suitable for hugepages.
 17.1084 +			 */
 17.1085 +			ret = prepare_hugepage_range(addr, len);
 17.1086 +		} else {
 17.1087 +			/*
 17.1088 +			 * Ensure that a normal request is not falling in a
 17.1089 +			 * reserved hugepage range.  For some archs like IA-64,
 17.1090 +			 * there is a separate region for hugepages.
 17.1091 +			 */
 17.1092 +			ret = is_hugepage_only_range(addr, len);
 17.1093 +		}
 17.1094 +		if (ret)
 17.1095 +			return -EINVAL;
 17.1096 +		return addr;
 17.1097 +	}
 17.1098 +
 17.1099 +	if (file && file->f_op && file->f_op->get_unmapped_area)
 17.1100 +		return file->f_op->get_unmapped_area(file, addr, len,
 17.1101 +						pgoff, flags);
 17.1102 +
 17.1103 +	return arch_get_unmapped_area(file, addr, len, pgoff, flags);
 17.1104 +}
 17.1105 +
 17.1106 +EXPORT_SYMBOL(get_unmapped_area);
 17.1107 +
 17.1108 +/* Look up the first VMA which satisfies  addr < vm_end,  NULL if none. */
 17.1109 +struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr)
 17.1110 +{
 17.1111 +	struct vm_area_struct *vma = NULL;
 17.1112 +
 17.1113 +	if (mm) {
 17.1114 +		/* Check the cache first. */
 17.1115 +		/* (Cache hit rate is typically around 35%.) */
 17.1116 +		vma = mm->mmap_cache;
 17.1117 +		if (!(vma && vma->vm_end > addr && vma->vm_start <= addr)) {
 17.1118 +			struct rb_node * rb_node;
 17.1119 +
 17.1120 +			rb_node = mm->mm_rb.rb_node;
 17.1121 +			vma = NULL;
 17.1122 +
 17.1123 +			while (rb_node) {
 17.1124 +				struct vm_area_struct * vma_tmp;
 17.1125 +
 17.1126 +				vma_tmp = rb_entry(rb_node,
 17.1127 +						struct vm_area_struct, vm_rb);
 17.1128 +
 17.1129 +				if (vma_tmp->vm_end > addr) {
 17.1130 +					vma = vma_tmp;
 17.1131 +					if (vma_tmp->vm_start <= addr)
 17.1132 +						break;
 17.1133 +					rb_node = rb_node->rb_left;
 17.1134 +				} else
 17.1135 +					rb_node = rb_node->rb_right;
 17.1136 +			}
 17.1137 +			if (vma)
 17.1138 +				mm->mmap_cache = vma;
 17.1139 +		}
 17.1140 +	}
 17.1141 +	return vma;
 17.1142 +}
 17.1143 +
 17.1144 +EXPORT_SYMBOL(find_vma);
 17.1145 +
 17.1146 +/* Same as find_vma, but also return a pointer to the previous VMA in *pprev. */
 17.1147 +struct vm_area_struct *
 17.1148 +find_vma_prev(struct mm_struct *mm, unsigned long addr,
 17.1149 +			struct vm_area_struct **pprev)
 17.1150 +{
 17.1151 +	struct vm_area_struct *vma = NULL, *prev = NULL;
 17.1152 +	struct rb_node * rb_node;
 17.1153 +	if (!mm)
 17.1154 +		goto out;
 17.1155 +
 17.1156 +	/* Guard against addr being lower than the first VMA */
 17.1157 +	vma = mm->mmap;
 17.1158 +
 17.1159 +	/* Go through the RB tree quickly. */
 17.1160 +	rb_node = mm->mm_rb.rb_node;
 17.1161 +
 17.1162 +	while (rb_node) {
 17.1163 +		struct vm_area_struct *vma_tmp;
 17.1164 +		vma_tmp = rb_entry(rb_node, struct vm_area_struct, vm_rb);
 17.1165 +
 17.1166 +		if (addr < vma_tmp->vm_end) {
 17.1167 +			rb_node = rb_node->rb_left;
 17.1168 +		} else {
 17.1169 +			prev = vma_tmp;
 17.1170 +			if (!prev->vm_next || (addr < prev->vm_next->vm_end))
 17.1171 +				break;
 17.1172 +			rb_node = rb_node->rb_right;
 17.1173 +		}
 17.1174 +	}
 17.1175 +
 17.1176 +out:
 17.1177 +	*pprev = prev;
 17.1178 +	return prev ? prev->vm_next : vma;
 17.1179 +}
 17.1180 +
 17.1181 +#ifdef CONFIG_STACK_GROWSUP
 17.1182 +/*
 17.1183 + * vma is the first one with address > vma->vm_end.  Have to extend vma.
 17.1184 + */
 17.1185 +int expand_stack(struct vm_area_struct * vma, unsigned long address)
 17.1186 +{
 17.1187 +	unsigned long grow;
 17.1188 +
 17.1189 +	if (!(vma->vm_flags & VM_GROWSUP))
 17.1190 +		return -EFAULT;
 17.1191 +
 17.1192 +	/*
 17.1193 +	 * We must make sure the anon_vma is allocated
 17.1194 +	 * so that the anon_vma locking is not a noop.
 17.1195 +	 */
 17.1196 +	if (unlikely(anon_vma_prepare(vma)))
 17.1197 +		return -ENOMEM;
 17.1198 +	anon_vma_lock(vma);
 17.1199 +
 17.1200 +	/*
 17.1201 +	 * vma->vm_start/vm_end cannot change under us because the caller
 17.1202 +	 * is required to hold the mmap_sem in read mode.  We need the
 17.1203 +	 * anon_vma lock to serialize against concurrent expand_stacks.
 17.1204 +	 */
 17.1205 +	address += 4 + PAGE_SIZE - 1;
 17.1206 +	address &= PAGE_MASK;
 17.1207 +	grow = (address - vma->vm_end) >> PAGE_SHIFT;
 17.1208 +
 17.1209 +	/* Overcommit.. */
 17.1210 +	if (security_vm_enough_memory(grow)) {
 17.1211 +		anon_vma_unlock(vma);
 17.1212 +		return -ENOMEM;
 17.1213 +	}
 17.1214 +	
 17.1215 +	if (address - vma->vm_start > current->rlim[RLIMIT_STACK].rlim_cur ||
 17.1216 +			((vma->vm_mm->total_vm + grow) << PAGE_SHIFT) >
 17.1217 +			current->rlim[RLIMIT_AS].rlim_cur) {
 17.1218 +		anon_vma_unlock(vma);
 17.1219 +		vm_unacct_memory(grow);
 17.1220 +		return -ENOMEM;
 17.1221 +	}
 17.1222 +	vma->vm_end = address;
 17.1223 +	vma->vm_mm->total_vm += grow;
 17.1224 +	if (vma->vm_flags & VM_LOCKED)
 17.1225 +		vma->vm_mm->locked_vm += grow;
 17.1226 +	anon_vma_unlock(vma);
 17.1227 +	return 0;
 17.1228 +}
 17.1229 +
 17.1230 +struct vm_area_struct *
 17.1231 +find_extend_vma(struct mm_struct *mm, unsigned long addr)
 17.1232 +{
 17.1233 +	struct vm_area_struct *vma, *prev;
 17.1234 +
 17.1235 +	addr &= PAGE_MASK;
 17.1236 +	vma = find_vma_prev(mm, addr, &prev);
 17.1237 +	if (vma && (vma->vm_start <= addr))
 17.1238 +		return vma;
 17.1239 +	if (!prev || expand_stack(prev, addr))
 17.1240 +		return NULL;
 17.1241 +	if (prev->vm_flags & VM_LOCKED) {
 17.1242 +		make_pages_present(addr, prev->vm_end);
 17.1243 +	}
 17.1244 +	return prev;
 17.1245 +}
 17.1246 +#else
 17.1247 +/*
 17.1248 + * vma is the first one with address < vma->vm_start.  Have to extend vma.
 17.1249 + */
 17.1250 +int expand_stack(struct vm_area_struct *vma, unsigned long address)
 17.1251 +{
 17.1252 +	unsigned long grow;
 17.1253 +
 17.1254 +	/*
 17.1255 +	 * We must make sure the anon_vma is allocated
 17.1256 +	 * so that the anon_vma locking is not a noop.
 17.1257 +	 */
 17.1258 +	if (unlikely(anon_vma_prepare(vma)))
 17.1259 +		return -ENOMEM;
 17.1260 +	anon_vma_lock(vma);
 17.1261 +
 17.1262 +	/*
 17.1263 +	 * vma->vm_start/vm_end cannot change under us because the caller
 17.1264 +	 * is required to hold the mmap_sem in read mode.  We need the
 17.1265 +	 * anon_vma lock to serialize against concurrent expand_stacks.
 17.1266 +	 */
 17.1267 +	address &= PAGE_MASK;
 17.1268 +	grow = (vma->vm_start - address) >> PAGE_SHIFT;
 17.1269 +
 17.1270 +	/* Overcommit.. */
 17.1271 +	if (security_vm_enough_memory(grow)) {
 17.1272 +		anon_vma_unlock(vma);
 17.1273 +		return -ENOMEM;
 17.1274 +	}
 17.1275 +	
 17.1276 +	if (vma->vm_end - address > current->rlim[RLIMIT_STACK].rlim_cur ||
 17.1277 +			((vma->vm_mm->total_vm + grow) << PAGE_SHIFT) >
 17.1278 +			current->rlim[RLIMIT_AS].rlim_cur) {
 17.1279 +		anon_vma_unlock(vma);
 17.1280 +		vm_unacct_memory(grow);
 17.1281 +		return -ENOMEM;
 17.1282 +	}
 17.1283 +	vma->vm_start = address;
 17.1284 +	vma->vm_pgoff -= grow;
 17.1285 +	vma->vm_mm->total_vm += grow;
 17.1286 +	if (vma->vm_flags & VM_LOCKED)
 17.1287 +		vma->vm_mm->locked_vm += grow;
 17.1288 +	anon_vma_unlock(vma);
 17.1289 +	return 0;
 17.1290 +}
 17.1291 +
 17.1292 +struct vm_area_struct *
 17.1293 +find_extend_vma(struct mm_struct * mm, unsigned long addr)
 17.1294 +{
 17.1295 +	struct vm_area_struct * vma;
 17.1296 +	unsigned long start;
 17.1297 +
 17.1298 +	addr &= PAGE_MASK;
 17.1299 +	vma = find_vma(mm,addr);
 17.1300 +	if (!vma)
 17.1301 +		return NULL;
 17.1302 +	if (vma->vm_start <= addr)
 17.1303 +		return vma;
 17.1304 +	if (!(vma->vm_flags & VM_GROWSDOWN))
 17.1305 +		return NULL;
 17.1306 +	start = vma->vm_start;
 17.1307 +	if (expand_stack(vma, addr))
 17.1308 +		return NULL;
 17.1309 +	if (vma->vm_flags & VM_LOCKED) {
 17.1310 +		make_pages_present(addr, start);
 17.1311 +	}
 17.1312 +	return vma;
 17.1313 +}
 17.1314 +#endif
 17.1315 +
 17.1316 +/*
 17.1317 + * Try to free as many page directory entries as we can,
 17.1318 + * without having to work very hard at actually scanning
 17.1319 + * the page tables themselves.
 17.1320 + *
 17.1321 + * Right now we try to free page tables if we have a nice
 17.1322 + * PGDIR-aligned area that got free'd up. We could be more
 17.1323 + * granular if we want to, but this is fast and simple,
 17.1324 + * and covers the bad cases.
 17.1325 + *
 17.1326 + * "prev", if it exists, points to a vma before the one
 17.1327 + * we just free'd - but there's no telling how much before.
 17.1328 + */
 17.1329 +static void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *prev,
 17.1330 +	unsigned long start, unsigned long end)
 17.1331 +{
 17.1332 +	unsigned long first = start & PGDIR_MASK;
 17.1333 +	unsigned long last = end + PGDIR_SIZE - 1;
 17.1334 +	unsigned long start_index, end_index;
 17.1335 +	struct mm_struct *mm = tlb->mm;
 17.1336 +
 17.1337 +	if (!prev) {
 17.1338 +		prev = mm->mmap;
 17.1339 +		if (!prev)
 17.1340 +			goto no_mmaps;
 17.1341 +		if (prev->vm_end > start) {
 17.1342 +			if (last > prev->vm_start)
 17.1343 +				last = prev->vm_start;
 17.1344 +			goto no_mmaps;
 17.1345 +		}
 17.1346 +	}
 17.1347 +	for (;;) {
 17.1348 +		struct vm_area_struct *next = prev->vm_next;
 17.1349 +
 17.1350 +		if (next) {
 17.1351 +			if (next->vm_start < start) {
 17.1352 +				prev = next;
 17.1353 +				continue;
 17.1354 +			}
 17.1355 +			if (last > next->vm_start)
 17.1356 +				last = next->vm_start;
 17.1357 +		}
 17.1358 +		if (prev->vm_end > first)
 17.1359 +			first = prev->vm_end + PGDIR_SIZE - 1;
 17.1360 +		break;
 17.1361 +	}
 17.1362 +no_mmaps:
 17.1363 +	if (last < first)	/* for arches with discontiguous pgd indices */
 17.1364 +		return;
 17.1365 +	/*
 17.1366 +	 * If the PGD bits are not consecutive in the virtual address, the
 17.1367 +	 * old method of shifting the VA >> by PGDIR_SHIFT doesn't work.
 17.1368 +	 */
 17.1369 +	start_index = pgd_index(first);
 17.1370 +	if (start_index < FIRST_USER_PGD_NR)
 17.1371 +		start_index = FIRST_USER_PGD_NR;
 17.1372 +	end_index = pgd_index(last);
 17.1373 +	if (end_index > start_index) {
 17.1374 +		clear_page_tables(tlb, start_index, end_index - start_index);
 17.1375 +		flush_tlb_pgtables(mm, first & PGDIR_MASK, last & PGDIR_MASK);
 17.1376 +	}
 17.1377 +}
 17.1378 +
 17.1379 +/* Normal function to fix up a mapping
 17.1380 + * This function is the default for when an area has no specific
 17.1381 + * function.  This may be used as part of a more specific routine.
 17.1382 + *
 17.1383 + * By the time this function is called, the area struct has been
 17.1384 + * removed from the process mapping list.
 17.1385 + */
 17.1386 +static void unmap_vma(struct mm_struct *mm, struct vm_area_struct *area)
 17.1387 +{
 17.1388 +	size_t len = area->vm_end - area->vm_start;
 17.1389 +
 17.1390 +	area->vm_mm->total_vm -= len >> PAGE_SHIFT;
 17.1391 +	if (area->vm_flags & VM_LOCKED)
 17.1392 +		area->vm_mm->locked_vm -= len >> PAGE_SHIFT;
 17.1393 +	/*
 17.1394 +	 * Is this a new hole at the lowest possible address?
 17.1395 +	 */
 17.1396 +	if (area->vm_start >= TASK_UNMAPPED_BASE &&
 17.1397 +				area->vm_start < area->vm_mm->free_area_cache)
 17.1398 +	      area->vm_mm->free_area_cache = area->vm_start;
 17.1399 +
 17.1400 +	remove_vm_struct(area);
 17.1401 +}
 17.1402 +
 17.1403 +/*
 17.1404 + * Update the VMA and inode share lists.
 17.1405 + *
 17.1406 + * Ok - we have the memory areas we should free on the 'free' list,
 17.1407 + * so release them, and do the vma updates.
 17.1408 + */
 17.1409 +static void unmap_vma_list(struct mm_struct *mm,
 17.1410 +	struct vm_area_struct *mpnt)
 17.1411 +{
 17.1412 +	do {
 17.1413 +		struct vm_area_struct *next = mpnt->vm_next;
 17.1414 +		unmap_vma(mm, mpnt);
 17.1415 +		mpnt = next;
 17.1416 +	} while (mpnt != NULL);
 17.1417 +	validate_mm(mm);
 17.1418 +}
 17.1419 +
 17.1420 +/*
 17.1421 + * Get rid of page table information in the indicated region.
 17.1422 + *
 17.1423 + * Called with the page table lock held.
 17.1424 + */
 17.1425 +static void unmap_region(struct mm_struct *mm,
 17.1426 +	struct vm_area_struct *vma,
 17.1427 +	struct vm_area_struct *prev,
 17.1428 +	unsigned long start,
 17.1429 +	unsigned long end)
 17.1430 +{
 17.1431 +	struct mmu_gather *tlb;
 17.1432 +	unsigned long nr_accounted = 0;
 17.1433 +
 17.1434 +	lru_add_drain();
 17.1435 +	tlb = tlb_gather_mmu(mm, 0);
 17.1436 +	unmap_vmas(&tlb, mm, vma, start, end, &nr_accounted, NULL);
 17.1437 +	vm_unacct_memory(nr_accounted);
 17.1438 +
 17.1439 +	if (is_hugepage_only_range(start, end - start))
 17.1440 +		hugetlb_free_pgtables(tlb, prev, start, end);
 17.1441 +	else
 17.1442 +		free_pgtables(tlb, prev, start, end);
 17.1443 +	tlb_finish_mmu(tlb, start, end);
 17.1444 +}
 17.1445 +
 17.1446 +/*
 17.1447 + * Create a list of vma's touched by the unmap, removing them from the mm's
 17.1448 + * vma list as we go..
 17.1449 + */
 17.1450 +static void
 17.1451 +detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
 17.1452 +	struct vm_area_struct *prev, unsigned long end)
 17.1453 +{
 17.1454 +	struct vm_area_struct **insertion_point;
 17.1455 +	struct vm_area_struct *tail_vma = NULL;
 17.1456 +
 17.1457 +	insertion_point = (prev ? &prev->vm_next : &mm->mmap);
 17.1458 +	do {
 17.1459 +		rb_erase(&vma->vm_rb, &mm->mm_rb);
 17.1460 +		mm->map_count--;
 17.1461 +		tail_vma = vma;
 17.1462 +		vma = vma->vm_next;
 17.1463 +	} while (vma && vma->vm_start < end);
 17.1464 +	*insertion_point = vma;
 17.1465 +	tail_vma->vm_next = NULL;
 17.1466 +	mm->mmap_cache = NULL;		/* Kill the cache. */
 17.1467 +}
 17.1468 +
 17.1469 +/*
 17.1470 + * Split a vma into two pieces at address 'addr', a new vma is allocated
 17.1471 + * either for the first part or the the tail.
 17.1472 + */
 17.1473 +int split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
 17.1474 +	      unsigned long addr, int new_below)
 17.1475 +{
 17.1476 +	struct mempolicy *pol;
 17.1477 +	struct vm_area_struct *new;
 17.1478 +
 17.1479 +	if (mm->map_count >= sysctl_max_map_count)
 17.1480 +		return -ENOMEM;
 17.1481 +
 17.1482 +	new = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
 17.1483 +	if (!new)
 17.1484 +		return -ENOMEM;
 17.1485 +
 17.1486 +	/* most fields are the same, copy all, and then fixup */
 17.1487 +	*new = *vma;
 17.1488 +	vma_prio_tree_init(new);
 17.1489 +
 17.1490 +	if (new_below)
 17.1491 +		new->vm_end = addr;
 17.1492 +	else {
 17.1493 +		new->vm_start = addr;
 17.1494 +		new->vm_pgoff += ((addr - vma->vm_start) >> PAGE_SHIFT);
 17.1495 +	}
 17.1496 +
 17.1497 +	pol = mpol_copy(vma_policy(vma));
 17.1498 +	if (IS_ERR(pol)) {
 17.1499 +		kmem_cache_free(vm_area_cachep, new);
 17.1500 +		return PTR_ERR(pol);
 17.1501 +	}
 17.1502 +	vma_set_policy(new, pol);
 17.1503 +
 17.1504 +	if (new->vm_file)
 17.1505 +		get_file(new->vm_file);
 17.1506 +
 17.1507 +	if (new->vm_ops && new->vm_ops->open)
 17.1508 +		new->vm_ops->open(new);
 17.1509 +
 17.1510 +	if (new_below)
 17.1511 +		vma_adjust(vma, addr, vma->vm_end, vma->vm_pgoff +
 17.1512 +			((addr - new->vm_start) >> PAGE_SHIFT), new);
 17.1513 +	else
 17.1514 +		vma_adjust(vma, vma->vm_start, addr, vma->vm_pgoff, new);
 17.1515 +
 17.1516 +	return 0;
 17.1517 +}
 17.1518 +
 17.1519 +/* Munmap is split into 2 main parts -- this part which finds
 17.1520 + * what needs doing, and the areas themselves, which do the
 17.1521 + * work.  This now handles partial unmappings.
 17.1522 + * Jeremy Fitzhardinge <jeremy@goop.org>
 17.1523 + */
 17.1524 +int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
 17.1525 +{
 17.1526 +	unsigned long end;
 17.1527 +	struct vm_area_struct *mpnt, *prev, *last;
 17.1528 +
 17.1529 +	if ((start & ~PAGE_MASK) || start > TASK_SIZE || len > TASK_SIZE-start)
 17.1530 +		return -EINVAL;
 17.1531 +
 17.1532 +	if ((len = PAGE_ALIGN(len)) == 0)
 17.1533 +		return -EINVAL;
 17.1534 +
 17.1535 +	/* Find the first overlapping VMA */
 17.1536 +	mpnt = find_vma_prev(mm, start, &prev);
 17.1537 +	if (!mpnt)
 17.1538 +		return 0;
 17.1539 +	/* we have  start < mpnt->vm_end  */
 17.1540 +
 17.1541 +	if (is_vm_hugetlb_page(mpnt)) {
 17.1542 +		int ret = is_aligned_hugepage_range(start, len);
 17.1543 +
 17.1544 +		if (ret)
 17.1545 +			return ret;
 17.1546 +	}
 17.1547 +
 17.1548 +	/* if it doesn't overlap, we have nothing.. */
 17.1549 +	end = start + len;
 17.1550 +	if (mpnt->vm_start >= end)
 17.1551 +		return 0;
 17.1552 +
 17.1553 +	/* Something will probably happen, so notify. */
 17.1554 +	if (mpnt->vm_file && (mpnt->vm_flags & VM_EXEC))
 17.1555 +		profile_exec_unmap(mm);
 17.1556 + 
 17.1557 +	/*
 17.1558 +	 * If we need to split any vma, do it now to save pain later.
 17.1559 +	 *
 17.1560 +	 * Note: mremap's move_vma VM_ACCOUNT handling assumes a partially
 17.1561 +	 * unmapped vm_area_struct will remain in use: so lower split_vma
 17.1562 +	 * places tmp vma above, and higher split_vma places tmp vma below.
 17.1563 +	 */
 17.1564 +	if (start > mpnt->vm_start) {
 17.1565 +		if (split_vma(mm, mpnt, start, 0))
 17.1566 +			return -ENOMEM;
 17.1567 +		prev = mpnt;
 17.1568 +	}
 17.1569 +
 17.1570 +	/* Does it split the last one? */
 17.1571 +	last = find_vma(mm, end);
 17.1572 +	if (last && end > last->vm_start) {
 17.1573 +		if (split_vma(mm, last, end, 1))
 17.1574 +			return -ENOMEM;
 17.1575 +	}
 17.1576 +	mpnt = prev? prev->vm_next: mm->mmap;
 17.1577 +
 17.1578 +	/*
 17.1579 +	 * Remove the vma's, and unmap the actual pages
 17.1580 +	 */
 17.1581 +	detach_vmas_to_be_unmapped(mm, mpnt, prev, end);
 17.1582 +	spin_lock(&mm->page_table_lock);
 17.1583 +	unmap_region(mm, mpnt, prev, start, end);
 17.1584 +	spin_unlock(&mm->page_table_lock);
 17.1585 +
 17.1586 +	/* Fix up all other VM information */
 17.1587 +	unmap_vma_list(mm, mpnt);
 17.1588 +
 17.1589 +	return 0;
 17.1590 +}
 17.1591 +
 17.1592 +EXPORT_SYMBOL(do_munmap);
 17.1593 +
 17.1594 +asmlinkage long sys_munmap(unsigned long addr, size_t len)
 17.1595 +{
 17.1596 +	int ret;
 17.1597 +	struct mm_struct *mm = current->mm;
 17.1598 +
 17.1599 +	down_write(&mm->mmap_sem);
 17.1600 +	ret = do_munmap(mm, addr, len);
 17.1601 +	up_write(&mm->mmap_sem);
 17.1602 +	return ret;
 17.1603 +}
 17.1604 +
 17.1605 +/*
 17.1606 + *  this is really a simplified "do_mmap".  it only handles
 17.1607 + *  anonymous maps.  eventually we may be able to do some
 17.1608 + *  brk-specific accounting here.
 17.1609 + */
 17.1610 +unsigned long do_brk(unsigned long addr, unsigned long len)
 17.1611 +{
 17.1612 +	struct mm_struct * mm = current->mm;
 17.1613 +	struct vm_area_struct * vma, * prev;
 17.1614 +	unsigned long flags;
 17.1615 +	struct rb_node ** rb_link, * rb_parent;
 17.1616 +	pgoff_t pgoff = addr >> PAGE_SHIFT;
 17.1617 +
 17.1618 +	len = PAGE_ALIGN(len);
 17.1619 +	if (!len)
 17.1620 +		return addr;
 17.1621 +
 17.1622 +	if ((addr + len) > TASK_SIZE || (addr + len) < addr)
 17.1623 +		return -EINVAL;
 17.1624 +
 17.1625 +	/*
 17.1626 +	 * mlock MCL_FUTURE?
 17.1627 +	 */
 17.1628 +	if (mm->def_flags & VM_LOCKED) {
 17.1629 +		unsigned long locked = mm->locked_vm << PAGE_SHIFT;
 17.1630 +		locked += len;
 17.1631 +		if (locked > current->rlim[RLIMIT_MEMLOCK].rlim_cur)
 17.1632 +			return -EAGAIN;
 17.1633 +	}
 17.1634 +
 17.1635 +	/*
 17.1636 +	 * Clear old maps.  this also does some error checking for us
 17.1637 +	 */
 17.1638 + munmap_back:
 17.1639 +	vma = find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent);
 17.1640 +	if (vma && vma->vm_start < addr + len) {
 17.1641 +		if (do_munmap(mm, addr, len))
 17.1642 +			return -ENOMEM;
 17.1643 +		goto munmap_back;
 17.1644 +	}
 17.1645 +
 17.1646 +	/* Check against address space limits *after* clearing old maps... */
 17.1647 +	if ((mm->total_vm << PAGE_SHIFT) + len
 17.1648 +	    > current->rlim[RLIMIT_AS].rlim_cur)
 17.1649 +		return -ENOMEM;
 17.1650 +
 17.1651 +	if (mm->map_count > sysctl_max_map_count)
 17.1652 +		return -ENOMEM;
 17.1653 +
 17.1654 +	if (security_vm_enough_memory(len >> PAGE_SHIFT))
 17.1655 +		return -ENOMEM;
 17.1656 +
 17.1657 +	flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;
 17.1658 +
 17.1659 +	/* Can we just expand an old private anonymous mapping? */
 17.1660 +	if (vma_merge(mm, prev, addr, addr + len, flags,
 17.1661 +					NULL, NULL, pgoff, NULL))
 17.1662 +		goto out;
 17.1663 +
 17.1664 +	/*
 17.1665 +	 * create a vma struct for an anonymous mapping
 17.1666 +	 */
 17.1667 +	vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
 17.1668 +	if (!vma) {
 17.1669 +		vm_unacct_memory(len >> PAGE_SHIFT);
 17.1670 +		return -ENOMEM;
 17.1671 +	}
 17.1672 +	memset(vma, 0, sizeof(*vma));
 17.1673 +
 17.1674 +	vma->vm_mm = mm;
 17.1675 +	vma->vm_start = addr;
 17.1676 +	vma->vm_end = addr + len;
 17.1677 +	vma->vm_pgoff = pgoff;
 17.1678 +	vma->vm_flags = flags;
 17.1679 +	vma->vm_page_prot = protection_map[flags & 0x0f];
 17.1680 +	vma_link(mm, vma, prev, rb_link, rb_parent);
 17.1681 +out:
 17.1682 +	mm->total_vm += len >> PAGE_SHIFT;
 17.1683 +	if (flags & VM_LOCKED) {
 17.1684 +		mm->locked_vm += len >> PAGE_SHIFT;
 17.1685 +		make_pages_present(addr, addr + len);
 17.1686 +	}
 17.1687 +	return addr;
 17.1688 +}
 17.1689 +
 17.1690 +EXPORT_SYMBOL(do_brk);
 17.1691 +
 17.1692 +/* Release all mmaps. */
 17.1693 +void exit_mmap(struct mm_struct *mm)
 17.1694 +{
 17.1695 +	struct mmu_gather *tlb;
 17.1696 +	struct vm_area_struct *vma;
 17.1697 +	unsigned long nr_accounted = 0;
 17.1698 +
 17.1699 +	profile_exit_mmap(mm);
 17.1700 + 
 17.1701 +	lru_add_drain();
 17.1702 +
 17.1703 +	spin_lock(&mm->page_table_lock);
 17.1704 +
 17.1705 +	tlb = tlb_gather_mmu(mm, 1);
 17.1706 +	flush_cache_mm(mm);
 17.1707 +	/* Use ~0UL here to ensure all VMAs in the mm are unmapped */
 17.1708 +	mm->map_count -= unmap_vmas(&tlb, mm, mm->mmap, 0,
 17.1709 +					~0UL, &nr_accounted, NULL);
 17.1710 +	vm_unacct_memory(nr_accounted);
 17.1711 +	BUG_ON(mm->map_count);	/* This is just debugging */
 17.1712 +	clear_page_tables(tlb, FIRST_USER_PGD_NR, USER_PTRS_PER_PGD);
 17.1713 +	tlb_finish_mmu(tlb, 0, MM_VM_SIZE(mm));
 17.1714 +
 17.1715 +	vma = mm->mmap;
 17.1716 +	mm->mmap = mm->mmap_cache = NULL;
 17.1717 +	mm->mm_rb = RB_ROOT;
 17.1718 +	mm->rss = 0;
 17.1719 +	mm->total_vm = 0;
 17.1720 +	mm->locked_vm = 0;
 17.1721 +
 17.1722 +	spin_unlock(&mm->page_table_lock);
 17.1723 +
 17.1724 +	/*
 17.1725 +	 * Walk the list again, actually closing and freeing it
 17.1726 +	 * without holding any MM locks.
 17.1727 +	 */
 17.1728 +	while (vma) {
 17.1729 +		struct vm_area_struct *next = vma->vm_next;
 17.1730 +		remove_vm_struct(vma);
 17.1731 +		vma = next;
 17.1732 +	}
 17.1733 +}
 17.1734 +
 17.1735 +/* Insert vm structure into process list sorted by address
 17.1736 + * and into the inode's i_mmap tree.  If vm_file is non-NULL
 17.1737 + * then i_mmap_lock is taken here.
 17.1738 + */
 17.1739 +void insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma)
 17.1740 +{
 17.1741 +	struct vm_area_struct * __vma, * prev;
 17.1742 +	struct rb_node ** rb_link, * rb_parent;
 17.1743 +
 17.1744 +	/*
 17.1745 +	 * The vm_pgoff of a purely anonymous vma should be irrelevant
 17.1746 +	 * until its first write fault, when page's anon_vma and index
 17.1747 +	 * are set.  But now set the vm_pgoff it will almost certainly
 17.1748 +	 * end up with (unless mremap moves it elsewhere before that
 17.1749 +	 * first wfault), so /proc/pid/maps tells a consistent story.
 17.1750 +	 *
 17.1751 +	 * By setting it to reflect the virtual start address of the
 17.1752 +	 * vma, merges and splits can happen in a seamless way, just
 17.1753 +	 * using the existing file pgoff checks and manipulations.
 17.1754 +	 * Similarly in do_mmap_pgoff and in do_brk.
 17.1755 +	 */
 17.1756 +	if (!vma->vm_file) {
 17.1757 +		BUG_ON(vma->anon_vma);
 17.1758 +		vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT;
 17.1759 +	}
 17.1760 +	__vma = find_vma_prepare(mm,vma->vm_start,&prev,&rb_link,&rb_parent);
 17.1761 +	if (__vma && __vma->vm_start < vma->vm_end)
 17.1762 +		BUG();
 17.1763 +	vma_link(mm, vma, prev, rb_link, rb_parent);
 17.1764 +}
 17.1765 +
 17.1766 +/*
 17.1767 + * Copy the vma structure to a new location in the same mm,
 17.1768 + * prior to moving page table entries, to effect an mremap move.
 17.1769 + */
 17.1770 +struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
 17.1771 +	unsigned long addr, unsigned long len, pgoff_t pgoff)
 17.1772 +{
 17.1773 +	struct vm_area_struct *vma = *vmap;
 17.1774 +	unsigned long vma_start = vma->vm_start;
 17.1775 +	struct mm_struct *mm = vma->vm_mm;
 17.1776 +	struct vm_area_struct *new_vma, *prev;
 17.1777 +	struct rb_node **rb_link, *rb_parent;
 17.1778 +	struct mempolicy *pol;
 17.1779 +
 17.1780 +	/*
 17.1781 +	 * If anonymous vma has not yet been faulted, update new pgoff
 17.1782 +	 * to match new location, to increase its chance of merging.
 17.1783 +	 */
 17.1784 +	if (!vma->vm_file && !vma->anon_vma)
 17.1785 +		pgoff = addr >> PAGE_SHIFT;
 17.1786 +
 17.1787 +	find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent);
 17.1788 +	new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags,
 17.1789 +			vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma));
 17.1790 +	if (new_vma) {
 17.1791 +		/*
 17.1792 +		 * Source vma may have been merged into new_vma
 17.1793 +		 */
 17.1794 +		if (vma_start >= new_vma->vm_start &&
 17.1795 +		    vma_start < new_vma->vm_end)
 17.1796 +			*vmap = new_vma;
 17.1797 +	} else {
 17.1798 +		new_vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
 17.1799 +		if (new_vma) {
 17.1800 +			*new_vma = *vma;
 17.1801 +			vma_prio_tree_init(new_vma);
 17.1802 +			pol = mpol_copy(vma_policy(vma));
 17.1803 +			if (IS_ERR(pol)) {
 17.1804 +				kmem_cache_free(vm_area_cachep, new_vma);
 17.1805 +				return NULL;
 17.1806 +			}
 17.1807 +			vma_set_policy(new_vma, pol);
 17.1808 +			new_vma->vm_start = addr;
 17.1809 +			new_vma->vm_end = addr + len;
 17.1810 +			new_vma->vm_pgoff = pgoff;
 17.1811 +			if (new_vma->vm_file)
 17.1812 +				get_file(new_vma->vm_file);
 17.1813 +			if (new_vma->vm_ops && new_vma->vm_ops->open)
 17.1814 +				new_vma->vm_ops->open(new_vma);
 17.1815 +			vma_link(mm, new_vma, prev, rb_link, rb_parent);
 17.1816 +		}
 17.1817 +	}
 17.1818 +	return new_vma;
 17.1819 +}
    18.1 --- a/tools/python/xen/lowlevel/xu/xu.c	Thu Aug 12 14:52:11 2004 +0000
    18.2 +++ b/tools/python/xen/lowlevel/xu/xu.c	Thu Aug 12 17:01:47 2004 +0000
    18.3 @@ -49,6 +49,13 @@
    18.4  /* Size of a machine page frame. */
    18.5  #define PAGE_SIZE 4096
    18.6  
    18.7 +#if defined(__i386__)
    18.8 +#define rmb() __asm__ __volatile__ ( "lock; addl $0,0(%%esp)" : : : "memory" )
    18.9 +#define wmb() __asm__ __volatile__ ( "" : : : "memory" )
   18.10 +#else
   18.11 +#error "Define barriers"
   18.12 +#endif
   18.13 +
   18.14  
   18.15  /*
   18.16   * *********************** NOTIFIER ***********************
   18.17 @@ -710,6 +717,9 @@ static PyObject *xu_port_read_request(Py
   18.18          return NULL;
   18.19      }
   18.20  
   18.21 +    /* Need to ensure we see the request, despite seeing the index update.*/
   18.22 +    rmb();
   18.23 +
   18.24      cmsg = &cif->tx_ring[MASK_CONTROL_IDX(c)];
   18.25      xum = PyObject_New(xu_message_object, &xu_message_type);
   18.26      memcpy(&xum->msg, cmsg, sizeof(*cmsg));
   18.27 @@ -745,6 +755,7 @@ static PyObject *xu_port_write_request(P
   18.28      cmsg = &cif->rx_ring[MASK_CONTROL_IDX(p)];
   18.29      memcpy(cmsg, &xum->msg, sizeof(*cmsg));
   18.30  
   18.31 +    wmb();
   18.32      xup->rx_req_prod = cif->rx_req_prod = p + 1;
   18.33  
   18.34      Py_INCREF(Py_None);
   18.35 @@ -768,6 +779,9 @@ static PyObject *xu_port_read_response(P
   18.36          return NULL;
   18.37      }
   18.38  
   18.39 +    /* Need to ensure we see the response, despite seeing the index update.*/
   18.40 +    rmb();
   18.41 +
   18.42      cmsg = &cif->rx_ring[MASK_CONTROL_IDX(c)];
   18.43      xum = PyObject_New(xu_message_object, &xu_message_type);
   18.44      memcpy(&xum->msg, cmsg, sizeof(*cmsg));
   18.45 @@ -803,6 +817,7 @@ static PyObject *xu_port_write_response(
   18.46      cmsg = &cif->tx_ring[MASK_CONTROL_IDX(p)];
   18.47      memcpy(cmsg, &xum->msg, sizeof(*cmsg));
   18.48  
   18.49 +    wmb();
   18.50      xup->tx_resp_prod = cif->tx_resp_prod = p + 1;
   18.51  
   18.52      Py_INCREF(Py_None);