ia64/xen-unstable

changeset 2213:8481b2eee50a

bitkeeper revision 1.1159.17.22 (411baa52nqHnDQFA6udVVgOtViVfeQ)

Merge freefall.cl.cam.ac.uk:/auto/groups/xeno/BK/xeno.bk
into freefall.cl.cam.ac.uk:/auto/groups/xeno/users/cl349/BK/xeno.bk-26dom0
author cl349@freefall.cl.cam.ac.uk
date Thu Aug 12 17:35:14 2004 +0000 (2004-08-12)
parents 9840fcd30668 b8884dc7fd28
children 249ef8d5db7d
files .rootkeys linux-2.4.26-xen-sparse/arch/xen/drivers/blkif/frontend/main.c linux-2.6.7-xen-sparse/arch/xen/i386/mm/Makefile linux-2.6.7-xen-sparse/arch/xen/i386/mm/mmap.c linux-2.6.7-xen-sparse/arch/xen/kernel/ctrl_if.c linux-2.6.7-xen-sparse/drivers/xen/blkback/blkback.c linux-2.6.7-xen-sparse/drivers/xen/blkfront/blkfront.c linux-2.6.7-xen-sparse/drivers/xen/netback/netback.c linux-2.6.7-xen-sparse/drivers/xen/netfront/netfront.c linux-2.6.7-xen-sparse/include/asm-xen/asm-i386/pgtable.h linux-2.6.7-xen-sparse/mm/mmap.c tools/python/xen/lowlevel/xu/xu.c
line diff
     1.1 --- a/.rootkeys	Thu Aug 12 17:34:21 2004 +0000
     1.2 +++ b/.rootkeys	Thu Aug 12 17:35:14 2004 +0000
     1.3 @@ -159,6 +159,7 @@ 4118cc35CbY8rfGVspF5O-7EkXBEAA linux-2.6
     1.4  40f562383SKvDStdtrvzr5fyCbW4rw linux-2.6.7-xen-sparse/arch/xen/i386/mm/hypervisor.c
     1.5  40f56239xcNylAxuGsQHwi1AyMLV8w linux-2.6.7-xen-sparse/arch/xen/i386/mm/init.c
     1.6  41062ab7CjxC1UBaFhOMWWdhHkIUyg linux-2.6.7-xen-sparse/arch/xen/i386/mm/ioremap.c
     1.7 +411b9db3oFpYQc4C-_mO2lRTcSz8UQ linux-2.6.7-xen-sparse/arch/xen/i386/mm/mmap.c
     1.8  40f5623906UYHv1rsVUeRc0tFT0dWw linux-2.6.7-xen-sparse/arch/xen/i386/mm/pgtable.c
     1.9  4107adf12ndy94MidCaivDibJ3pPAg linux-2.6.7-xen-sparse/arch/xen/i386/pci/Makefile
    1.10  4107adf1WcCgkhsdLTRGX52cOG1vJg linux-2.6.7-xen-sparse/arch/xen/i386/pci/direct.c
    1.11 @@ -241,6 +242,7 @@ 3f108af1ylCIm82H052FVTfXACBHrw linux-2.6
    1.12  3fa8e3f0kBLeE4To2vpdi3cpJbIkbQ linux-2.6.7-xen-sparse/include/asm-xen/suspend.h
    1.13  3f689063BoW-HWV3auUJ-OqXfcGArw linux-2.6.7-xen-sparse/include/asm-xen/xen_proc.h
    1.14  40f56a0ddHCSs3501MY4hRf22tctOw linux-2.6.7-xen-sparse/mkbuildtree
    1.15 +411b9db3dpQAK-pcP8WwcRHZGn2eKg linux-2.6.7-xen-sparse/mm/mmap.c
    1.16  410a94a4KT6I6X0LVc7djB39tRDp4g linux-2.6.7-xen-sparse/mm/page_alloc.c
    1.17  40e1b09db5mN69Ijj0X_Eol-S7dXiw tools/Make.defs
    1.18  3f776bd1Hy9rn69ntXBhPReUFw9IEA tools/Makefile
     2.1 --- a/linux-2.4.26-xen-sparse/arch/xen/drivers/blkif/frontend/main.c	Thu Aug 12 17:34:21 2004 +0000
     2.2 +++ b/linux-2.4.26-xen-sparse/arch/xen/drivers/blkif/frontend/main.c	Thu Aug 12 17:35:14 2004 +0000
     2.3 @@ -16,8 +16,6 @@
     2.4  #include <scsi/scsi.h>
     2.5  #include <asm/ctrl_if.h>
     2.6  
     2.7 -
     2.8 -
     2.9  typedef unsigned char byte; /* from linux/ide.h */
    2.10  
    2.11  #define BLKIF_STATE_CLOSED       0
    2.12 @@ -95,6 +93,7 @@ static inline void translate_req_to_mfn(
    2.13  static inline void flush_requests(void)
    2.14  {
    2.15      DISABLE_SCATTERGATHER();
    2.16 +    wmb(); /* Ensure that the frontend can see the requests. */
    2.17      blk_ring->req_prod = req_prod;
    2.18      notify_via_evtchn(blkif_evtchn);
    2.19  }
    2.20 @@ -533,7 +532,7 @@ static void kick_pending_request_queues(
    2.21  
    2.22  static void blkif_int(int irq, void *dev_id, struct pt_regs *ptregs)
    2.23  {
    2.24 -    BLKIF_RING_IDX i; 
    2.25 +    BLKIF_RING_IDX i, rp; 
    2.26      unsigned long flags; 
    2.27      struct buffer_head *bh, *next_bh;
    2.28      
    2.29 @@ -541,13 +540,14 @@ static void blkif_int(int irq, void *dev
    2.30  
    2.31      if ( unlikely(blkif_state == BLKIF_STATE_CLOSED || recovery) )
    2.32      {
    2.33 -        printk("Bailed out\n");
    2.34 -        
    2.35          spin_unlock_irqrestore(&io_request_lock, flags);
    2.36          return;
    2.37      }
    2.38  
    2.39 -    for ( i = resp_cons; i != blk_ring->resp_prod; i++ )
    2.40 +    rp = blk_ring->resp_prod;
    2.41 +    rmb(); /* Ensure we see queued responses up to 'rp'. */
    2.42 +
    2.43 +    for ( i = resp_cons; i != rp; i++ )
    2.44      {
    2.45          blkif_response_t *bret = &blk_ring->ring[MASK_BLKIF_IDX(i)].resp;
    2.46          switch ( bret->operation )
     3.1 --- a/linux-2.6.7-xen-sparse/arch/xen/i386/mm/Makefile	Thu Aug 12 17:34:21 2004 +0000
     3.2 +++ b/linux-2.6.7-xen-sparse/arch/xen/i386/mm/Makefile	Thu Aug 12 17:35:14 2004 +0000
     3.3 @@ -6,7 +6,7 @@ XENARCH	:= $(subst ",,$(CONFIG_XENARCH))
     3.4  
     3.5  CFLAGS	+= -Iarch/$(XENARCH)/mm
     3.6  
     3.7 -obj-y	:= init.o fault.o ioremap.o pgtable.o hypervisor.o
     3.8 +obj-y	:= init.o fault.o ioremap.o pgtable.o hypervisor.o mmap.o
     3.9  c-obj-y	:= extable.o pageattr.o 
    3.10  
    3.11  c-obj-$(CONFIG_DISCONTIGMEM)	+= discontig.o
     4.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     4.2 +++ b/linux-2.6.7-xen-sparse/arch/xen/i386/mm/mmap.c	Thu Aug 12 17:35:14 2004 +0000
     4.3 @@ -0,0 +1,60 @@
     4.4 +
     4.5 +#include <linux/slab.h>
     4.6 +#include <linux/mman.h>
     4.7 +#include <linux/init.h>
     4.8 +#include <asm/pgalloc.h>
     4.9 +
    4.10 +unsigned long
    4.11 +arch_get_unmapped_area(struct file *filp, unsigned long addr,
    4.12 +		unsigned long len, unsigned long pgoff, unsigned long flags)
    4.13 +{
    4.14 +	struct mm_struct *mm = current->mm;
    4.15 +	struct vm_area_struct *vma;
    4.16 +	unsigned long start_addr;
    4.17 +
    4.18 +	if (len > TASK_SIZE)
    4.19 +		return -ENOMEM;
    4.20 +
    4.21 +	if (addr) {
    4.22 +		addr = PAGE_ALIGN(addr);
    4.23 +		vma = find_vma(mm, addr);
    4.24 +		if (((TASK_SIZE - len) >= addr) &&
    4.25 +		    (addr >= (FIRST_USER_PGD_NR<<PGDIR_SHIFT)) &&
    4.26 +		    (!vma || ((addr + len) <= vma->vm_start)))
    4.27 +			return addr;
    4.28 +	}
    4.29 +	start_addr = addr = mm->free_area_cache;
    4.30 +
    4.31 +full_search:
    4.32 +	for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
    4.33 +		/* At this point:  (!vma || addr < vma->vm_end). */
    4.34 +		if (TASK_SIZE - len < addr) {
    4.35 +			/*
    4.36 +			 * Start a new search - just in case we missed
    4.37 +			 * some holes.
    4.38 +			 */
    4.39 +			if (start_addr != TASK_UNMAPPED_BASE) {
    4.40 +				start_addr = addr = TASK_UNMAPPED_BASE;
    4.41 +				goto full_search;
    4.42 +			}
    4.43 +			return -ENOMEM;
    4.44 +		}
    4.45 +		if (!vma || addr + len <= vma->vm_start) {
    4.46 +			/*
    4.47 +			 * Remember the place where we stopped the search:
    4.48 +			 */
    4.49 +			mm->free_area_cache = addr + len;
    4.50 +			return addr;
    4.51 +		}
    4.52 +		addr = vma->vm_end;
    4.53 +	}
    4.54 +}
    4.55 +
    4.56 +unsigned long
    4.57 +arch_check_fixed_mapping(struct file *filp, unsigned long addr,
    4.58 +		unsigned long len, unsigned long pgoff, unsigned long flags)
    4.59 +{
    4.60 +	if ( addr < (FIRST_USER_PGD_NR<<PGDIR_SHIFT) )
    4.61 +		return -EINVAL;
    4.62 +	return 0;
    4.63 +}
     5.1 --- a/linux-2.6.7-xen-sparse/arch/xen/kernel/ctrl_if.c	Thu Aug 12 17:34:21 2004 +0000
     5.2 +++ b/linux-2.6.7-xen-sparse/arch/xen/kernel/ctrl_if.c	Thu Aug 12 17:35:14 2004 +0000
     5.3 @@ -93,8 +93,12 @@ static void __ctrl_if_tx_tasklet(unsigne
     5.4      control_if_t *ctrl_if = get_ctrl_if();
     5.5      ctrl_msg_t   *msg;
     5.6      int           was_full = TX_FULL(ctrl_if);
     5.7 +    CONTROL_RING_IDX rp;
     5.8  
     5.9 -    while ( ctrl_if_tx_resp_cons != ctrl_if->tx_resp_prod )
    5.10 +    rp = ctrl_if->tx_resp_prod;
    5.11 +    rmb(); /* Ensure we see all requests up to 'rp'. */
    5.12 +
    5.13 +    while ( ctrl_if_tx_resp_cons != rp )
    5.14      {
    5.15          msg = &ctrl_if->tx_ring[MASK_CONTROL_IDX(ctrl_if_tx_resp_cons)];
    5.16  
    5.17 @@ -132,8 +136,12 @@ static void __ctrl_if_tx_tasklet(unsigne
    5.18  static void __ctrl_if_rxmsg_deferred(void *unused)
    5.19  {
    5.20      ctrl_msg_t *msg;
    5.21 +    CONTROL_RING_IDX dp;
    5.22  
    5.23 -    while ( ctrl_if_rxmsg_deferred_cons != ctrl_if_rxmsg_deferred_prod )
    5.24 +    dp = ctrl_if_rxmsg_deferred_prod;
    5.25 +    rmb(); /* Ensure we see all deferred requests up to 'dp'. */
    5.26 +
    5.27 +    while ( ctrl_if_rxmsg_deferred_cons != dp )
    5.28      {
    5.29          msg = &ctrl_if_rxmsg_deferred[MASK_CONTROL_IDX(
    5.30              ctrl_if_rxmsg_deferred_cons++)];
    5.31 @@ -145,8 +153,13 @@ static void __ctrl_if_rx_tasklet(unsigne
    5.32  {
    5.33      control_if_t *ctrl_if = get_ctrl_if();
    5.34      ctrl_msg_t    msg, *pmsg;
    5.35 +    CONTROL_RING_IDX rp, dp;
    5.36  
    5.37 -    while ( ctrl_if_rx_req_cons != ctrl_if->rx_req_prod )
    5.38 +    dp = ctrl_if_rxmsg_deferred_prod;
    5.39 +    rp = ctrl_if->rx_req_prod;
    5.40 +    rmb(); /* Ensure we see all requests up to 'rp'. */
    5.41 +
    5.42 +    while ( ctrl_if_rx_req_cons != rp )
    5.43      {
    5.44          pmsg = &ctrl_if->rx_ring[MASK_CONTROL_IDX(ctrl_if_rx_req_cons++)];
    5.45          memcpy(&msg, pmsg, offsetof(ctrl_msg_t, msg));
    5.46 @@ -161,20 +174,21 @@ static void __ctrl_if_rx_tasklet(unsigne
    5.47  
    5.48          if ( test_bit(msg.type, 
    5.49                        (unsigned long *)&ctrl_if_rxmsg_blocking_context) )
    5.50 -        {
    5.51 -            pmsg = &ctrl_if_rxmsg_deferred[MASK_CONTROL_IDX(
    5.52 -                ctrl_if_rxmsg_deferred_prod++)];
    5.53 -            memcpy(pmsg, &msg, offsetof(ctrl_msg_t, msg) + msg.length);
    5.54 +            memcpy(&ctrl_if_rxmsg_deferred[MASK_CONTROL_IDX(dp++)],
    5.55 +                   &msg, offsetof(ctrl_msg_t, msg) + msg.length);
    5.56 +        else
    5.57 +            (*ctrl_if_rxmsg_handler[msg.type])(&msg, 0);
    5.58 +    }
    5.59 +
    5.60 +    if ( dp != ctrl_if_rxmsg_deferred_prod )
    5.61 +    {
    5.62 +        wmb();
    5.63 +        ctrl_if_rxmsg_deferred_prod = dp;
    5.64  #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0)
    5.65 -            schedule_task(&ctrl_if_rxmsg_deferred_tq);
    5.66 +        schedule_task(&ctrl_if_rxmsg_deferred_tq);
    5.67  #else
    5.68 -            schedule_work(&ctrl_if_rxmsg_deferred_work);
    5.69 +        schedule_work(&ctrl_if_rxmsg_deferred_work);
    5.70  #endif
    5.71 -        }
    5.72 -        else
    5.73 -        {
    5.74 -            (*ctrl_if_rxmsg_handler[msg.type])(&msg, 0);
    5.75 -        }
    5.76      }
    5.77  }
    5.78  
     6.1 --- a/linux-2.6.7-xen-sparse/drivers/xen/blkback/blkback.c	Thu Aug 12 17:34:21 2004 +0000
     6.2 +++ b/linux-2.6.7-xen-sparse/drivers/xen/blkback/blkback.c	Thu Aug 12 17:35:14 2004 +0000
     6.3 @@ -268,13 +268,15 @@ static int do_block_io_op(blkif_t *blkif
     6.4  {
     6.5      blkif_ring_t *blk_ring = blkif->blk_ring_base;
     6.6      blkif_request_t *req;
     6.7 -    BLKIF_RING_IDX i;
     6.8 +    BLKIF_RING_IDX i, rp;
     6.9      int more_to_do = 0;
    6.10  
    6.11 +    rp = blk_ring->req_prod;
    6.12 +    rmb(); /* Ensure we see queued requests up to 'rp'. */
    6.13 +
    6.14      /* Take items off the comms ring, taking care not to overflow. */
    6.15      for ( i = blkif->blk_req_cons; 
    6.16 -          (i != blk_ring->req_prod) && ((i-blkif->blk_resp_prod) != 
    6.17 -                                        BLKIF_RING_SIZE);
    6.18 +          (i != rp) && ((i-blkif->blk_resp_prod) != BLKIF_RING_SIZE);
    6.19            i++ )
    6.20      {
    6.21          if ( (max_to_do-- == 0) || (NR_PENDING_REQS == MAX_PENDING_REQS) )
    6.22 @@ -533,7 +535,7 @@ static void make_response(blkif_t *blkif
    6.23      resp->id        = id;
    6.24      resp->operation = op;
    6.25      resp->status    = st;
    6.26 -    wmb();
    6.27 +    wmb(); /* Ensure other side can see the response fields. */
    6.28      blkif->blk_ring_base->resp_prod = ++blkif->blk_resp_prod;
    6.29      spin_unlock_irqrestore(&blkif->blk_ring_lock, flags);
    6.30  
     7.1 --- a/linux-2.6.7-xen-sparse/drivers/xen/blkfront/blkfront.c	Thu Aug 12 17:34:21 2004 +0000
     7.2 +++ b/linux-2.6.7-xen-sparse/drivers/xen/blkfront/blkfront.c	Thu Aug 12 17:35:14 2004 +0000
     7.3 @@ -82,6 +82,7 @@ static inline void translate_req_to_mfn(
     7.4  
     7.5  static inline void flush_requests(void)
     7.6  {
     7.7 +    wmb(); /* Ensure that the frontend can see the requests. */
     7.8      blk_ring->req_prod = req_prod;
     7.9      notify_via_evtchn(blkif_evtchn);
    7.10  }
    7.11 @@ -363,34 +364,39 @@ static irqreturn_t blkif_int(int irq, vo
    7.12  {
    7.13      struct request *req;
    7.14      blkif_response_t *bret;
    7.15 -    BLKIF_RING_IDX i; 
    7.16 +    BLKIF_RING_IDX i, rp;
    7.17      unsigned long flags; 
    7.18  
    7.19      spin_lock_irqsave(&blkif_io_lock, flags);     
    7.20  
    7.21 -    if (unlikely(blkif_state == BLKIF_STATE_CLOSED || recovery)) {
    7.22 -        printk("Bailed out\n");
    7.23 -        
    7.24 +    if ( unlikely(blkif_state == BLKIF_STATE_CLOSED) || 
    7.25 +         unlikely(recovery) )
    7.26 +    {
    7.27          spin_unlock_irqrestore(&blkif_io_lock, flags);
    7.28          return IRQ_HANDLED;
    7.29      }
    7.30  
    7.31 -    for (i = resp_cons; i != blk_ring->resp_prod; i++) {
    7.32 +    rp = blk_ring->resp_prod;
    7.33 +    rmb(); /* Ensure we see queued responses up to 'rp'. */
    7.34 +
    7.35 +    for ( i = resp_cons; i != rp; i++ )
    7.36 +    {
    7.37          bret = &blk_ring->ring[MASK_BLKIF_IDX(i)].resp;
    7.38 -        switch (bret->operation) {
    7.39 +        switch ( bret->operation )
    7.40 +        {
    7.41          case BLKIF_OP_READ:
    7.42          case BLKIF_OP_WRITE:
    7.43 -            if (unlikely(bret->status != BLKIF_RSP_OKAY))
    7.44 +            if ( unlikely(bret->status != BLKIF_RSP_OKAY) )
    7.45                  DPRINTK("Bad return from blkdev data request: %lx\n",
    7.46                          bret->status);
    7.47              req = (struct request *)bret->id;
    7.48 -            /* XXXcl pass up status */
    7.49 -            if (unlikely(end_that_request_first(req, 1,
    7.50 -                                                req->hard_nr_sectors)))
    7.51 +            if ( unlikely(end_that_request_first
    7.52 +                          (req, 
    7.53 +                           (bret->status != BLKIF_RSP_OKAY),
    7.54 +                           req->hard_nr_sectors)) )
    7.55                  BUG();
    7.56 -
    7.57              end_that_request_last(req);
    7.58 -            blkif_completion( bret, req );
    7.59 +            blkif_completion(bret, req);
    7.60              break;
    7.61          case BLKIF_OP_PROBE:
    7.62              memcpy(&blkif_control_rsp, bret, sizeof(*bret));
    7.63 @@ -404,8 +410,9 @@ static irqreturn_t blkif_int(int irq, vo
    7.64      resp_cons = i;
    7.65      resp_cons_rec = i;
    7.66  
    7.67 -    if (xlbd_blk_queue &&
    7.68 -        test_bit(QUEUE_FLAG_STOPPED, &xlbd_blk_queue->queue_flags)) {
    7.69 +    if ( (xlbd_blk_queue != NULL) &&
    7.70 +         test_bit(QUEUE_FLAG_STOPPED, &xlbd_blk_queue->queue_flags) )
    7.71 +    {
    7.72          blk_start_queue(xlbd_blk_queue);
    7.73          /* XXXcl call to request_fn should not be needed but
    7.74           * we get stuck without...  needs investigating
     8.1 --- a/linux-2.6.7-xen-sparse/drivers/xen/netback/netback.c	Thu Aug 12 17:34:21 2004 +0000
     8.2 +++ b/linux-2.6.7-xen-sparse/drivers/xen/netback/netback.c	Thu Aug 12 17:35:14 2004 +0000
     8.3 @@ -446,6 +446,7 @@ static void net_tx_action(unsigned long 
     8.4              netif_put(netif);
     8.5              continue;
     8.6          }
     8.7 +        rmb(); /* Ensure that we see the request. */
     8.8          memcpy(&txreq, &netif->tx->ring[MASK_NETIF_TX_IDX(i)].req, 
     8.9                 sizeof(txreq));
    8.10          netif->tx_req_cons++;
     9.1 --- a/linux-2.6.7-xen-sparse/drivers/xen/netfront/netfront.c	Thu Aug 12 17:34:21 2004 +0000
     9.2 +++ b/linux-2.6.7-xen-sparse/drivers/xen/netfront/netfront.c	Thu Aug 12 17:35:14 2004 +0000
     9.3 @@ -118,10 +118,8 @@ static void netctrl_init(void)
     9.4   */
     9.5  static int netctrl_err(int err)
     9.6  {
     9.7 -    if(err < 0 && !netctrl.err){
     9.8 +    if ( (err < 0) && !netctrl.err )
     9.9          netctrl.err = err;
    9.10 -        printk(KERN_WARNING "%s> err=%d\n", __FUNCTION__, err);
    9.11 -    }
    9.12      return netctrl.err;
    9.13  }
    9.14  
    9.15 @@ -177,7 +175,6 @@ static int network_open(struct net_devic
    9.16      return 0;
    9.17  }
    9.18  
    9.19 -
    9.20  static void network_tx_buf_gc(struct net_device *dev)
    9.21  {
    9.22      NETIF_RING_IDX i, prod;
    9.23 @@ -190,6 +187,7 @@ static void network_tx_buf_gc(struct net
    9.24  
    9.25      do {
    9.26          prod = np->tx->resp_prod;
    9.27 +        rmb(); /* Ensure we see responses up to 'rp'. */
    9.28  
    9.29          for ( i = np->tx_resp_cons; i != prod; i++ )
    9.30          {
    9.31 @@ -295,6 +293,7 @@ static void network_alloc_rx_buffers(str
    9.32      if ( rx_mcl[nr_pfns].args[5] != nr_pfns )
    9.33          panic("Unable to reduce memory reservation\n");
    9.34  
    9.35 +    /* Above is a suitable barrier to ensure backend will see requests. */
    9.36      np->rx->req_prod = i;
    9.37  }
    9.38  
    9.39 @@ -344,7 +343,7 @@ static int network_start_xmit(struct sk_
    9.40      tx->addr = virt_to_machine(skb->data);
    9.41      tx->size = skb->len;
    9.42  
    9.43 -    wmb();
    9.44 +    wmb(); /* Ensure that backend will see the request. */
    9.45      np->tx->req_prod = i + 1;
    9.46  
    9.47      network_tx_buf_gc(dev);
    9.48 @@ -392,7 +391,7 @@ static int netif_poll(struct net_device 
    9.49      struct net_private *np = dev->priv;
    9.50      struct sk_buff *skb;
    9.51      netif_rx_response_t *rx;
    9.52 -    NETIF_RING_IDX i;
    9.53 +    NETIF_RING_IDX i, rp;
    9.54      mmu_update_t *mmu = rx_mmu;
    9.55      multicall_entry_t *mcl = rx_mcl;
    9.56      int work_done, budget, more_to_do = 1;
    9.57 @@ -412,8 +411,11 @@ static int netif_poll(struct net_device 
    9.58      if ( (budget = *pbudget) > dev->quota )
    9.59          budget = dev->quota;
    9.60  
    9.61 +    rp = np->rx->resp_prod;
    9.62 +    rmb(); /* Ensure we see queued responses up to 'rp'. */
    9.63 +
    9.64      for ( i = np->rx_resp_cons, work_done = 0; 
    9.65 -          (i != np->rx->resp_prod) && (work_done < budget); 
    9.66 +          (i != rp) && (work_done < budget); 
    9.67            i++, work_done++ )
    9.68      {
    9.69          rx = &np->rx->ring[MASK_NETIF_RX_IDX(i)].resp;
    9.70 @@ -904,9 +906,8 @@ void netif_suspend(void)
    9.71  
    9.72  void netif_resume(void)
    9.73  {
    9.74 -    ctrl_msg_t                       cmsg;
    9.75 -    netif_fe_interface_connect_t     up;
    9.76 -//    netif_fe_driver_status_changed_t   st;
    9.77 +    ctrl_msg_t                   cmsg;
    9.78 +    netif_fe_interface_connect_t up;
    9.79      struct net_device *dev = NULL;
    9.80      struct net_private *np = NULL;
    9.81      int i;
    10.1 --- a/linux-2.6.7-xen-sparse/include/asm-xen/asm-i386/pgtable.h	Thu Aug 12 17:34:21 2004 +0000
    10.2 +++ b/linux-2.6.7-xen-sparse/include/asm-xen/asm-i386/pgtable.h	Thu Aug 12 17:35:14 2004 +0000
    10.3 @@ -462,4 +462,7 @@ static inline unsigned long arbitrary_vi
    10.4  #define __HAVE_ARCH_PTE_SAME
    10.5  #include <asm-generic/pgtable.h>
    10.6  
    10.7 +#define HAVE_ARCH_UNMAPPED_AREA
    10.8 +#define HAVE_ARCH_CHECK_FIXED_MAPPING
    10.9 +
   10.10  #endif /* _I386_PGTABLE_H */
    11.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    11.2 +++ b/linux-2.6.7-xen-sparse/mm/mmap.c	Thu Aug 12 17:35:14 2004 +0000
    11.3 @@ -0,0 +1,1816 @@
    11.4 +/*
    11.5 + * mm/mmap.c
    11.6 + *
    11.7 + * Written by obz.
    11.8 + *
    11.9 + * Address space accounting code	<alan@redhat.com>
   11.10 + */
   11.11 +
   11.12 +#include <linux/slab.h>
   11.13 +#include <linux/shm.h>
   11.14 +#include <linux/mman.h>
   11.15 +#include <linux/pagemap.h>
   11.16 +#include <linux/swap.h>
   11.17 +#include <linux/syscalls.h>
   11.18 +#include <linux/init.h>
   11.19 +#include <linux/file.h>
   11.20 +#include <linux/fs.h>
   11.21 +#include <linux/personality.h>
   11.22 +#include <linux/security.h>
   11.23 +#include <linux/hugetlb.h>
   11.24 +#include <linux/profile.h>
   11.25 +#include <linux/module.h>
   11.26 +#include <linux/mount.h>
   11.27 +#include <linux/mempolicy.h>
   11.28 +#include <linux/rmap.h>
   11.29 +
   11.30 +#include <asm/uaccess.h>
   11.31 +#include <asm/pgalloc.h>
   11.32 +#include <asm/cacheflush.h>
   11.33 +#include <asm/tlb.h>
   11.34 +
   11.35 +/*
   11.36 + * WARNING: the debugging will use recursive algorithms so never enable this
   11.37 + * unless you know what you are doing.
   11.38 + */
   11.39 +#undef DEBUG_MM_RB
   11.40 +
   11.41 +/* description of effects of mapping type and prot in current implementation.
   11.42 + * this is due to the limited x86 page protection hardware.  The expected
   11.43 + * behavior is in parens:
   11.44 + *
   11.45 + * map_type	prot
   11.46 + *		PROT_NONE	PROT_READ	PROT_WRITE	PROT_EXEC
   11.47 + * MAP_SHARED	r: (no) no	r: (yes) yes	r: (no) yes	r: (no) yes
   11.48 + *		w: (no) no	w: (no) no	w: (yes) yes	w: (no) no
   11.49 + *		x: (no) no	x: (no) yes	x: (no) yes	x: (yes) yes
   11.50 + *		
   11.51 + * MAP_PRIVATE	r: (no) no	r: (yes) yes	r: (no) yes	r: (no) yes
   11.52 + *		w: (no) no	w: (no) no	w: (copy) copy	w: (no) no
   11.53 + *		x: (no) no	x: (no) yes	x: (no) yes	x: (yes) yes
   11.54 + *
   11.55 + */
   11.56 +pgprot_t protection_map[16] = {
   11.57 +	__P000, __P001, __P010, __P011, __P100, __P101, __P110, __P111,
   11.58 +	__S000, __S001, __S010, __S011, __S100, __S101, __S110, __S111
   11.59 +};
   11.60 +
   11.61 +int sysctl_overcommit_memory = 0;	/* default is heuristic overcommit */
   11.62 +int sysctl_overcommit_ratio = 50;	/* default is 50% */
   11.63 +int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT;
   11.64 +atomic_t vm_committed_space = ATOMIC_INIT(0);
   11.65 +
   11.66 +EXPORT_SYMBOL(sysctl_overcommit_memory);
   11.67 +EXPORT_SYMBOL(sysctl_overcommit_ratio);
   11.68 +EXPORT_SYMBOL(sysctl_max_map_count);
   11.69 +EXPORT_SYMBOL(vm_committed_space);
   11.70 +
   11.71 +/*
   11.72 + * Requires inode->i_mapping->i_mmap_lock
   11.73 + */
   11.74 +static void __remove_shared_vm_struct(struct vm_area_struct *vma,
   11.75 +		struct file *file, struct address_space *mapping)
   11.76 +{
   11.77 +	if (vma->vm_flags & VM_DENYWRITE)
   11.78 +		atomic_inc(&file->f_dentry->d_inode->i_writecount);
   11.79 +	if (vma->vm_flags & VM_SHARED)
   11.80 +		mapping->i_mmap_writable--;
   11.81 +
   11.82 +	flush_dcache_mmap_lock(mapping);
   11.83 +	if (unlikely(vma->vm_flags & VM_NONLINEAR))
   11.84 +		list_del_init(&vma->shared.vm_set.list);
   11.85 +	else
   11.86 +		vma_prio_tree_remove(vma, &mapping->i_mmap);
   11.87 +	flush_dcache_mmap_unlock(mapping);
   11.88 +}
   11.89 +
   11.90 +/*
   11.91 + * Remove one vm structure and free it.
   11.92 + */
   11.93 +static void remove_vm_struct(struct vm_area_struct *vma)
   11.94 +{
   11.95 +	struct file *file = vma->vm_file;
   11.96 +
   11.97 +	if (file) {
   11.98 +		struct address_space *mapping = file->f_mapping;
   11.99 +		spin_lock(&mapping->i_mmap_lock);
  11.100 +		__remove_shared_vm_struct(vma, file, mapping);
  11.101 +		spin_unlock(&mapping->i_mmap_lock);
  11.102 +	}
  11.103 +	if (vma->vm_ops && vma->vm_ops->close)
  11.104 +		vma->vm_ops->close(vma);
  11.105 +	if (file)
  11.106 +		fput(file);
  11.107 +	anon_vma_unlink(vma);
  11.108 +	mpol_free(vma_policy(vma));
  11.109 +	kmem_cache_free(vm_area_cachep, vma);
  11.110 +}
  11.111 +
  11.112 +/*
  11.113 + *  sys_brk() for the most part doesn't need the global kernel
  11.114 + *  lock, except when an application is doing something nasty
  11.115 + *  like trying to un-brk an area that has already been mapped
  11.116 + *  to a regular file.  in this case, the unmapping will need
  11.117 + *  to invoke file system routines that need the global lock.
  11.118 + */
  11.119 +asmlinkage unsigned long sys_brk(unsigned long brk)
  11.120 +{
  11.121 +	unsigned long rlim, retval;
  11.122 +	unsigned long newbrk, oldbrk;
  11.123 +	struct mm_struct *mm = current->mm;
  11.124 +
  11.125 +	down_write(&mm->mmap_sem);
  11.126 +
  11.127 +	if (brk < mm->end_code)
  11.128 +		goto out;
  11.129 +	newbrk = PAGE_ALIGN(brk);
  11.130 +	oldbrk = PAGE_ALIGN(mm->brk);
  11.131 +	if (oldbrk == newbrk)
  11.132 +		goto set_brk;
  11.133 +
  11.134 +	/* Always allow shrinking brk. */
  11.135 +	if (brk <= mm->brk) {
  11.136 +		if (!do_munmap(mm, newbrk, oldbrk-newbrk))
  11.137 +			goto set_brk;
  11.138 +		goto out;
  11.139 +	}
  11.140 +
  11.141 +	/* Check against rlimit.. */
  11.142 +	rlim = current->rlim[RLIMIT_DATA].rlim_cur;
  11.143 +	if (rlim < RLIM_INFINITY && brk - mm->start_data > rlim)
  11.144 +		goto out;
  11.145 +
  11.146 +	/* Check against existing mmap mappings. */
  11.147 +	if (find_vma_intersection(mm, oldbrk, newbrk+PAGE_SIZE))
  11.148 +		goto out;
  11.149 +
  11.150 +	/* Ok, looks good - let it rip. */
  11.151 +	if (do_brk(oldbrk, newbrk-oldbrk) != oldbrk)
  11.152 +		goto out;
  11.153 +set_brk:
  11.154 +	mm->brk = brk;
  11.155 +out:
  11.156 +	retval = mm->brk;
  11.157 +	up_write(&mm->mmap_sem);
  11.158 +	return retval;
  11.159 +}
  11.160 +
  11.161 +#ifdef DEBUG_MM_RB
  11.162 +static int browse_rb(struct rb_root *root)
  11.163 +{
  11.164 +	int i = 0, j;
  11.165 +	struct rb_node *nd, *pn = NULL;
  11.166 +	unsigned long prev = 0, pend = 0;
  11.167 +
  11.168 +	for (nd = rb_first(root); nd; nd = rb_next(nd)) {
  11.169 +		struct vm_area_struct *vma;
  11.170 +		vma = rb_entry(nd, struct vm_area_struct, vm_rb);
  11.171 +		if (vma->vm_start < prev)
  11.172 +			printk("vm_start %lx prev %lx\n", vma->vm_start, prev), i = -1;
  11.173 +		if (vma->vm_start < pend)
  11.174 +			printk("vm_start %lx pend %lx\n", vma->vm_start, pend);
  11.175 +		if (vma->vm_start > vma->vm_end)
  11.176 +			printk("vm_end %lx < vm_start %lx\n", vma->vm_end, vma->vm_start);
  11.177 +		i++;
  11.178 +		pn = nd;
  11.179 +	}
  11.180 +	j = 0;
  11.181 +	for (nd = pn; nd; nd = rb_prev(nd)) {
  11.182 +		j++;
  11.183 +	}
  11.184 +	if (i != j)
  11.185 +		printk("backwards %d, forwards %d\n", j, i), i = 0;
  11.186 +	return i;
  11.187 +}
  11.188 +
  11.189 +void validate_mm(struct mm_struct *mm)
  11.190 +{
  11.191 +	int bug = 0;
  11.192 +	int i = 0;
  11.193 +	struct vm_area_struct *tmp = mm->mmap;
  11.194 +	while (tmp) {
  11.195 +		tmp = tmp->vm_next;
  11.196 +		i++;
  11.197 +	}
  11.198 +	if (i != mm->map_count)
  11.199 +		printk("map_count %d vm_next %d\n", mm->map_count, i), bug = 1;
  11.200 +	i = browse_rb(&mm->mm_rb);
  11.201 +	if (i != mm->map_count)
  11.202 +		printk("map_count %d rb %d\n", mm->map_count, i), bug = 1;
  11.203 +	if (bug)
  11.204 +		BUG();
  11.205 +}
  11.206 +#else
  11.207 +#define validate_mm(mm) do { } while (0)
  11.208 +#endif
  11.209 +
  11.210 +static struct vm_area_struct *
  11.211 +find_vma_prepare(struct mm_struct *mm, unsigned long addr,
  11.212 +		struct vm_area_struct **pprev, struct rb_node ***rb_link,
  11.213 +		struct rb_node ** rb_parent)
  11.214 +{
  11.215 +	struct vm_area_struct * vma;
  11.216 +	struct rb_node ** __rb_link, * __rb_parent, * rb_prev;
  11.217 +
  11.218 +	__rb_link = &mm->mm_rb.rb_node;
  11.219 +	rb_prev = __rb_parent = NULL;
  11.220 +	vma = NULL;
  11.221 +
  11.222 +	while (*__rb_link) {
  11.223 +		struct vm_area_struct *vma_tmp;
  11.224 +
  11.225 +		__rb_parent = *__rb_link;
  11.226 +		vma_tmp = rb_entry(__rb_parent, struct vm_area_struct, vm_rb);
  11.227 +
  11.228 +		if (vma_tmp->vm_end > addr) {
  11.229 +			vma = vma_tmp;
  11.230 +			if (vma_tmp->vm_start <= addr)
  11.231 +				return vma;
  11.232 +			__rb_link = &__rb_parent->rb_left;
  11.233 +		} else {
  11.234 +			rb_prev = __rb_parent;
  11.235 +			__rb_link = &__rb_parent->rb_right;
  11.236 +		}
  11.237 +	}
  11.238 +
  11.239 +	*pprev = NULL;
  11.240 +	if (rb_prev)
  11.241 +		*pprev = rb_entry(rb_prev, struct vm_area_struct, vm_rb);
  11.242 +	*rb_link = __rb_link;
  11.243 +	*rb_parent = __rb_parent;
  11.244 +	return vma;
  11.245 +}
  11.246 +
  11.247 +static inline void
  11.248 +__vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
  11.249 +		struct vm_area_struct *prev, struct rb_node *rb_parent)
  11.250 +{
  11.251 +	if (prev) {
  11.252 +		vma->vm_next = prev->vm_next;
  11.253 +		prev->vm_next = vma;
  11.254 +	} else {
  11.255 +		mm->mmap = vma;
  11.256 +		if (rb_parent)
  11.257 +			vma->vm_next = rb_entry(rb_parent,
  11.258 +					struct vm_area_struct, vm_rb);
  11.259 +		else
  11.260 +			vma->vm_next = NULL;
  11.261 +	}
  11.262 +}
  11.263 +
  11.264 +void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma,
  11.265 +		struct rb_node **rb_link, struct rb_node *rb_parent)
  11.266 +{
  11.267 +	rb_link_node(&vma->vm_rb, rb_parent, rb_link);
  11.268 +	rb_insert_color(&vma->vm_rb, &mm->mm_rb);
  11.269 +}
  11.270 +
  11.271 +static inline void __vma_link_file(struct vm_area_struct *vma)
  11.272 +{
  11.273 +	struct file * file;
  11.274 +
  11.275 +	file = vma->vm_file;
  11.276 +	if (file) {
  11.277 +		struct address_space *mapping = file->f_mapping;
  11.278 +
  11.279 +		if (vma->vm_flags & VM_DENYWRITE)
  11.280 +			atomic_dec(&file->f_dentry->d_inode->i_writecount);
  11.281 +		if (vma->vm_flags & VM_SHARED)
  11.282 +			mapping->i_mmap_writable++;
  11.283 +
  11.284 +		flush_dcache_mmap_lock(mapping);
  11.285 +		if (unlikely(vma->vm_flags & VM_NONLINEAR))
  11.286 +			list_add_tail(&vma->shared.vm_set.list,
  11.287 +					&mapping->i_mmap_nonlinear);
  11.288 +		else
  11.289 +			vma_prio_tree_insert(vma, &mapping->i_mmap);
  11.290 +		flush_dcache_mmap_unlock(mapping);
  11.291 +	}
  11.292 +}
  11.293 +
  11.294 +static void
  11.295 +__vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
  11.296 +	struct vm_area_struct *prev, struct rb_node **rb_link,
  11.297 +	struct rb_node *rb_parent)
  11.298 +{
  11.299 +	__vma_link_list(mm, vma, prev, rb_parent);
  11.300 +	__vma_link_rb(mm, vma, rb_link, rb_parent);
  11.301 +	__anon_vma_link(vma);
  11.302 +}
  11.303 +
  11.304 +static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
  11.305 +			struct vm_area_struct *prev, struct rb_node **rb_link,
  11.306 +			struct rb_node *rb_parent)
  11.307 +{
  11.308 +	struct address_space *mapping = NULL;
  11.309 +
  11.310 +	if (vma->vm_file)
  11.311 +		mapping = vma->vm_file->f_mapping;
  11.312 +
  11.313 +	if (mapping)
  11.314 +		spin_lock(&mapping->i_mmap_lock);
  11.315 +	anon_vma_lock(vma);
  11.316 +
  11.317 +	__vma_link(mm, vma, prev, rb_link, rb_parent);
  11.318 +	__vma_link_file(vma);
  11.319 +
  11.320 +	anon_vma_unlock(vma);
  11.321 +	if (mapping)
  11.322 +		spin_unlock(&mapping->i_mmap_lock);
  11.323 +
  11.324 +	mark_mm_hugetlb(mm, vma);
  11.325 +	mm->map_count++;
  11.326 +	validate_mm(mm);
  11.327 +}
  11.328 +
  11.329 +/*
  11.330 + * Helper for vma_adjust in the split_vma insert case:
  11.331 + * insert vm structure into list and rbtree and anon_vma,
  11.332 + * but it has already been inserted into prio_tree earlier.
  11.333 + */
  11.334 +static void
  11.335 +__insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma)
  11.336 +{
  11.337 +	struct vm_area_struct * __vma, * prev;
  11.338 +	struct rb_node ** rb_link, * rb_parent;
  11.339 +
  11.340 +	__vma = find_vma_prepare(mm, vma->vm_start,&prev, &rb_link, &rb_parent);
  11.341 +	if (__vma && __vma->vm_start < vma->vm_end)
  11.342 +		BUG();
  11.343 +	__vma_link(mm, vma, prev, rb_link, rb_parent);
  11.344 +	mm->map_count++;
  11.345 +}
  11.346 +
  11.347 +static inline void
  11.348 +__vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma,
  11.349 +		struct vm_area_struct *prev)
  11.350 +{
  11.351 +	prev->vm_next = vma->vm_next;
  11.352 +	rb_erase(&vma->vm_rb, &mm->mm_rb);
  11.353 +	if (mm->mmap_cache == vma)
  11.354 +		mm->mmap_cache = prev;
  11.355 +}
  11.356 +
  11.357 +/*
  11.358 + * We cannot adjust vm_start, vm_end, vm_pgoff fields of a vma that
  11.359 + * is already present in an i_mmap tree without adjusting the tree.
  11.360 + * The following helper function should be used when such adjustments
  11.361 + * are necessary.  The "insert" vma (if any) is to be inserted
  11.362 + * before we drop the necessary locks.
  11.363 + */
  11.364 +void vma_adjust(struct vm_area_struct *vma, unsigned long start,
  11.365 +	unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert)
  11.366 +{
  11.367 +	struct mm_struct *mm = vma->vm_mm;
  11.368 +	struct vm_area_struct *next = vma->vm_next;
  11.369 +	struct address_space *mapping = NULL;
  11.370 +	struct prio_tree_root *root = NULL;
  11.371 +	struct file *file = vma->vm_file;
  11.372 +	struct anon_vma *anon_vma = NULL;
  11.373 +	long adjust_next = 0;
  11.374 +	int remove_next = 0;
  11.375 +
  11.376 +	if (next && !insert) {
  11.377 +		if (end >= next->vm_end) {
  11.378 +			/*
  11.379 +			 * vma expands, overlapping all the next, and
  11.380 +			 * perhaps the one after too (mprotect case 6).
  11.381 +			 */
  11.382 +again:			remove_next = 1 + (end > next->vm_end);
  11.383 +			end = next->vm_end;
  11.384 +			anon_vma = next->anon_vma;
  11.385 +		} else if (end > next->vm_start) {
  11.386 +			/*
  11.387 +			 * vma expands, overlapping part of the next:
  11.388 +			 * mprotect case 5 shifting the boundary up.
  11.389 +			 */
  11.390 +			adjust_next = (end - next->vm_start) >> PAGE_SHIFT;
  11.391 +			anon_vma = next->anon_vma;
  11.392 +		} else if (end < vma->vm_end) {
  11.393 +			/*
  11.394 +			 * vma shrinks, and !insert tells it's not
  11.395 +			 * split_vma inserting another: so it must be
  11.396 +			 * mprotect case 4 shifting the boundary down.
  11.397 +			 */
  11.398 +			adjust_next = - ((vma->vm_end - end) >> PAGE_SHIFT);
  11.399 +			anon_vma = next->anon_vma;
  11.400 +		}
  11.401 +	}
  11.402 +
  11.403 +	if (file) {
  11.404 +		mapping = file->f_mapping;
  11.405 +		if (!(vma->vm_flags & VM_NONLINEAR))
  11.406 +			root = &mapping->i_mmap;
  11.407 +		spin_lock(&mapping->i_mmap_lock);
  11.408 +		if (insert) {
  11.409 +			/*
  11.410 +			 * Put into prio_tree now, so instantiated pages
  11.411 +			 * are visible to arm/parisc __flush_dcache_page
  11.412 +			 * throughout; but we cannot insert into address
  11.413 +			 * space until vma start or end is updated.
  11.414 +			 */
  11.415 +			__vma_link_file(insert);
  11.416 +		}
  11.417 +	}
  11.418 +
  11.419 +	/*
  11.420 +	 * When changing only vma->vm_end, we don't really need
  11.421 +	 * anon_vma lock: but is that case worth optimizing out?
  11.422 +	 */
  11.423 +	if (vma->anon_vma)
  11.424 +		anon_vma = vma->anon_vma;
  11.425 +	if (anon_vma)
  11.426 +		spin_lock(&anon_vma->lock);
  11.427 +
  11.428 +	if (root) {
  11.429 +		flush_dcache_mmap_lock(mapping);
  11.430 +		vma_prio_tree_remove(vma, root);
  11.431 +		if (adjust_next)
  11.432 +			vma_prio_tree_remove(next, root);
  11.433 +	}
  11.434 +
  11.435 +	vma->vm_start = start;
  11.436 +	vma->vm_end = end;
  11.437 +	vma->vm_pgoff = pgoff;
  11.438 +	if (adjust_next) {
  11.439 +		next->vm_start += adjust_next << PAGE_SHIFT;
  11.440 +		next->vm_pgoff += adjust_next;
  11.441 +	}
  11.442 +
  11.443 +	if (root) {
  11.444 +		if (adjust_next) {
  11.445 +			vma_prio_tree_init(next);
  11.446 +			vma_prio_tree_insert(next, root);
  11.447 +		}
  11.448 +		vma_prio_tree_init(vma);
  11.449 +		vma_prio_tree_insert(vma, root);
  11.450 +		flush_dcache_mmap_unlock(mapping);
  11.451 +	}
  11.452 +
  11.453 +	if (remove_next) {
  11.454 +		/*
  11.455 +		 * vma_merge has merged next into vma, and needs
  11.456 +		 * us to remove next before dropping the locks.
  11.457 +		 */
  11.458 +		__vma_unlink(mm, next, vma);
  11.459 +		if (file)
  11.460 +			__remove_shared_vm_struct(next, file, mapping);
  11.461 +		if (next->anon_vma)
  11.462 +			__anon_vma_merge(vma, next);
  11.463 +	} else if (insert) {
  11.464 +		/*
  11.465 +		 * split_vma has split insert from vma, and needs
  11.466 +		 * us to insert it before dropping the locks
  11.467 +		 * (it may either follow vma or precede it).
  11.468 +		 */
  11.469 +		__insert_vm_struct(mm, insert);
  11.470 +	}
  11.471 +
  11.472 +	if (anon_vma)
  11.473 +		spin_unlock(&anon_vma->lock);
  11.474 +	if (mapping)
  11.475 +		spin_unlock(&mapping->i_mmap_lock);
  11.476 +
  11.477 +	if (remove_next) {
  11.478 +		if (file)
  11.479 +			fput(file);
  11.480 +		mm->map_count--;
  11.481 +		mpol_free(vma_policy(next));
  11.482 +		kmem_cache_free(vm_area_cachep, next);
  11.483 +		/*
  11.484 +		 * In mprotect's case 6 (see comments on vma_merge),
  11.485 +		 * we must remove another next too. It would clutter
  11.486 +		 * up the code too much to do both in one go.
  11.487 +		 */
  11.488 +		if (remove_next == 2) {
  11.489 +			next = vma->vm_next;
  11.490 +			goto again;
  11.491 +		}
  11.492 +	}
  11.493 +
  11.494 +	validate_mm(mm);
  11.495 +}
  11.496 +
  11.497 +/*
  11.498 + * If the vma has a ->close operation then the driver probably needs to release
  11.499 + * per-vma resources, so we don't attempt to merge those.
  11.500 + */
  11.501 +#define VM_SPECIAL (VM_IO | VM_DONTCOPY | VM_DONTEXPAND | VM_RESERVED)
  11.502 +
  11.503 +static inline int is_mergeable_vma(struct vm_area_struct *vma,
  11.504 +			struct file *file, unsigned long vm_flags)
  11.505 +{
  11.506 +	if (vma->vm_flags != vm_flags)
  11.507 +		return 0;
  11.508 +	if (vma->vm_file != file)
  11.509 +		return 0;
  11.510 +	if (vma->vm_ops && vma->vm_ops->close)
  11.511 +		return 0;
  11.512 +	return 1;
  11.513 +}
  11.514 +
  11.515 +static inline int is_mergeable_anon_vma(struct anon_vma *anon_vma1,
  11.516 +					struct anon_vma *anon_vma2)
  11.517 +{
  11.518 +	return !anon_vma1 || !anon_vma2 || (anon_vma1 == anon_vma2);
  11.519 +}
  11.520 +
  11.521 +/*
  11.522 + * Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff)
  11.523 + * in front of (at a lower virtual address and file offset than) the vma.
  11.524 + *
  11.525 + * We cannot merge two vmas if they have differently assigned (non-NULL)
  11.526 + * anon_vmas, nor if same anon_vma is assigned but offsets incompatible.
  11.527 + *
  11.528 + * We don't check here for the merged mmap wrapping around the end of pagecache
  11.529 + * indices (16TB on ia32) because do_mmap_pgoff() does not permit mmap's which
  11.530 + * wrap, nor mmaps which cover the final page at index -1UL.
  11.531 + */
  11.532 +static int
  11.533 +can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags,
  11.534 +	struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff)
  11.535 +{
  11.536 +	if (is_mergeable_vma(vma, file, vm_flags) &&
  11.537 +	    is_mergeable_anon_vma(anon_vma, vma->anon_vma)) {
  11.538 +		if (vma->vm_pgoff == vm_pgoff)
  11.539 +			return 1;
  11.540 +	}
  11.541 +	return 0;
  11.542 +}
  11.543 +
  11.544 +/*
  11.545 + * Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff)
  11.546 + * beyond (at a higher virtual address and file offset than) the vma.
  11.547 + *
  11.548 + * We cannot merge two vmas if they have differently assigned (non-NULL)
  11.549 + * anon_vmas, nor if same anon_vma is assigned but offsets incompatible.
  11.550 + */
  11.551 +static int
  11.552 +can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
  11.553 +	struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff)
  11.554 +{
  11.555 +	if (is_mergeable_vma(vma, file, vm_flags) &&
  11.556 +	    is_mergeable_anon_vma(anon_vma, vma->anon_vma)) {
  11.557 +		pgoff_t vm_pglen;
  11.558 +		vm_pglen = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
  11.559 +		if (vma->vm_pgoff + vm_pglen == vm_pgoff)
  11.560 +			return 1;
  11.561 +	}
  11.562 +	return 0;
  11.563 +}
  11.564 +
  11.565 +/*
  11.566 + * Given a mapping request (addr,end,vm_flags,file,pgoff), figure out
  11.567 + * whether that can be merged with its predecessor or its successor.
  11.568 + * Or both (it neatly fills a hole).
  11.569 + *
  11.570 + * In most cases - when called for mmap, brk or mremap - [addr,end) is
  11.571 + * certain not to be mapped by the time vma_merge is called; but when
  11.572 + * called for mprotect, it is certain to be already mapped (either at
  11.573 + * an offset within prev, or at the start of next), and the flags of
  11.574 + * this area are about to be changed to vm_flags - and the no-change
  11.575 + * case has already been eliminated.
  11.576 + *
  11.577 + * The following mprotect cases have to be considered, where AAAA is
  11.578 + * the area passed down from mprotect_fixup, never extending beyond one
  11.579 + * vma, PPPPPP is the prev vma specified, and NNNNNN the next vma after:
  11.580 + *
  11.581 + *     AAAA             AAAA                AAAA          AAAA
  11.582 + *    PPPPPPNNNNNN    PPPPPPNNNNNN    PPPPPPNNNNNN    PPPPNNNNXXXX
  11.583 + *    cannot merge    might become    might become    might become
  11.584 + *                    PPNNNNNNNNNN    PPPPPPPPPPNN    PPPPPPPPPPPP 6 or
  11.585 + *    mmap, brk or    case 4 below    case 5 below    PPPPPPPPXXXX 7 or
  11.586 + *    mremap move:                                    PPPPNNNNNNNN 8
  11.587 + *        AAAA
  11.588 + *    PPPP    NNNN    PPPPPPPPPPPP    PPPPPPPPNNNN    PPPPNNNNNNNN
  11.589 + *    might become    case 1 below    case 2 below    case 3 below
  11.590 + *
  11.591 + * Odd one out? Case 8, because it extends NNNN but needs flags of XXXX:
  11.592 + * mprotect_fixup updates vm_flags & vm_page_prot on successful return.
  11.593 + */
  11.594 +struct vm_area_struct *vma_merge(struct mm_struct *mm,
  11.595 +			struct vm_area_struct *prev, unsigned long addr,
  11.596 +			unsigned long end, unsigned long vm_flags,
  11.597 +		     	struct anon_vma *anon_vma, struct file *file,
  11.598 +			pgoff_t pgoff, struct mempolicy *policy)
  11.599 +{
  11.600 +	pgoff_t pglen = (end - addr) >> PAGE_SHIFT;
  11.601 +	struct vm_area_struct *area, *next;
  11.602 +
  11.603 +	/*
  11.604 +	 * We later require that vma->vm_flags == vm_flags,
  11.605 +	 * so this tests vma->vm_flags & VM_SPECIAL, too.
  11.606 +	 */
  11.607 +	if (vm_flags & VM_SPECIAL)
  11.608 +		return NULL;
  11.609 +
  11.610 +	if (prev)
  11.611 +		next = prev->vm_next;
  11.612 +	else
  11.613 +		next = mm->mmap;
  11.614 +	area = next;
  11.615 +	if (next && next->vm_end == end)		/* cases 6, 7, 8 */
  11.616 +		next = next->vm_next;
  11.617 +
  11.618 +	/*
  11.619 +	 * Can it merge with the predecessor?
  11.620 +	 */
  11.621 +	if (prev && prev->vm_end == addr &&
  11.622 +  			mpol_equal(vma_policy(prev), policy) &&
  11.623 +			can_vma_merge_after(prev, vm_flags,
  11.624 +						anon_vma, file, pgoff)) {
  11.625 +		/*
  11.626 +		 * OK, it can.  Can we now merge in the successor as well?
  11.627 +		 */
  11.628 +		if (next && end == next->vm_start &&
  11.629 +				mpol_equal(policy, vma_policy(next)) &&
  11.630 +				can_vma_merge_before(next, vm_flags,
  11.631 +					anon_vma, file, pgoff+pglen) &&
  11.632 +				is_mergeable_anon_vma(prev->anon_vma,
  11.633 +						      next->anon_vma)) {
  11.634 +							/* cases 1, 6 */
  11.635 +			vma_adjust(prev, prev->vm_start,
  11.636 +				next->vm_end, prev->vm_pgoff, NULL);
  11.637 +		} else					/* cases 2, 5, 7 */
  11.638 +			vma_adjust(prev, prev->vm_start,
  11.639 +				end, prev->vm_pgoff, NULL);
  11.640 +		return prev;
  11.641 +	}
  11.642 +
  11.643 +	/*
  11.644 +	 * Can this new request be merged in front of next?
  11.645 +	 */
  11.646 +	if (next && end == next->vm_start &&
  11.647 + 			mpol_equal(policy, vma_policy(next)) &&
  11.648 +			can_vma_merge_before(next, vm_flags,
  11.649 +					anon_vma, file, pgoff+pglen)) {
  11.650 +		if (prev && addr < prev->vm_end)	/* case 4 */
  11.651 +			vma_adjust(prev, prev->vm_start,
  11.652 +				addr, prev->vm_pgoff, NULL);
  11.653 +		else					/* cases 3, 8 */
  11.654 +			vma_adjust(area, addr, next->vm_end,
  11.655 +				next->vm_pgoff - pglen, NULL);
  11.656 +		return area;
  11.657 +	}
  11.658 +
  11.659 +	return NULL;
  11.660 +}
  11.661 +
  11.662 +/*
  11.663 + * find_mergeable_anon_vma is used by anon_vma_prepare, to check
  11.664 + * neighbouring vmas for a suitable anon_vma, before it goes off
  11.665 + * to allocate a new anon_vma.  It checks because a repetitive
  11.666 + * sequence of mprotects and faults may otherwise lead to distinct
  11.667 + * anon_vmas being allocated, preventing vma merge in subsequent
  11.668 + * mprotect.
  11.669 + */
  11.670 +struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma)
  11.671 +{
  11.672 +	struct vm_area_struct *near;
  11.673 +	unsigned long vm_flags;
  11.674 +
  11.675 +	near = vma->vm_next;
  11.676 +	if (!near)
  11.677 +		goto try_prev;
  11.678 +
  11.679 +	/*
  11.680 +	 * Since only mprotect tries to remerge vmas, match flags
  11.681 +	 * which might be mprotected into each other later on.
  11.682 +	 * Neither mlock nor madvise tries to remerge at present,
  11.683 +	 * so leave their flags as obstructing a merge.
  11.684 +	 */
  11.685 +	vm_flags = vma->vm_flags & ~(VM_READ|VM_WRITE|VM_EXEC);
  11.686 +	vm_flags |= near->vm_flags & (VM_READ|VM_WRITE|VM_EXEC);
  11.687 +
  11.688 +	if (near->anon_vma && vma->vm_end == near->vm_start &&
  11.689 + 			mpol_equal(vma_policy(vma), vma_policy(near)) &&
  11.690 +			can_vma_merge_before(near, vm_flags,
  11.691 +				NULL, vma->vm_file, vma->vm_pgoff +
  11.692 +				((vma->vm_end - vma->vm_start) >> PAGE_SHIFT)))
  11.693 +		return near->anon_vma;
  11.694 +try_prev:
  11.695 +	/*
  11.696 +	 * It is potentially slow to have to call find_vma_prev here.
  11.697 +	 * But it's only on the first write fault on the vma, not
  11.698 +	 * every time, and we could devise a way to avoid it later
  11.699 +	 * (e.g. stash info in next's anon_vma_node when assigning
  11.700 +	 * an anon_vma, or when trying vma_merge).  Another time.
  11.701 +	 */
  11.702 +	if (find_vma_prev(vma->vm_mm, vma->vm_start, &near) != vma)
  11.703 +		BUG();
  11.704 +	if (!near)
  11.705 +		goto none;
  11.706 +
  11.707 +	vm_flags = vma->vm_flags & ~(VM_READ|VM_WRITE|VM_EXEC);
  11.708 +	vm_flags |= near->vm_flags & (VM_READ|VM_WRITE|VM_EXEC);
  11.709 +
  11.710 +	if (near->anon_vma && near->vm_end == vma->vm_start &&
  11.711 +  			mpol_equal(vma_policy(near), vma_policy(vma)) &&
  11.712 +			can_vma_merge_after(near, vm_flags,
  11.713 +				NULL, vma->vm_file, vma->vm_pgoff))
  11.714 +		return near->anon_vma;
  11.715 +none:
  11.716 +	/*
  11.717 +	 * There's no absolute need to look only at touching neighbours:
  11.718 +	 * we could search further afield for "compatible" anon_vmas.
  11.719 +	 * But it would probably just be a waste of time searching,
  11.720 +	 * or lead to too many vmas hanging off the same anon_vma.
  11.721 +	 * We're trying to allow mprotect remerging later on,
  11.722 +	 * not trying to minimize memory used for anon_vmas.
  11.723 +	 */
  11.724 +	return NULL;
  11.725 +}
  11.726 +
  11.727 +/*
  11.728 + * The caller must hold down_write(current->mm->mmap_sem).
  11.729 + */
  11.730 +
  11.731 +unsigned long do_mmap_pgoff(struct file * file, unsigned long addr,
  11.732 +			unsigned long len, unsigned long prot,
  11.733 +			unsigned long flags, unsigned long pgoff)
  11.734 +{
  11.735 +	struct mm_struct * mm = current->mm;
  11.736 +	struct vm_area_struct * vma, * prev;
  11.737 +	struct inode *inode;
  11.738 +	unsigned int vm_flags;
  11.739 +	int correct_wcount = 0;
  11.740 +	int error;
  11.741 +	struct rb_node ** rb_link, * rb_parent;
  11.742 +	int accountable = 1;
  11.743 +	unsigned long charged = 0;
  11.744 +
  11.745 +	if (file) {
  11.746 +		if (is_file_hugepages(file))
  11.747 +			accountable = 0;
  11.748 +
  11.749 +		if (!file->f_op || !file->f_op->mmap)
  11.750 +			return -ENODEV;
  11.751 +
  11.752 +		if ((prot & PROT_EXEC) &&
  11.753 +		    (file->f_vfsmnt->mnt_flags & MNT_NOEXEC))
  11.754 +			return -EPERM;
  11.755 +	}
  11.756 +
  11.757 +	if (!len)
  11.758 +		return addr;
  11.759 +
  11.760 +	/* Careful about overflows.. */
  11.761 +	len = PAGE_ALIGN(len);
  11.762 +	if (!len || len > TASK_SIZE)
  11.763 +		return -EINVAL;
  11.764 +
  11.765 +	/* offset overflow? */
  11.766 +	if ((pgoff + (len >> PAGE_SHIFT)) < pgoff)
  11.767 +		return -EINVAL;
  11.768 +
  11.769 +	/* Too many mappings? */
  11.770 +	if (mm->map_count > sysctl_max_map_count)
  11.771 +		return -ENOMEM;
  11.772 +
  11.773 +	/* Obtain the address to map to. we verify (or select) it and ensure
  11.774 +	 * that it represents a valid section of the address space.
  11.775 +	 */
  11.776 +	addr = get_unmapped_area(file, addr, len, pgoff, flags);
  11.777 +	if (addr & ~PAGE_MASK)
  11.778 +		return addr;
  11.779 +
  11.780 +	/* Do simple checking here so the lower-level routines won't have
  11.781 +	 * to. we assume access permissions have been handled by the open
  11.782 +	 * of the memory object, so we don't do any here.
  11.783 +	 */
  11.784 +	vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags) |
  11.785 +			mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
  11.786 +
  11.787 +	if (flags & MAP_LOCKED) {
  11.788 +		if (!capable(CAP_IPC_LOCK))
  11.789 +			return -EPERM;
  11.790 +		vm_flags |= VM_LOCKED;
  11.791 +	}
  11.792 +	/* mlock MCL_FUTURE? */
  11.793 +	if (vm_flags & VM_LOCKED) {
  11.794 +		unsigned long locked = mm->locked_vm << PAGE_SHIFT;
  11.795 +		locked += len;
  11.796 +		if (locked > current->rlim[RLIMIT_MEMLOCK].rlim_cur)
  11.797 +			return -EAGAIN;
  11.798 +	}
  11.799 +
  11.800 +	inode = file ? file->f_dentry->d_inode : NULL;
  11.801 +
  11.802 +	if (file) {
  11.803 +		switch (flags & MAP_TYPE) {
  11.804 +		case MAP_SHARED:
  11.805 +			if ((prot&PROT_WRITE) && !(file->f_mode&FMODE_WRITE))
  11.806 +				return -EACCES;
  11.807 +
  11.808 +			/*
  11.809 +			 * Make sure we don't allow writing to an append-only
  11.810 +			 * file..
  11.811 +			 */
  11.812 +			if (IS_APPEND(inode) && (file->f_mode & FMODE_WRITE))
  11.813 +				return -EACCES;
  11.814 +
  11.815 +			/*
  11.816 +			 * Make sure there are no mandatory locks on the file.
  11.817 +			 */
  11.818 +			if (locks_verify_locked(inode))
  11.819 +				return -EAGAIN;
  11.820 +
  11.821 +			vm_flags |= VM_SHARED | VM_MAYSHARE;
  11.822 +			if (!(file->f_mode & FMODE_WRITE))
  11.823 +				vm_flags &= ~(VM_MAYWRITE | VM_SHARED);
  11.824 +
  11.825 +			/* fall through */
  11.826 +		case MAP_PRIVATE:
  11.827 +			if (!(file->f_mode & FMODE_READ))
  11.828 +				return -EACCES;
  11.829 +			break;
  11.830 +
  11.831 +		default:
  11.832 +			return -EINVAL;
  11.833 +		}
  11.834 +	} else {
  11.835 +		switch (flags & MAP_TYPE) {
  11.836 +		case MAP_SHARED:
  11.837 +			vm_flags |= VM_SHARED | VM_MAYSHARE;
  11.838 +			break;
  11.839 +		case MAP_PRIVATE:
  11.840 +			/*
  11.841 +			 * Set pgoff according to addr for anon_vma.
  11.842 +			 */
  11.843 +			pgoff = addr >> PAGE_SHIFT;
  11.844 +			break;
  11.845 +		default:
  11.846 +			return -EINVAL;
  11.847 +		}
  11.848 +	}
  11.849 +
  11.850 +	error = security_file_mmap(file, prot, flags);
  11.851 +	if (error)
  11.852 +		return error;
  11.853 +		
  11.854 +	/* Clear old maps */
  11.855 +	error = -ENOMEM;
  11.856 +munmap_back:
  11.857 +	vma = find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent);
  11.858 +	if (vma && vma->vm_start < addr + len) {
  11.859 +		if (do_munmap(mm, addr, len))
  11.860 +			return -ENOMEM;
  11.861 +		goto munmap_back;
  11.862 +	}
  11.863 +
  11.864 +	/* Check against address space limit. */
  11.865 +	if ((mm->total_vm << PAGE_SHIFT) + len
  11.866 +	    > current->rlim[RLIMIT_AS].rlim_cur)
  11.867 +		return -ENOMEM;
  11.868 +
  11.869 +	if (accountable && (!(flags & MAP_NORESERVE) ||
  11.870 +			sysctl_overcommit_memory > 1)) {
  11.871 +		if (vm_flags & VM_SHARED) {
  11.872 +			/* Check memory availability in shmem_file_setup? */
  11.873 +			vm_flags |= VM_ACCOUNT;
  11.874 +		} else if (vm_flags & VM_WRITE) {
  11.875 +			/*
  11.876 +			 * Private writable mapping: check memory availability
  11.877 +			 */
  11.878 +			charged = len >> PAGE_SHIFT;
  11.879 +			if (security_vm_enough_memory(charged))
  11.880 +				return -ENOMEM;
  11.881 +			vm_flags |= VM_ACCOUNT;
  11.882 +		}
  11.883 +	}
  11.884 +
  11.885 +	/*
  11.886 +	 * Can we just expand an old private anonymous mapping?
  11.887 +	 * The VM_SHARED test is necessary because shmem_zero_setup
  11.888 +	 * will create the file object for a shared anonymous map below.
  11.889 +	 */
  11.890 +	if (!file && !(vm_flags & VM_SHARED) &&
  11.891 +	    vma_merge(mm, prev, addr, addr + len, vm_flags,
  11.892 +					NULL, NULL, pgoff, NULL))
  11.893 +		goto out;
  11.894 +
  11.895 +	/*
  11.896 +	 * Determine the object being mapped and call the appropriate
  11.897 +	 * specific mapper. the address has already been validated, but
  11.898 +	 * not unmapped, but the maps are removed from the list.
  11.899 +	 */
  11.900 +	vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
  11.901 +	if (!vma) {
  11.902 +		error = -ENOMEM;
  11.903 +		goto unacct_error;
  11.904 +	}
  11.905 +	memset(vma, 0, sizeof(*vma));
  11.906 +
  11.907 +	vma->vm_mm = mm;
  11.908 +	vma->vm_start = addr;
  11.909 +	vma->vm_end = addr + len;
  11.910 +	vma->vm_flags = vm_flags;
  11.911 +	vma->vm_page_prot = protection_map[vm_flags & 0x0f];
  11.912 +	vma->vm_pgoff = pgoff;
  11.913 +
  11.914 +	if (file) {
  11.915 +		error = -EINVAL;
  11.916 +		if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP))
  11.917 +			goto free_vma;
  11.918 +		if (vm_flags & VM_DENYWRITE) {
  11.919 +			error = deny_write_access(file);
  11.920 +			if (error)
  11.921 +				goto free_vma;
  11.922 +			correct_wcount = 1;
  11.923 +		}
  11.924 +		vma->vm_file = file;
  11.925 +		get_file(file);
  11.926 +		error = file->f_op->mmap(file, vma);
  11.927 +		if (error)
  11.928 +			goto unmap_and_free_vma;
  11.929 +	} else if (vm_flags & VM_SHARED) {
  11.930 +		error = shmem_zero_setup(vma);
  11.931 +		if (error)
  11.932 +			goto free_vma;
  11.933 +	}
  11.934 +
  11.935 +	/* We set VM_ACCOUNT in a shared mapping's vm_flags, to inform
  11.936 +	 * shmem_zero_setup (perhaps called through /dev/zero's ->mmap)
  11.937 +	 * that memory reservation must be checked; but that reservation
  11.938 +	 * belongs to shared memory object, not to vma: so now clear it.
  11.939 +	 */
  11.940 +	if ((vm_flags & (VM_SHARED|VM_ACCOUNT)) == (VM_SHARED|VM_ACCOUNT))
  11.941 +		vma->vm_flags &= ~VM_ACCOUNT;
  11.942 +
  11.943 +	/* Can addr have changed??
  11.944 +	 *
  11.945 +	 * Answer: Yes, several device drivers can do it in their
  11.946 +	 *         f_op->mmap method. -DaveM
  11.947 +	 */
  11.948 +	addr = vma->vm_start;
  11.949 +
  11.950 +	if (!file || !vma_merge(mm, prev, addr, vma->vm_end,
  11.951 +			vma->vm_flags, NULL, file, pgoff, vma_policy(vma))) {
  11.952 +		vma_link(mm, vma, prev, rb_link, rb_parent);
  11.953 +		if (correct_wcount)
  11.954 +			atomic_inc(&inode->i_writecount);
  11.955 +	} else {
  11.956 +		if (file) {
  11.957 +			if (correct_wcount)
  11.958 +				atomic_inc(&inode->i_writecount);
  11.959 +			fput(file);
  11.960 +		}
  11.961 +		mpol_free(vma_policy(vma));
  11.962 +		kmem_cache_free(vm_area_cachep, vma);
  11.963 +	}
  11.964 +out:	
  11.965 +	mm->total_vm += len >> PAGE_SHIFT;
  11.966 +	if (vm_flags & VM_LOCKED) {
  11.967 +		mm->locked_vm += len >> PAGE_SHIFT;
  11.968 +		make_pages_present(addr, addr + len);
  11.969 +	}
  11.970 +	if (flags & MAP_POPULATE) {
  11.971 +		up_write(&mm->mmap_sem);
  11.972 +		sys_remap_file_pages(addr, len, 0,
  11.973 +					pgoff, flags & MAP_NONBLOCK);
  11.974 +		down_write(&mm->mmap_sem);
  11.975 +	}
  11.976 +	return addr;
  11.977 +
  11.978 +unmap_and_free_vma:
  11.979 +	if (correct_wcount)
  11.980 +		atomic_inc(&inode->i_writecount);
  11.981 +	vma->vm_file = NULL;
  11.982 +	fput(file);
  11.983 +
  11.984 +	/* Undo any partial mapping done by a device driver. */
  11.985 +	zap_page_range(vma, vma->vm_start, vma->vm_end - vma->vm_start, NULL);
  11.986 +free_vma:
  11.987 +	kmem_cache_free(vm_area_cachep, vma);
  11.988 +unacct_error:
  11.989 +	if (charged)
  11.990 +		vm_unacct_memory(charged);
  11.991 +	return error;
  11.992 +}
  11.993 +
  11.994 +EXPORT_SYMBOL(do_mmap_pgoff);
  11.995 +
  11.996 +/* Get an address range which is currently unmapped.
  11.997 + * For shmat() with addr=0.
  11.998 + *
  11.999 + * Ugly calling convention alert:
 11.1000 + * Return value with the low bits set means error value,
 11.1001 + * ie
 11.1002 + *	if (ret & ~PAGE_MASK)
 11.1003 + *		error = ret;
 11.1004 + *
 11.1005 + * This function "knows" that -ENOMEM has the bits set.
 11.1006 + */
 11.1007 +#ifndef HAVE_ARCH_UNMAPPED_AREA
 11.1008 +static inline unsigned long
 11.1009 +arch_get_unmapped_area(struct file *filp, unsigned long addr,
 11.1010 +		unsigned long len, unsigned long pgoff, unsigned long flags)
 11.1011 +{
 11.1012 +	struct mm_struct *mm = current->mm;
 11.1013 +	struct vm_area_struct *vma;
 11.1014 +	unsigned long start_addr;
 11.1015 +
 11.1016 +	if (len > TASK_SIZE)
 11.1017 +		return -ENOMEM;
 11.1018 +
 11.1019 +	if (addr) {
 11.1020 +		addr = PAGE_ALIGN(addr);
 11.1021 +		vma = find_vma(mm, addr);
 11.1022 +		if (TASK_SIZE - len >= addr &&
 11.1023 +		    (!vma || addr + len <= vma->vm_start))
 11.1024 +			return addr;
 11.1025 +	}
 11.1026 +	start_addr = addr = mm->free_area_cache;
 11.1027 +
 11.1028 +full_search:
 11.1029 +	for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
 11.1030 +		/* At this point:  (!vma || addr < vma->vm_end). */
 11.1031 +		if (TASK_SIZE - len < addr) {
 11.1032 +			/*
 11.1033 +			 * Start a new search - just in case we missed
 11.1034 +			 * some holes.
 11.1035 +			 */
 11.1036 +			if (start_addr != TASK_UNMAPPED_BASE) {
 11.1037 +				start_addr = addr = TASK_UNMAPPED_BASE;
 11.1038 +				goto full_search;
 11.1039 +			}
 11.1040 +			return -ENOMEM;
 11.1041 +		}
 11.1042 +		if (!vma || addr + len <= vma->vm_start) {
 11.1043 +			/*
 11.1044 +			 * Remember the place where we stopped the search:
 11.1045 +			 */
 11.1046 +			mm->free_area_cache = addr + len;
 11.1047 +			return addr;
 11.1048 +		}
 11.1049 +		addr = vma->vm_end;
 11.1050 +	}
 11.1051 +}
 11.1052 +#else
 11.1053 +extern unsigned long
 11.1054 +arch_get_unmapped_area(struct file *, unsigned long, unsigned long,
 11.1055 +			unsigned long, unsigned long);
 11.1056 +#endif	
 11.1057 +
 11.1058 +#ifndef HAVE_ARCH_CHECK_FIXED_MAPPING
 11.1059 +#define arch_check_fixed_mapping(_file,_addr,_len,_pgoff,_flags) 0
 11.1060 +#else
 11.1061 +extern unsigned long
 11.1062 +arch_check_fixed_mapping(struct file *, unsigned long, unsigned long,
 11.1063 +			unsigned long, unsigned long);
 11.1064 +#endif
 11.1065 +
 11.1066 +unsigned long
 11.1067 +get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
 11.1068 +		unsigned long pgoff, unsigned long flags)
 11.1069 +{
 11.1070 +	if (flags & MAP_FIXED) {
 11.1071 +		unsigned long ret;
 11.1072 +
 11.1073 +		if (addr > TASK_SIZE - len)
 11.1074 +			return -ENOMEM;
 11.1075 +		if (addr & ~PAGE_MASK)
 11.1076 +			return -EINVAL;
 11.1077 +		ret = arch_check_fixed_mapping(file, addr, len, pgoff, flags);
 11.1078 +		if (ret != 0)
 11.1079 +			return ret;
 11.1080 +		if (file && is_file_hugepages(file))  {
 11.1081 +			/*
 11.1082 +			 * Check if the given range is hugepage aligned, and
 11.1083 +			 * can be made suitable for hugepages.
 11.1084 +			 */
 11.1085 +			ret = prepare_hugepage_range(addr, len);
 11.1086 +		} else {
 11.1087 +			/*
 11.1088 +			 * Ensure that a normal request is not falling in a
 11.1089 +			 * reserved hugepage range.  For some archs like IA-64,
 11.1090 +			 * there is a separate region for hugepages.
 11.1091 +			 */
 11.1092 +			ret = is_hugepage_only_range(addr, len);
 11.1093 +		}
 11.1094 +		if (ret)
 11.1095 +			return -EINVAL;
 11.1096 +		return addr;
 11.1097 +	}
 11.1098 +
 11.1099 +	if (file && file->f_op && file->f_op->get_unmapped_area)
 11.1100 +		return file->f_op->get_unmapped_area(file, addr, len,
 11.1101 +						pgoff, flags);
 11.1102 +
 11.1103 +	return arch_get_unmapped_area(file, addr, len, pgoff, flags);
 11.1104 +}
 11.1105 +
 11.1106 +EXPORT_SYMBOL(get_unmapped_area);
 11.1107 +
 11.1108 +/* Look up the first VMA which satisfies  addr < vm_end,  NULL if none. */
 11.1109 +struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr)
 11.1110 +{
 11.1111 +	struct vm_area_struct *vma = NULL;
 11.1112 +
 11.1113 +	if (mm) {
 11.1114 +		/* Check the cache first. */
 11.1115 +		/* (Cache hit rate is typically around 35%.) */
 11.1116 +		vma = mm->mmap_cache;
 11.1117 +		if (!(vma && vma->vm_end > addr && vma->vm_start <= addr)) {
 11.1118 +			struct rb_node * rb_node;
 11.1119 +
 11.1120 +			rb_node = mm->mm_rb.rb_node;
 11.1121 +			vma = NULL;
 11.1122 +
 11.1123 +			while (rb_node) {
 11.1124 +				struct vm_area_struct * vma_tmp;
 11.1125 +
 11.1126 +				vma_tmp = rb_entry(rb_node,
 11.1127 +						struct vm_area_struct, vm_rb);
 11.1128 +
 11.1129 +				if (vma_tmp->vm_end > addr) {
 11.1130 +					vma = vma_tmp;
 11.1131 +					if (vma_tmp->vm_start <= addr)
 11.1132 +						break;
 11.1133 +					rb_node = rb_node->rb_left;
 11.1134 +				} else
 11.1135 +					rb_node = rb_node->rb_right;
 11.1136 +			}
 11.1137 +			if (vma)
 11.1138 +				mm->mmap_cache = vma;
 11.1139 +		}
 11.1140 +	}
 11.1141 +	return vma;
 11.1142 +}
 11.1143 +
 11.1144 +EXPORT_SYMBOL(find_vma);
 11.1145 +
 11.1146 +/* Same as find_vma, but also return a pointer to the previous VMA in *pprev. */
 11.1147 +struct vm_area_struct *
 11.1148 +find_vma_prev(struct mm_struct *mm, unsigned long addr,
 11.1149 +			struct vm_area_struct **pprev)
 11.1150 +{
 11.1151 +	struct vm_area_struct *vma = NULL, *prev = NULL;
 11.1152 +	struct rb_node * rb_node;
 11.1153 +	if (!mm)
 11.1154 +		goto out;
 11.1155 +
 11.1156 +	/* Guard against addr being lower than the first VMA */
 11.1157 +	vma = mm->mmap;
 11.1158 +
 11.1159 +	/* Go through the RB tree quickly. */
 11.1160 +	rb_node = mm->mm_rb.rb_node;
 11.1161 +
 11.1162 +	while (rb_node) {
 11.1163 +		struct vm_area_struct *vma_tmp;
 11.1164 +		vma_tmp = rb_entry(rb_node, struct vm_area_struct, vm_rb);
 11.1165 +
 11.1166 +		if (addr < vma_tmp->vm_end) {
 11.1167 +			rb_node = rb_node->rb_left;
 11.1168 +		} else {
 11.1169 +			prev = vma_tmp;
 11.1170 +			if (!prev->vm_next || (addr < prev->vm_next->vm_end))
 11.1171 +				break;
 11.1172 +			rb_node = rb_node->rb_right;
 11.1173 +		}
 11.1174 +	}
 11.1175 +
 11.1176 +out:
 11.1177 +	*pprev = prev;
 11.1178 +	return prev ? prev->vm_next : vma;
 11.1179 +}
 11.1180 +
 11.1181 +#ifdef CONFIG_STACK_GROWSUP
 11.1182 +/*
 11.1183 + * vma is the first one with address > vma->vm_end.  Have to extend vma.
 11.1184 + */
 11.1185 +int expand_stack(struct vm_area_struct * vma, unsigned long address)
 11.1186 +{
 11.1187 +	unsigned long grow;
 11.1188 +
 11.1189 +	if (!(vma->vm_flags & VM_GROWSUP))
 11.1190 +		return -EFAULT;
 11.1191 +
 11.1192 +	/*
 11.1193 +	 * We must make sure the anon_vma is allocated
 11.1194 +	 * so that the anon_vma locking is not a noop.
 11.1195 +	 */
 11.1196 +	if (unlikely(anon_vma_prepare(vma)))
 11.1197 +		return -ENOMEM;
 11.1198 +	anon_vma_lock(vma);
 11.1199 +
 11.1200 +	/*
 11.1201 +	 * vma->vm_start/vm_end cannot change under us because the caller
 11.1202 +	 * is required to hold the mmap_sem in read mode.  We need the
 11.1203 +	 * anon_vma lock to serialize against concurrent expand_stacks.
 11.1204 +	 */
 11.1205 +	address += 4 + PAGE_SIZE - 1;
 11.1206 +	address &= PAGE_MASK;
 11.1207 +	grow = (address - vma->vm_end) >> PAGE_SHIFT;
 11.1208 +
 11.1209 +	/* Overcommit.. */
 11.1210 +	if (security_vm_enough_memory(grow)) {
 11.1211 +		anon_vma_unlock(vma);
 11.1212 +		return -ENOMEM;
 11.1213 +	}
 11.1214 +	
 11.1215 +	if (address - vma->vm_start > current->rlim[RLIMIT_STACK].rlim_cur ||
 11.1216 +			((vma->vm_mm->total_vm + grow) << PAGE_SHIFT) >
 11.1217 +			current->rlim[RLIMIT_AS].rlim_cur) {
 11.1218 +		anon_vma_unlock(vma);
 11.1219 +		vm_unacct_memory(grow);
 11.1220 +		return -ENOMEM;
 11.1221 +	}
 11.1222 +	vma->vm_end = address;
 11.1223 +	vma->vm_mm->total_vm += grow;
 11.1224 +	if (vma->vm_flags & VM_LOCKED)
 11.1225 +		vma->vm_mm->locked_vm += grow;
 11.1226 +	anon_vma_unlock(vma);
 11.1227 +	return 0;
 11.1228 +}
 11.1229 +
 11.1230 +struct vm_area_struct *
 11.1231 +find_extend_vma(struct mm_struct *mm, unsigned long addr)
 11.1232 +{
 11.1233 +	struct vm_area_struct *vma, *prev;
 11.1234 +
 11.1235 +	addr &= PAGE_MASK;
 11.1236 +	vma = find_vma_prev(mm, addr, &prev);
 11.1237 +	if (vma && (vma->vm_start <= addr))
 11.1238 +		return vma;
 11.1239 +	if (!prev || expand_stack(prev, addr))
 11.1240 +		return NULL;
 11.1241 +	if (prev->vm_flags & VM_LOCKED) {
 11.1242 +		make_pages_present(addr, prev->vm_end);
 11.1243 +	}
 11.1244 +	return prev;
 11.1245 +}
 11.1246 +#else
 11.1247 +/*
 11.1248 + * vma is the first one with address < vma->vm_start.  Have to extend vma.
 11.1249 + */
 11.1250 +int expand_stack(struct vm_area_struct *vma, unsigned long address)
 11.1251 +{
 11.1252 +	unsigned long grow;
 11.1253 +
 11.1254 +	/*
 11.1255 +	 * We must make sure the anon_vma is allocated
 11.1256 +	 * so that the anon_vma locking is not a noop.
 11.1257 +	 */
 11.1258 +	if (unlikely(anon_vma_prepare(vma)))
 11.1259 +		return -ENOMEM;
 11.1260 +	anon_vma_lock(vma);
 11.1261 +
 11.1262 +	/*
 11.1263 +	 * vma->vm_start/vm_end cannot change under us because the caller
 11.1264 +	 * is required to hold the mmap_sem in read mode.  We need the
 11.1265 +	 * anon_vma lock to serialize against concurrent expand_stacks.
 11.1266 +	 */
 11.1267 +	address &= PAGE_MASK;
 11.1268 +	grow = (vma->vm_start - address) >> PAGE_SHIFT;
 11.1269 +
 11.1270 +	/* Overcommit.. */
 11.1271 +	if (security_vm_enough_memory(grow)) {
 11.1272 +		anon_vma_unlock(vma);
 11.1273 +		return -ENOMEM;
 11.1274 +	}
 11.1275 +	
 11.1276 +	if (vma->vm_end - address > current->rlim[RLIMIT_STACK].rlim_cur ||
 11.1277 +			((vma->vm_mm->total_vm + grow) << PAGE_SHIFT) >
 11.1278 +			current->rlim[RLIMIT_AS].rlim_cur) {
 11.1279 +		anon_vma_unlock(vma);
 11.1280 +		vm_unacct_memory(grow);
 11.1281 +		return -ENOMEM;
 11.1282 +	}
 11.1283 +	vma->vm_start = address;
 11.1284 +	vma->vm_pgoff -= grow;
 11.1285 +	vma->vm_mm->total_vm += grow;
 11.1286 +	if (vma->vm_flags & VM_LOCKED)
 11.1287 +		vma->vm_mm->locked_vm += grow;
 11.1288 +	anon_vma_unlock(vma);
 11.1289 +	return 0;
 11.1290 +}
 11.1291 +
 11.1292 +struct vm_area_struct *
 11.1293 +find_extend_vma(struct mm_struct * mm, unsigned long addr)
 11.1294 +{
 11.1295 +	struct vm_area_struct * vma;
 11.1296 +	unsigned long start;
 11.1297 +
 11.1298 +	addr &= PAGE_MASK;
 11.1299 +	vma = find_vma(mm,addr);
 11.1300 +	if (!vma)
 11.1301 +		return NULL;
 11.1302 +	if (vma->vm_start <= addr)
 11.1303 +		return vma;
 11.1304 +	if (!(vma->vm_flags & VM_GROWSDOWN))
 11.1305 +		return NULL;
 11.1306 +	start = vma->vm_start;
 11.1307 +	if (expand_stack(vma, addr))
 11.1308 +		return NULL;
 11.1309 +	if (vma->vm_flags & VM_LOCKED) {
 11.1310 +		make_pages_present(addr, start);
 11.1311 +	}
 11.1312 +	return vma;
 11.1313 +}
 11.1314 +#endif
 11.1315 +
 11.1316 +/*
 11.1317 + * Try to free as many page directory entries as we can,
 11.1318 + * without having to work very hard at actually scanning
 11.1319 + * the page tables themselves.
 11.1320 + *
 11.1321 + * Right now we try to free page tables if we have a nice
 11.1322 + * PGDIR-aligned area that got free'd up. We could be more
 11.1323 + * granular if we want to, but this is fast and simple,
 11.1324 + * and covers the bad cases.
 11.1325 + *
 11.1326 + * "prev", if it exists, points to a vma before the one
 11.1327 + * we just free'd - but there's no telling how much before.
 11.1328 + */
 11.1329 +static void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *prev,
 11.1330 +	unsigned long start, unsigned long end)
 11.1331 +{
 11.1332 +	unsigned long first = start & PGDIR_MASK;
 11.1333 +	unsigned long last = end + PGDIR_SIZE - 1;
 11.1334 +	unsigned long start_index, end_index;
 11.1335 +	struct mm_struct *mm = tlb->mm;
 11.1336 +
 11.1337 +	if (!prev) {
 11.1338 +		prev = mm->mmap;
 11.1339 +		if (!prev)
 11.1340 +			goto no_mmaps;
 11.1341 +		if (prev->vm_end > start) {
 11.1342 +			if (last > prev->vm_start)
 11.1343 +				last = prev->vm_start;
 11.1344 +			goto no_mmaps;
 11.1345 +		}
 11.1346 +	}
 11.1347 +	for (;;) {
 11.1348 +		struct vm_area_struct *next = prev->vm_next;
 11.1349 +
 11.1350 +		if (next) {
 11.1351 +			if (next->vm_start < start) {
 11.1352 +				prev = next;
 11.1353 +				continue;
 11.1354 +			}
 11.1355 +			if (last > next->vm_start)
 11.1356 +				last = next->vm_start;
 11.1357 +		}
 11.1358 +		if (prev->vm_end > first)
 11.1359 +			first = prev->vm_end + PGDIR_SIZE - 1;
 11.1360 +		break;
 11.1361 +	}
 11.1362 +no_mmaps:
 11.1363 +	if (last < first)	/* for arches with discontiguous pgd indices */
 11.1364 +		return;
 11.1365 +	/*
 11.1366 +	 * If the PGD bits are not consecutive in the virtual address, the
 11.1367 +	 * old method of shifting the VA >> by PGDIR_SHIFT doesn't work.
 11.1368 +	 */
 11.1369 +	start_index = pgd_index(first);
 11.1370 +	if (start_index < FIRST_USER_PGD_NR)
 11.1371 +		start_index = FIRST_USER_PGD_NR;
 11.1372 +	end_index = pgd_index(last);
 11.1373 +	if (end_index > start_index) {
 11.1374 +		clear_page_tables(tlb, start_index, end_index - start_index);
 11.1375 +		flush_tlb_pgtables(mm, first & PGDIR_MASK, last & PGDIR_MASK);
 11.1376 +	}
 11.1377 +}
 11.1378 +
 11.1379 +/* Normal function to fix up a mapping
 11.1380 + * This function is the default for when an area has no specific
 11.1381 + * function.  This may be used as part of a more specific routine.
 11.1382 + *
 11.1383 + * By the time this function is called, the area struct has been
 11.1384 + * removed from the process mapping list.
 11.1385 + */
 11.1386 +static void unmap_vma(struct mm_struct *mm, struct vm_area_struct *area)
 11.1387 +{
 11.1388 +	size_t len = area->vm_end - area->vm_start;
 11.1389 +
 11.1390 +	area->vm_mm->total_vm -= len >> PAGE_SHIFT;
 11.1391 +	if (area->vm_flags & VM_LOCKED)
 11.1392 +		area->vm_mm->locked_vm -= len >> PAGE_SHIFT;
 11.1393 +	/*
 11.1394 +	 * Is this a new hole at the lowest possible address?
 11.1395 +	 */
 11.1396 +	if (area->vm_start >= TASK_UNMAPPED_BASE &&
 11.1397 +				area->vm_start < area->vm_mm->free_area_cache)
 11.1398 +	      area->vm_mm->free_area_cache = area->vm_start;
 11.1399 +
 11.1400 +	remove_vm_struct(area);
 11.1401 +}
 11.1402 +
 11.1403 +/*
 11.1404 + * Update the VMA and inode share lists.
 11.1405 + *
 11.1406 + * Ok - we have the memory areas we should free on the 'free' list,
 11.1407 + * so release them, and do the vma updates.
 11.1408 + */
 11.1409 +static void unmap_vma_list(struct mm_struct *mm,
 11.1410 +	struct vm_area_struct *mpnt)
 11.1411 +{
 11.1412 +	do {
 11.1413 +		struct vm_area_struct *next = mpnt->vm_next;
 11.1414 +		unmap_vma(mm, mpnt);
 11.1415 +		mpnt = next;
 11.1416 +	} while (mpnt != NULL);
 11.1417 +	validate_mm(mm);
 11.1418 +}
 11.1419 +
 11.1420 +/*
 11.1421 + * Get rid of page table information in the indicated region.
 11.1422 + *
 11.1423 + * Called with the page table lock held.
 11.1424 + */
 11.1425 +static void unmap_region(struct mm_struct *mm,
 11.1426 +	struct vm_area_struct *vma,
 11.1427 +	struct vm_area_struct *prev,
 11.1428 +	unsigned long start,
 11.1429 +	unsigned long end)
 11.1430 +{
 11.1431 +	struct mmu_gather *tlb;
 11.1432 +	unsigned long nr_accounted = 0;
 11.1433 +
 11.1434 +	lru_add_drain();
 11.1435 +	tlb = tlb_gather_mmu(mm, 0);
 11.1436 +	unmap_vmas(&tlb, mm, vma, start, end, &nr_accounted, NULL);
 11.1437 +	vm_unacct_memory(nr_accounted);
 11.1438 +
 11.1439 +	if (is_hugepage_only_range(start, end - start))
 11.1440 +		hugetlb_free_pgtables(tlb, prev, start, end);
 11.1441 +	else
 11.1442 +		free_pgtables(tlb, prev, start, end);
 11.1443 +	tlb_finish_mmu(tlb, start, end);
 11.1444 +}
 11.1445 +
 11.1446 +/*
 11.1447 + * Create a list of vma's touched by the unmap, removing them from the mm's
 11.1448 + * vma list as we go..
 11.1449 + */
 11.1450 +static void
 11.1451 +detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
 11.1452 +	struct vm_area_struct *prev, unsigned long end)
 11.1453 +{
 11.1454 +	struct vm_area_struct **insertion_point;
 11.1455 +	struct vm_area_struct *tail_vma = NULL;
 11.1456 +
 11.1457 +	insertion_point = (prev ? &prev->vm_next : &mm->mmap);
 11.1458 +	do {
 11.1459 +		rb_erase(&vma->vm_rb, &mm->mm_rb);
 11.1460 +		mm->map_count--;
 11.1461 +		tail_vma = vma;
 11.1462 +		vma = vma->vm_next;
 11.1463 +	} while (vma && vma->vm_start < end);
 11.1464 +	*insertion_point = vma;
 11.1465 +	tail_vma->vm_next = NULL;
 11.1466 +	mm->mmap_cache = NULL;		/* Kill the cache. */
 11.1467 +}
 11.1468 +
 11.1469 +/*
 11.1470 + * Split a vma into two pieces at address 'addr', a new vma is allocated
 11.1471 + * either for the first part or the the tail.
 11.1472 + */
 11.1473 +int split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
 11.1474 +	      unsigned long addr, int new_below)
 11.1475 +{
 11.1476 +	struct mempolicy *pol;
 11.1477 +	struct vm_area_struct *new;
 11.1478 +
 11.1479 +	if (mm->map_count >= sysctl_max_map_count)
 11.1480 +		return -ENOMEM;
 11.1481 +
 11.1482 +	new = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
 11.1483 +	if (!new)
 11.1484 +		return -ENOMEM;
 11.1485 +
 11.1486 +	/* most fields are the same, copy all, and then fixup */
 11.1487 +	*new = *vma;
 11.1488 +	vma_prio_tree_init(new);
 11.1489 +
 11.1490 +	if (new_below)
 11.1491 +		new->vm_end = addr;
 11.1492 +	else {
 11.1493 +		new->vm_start = addr;
 11.1494 +		new->vm_pgoff += ((addr - vma->vm_start) >> PAGE_SHIFT);
 11.1495 +	}
 11.1496 +
 11.1497 +	pol = mpol_copy(vma_policy(vma));
 11.1498 +	if (IS_ERR(pol)) {
 11.1499 +		kmem_cache_free(vm_area_cachep, new);
 11.1500 +		return PTR_ERR(pol);
 11.1501 +	}
 11.1502 +	vma_set_policy(new, pol);
 11.1503 +
 11.1504 +	if (new->vm_file)
 11.1505 +		get_file(new->vm_file);
 11.1506 +
 11.1507 +	if (new->vm_ops && new->vm_ops->open)
 11.1508 +		new->vm_ops->open(new);
 11.1509 +
 11.1510 +	if (new_below)
 11.1511 +		vma_adjust(vma, addr, vma->vm_end, vma->vm_pgoff +
 11.1512 +			((addr - new->vm_start) >> PAGE_SHIFT), new);
 11.1513 +	else
 11.1514 +		vma_adjust(vma, vma->vm_start, addr, vma->vm_pgoff, new);
 11.1515 +
 11.1516 +	return 0;
 11.1517 +}
 11.1518 +
 11.1519 +/* Munmap is split into 2 main parts -- this part which finds
 11.1520 + * what needs doing, and the areas themselves, which do the
 11.1521 + * work.  This now handles partial unmappings.
 11.1522 + * Jeremy Fitzhardinge <jeremy@goop.org>
 11.1523 + */
 11.1524 +int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
 11.1525 +{
 11.1526 +	unsigned long end;
 11.1527 +	struct vm_area_struct *mpnt, *prev, *last;
 11.1528 +
 11.1529 +	if ((start & ~PAGE_MASK) || start > TASK_SIZE || len > TASK_SIZE-start)
 11.1530 +		return -EINVAL;
 11.1531 +
 11.1532 +	if ((len = PAGE_ALIGN(len)) == 0)
 11.1533 +		return -EINVAL;
 11.1534 +
 11.1535 +	/* Find the first overlapping VMA */
 11.1536 +	mpnt = find_vma_prev(mm, start, &prev);
 11.1537 +	if (!mpnt)
 11.1538 +		return 0;
 11.1539 +	/* we have  start < mpnt->vm_end  */
 11.1540 +
 11.1541 +	if (is_vm_hugetlb_page(mpnt)) {
 11.1542 +		int ret = is_aligned_hugepage_range(start, len);
 11.1543 +
 11.1544 +		if (ret)
 11.1545 +			return ret;
 11.1546 +	}
 11.1547 +
 11.1548 +	/* if it doesn't overlap, we have nothing.. */
 11.1549 +	end = start + len;
 11.1550 +	if (mpnt->vm_start >= end)
 11.1551 +		return 0;
 11.1552 +
 11.1553 +	/* Something will probably happen, so notify. */
 11.1554 +	if (mpnt->vm_file && (mpnt->vm_flags & VM_EXEC))
 11.1555 +		profile_exec_unmap(mm);
 11.1556 + 
 11.1557 +	/*
 11.1558 +	 * If we need to split any vma, do it now to save pain later.
 11.1559 +	 *
 11.1560 +	 * Note: mremap's move_vma VM_ACCOUNT handling assumes a partially
 11.1561 +	 * unmapped vm_area_struct will remain in use: so lower split_vma
 11.1562 +	 * places tmp vma above, and higher split_vma places tmp vma below.
 11.1563 +	 */
 11.1564 +	if (start > mpnt->vm_start) {
 11.1565 +		if (split_vma(mm, mpnt, start, 0))
 11.1566 +			return -ENOMEM;
 11.1567 +		prev = mpnt;
 11.1568 +	}
 11.1569 +
 11.1570 +	/* Does it split the last one? */
 11.1571 +	last = find_vma(mm, end);
 11.1572 +	if (last && end > last->vm_start) {
 11.1573 +		if (split_vma(mm, last, end, 1))
 11.1574 +			return -ENOMEM;
 11.1575 +	}
 11.1576 +	mpnt = prev? prev->vm_next: mm->mmap;
 11.1577 +
 11.1578 +	/*
 11.1579 +	 * Remove the vma's, and unmap the actual pages
 11.1580 +	 */
 11.1581 +	detach_vmas_to_be_unmapped(mm, mpnt, prev, end);
 11.1582 +	spin_lock(&mm->page_table_lock);
 11.1583 +	unmap_region(mm, mpnt, prev, start, end);
 11.1584 +	spin_unlock(&mm->page_table_lock);
 11.1585 +
 11.1586 +	/* Fix up all other VM information */
 11.1587 +	unmap_vma_list(mm, mpnt);
 11.1588 +
 11.1589 +	return 0;
 11.1590 +}
 11.1591 +
 11.1592 +EXPORT_SYMBOL(do_munmap);
 11.1593 +
 11.1594 +asmlinkage long sys_munmap(unsigned long addr, size_t len)
 11.1595 +{
 11.1596 +	int ret;
 11.1597 +	struct mm_struct *mm = current->mm;
 11.1598 +
 11.1599 +	down_write(&mm->mmap_sem);
 11.1600 +	ret = do_munmap(mm, addr, len);
 11.1601 +	up_write(&mm->mmap_sem);
 11.1602 +	return ret;
 11.1603 +}
 11.1604 +
 11.1605 +/*
 11.1606 + *  this is really a simplified "do_mmap".  it only handles
 11.1607 + *  anonymous maps.  eventually we may be able to do some
 11.1608 + *  brk-specific accounting here.
 11.1609 + */
 11.1610 +unsigned long do_brk(unsigned long addr, unsigned long len)
 11.1611 +{
 11.1612 +	struct mm_struct * mm = current->mm;
 11.1613 +	struct vm_area_struct * vma, * prev;
 11.1614 +	unsigned long flags;
 11.1615 +	struct rb_node ** rb_link, * rb_parent;
 11.1616 +	pgoff_t pgoff = addr >> PAGE_SHIFT;
 11.1617 +
 11.1618 +	len = PAGE_ALIGN(len);
 11.1619 +	if (!len)
 11.1620 +		return addr;
 11.1621 +
 11.1622 +	if ((addr + len) > TASK_SIZE || (addr + len) < addr)
 11.1623 +		return -EINVAL;
 11.1624 +
 11.1625 +	/*
 11.1626 +	 * mlock MCL_FUTURE?
 11.1627 +	 */
 11.1628 +	if (mm->def_flags & VM_LOCKED) {
 11.1629 +		unsigned long locked = mm->locked_vm << PAGE_SHIFT;
 11.1630 +		locked += len;
 11.1631 +		if (locked > current->rlim[RLIMIT_MEMLOCK].rlim_cur)
 11.1632 +			return -EAGAIN;
 11.1633 +	}
 11.1634 +
 11.1635 +	/*
 11.1636 +	 * Clear old maps.  this also does some error checking for us
 11.1637 +	 */
 11.1638 + munmap_back:
 11.1639 +	vma = find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent);
 11.1640 +	if (vma && vma->vm_start < addr + len) {
 11.1641 +		if (do_munmap(mm, addr, len))
 11.1642 +			return -ENOMEM;
 11.1643 +		goto munmap_back;
 11.1644 +	}
 11.1645 +
 11.1646 +	/* Check against address space limits *after* clearing old maps... */
 11.1647 +	if ((mm->total_vm << PAGE_SHIFT) + len
 11.1648 +	    > current->rlim[RLIMIT_AS].rlim_cur)
 11.1649 +		return -ENOMEM;
 11.1650 +
 11.1651 +	if (mm->map_count > sysctl_max_map_count)
 11.1652 +		return -ENOMEM;
 11.1653 +
 11.1654 +	if (security_vm_enough_memory(len >> PAGE_SHIFT))
 11.1655 +		return -ENOMEM;
 11.1656 +
 11.1657 +	flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;
 11.1658 +
 11.1659 +	/* Can we just expand an old private anonymous mapping? */
 11.1660 +	if (vma_merge(mm, prev, addr, addr + len, flags,
 11.1661 +					NULL, NULL, pgoff, NULL))
 11.1662 +		goto out;
 11.1663 +
 11.1664 +	/*
 11.1665 +	 * create a vma struct for an anonymous mapping
 11.1666 +	 */
 11.1667 +	vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
 11.1668 +	if (!vma) {
 11.1669 +		vm_unacct_memory(len >> PAGE_SHIFT);
 11.1670 +		return -ENOMEM;
 11.1671 +	}
 11.1672 +	memset(vma, 0, sizeof(*vma));
 11.1673 +
 11.1674 +	vma->vm_mm = mm;
 11.1675 +	vma->vm_start = addr;
 11.1676 +	vma->vm_end = addr + len;
 11.1677 +	vma->vm_pgoff = pgoff;
 11.1678 +	vma->vm_flags = flags;
 11.1679 +	vma->vm_page_prot = protection_map[flags & 0x0f];
 11.1680 +	vma_link(mm, vma, prev, rb_link, rb_parent);
 11.1681 +out:
 11.1682 +	mm->total_vm += len >> PAGE_SHIFT;
 11.1683 +	if (flags & VM_LOCKED) {
 11.1684 +		mm->locked_vm += len >> PAGE_SHIFT;
 11.1685 +		make_pages_present(addr, addr + len);
 11.1686 +	}
 11.1687 +	return addr;
 11.1688 +}
 11.1689 +
 11.1690 +EXPORT_SYMBOL(do_brk);
 11.1691 +
 11.1692 +/* Release all mmaps. */
 11.1693 +void exit_mmap(struct mm_struct *mm)
 11.1694 +{
 11.1695 +	struct mmu_gather *tlb;
 11.1696 +	struct vm_area_struct *vma;
 11.1697 +	unsigned long nr_accounted = 0;
 11.1698 +
 11.1699 +	profile_exit_mmap(mm);
 11.1700 + 
 11.1701 +	lru_add_drain();
 11.1702 +
 11.1703 +	spin_lock(&mm->page_table_lock);
 11.1704 +
 11.1705 +	tlb = tlb_gather_mmu(mm, 1);
 11.1706 +	flush_cache_mm(mm);
 11.1707 +	/* Use ~0UL here to ensure all VMAs in the mm are unmapped */
 11.1708 +	mm->map_count -= unmap_vmas(&tlb, mm, mm->mmap, 0,
 11.1709 +					~0UL, &nr_accounted, NULL);
 11.1710 +	vm_unacct_memory(nr_accounted);
 11.1711 +	BUG_ON(mm->map_count);	/* This is just debugging */
 11.1712 +	clear_page_tables(tlb, FIRST_USER_PGD_NR, USER_PTRS_PER_PGD);
 11.1713 +	tlb_finish_mmu(tlb, 0, MM_VM_SIZE(mm));
 11.1714 +
 11.1715 +	vma = mm->mmap;
 11.1716 +	mm->mmap = mm->mmap_cache = NULL;
 11.1717 +	mm->mm_rb = RB_ROOT;
 11.1718 +	mm->rss = 0;
 11.1719 +	mm->total_vm = 0;
 11.1720 +	mm->locked_vm = 0;
 11.1721 +
 11.1722 +	spin_unlock(&mm->page_table_lock);
 11.1723 +
 11.1724 +	/*
 11.1725 +	 * Walk the list again, actually closing and freeing it
 11.1726 +	 * without holding any MM locks.
 11.1727 +	 */
 11.1728 +	while (vma) {
 11.1729 +		struct vm_area_struct *next = vma->vm_next;
 11.1730 +		remove_vm_struct(vma);
 11.1731 +		vma = next;
 11.1732 +	}
 11.1733 +}
 11.1734 +
 11.1735 +/* Insert vm structure into process list sorted by address
 11.1736 + * and into the inode's i_mmap tree.  If vm_file is non-NULL
 11.1737 + * then i_mmap_lock is taken here.
 11.1738 + */
 11.1739 +void insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma)
 11.1740 +{
 11.1741 +	struct vm_area_struct * __vma, * prev;
 11.1742 +	struct rb_node ** rb_link, * rb_parent;
 11.1743 +
 11.1744 +	/*
 11.1745 +	 * The vm_pgoff of a purely anonymous vma should be irrelevant
 11.1746 +	 * until its first write fault, when page's anon_vma and index
 11.1747 +	 * are set.  But now set the vm_pgoff it will almost certainly
 11.1748 +	 * end up with (unless mremap moves it elsewhere before that
 11.1749 +	 * first wfault), so /proc/pid/maps tells a consistent story.
 11.1750 +	 *
 11.1751 +	 * By setting it to reflect the virtual start address of the
 11.1752 +	 * vma, merges and splits can happen in a seamless way, just
 11.1753 +	 * using the existing file pgoff checks and manipulations.
 11.1754 +	 * Similarly in do_mmap_pgoff and in do_brk.
 11.1755 +	 */
 11.1756 +	if (!vma->vm_file) {
 11.1757 +		BUG_ON(vma->anon_vma);
 11.1758 +		vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT;
 11.1759 +	}
 11.1760 +	__vma = find_vma_prepare(mm,vma->vm_start,&prev,&rb_link,&rb_parent);
 11.1761 +	if (__vma && __vma->vm_start < vma->vm_end)
 11.1762 +		BUG();
 11.1763 +	vma_link(mm, vma, prev, rb_link, rb_parent);
 11.1764 +}
 11.1765 +
 11.1766 +/*
 11.1767 + * Copy the vma structure to a new location in the same mm,
 11.1768 + * prior to moving page table entries, to effect an mremap move.
 11.1769 + */
 11.1770 +struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
 11.1771 +	unsigned long addr, unsigned long len, pgoff_t pgoff)
 11.1772 +{
 11.1773 +	struct vm_area_struct *vma = *vmap;
 11.1774 +	unsigned long vma_start = vma->vm_start;
 11.1775 +	struct mm_struct *mm = vma->vm_mm;
 11.1776 +	struct vm_area_struct *new_vma, *prev;
 11.1777 +	struct rb_node **rb_link, *rb_parent;
 11.1778 +	struct mempolicy *pol;
 11.1779 +
 11.1780 +	/*
 11.1781 +	 * If anonymous vma has not yet been faulted, update new pgoff
 11.1782 +	 * to match new location, to increase its chance of merging.
 11.1783 +	 */
 11.1784 +	if (!vma->vm_file && !vma->anon_vma)
 11.1785 +		pgoff = addr >> PAGE_SHIFT;
 11.1786 +
 11.1787 +	find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent);
 11.1788 +	new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags,
 11.1789 +			vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma));
 11.1790 +	if (new_vma) {
 11.1791 +		/*
 11.1792 +		 * Source vma may have been merged into new_vma
 11.1793 +		 */
 11.1794 +		if (vma_start >= new_vma->vm_start &&
 11.1795 +		    vma_start < new_vma->vm_end)
 11.1796 +			*vmap = new_vma;
 11.1797 +	} else {
 11.1798 +		new_vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
 11.1799 +		if (new_vma) {
 11.1800 +			*new_vma = *vma;
 11.1801 +			vma_prio_tree_init(new_vma);
 11.1802 +			pol = mpol_copy(vma_policy(vma));
 11.1803 +			if (IS_ERR(pol)) {
 11.1804 +				kmem_cache_free(vm_area_cachep, new_vma);
 11.1805 +				return NULL;
 11.1806 +			}
 11.1807 +			vma_set_policy(new_vma, pol);
 11.1808 +			new_vma->vm_start = addr;
 11.1809 +			new_vma->vm_end = addr + len;
 11.1810 +			new_vma->vm_pgoff = pgoff;
 11.1811 +			if (new_vma->vm_file)
 11.1812 +				get_file(new_vma->vm_file);
 11.1813 +			if (new_vma->vm_ops && new_vma->vm_ops->open)
 11.1814 +				new_vma->vm_ops->open(new_vma);
 11.1815 +			vma_link(mm, new_vma, prev, rb_link, rb_parent);
 11.1816 +		}
 11.1817 +	}
 11.1818 +	return new_vma;
 11.1819 +}
    12.1 --- a/tools/python/xen/lowlevel/xu/xu.c	Thu Aug 12 17:34:21 2004 +0000
    12.2 +++ b/tools/python/xen/lowlevel/xu/xu.c	Thu Aug 12 17:35:14 2004 +0000
    12.3 @@ -49,6 +49,13 @@
    12.4  /* Size of a machine page frame. */
    12.5  #define PAGE_SIZE 4096
    12.6  
    12.7 +#if defined(__i386__)
    12.8 +#define rmb() __asm__ __volatile__ ( "lock; addl $0,0(%%esp)" : : : "memory" )
    12.9 +#define wmb() __asm__ __volatile__ ( "" : : : "memory" )
   12.10 +#else
   12.11 +#error "Define barriers"
   12.12 +#endif
   12.13 +
   12.14  
   12.15  /*
   12.16   * *********************** NOTIFIER ***********************
   12.17 @@ -710,6 +717,9 @@ static PyObject *xu_port_read_request(Py
   12.18          return NULL;
   12.19      }
   12.20  
   12.21 +    /* Need to ensure we see the request, despite seeing the index update.*/
   12.22 +    rmb();
   12.23 +
   12.24      cmsg = &cif->tx_ring[MASK_CONTROL_IDX(c)];
   12.25      xum = PyObject_New(xu_message_object, &xu_message_type);
   12.26      memcpy(&xum->msg, cmsg, sizeof(*cmsg));
   12.27 @@ -745,6 +755,7 @@ static PyObject *xu_port_write_request(P
   12.28      cmsg = &cif->rx_ring[MASK_CONTROL_IDX(p)];
   12.29      memcpy(cmsg, &xum->msg, sizeof(*cmsg));
   12.30  
   12.31 +    wmb();
   12.32      xup->rx_req_prod = cif->rx_req_prod = p + 1;
   12.33  
   12.34      Py_INCREF(Py_None);
   12.35 @@ -768,6 +779,9 @@ static PyObject *xu_port_read_response(P
   12.36          return NULL;
   12.37      }
   12.38  
   12.39 +    /* Need to ensure we see the response, despite seeing the index update.*/
   12.40 +    rmb();
   12.41 +
   12.42      cmsg = &cif->rx_ring[MASK_CONTROL_IDX(c)];
   12.43      xum = PyObject_New(xu_message_object, &xu_message_type);
   12.44      memcpy(&xum->msg, cmsg, sizeof(*cmsg));
   12.45 @@ -803,6 +817,7 @@ static PyObject *xu_port_write_response(
   12.46      cmsg = &cif->tx_ring[MASK_CONTROL_IDX(p)];
   12.47      memcpy(cmsg, &xum->msg, sizeof(*cmsg));
   12.48  
   12.49 +    wmb();
   12.50      xup->tx_resp_prod = cif->tx_resp_prod = p + 1;
   12.51  
   12.52      Py_INCREF(Py_None);