ia64/linux-2.6.18-xen.hg

changeset 867:978499ee4f39

linux/blktap: fix vma_close() for partial munmap.

vm_area_struct::vm_private_data is used
by get_user_pages() so that we can't override
it. So in order to make blktap work, set it
to a array of struct page*.

Without mm->mmap_sem, virtual mapping can be changed.
so remembering vma which was passed to mmap callback
is bogus because later the vma can be freed or changed.
So don't remember vma and put necessary infomations into
tap_blkif_t. and use find_vma() to get necessary vma's.

Signed-off-by: Isaku Yamahata <yamahata@valinux.co.jp>
author Keir Fraser <keir.fraser@citrix.com>
date Tue Apr 28 13:44:22 2009 +0100 (2009-04-28)
parents cb25b9d5a594
children 42dfb4e2bce0
files drivers/xen/blktap/blktap.c
line diff
     1.1 --- a/drivers/xen/blktap/blktap.c	Tue Apr 28 13:43:46 2009 +0100
     1.2 +++ b/drivers/xen/blktap/blktap.c	Tue Apr 28 13:44:22 2009 +0100
     1.3 @@ -99,7 +99,7 @@ typedef struct domid_translate_ext {
     1.4  
     1.5  /*Data struct associated with each of the tapdisk devices*/
     1.6  typedef struct tap_blkif {
     1.7 -	struct vm_area_struct *vma;   /*Shared memory area                   */
     1.8 +	struct mm_struct *mm;         /*User address space                   */
     1.9  	unsigned long rings_vstart;   /*Kernel memory mapping                */
    1.10  	unsigned long user_vstart;    /*User memory mapping                  */
    1.11  	unsigned long dev_inuse;      /*One process opens device at a time.  */
    1.12 @@ -116,6 +116,7 @@ typedef struct tap_blkif {
    1.13  					[req id, idx] tuple                  */
    1.14  	blkif_t *blkif;               /*Associate blkif with tapdev          */
    1.15  	struct domid_translate_ext trans; /*Translation from domid to bus.   */
    1.16 +	struct page **map;	      /*Mapping page */
    1.17  } tap_blkif_t;
    1.18  
    1.19  static struct tap_blkif *tapfds[MAX_TAP_DEV];
    1.20 @@ -293,10 +294,6 @@ static inline int OFFSET_TO_SEG(int offs
    1.21  /******************************************************************
    1.22   * BLKTAP VM OPS
    1.23   */
    1.24 -struct tap_vma_priv {
    1.25 -	tap_blkif_t *info;
    1.26 -	struct page *map[];
    1.27 -};
    1.28  
    1.29  static struct page *blktap_nopage(struct vm_area_struct *vma,
    1.30  				  unsigned long address,
    1.31 @@ -315,11 +312,10 @@ static pte_t blktap_clear_pte(struct vm_
    1.32  			      pte_t *ptep, int is_fullmm)
    1.33  {
    1.34  	pte_t copy;
    1.35 -	tap_blkif_t *info;
    1.36 +	tap_blkif_t *info = NULL;
    1.37  	int offset, seg, usr_idx, pending_idx, mmap_idx;
    1.38  	unsigned long uvstart;
    1.39  	unsigned long kvaddr;
    1.40 -	struct tap_vma_priv *priv;
    1.41  	struct page *pg;
    1.42  	struct grant_handle_pair *khandle;
    1.43  	struct gnttab_unmap_grant_ref unmap[2];
    1.44 @@ -338,12 +334,9 @@ static pte_t blktap_clear_pte(struct vm_
    1.45  		return ptep_get_and_clear_full(vma->vm_mm, uvaddr, 
    1.46  					       ptep, is_fullmm);
    1.47  
    1.48 -	priv = vma->vm_private_data;
    1.49 -
    1.50  	/* TODO Should these be changed to if statements? */
    1.51  	BUG_ON(!info);
    1.52  	BUG_ON(!info->idx_map);
    1.53 -	BUG_ON(!priv);
    1.54  
    1.55  	offset = (int) ((uvaddr - uvstart) >> PAGE_SHIFT);
    1.56  	usr_idx = OFFSET_TO_USR_IDX(offset);
    1.57 @@ -355,7 +348,7 @@ static pte_t blktap_clear_pte(struct vm_
    1.58  	kvaddr = idx_to_kaddr(mmap_idx, pending_idx, seg);
    1.59  	pg = pfn_to_page(__pa(kvaddr) >> PAGE_SHIFT);
    1.60  	ClearPageReserved(pg);
    1.61 -	priv->map[offset + RING_PAGES] = NULL;
    1.62 +	info->map[offset + RING_PAGES] = NULL;
    1.63  
    1.64  	khandle = &pending_handle(mmap_idx, pending_idx, seg);
    1.65  
    1.66 @@ -396,19 +389,43 @@ static pte_t blktap_clear_pte(struct vm_
    1.67  	return copy;
    1.68  }
    1.69  
    1.70 +static void blktap_vma_open(struct vm_area_struct *vma)
    1.71 +{
    1.72 +	tap_blkif_t *info;
    1.73 +	if (vma->vm_file == NULL)
    1.74 +		return;
    1.75 +
    1.76 +	info = vma->vm_file->private_data;
    1.77 +	vma->vm_private_data =
    1.78 +		&info->map[(vma->vm_start - info->rings_vstart) >> PAGE_SHIFT];
    1.79 +}
    1.80 +
    1.81 +/* tricky part
    1.82 + * When partial munmapping, ->open() is called only splitted vma which
    1.83 + * will be released soon. * See split_vma() and do_munmap() in mm/mmap.c
    1.84 + * So there is no chance to fix up vm_private_data of the end vma.
    1.85 + */
    1.86  static void blktap_vma_close(struct vm_area_struct *vma)
    1.87  {
    1.88 -	struct tap_vma_priv *priv = vma->vm_private_data;
    1.89 +	tap_blkif_t *info;
    1.90 +	struct vm_area_struct *next = vma->vm_next;
    1.91  
    1.92 -	if (priv) {
    1.93 -		priv->info->vma = NULL;
    1.94 -		kfree(priv);
    1.95 -	}
    1.96 +	if (next == NULL ||
    1.97 +	    vma->vm_ops != next->vm_ops ||
    1.98 +	    vma->vm_end != next->vm_start ||
    1.99 +	    vma->vm_file == NULL ||
   1.100 +	    vma->vm_file != next->vm_file)
   1.101 +		return;
   1.102 +
   1.103 +	info = vma->vm_file->private_data;
   1.104 +	next->vm_private_data =
   1.105 +		&info->map[(next->vm_start - info->rings_vstart) >> PAGE_SHIFT];
   1.106  }
   1.107  
   1.108 -struct vm_operations_struct blktap_vm_ops = {
   1.109 +static struct vm_operations_struct blktap_vm_ops = {
   1.110  	nopage:   blktap_nopage,
   1.111  	zap_pte:  blktap_clear_pte,
   1.112 +	open:     blktap_vma_open,
   1.113  	close:    blktap_vma_close,
   1.114  };
   1.115  
   1.116 @@ -455,7 +472,7 @@ static tap_blkif_t *get_next_free_dev(vo
   1.117  		info = tapfds[minor];
   1.118  		/* we could have failed a previous attempt. */
   1.119  		if (!info ||
   1.120 -		    ((info->dev_inuse == 0) &&
   1.121 +		    ((!test_bit(0, &info->dev_inuse)) &&
   1.122  		     (info->dev_pending == 0)) ) {
   1.123  			info->dev_pending = 1;
   1.124  			goto found;
   1.125 @@ -592,7 +609,7 @@ static int blktap_open(struct inode *ino
   1.126  	FRONT_RING_INIT(&info->ufe_ring, sring, PAGE_SIZE);
   1.127  	
   1.128  	filp->private_data = info;
   1.129 -	info->vma = NULL;
   1.130 +	info->mm = NULL;
   1.131  
   1.132  	info->idx_map = kmalloc(sizeof(unsigned long) * MAX_PENDING_REQS, 
   1.133  				GFP_KERNEL);
   1.134 @@ -624,8 +641,10 @@ static int blktap_release(struct inode *
   1.135  	info->ring_ok = 0;
   1.136  	smp_wmb();
   1.137  
   1.138 -	info->dev_inuse = 0;
   1.139 -	DPRINTK("Freeing device [/dev/xen/blktap%d]\n",info->minor);
   1.140 +	mmput(info->mm);
   1.141 +	info->mm = NULL;
   1.142 +	kfree(info->map);
   1.143 +	info->map = NULL;
   1.144  
   1.145  	/* Free the ring page. */
   1.146  	ClearPageReserved(virt_to_page(info->ufe_ring.sring));
   1.147 @@ -644,6 +663,9 @@ static int blktap_release(struct inode *
   1.148  		info->status = CLEANSHUTDOWN;
   1.149  	}
   1.150  
   1.151 +	clear_bit(0, &info->dev_inuse);
   1.152 +	DPRINTK("Freeing device [/dev/xen/blktap%d]\n",info->minor);
   1.153 +
   1.154  	return 0;
   1.155  }
   1.156  
   1.157 @@ -669,7 +691,6 @@ static int blktap_release(struct inode *
   1.158  static int blktap_mmap(struct file *filp, struct vm_area_struct *vma)
   1.159  {
   1.160  	int size;
   1.161 -	struct tap_vma_priv *priv;
   1.162  	tap_blkif_t *info = filp->private_data;
   1.163  	int ret;
   1.164  
   1.165 @@ -706,16 +727,14 @@ static int blktap_mmap(struct file *filp
   1.166  	}
   1.167  
   1.168  	/* Mark this VM as containing foreign pages, and set up mappings. */
   1.169 -	priv = kzalloc(sizeof(*priv) + ((vma->vm_end - vma->vm_start)
   1.170 -					>> PAGE_SHIFT) * sizeof(*priv->map),
   1.171 -		       GFP_KERNEL);
   1.172 -	if (priv == NULL) {
   1.173 +	info->map = kzalloc(((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) *
   1.174 +			    sizeof(*info->map), GFP_KERNEL);
   1.175 +	if (info->map == NULL) {
   1.176  		WPRINTK("Couldn't alloc VM_FOREIGN map.\n");
   1.177  		goto fail;
   1.178  	}
   1.179 -	priv->info = info;
   1.180  
   1.181 -	vma->vm_private_data = priv;
   1.182 +	vma->vm_private_data = info->map;
   1.183  	vma->vm_flags |= VM_FOREIGN;
   1.184  	vma->vm_flags |= VM_DONTCOPY;
   1.185  
   1.186 @@ -723,7 +742,7 @@ static int blktap_mmap(struct file *filp
   1.187  	vma->vm_mm->context.has_foreign_mappings = 1;
   1.188  #endif
   1.189  
   1.190 -	info->vma = vma;
   1.191 +	info->mm = get_task_mm(current);
   1.192  	smp_wmb();
   1.193  	info->ring_ok = 1;
   1.194  	return 0;
   1.195 @@ -997,6 +1016,24 @@ static void free_req(pending_req_t *req)
   1.196  		wake_up(&pending_free_wq);
   1.197  }
   1.198  
   1.199 +static void blktap_zap_page_range(struct mm_struct *mm,
   1.200 +				  unsigned long uvaddr, int nr_pages)
   1.201 +{
   1.202 +	unsigned long end = uvaddr + (nr_pages << PAGE_SHIFT);
   1.203 +	struct vm_area_struct *vma;
   1.204 +
   1.205 +	vma = find_vma(mm, uvaddr);
   1.206 +	while (vma && uvaddr < end) {
   1.207 +		unsigned long s = max(uvaddr, vma->vm_start);
   1.208 +		unsigned long e = min(end, vma->vm_end);
   1.209 +
   1.210 +		zap_page_range(vma, s, e - s, NULL);
   1.211 +
   1.212 +		uvaddr = e;
   1.213 +		vma = vma->vm_next;
   1.214 +	}
   1.215 +}
   1.216 +
   1.217  static void fast_flush_area(pending_req_t *req, int k_idx, int u_idx,
   1.218  			    int tapidx)
   1.219  {
   1.220 @@ -1017,14 +1054,13 @@ static void fast_flush_area(pending_req_
   1.221  		return;
   1.222  	}
   1.223  
   1.224 -	mm = info->vma ? info->vma->vm_mm : NULL;
   1.225 +	mm = info->mm;
   1.226  
   1.227 -	if (info->vma != NULL &&
   1.228 -	    xen_feature(XENFEAT_auto_translated_physmap)) {
   1.229 +	if (mm != NULL && xen_feature(XENFEAT_auto_translated_physmap)) {
   1.230  		down_write(&mm->mmap_sem);
   1.231 -		zap_page_range(info->vma, 
   1.232 -			       MMAP_VADDR(info->user_vstart, u_idx, 0), 
   1.233 -			       req->nr_pages << PAGE_SHIFT, NULL);
   1.234 +		blktap_zap_page_range(mm,
   1.235 +				      MMAP_VADDR(info->user_vstart, u_idx, 0),
   1.236 +				      req->nr_pages);
   1.237  		up_write(&mm->mmap_sem);
   1.238  		return;
   1.239  	}
   1.240 @@ -1075,13 +1111,12 @@ static void fast_flush_area(pending_req_
   1.241  		GNTTABOP_unmap_grant_ref, unmap, invcount);
   1.242  	BUG_ON(ret);
   1.243  	
   1.244 -	if (info->vma != NULL &&
   1.245 -	    !xen_feature(XENFEAT_auto_translated_physmap)) {
   1.246 +	if (mm != NULL && !xen_feature(XENFEAT_auto_translated_physmap)) {
   1.247  		if (!locked++)
   1.248  			down_write(&mm->mmap_sem);
   1.249 -		zap_page_range(info->vma, 
   1.250 -			       MMAP_VADDR(info->user_vstart, u_idx, 0), 
   1.251 -			       req->nr_pages << PAGE_SHIFT, NULL);
   1.252 +		blktap_zap_page_range(mm, 
   1.253 +				      MMAP_VADDR(info->user_vstart, u_idx, 0), 
   1.254 +				      req->nr_pages);
   1.255  	}
   1.256  
   1.257  	if (locked)
   1.258 @@ -1195,7 +1230,6 @@ static int blktap_read_ufe_ring(tap_blki
   1.259  		for (j = 0; j < pending_req->nr_pages; j++) {
   1.260  
   1.261  			unsigned long kvaddr, uvaddr;
   1.262 -			struct tap_vma_priv *priv = info->vma->vm_private_data;
   1.263  			struct page *pg;
   1.264  			int offset;
   1.265  
   1.266 @@ -1205,7 +1239,7 @@ static int blktap_read_ufe_ring(tap_blki
   1.267  			pg = pfn_to_page(__pa(kvaddr) >> PAGE_SHIFT);
   1.268  			ClearPageReserved(pg);
   1.269  			offset = (uvaddr - info->rings_vstart) >> PAGE_SHIFT;
   1.270 -			priv->map[offset] = NULL;
   1.271 +			info->map[offset] = NULL;
   1.272  		}
   1.273  		fast_flush_area(pending_req, pending_idx, usr_idx, info->minor);
   1.274  		info->idx_map[usr_idx] = INVALID_REQ;
   1.275 @@ -1267,7 +1301,8 @@ static int do_block_io_op(blkif_t *blkif
   1.276  
   1.277  	info = tapfds[blkif->dev_num];
   1.278  
   1.279 -	if (blkif->dev_num > MAX_TAP_DEV || !info || !info->dev_inuse) {
   1.280 +	if (blkif->dev_num > MAX_TAP_DEV || !info ||
   1.281 +	    !test_bit(0, &info->dev_inuse)) {
   1.282  		if (print_dbug) {
   1.283  			WPRINTK("Can't get UE info!\n");
   1.284  			print_dbug = 0;
   1.285 @@ -1363,12 +1398,12 @@ static void dispatch_rw_block_io(blkif_t
   1.286  	unsigned int nseg;
   1.287  	int ret, i, nr_sects = 0;
   1.288  	tap_blkif_t *info;
   1.289 -	struct tap_vma_priv *priv;
   1.290  	blkif_request_t *target;
   1.291  	int pending_idx = RTN_PEND_IDX(pending_req,pending_req->mem_idx);
   1.292  	int usr_idx;
   1.293  	uint16_t mmap_idx = pending_req->mem_idx;
   1.294  	struct mm_struct *mm;
   1.295 +	struct vm_area_struct *vma = NULL;
   1.296  
   1.297  	if (blkif->dev_num < 0 || blkif->dev_num > MAX_TAP_DEV)
   1.298  		goto fail_response;
   1.299 @@ -1413,8 +1448,7 @@ static void dispatch_rw_block_io(blkif_t
   1.300  	pending_req->status    = BLKIF_RSP_OKAY;
   1.301  	pending_req->nr_pages  = nseg;
   1.302  	op = 0;
   1.303 -	priv = info->vma->vm_private_data;
   1.304 -	mm = info->vma->vm_mm;
   1.305 +	mm = info->mm;
   1.306  	if (!xen_feature(XENFEAT_auto_translated_physmap))
   1.307  		down_write(&mm->mmap_sem);
   1.308  	for (i = 0; i < nseg; i++) {
   1.309 @@ -1497,7 +1531,7 @@ static void dispatch_rw_block_io(blkif_t
   1.310  							  >> PAGE_SHIFT));
   1.311  			offset = (uvaddr - info->rings_vstart) >> PAGE_SHIFT;
   1.312  			pg = pfn_to_page(__pa(kvaddr) >> PAGE_SHIFT);
   1.313 -			priv->map[offset] = pg;
   1.314 +			info->map[offset] = pg;
   1.315  		}
   1.316  	} else {
   1.317  		for (i = 0; i < nseg; i++) {
   1.318 @@ -1524,7 +1558,7 @@ static void dispatch_rw_block_io(blkif_t
   1.319  
   1.320  			offset = (uvaddr - info->rings_vstart) >> PAGE_SHIFT;
   1.321  			pg = pfn_to_page(__pa(kvaddr) >> PAGE_SHIFT);
   1.322 -			priv->map[offset] = pg;
   1.323 +			info->map[offset] = pg;
   1.324  		}
   1.325  	}
   1.326  
   1.327 @@ -1542,9 +1576,23 @@ static void dispatch_rw_block_io(blkif_t
   1.328  		pg = pfn_to_page(__pa(kvaddr) >> PAGE_SHIFT);
   1.329  		SetPageReserved(pg);
   1.330  		if (xen_feature(XENFEAT_auto_translated_physmap)) {
   1.331 -			ret = vm_insert_page(info->vma,
   1.332 -					     MMAP_VADDR(info->user_vstart,
   1.333 -							usr_idx, i), pg);
   1.334 +			unsigned long uvaddr = MMAP_VADDR(info->user_vstart,
   1.335 +							  usr_idx, i);
   1.336 +			if (vma && uvaddr >= vma->vm_end) {
   1.337 +				vma = vma->vm_next;
   1.338 +				if (vma &&
   1.339 +				    (uvaddr < vma->vm_start ||
   1.340 +				     uvaddr >= vma->vm_end))
   1.341 +					vma = NULL;
   1.342 +			}
   1.343 +			if (vma == NULL) {
   1.344 +				vma = find_vma(mm, uvaddr);
   1.345 +				/* this virtual area was already munmapped.
   1.346 +				   so skip to next page */
   1.347 +				if (!vma)
   1.348 +					continue;
   1.349 +			}
   1.350 +			ret = vm_insert_page(vma, uvaddr, pg);
   1.351  			if (ret) {
   1.352  				up_write(&mm->mmap_sem);
   1.353  				goto fail_flush;