ia64/xen-unstable

changeset 10308:50db8c95e65d

[NET] back: Add SG support

This patch adds scatter-and-gather support to the backend. It also
advertises this fact through xenbus so that the frontend can detect
this and send through SG requests only if it is supported.

SG support is required to support skb's larger than one page. This
in turn is needed for either jumbo MTU or TSO. One of these is
required to bring local networking performance up to a level that
is acceptable.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
author kaf24@firebug.cl.cam.ac.uk
date Mon Jun 05 16:13:47 2006 +0100 (2006-06-05)
parents 1dab198509a9
children aecdb4c52fa7
files linux-2.6-xen-sparse/drivers/xen/netback/netback.c linux-2.6-xen-sparse/drivers/xen/netback/xenbus.c xen/include/public/io/netif.h xen/include/public/io/ring.h
line diff
     1.1 --- a/linux-2.6-xen-sparse/drivers/xen/netback/netback.c	Mon Jun 05 15:18:13 2006 +0100
     1.2 +++ b/linux-2.6-xen-sparse/drivers/xen/netback/netback.c	Mon Jun 05 16:13:47 2006 +0100
     1.3 @@ -490,6 +490,178 @@ inline static void net_tx_action_dealloc
     1.4  	}
     1.5  }
     1.6  
     1.7 +static void netbk_tx_err(netif_t *netif, RING_IDX end)
     1.8 +{
     1.9 +	RING_IDX cons = netif->tx.req_cons;
    1.10 +
    1.11 +	do {
    1.12 +		netif_tx_request_t *txp = RING_GET_REQUEST(&netif->tx, cons);
    1.13 +		make_tx_response(netif, txp->id, NETIF_RSP_ERROR);
    1.14 +	} while (++cons < end);
    1.15 +	netif->tx.req_cons = cons;
    1.16 +	netif_schedule_work(netif);
    1.17 +	netif_put(netif);
    1.18 +}
    1.19 +
    1.20 +static int netbk_count_requests(netif_t *netif, netif_tx_request_t *txp,
    1.21 +				int work_to_do)
    1.22 +{
    1.23 +	netif_tx_request_t *first = txp;
    1.24 +	RING_IDX cons = netif->tx.req_cons;
    1.25 +	int frags = 1;
    1.26 +
    1.27 +	while (txp->flags & NETTXF_more_data) {
    1.28 +		if (frags >= work_to_do) {
    1.29 +			DPRINTK("Need more frags\n");
    1.30 +			return -frags;
    1.31 +		}
    1.32 +
    1.33 +		txp = RING_GET_REQUEST(&netif->tx, cons + frags);
    1.34 +		if (txp->size > first->size) {
    1.35 +			DPRINTK("Frags galore\n");
    1.36 +			return -frags;
    1.37 +		}
    1.38 +
    1.39 +		first->size -= txp->size;
    1.40 +		frags++;
    1.41 +
    1.42 +		if (unlikely((txp->offset + txp->size) > PAGE_SIZE)) {
    1.43 +			DPRINTK("txp->offset: %x, size: %u\n",
    1.44 +				txp->offset, txp->size);
    1.45 +			return -frags;
    1.46 +		}
    1.47 +	}
    1.48 +
    1.49 +	return frags;
    1.50 +}
    1.51 +
    1.52 +static gnttab_map_grant_ref_t *netbk_get_requests(netif_t *netif,
    1.53 +						  struct sk_buff *skb,
    1.54 +						  gnttab_map_grant_ref_t *mop)
    1.55 +{
    1.56 +	struct skb_shared_info *shinfo = skb_shinfo(skb);
    1.57 +	skb_frag_t *frags = shinfo->frags;
    1.58 +	netif_tx_request_t *txp;
    1.59 +	unsigned long pending_idx = *((u16 *)skb->data);
    1.60 +	int nr_frags = shinfo->nr_frags;
    1.61 +	RING_IDX cons = netif->tx.req_cons + 1;
    1.62 +	int i;
    1.63 +
    1.64 +	if ((unsigned long)shinfo->frags[0].page == pending_idx) {
    1.65 +		frags++;
    1.66 +		nr_frags--;
    1.67 +	}
    1.68 +
    1.69 +	for (i = 0; i < nr_frags; i++) {
    1.70 +		txp = RING_GET_REQUEST(&netif->tx, cons + i);
    1.71 +		pending_idx = pending_ring[MASK_PEND_IDX(pending_cons++)];
    1.72 +
    1.73 +		gnttab_set_map_op(mop++, MMAP_VADDR(pending_idx),
    1.74 +				  GNTMAP_host_map | GNTMAP_readonly,
    1.75 +				  txp->gref, netif->domid);
    1.76 +
    1.77 +		memcpy(&pending_tx_info[pending_idx].req, txp, sizeof(*txp));
    1.78 +		netif_get(netif);
    1.79 +		pending_tx_info[pending_idx].netif = netif;
    1.80 +		frags[i].page = (void *)pending_idx;
    1.81 +	}
    1.82 +
    1.83 +	return mop;
    1.84 +}
    1.85 +
    1.86 +static int netbk_tx_check_mop(struct sk_buff *skb,
    1.87 +			       gnttab_map_grant_ref_t **mopp)
    1.88 +{
    1.89 +	gnttab_map_grant_ref_t *mop = *mopp;
    1.90 +	int pending_idx = *((u16 *)skb->data);
    1.91 +	netif_t *netif = pending_tx_info[pending_idx].netif;
    1.92 +	netif_tx_request_t *txp;
    1.93 +	struct skb_shared_info *shinfo = skb_shinfo(skb);
    1.94 +	int nr_frags = shinfo->nr_frags;
    1.95 +	int start;
    1.96 +	int err;
    1.97 +	int i;
    1.98 +
    1.99 +	err = mop->status;
   1.100 +	if (unlikely(err)) {
   1.101 +		txp = &pending_tx_info[pending_idx].req;
   1.102 +		make_tx_response(netif, txp->id, NETIF_RSP_ERROR);
   1.103 +		pending_ring[MASK_PEND_IDX(pending_prod++)] = pending_idx;
   1.104 +		netif_put(netif);
   1.105 +	} else {
   1.106 +		set_phys_to_machine(
   1.107 +			__pa(MMAP_VADDR(pending_idx)) >> PAGE_SHIFT,
   1.108 +			FOREIGN_FRAME(mop->dev_bus_addr >> PAGE_SHIFT));
   1.109 +		grant_tx_handle[pending_idx] = mop->handle;
   1.110 +	}
   1.111 +
   1.112 +	start = 0;
   1.113 +	if ((unsigned long)shinfo->frags[0].page == pending_idx)
   1.114 +		start++;
   1.115 +
   1.116 +	for (i = start; i < nr_frags; i++) {
   1.117 +		int newerr;
   1.118 +		int j;
   1.119 +
   1.120 +		pending_idx = (unsigned long)shinfo->frags[i].page;
   1.121 +
   1.122 +		newerr = (++mop)->status;
   1.123 +		if (likely(!newerr)) {
   1.124 +			set_phys_to_machine(
   1.125 +				__pa(MMAP_VADDR(pending_idx)) >> PAGE_SHIFT,
   1.126 +				FOREIGN_FRAME(mop->dev_bus_addr >> PAGE_SHIFT));
   1.127 +			grant_tx_handle[pending_idx] = mop->handle;
   1.128 +
   1.129 +			if (unlikely(err))
   1.130 +				netif_idx_release(pending_idx);
   1.131 +			continue;
   1.132 +		}
   1.133 +
   1.134 +		txp = &pending_tx_info[pending_idx].req;
   1.135 +		make_tx_response(netif, txp->id, NETIF_RSP_ERROR);
   1.136 +		pending_ring[MASK_PEND_IDX(pending_prod++)] = pending_idx;
   1.137 +		netif_put(netif);
   1.138 +
   1.139 +		if (err)
   1.140 +			continue;
   1.141 +
   1.142 +		pending_idx = *((u16 *)skb->data);
   1.143 +		netif_idx_release(pending_idx);
   1.144 +
   1.145 +		for (j = start; j < i; j++) {
   1.146 +			pending_idx = (unsigned long)shinfo->frags[i].page;
   1.147 +			netif_idx_release(pending_idx);
   1.148 +		}
   1.149 +		err |= newerr;
   1.150 +	}
   1.151 +
   1.152 +	*mopp = mop + 1;
   1.153 +	return err;
   1.154 +}
   1.155 +
   1.156 +static void netbk_fill_frags(struct sk_buff *skb)
   1.157 +{
   1.158 +	struct skb_shared_info *shinfo = skb_shinfo(skb);
   1.159 +	int nr_frags = shinfo->nr_frags;
   1.160 +	int i;
   1.161 +
   1.162 +	for (i = 0; i < nr_frags; i++) {
   1.163 +		skb_frag_t *frag = shinfo->frags + i;
   1.164 +		netif_tx_request_t *txp;
   1.165 +		unsigned long pending_idx;
   1.166 +
   1.167 +		pending_idx = (unsigned long)frag->page;
   1.168 +		txp = &pending_tx_info[pending_idx].req;
   1.169 +		frag->page = virt_to_page(MMAP_VADDR(pending_idx));
   1.170 +		frag->size = txp->size;
   1.171 +		frag->page_offset = txp->offset;
   1.172 +
   1.173 +		skb->len += txp->size;
   1.174 +		skb->data_len += txp->size;
   1.175 +		skb->truesize += txp->size;
   1.176 +	}
   1.177 +}
   1.178 +
   1.179  /* Called after netfront has transmitted */
   1.180  static void net_tx_action(unsigned long unused)
   1.181  {
   1.182 @@ -507,7 +679,7 @@ static void net_tx_action(unsigned long 
   1.183  		net_tx_action_dealloc();
   1.184  
   1.185  	mop = tx_map_ops;
   1.186 -	while ((NR_PENDING_REQS < MAX_PENDING_REQS) &&
   1.187 +	while (((NR_PENDING_REQS + MAX_SKB_FRAGS) < MAX_PENDING_REQS) &&
   1.188  		!list_empty(&net_schedule_list)) {
   1.189  		/* Get a netif from the list with work to do. */
   1.190  		ent = net_schedule_list.next;
   1.191 @@ -555,38 +727,44 @@ static void net_tx_action(unsigned long 
   1.192  		}
   1.193  		netif->remaining_credit -= txreq.size;
   1.194  
   1.195 -		netif->tx.req_cons++;
   1.196 -
   1.197 -		netif_schedule_work(netif);
   1.198 +		ret = netbk_count_requests(netif, &txreq, work_to_do);
   1.199 +		if (unlikely(ret < 0)) {
   1.200 +			netbk_tx_err(netif, i - ret);
   1.201 +			continue;
   1.202 +		}
   1.203 +		i += ret;
   1.204  
   1.205 -		if (unlikely(txreq.size < ETH_HLEN) || 
   1.206 -		    unlikely(txreq.size > ETH_FRAME_LEN)) {
   1.207 +		if (unlikely(ret > MAX_SKB_FRAGS + 1)) {
   1.208 +			DPRINTK("Too many frags\n");
   1.209 +			netbk_tx_err(netif, i);
   1.210 +			continue;
   1.211 +		}
   1.212 +
   1.213 +		if (unlikely(txreq.size < ETH_HLEN)) {
   1.214  			DPRINTK("Bad packet size: %d\n", txreq.size);
   1.215 -			make_tx_response(netif, txreq.id, NETIF_RSP_ERROR);
   1.216 -			netif_put(netif);
   1.217 +			netbk_tx_err(netif, i);
   1.218  			continue; 
   1.219  		}
   1.220  
   1.221  		/* No crossing a page as the payload mustn't fragment. */
   1.222 -		if (unlikely((txreq.offset + txreq.size) >= PAGE_SIZE)) {
   1.223 +		if (unlikely((txreq.offset + txreq.size) > PAGE_SIZE)) {
   1.224  			DPRINTK("txreq.offset: %x, size: %u, end: %lu\n", 
   1.225  				txreq.offset, txreq.size, 
   1.226  				(txreq.offset &~PAGE_MASK) + txreq.size);
   1.227 -			make_tx_response(netif, txreq.id, NETIF_RSP_ERROR);
   1.228 -			netif_put(netif);
   1.229 +			netbk_tx_err(netif, i);
   1.230  			continue;
   1.231  		}
   1.232  
   1.233  		pending_idx = pending_ring[MASK_PEND_IDX(pending_cons)];
   1.234  
   1.235 -		data_len = (txreq.size > PKT_PROT_LEN) ?
   1.236 +		data_len = (txreq.size > PKT_PROT_LEN &&
   1.237 +			    ret < MAX_SKB_FRAGS + 1) ?
   1.238  			PKT_PROT_LEN : txreq.size;
   1.239  
   1.240  		skb = alloc_skb(data_len+16, GFP_ATOMIC);
   1.241  		if (unlikely(skb == NULL)) {
   1.242  			DPRINTK("Can't allocate a skb in start_xmit.\n");
   1.243 -			make_tx_response(netif, txreq.id, NETIF_RSP_ERROR);
   1.244 -			netif_put(netif);
   1.245 +			netbk_tx_err(netif, i);
   1.246  			break;
   1.247  		}
   1.248  
   1.249 @@ -603,10 +781,24 @@ static void net_tx_action(unsigned long 
   1.250  		pending_tx_info[pending_idx].netif = netif;
   1.251  		*((u16 *)skb->data) = pending_idx;
   1.252  
   1.253 +		__skb_put(skb, data_len);
   1.254 +
   1.255 +		skb_shinfo(skb)->nr_frags = ret - 1;
   1.256 +		if (data_len < txreq.size) {
   1.257 +			skb_shinfo(skb)->nr_frags++;
   1.258 +			skb_shinfo(skb)->frags[0].page =
   1.259 +				(void *)(unsigned long)pending_idx;
   1.260 +		}
   1.261 +
   1.262  		__skb_queue_tail(&tx_queue, skb);
   1.263  
   1.264  		pending_cons++;
   1.265  
   1.266 +		mop = netbk_get_requests(netif, skb, mop);
   1.267 +
   1.268 +		netif->tx.req_cons = i;
   1.269 +		netif_schedule_work(netif);
   1.270 +
   1.271  		if ((mop - tx_map_ops) >= ARRAY_SIZE(tx_map_ops))
   1.272  			break;
   1.273  	}
   1.274 @@ -620,75 +812,56 @@ static void net_tx_action(unsigned long 
   1.275  
   1.276  	mop = tx_map_ops;
   1.277  	while ((skb = __skb_dequeue(&tx_queue)) != NULL) {
   1.278 +		netif_tx_request_t *txp;
   1.279 +
   1.280  		pending_idx = *((u16 *)skb->data);
   1.281  		netif       = pending_tx_info[pending_idx].netif;
   1.282 -		memcpy(&txreq, &pending_tx_info[pending_idx].req,
   1.283 -		       sizeof(txreq));
   1.284 +		txp         = &pending_tx_info[pending_idx].req;
   1.285  
   1.286  		/* Check the remap error code. */
   1.287 -		if (unlikely(mop->status)) {
   1.288 +		if (unlikely(netbk_tx_check_mop(skb, &mop))) {
   1.289  			printk(KERN_ALERT "#### netback grant fails\n");
   1.290 -			make_tx_response(netif, txreq.id, NETIF_RSP_ERROR);
   1.291 -			netif_put(netif);
   1.292 +			skb_shinfo(skb)->nr_frags = 0;
   1.293  			kfree_skb(skb);
   1.294 -			mop++;
   1.295 -			pending_ring[MASK_PEND_IDX(pending_prod++)] =
   1.296 -				pending_idx;
   1.297  			continue;
   1.298  		}
   1.299 -		set_phys_to_machine(
   1.300 -			__pa(MMAP_VADDR(pending_idx)) >> PAGE_SHIFT,
   1.301 -			FOREIGN_FRAME(mop->dev_bus_addr >> PAGE_SHIFT));
   1.302 -		grant_tx_handle[pending_idx] = mop->handle;
   1.303  
   1.304 -		data_len = (txreq.size > PKT_PROT_LEN) ?
   1.305 -			PKT_PROT_LEN : txreq.size;
   1.306 -
   1.307 -		__skb_put(skb, data_len);
   1.308 +		data_len = skb->len;
   1.309  		memcpy(skb->data, 
   1.310 -		       (void *)(MMAP_VADDR(pending_idx)|txreq.offset),
   1.311 +		       (void *)(MMAP_VADDR(pending_idx)|txp->offset),
   1.312  		       data_len);
   1.313 -		if (data_len < txreq.size) {
   1.314 +		if (data_len < txp->size) {
   1.315  			/* Append the packet payload as a fragment. */
   1.316 -			skb_shinfo(skb)->frags[0].page        = 
   1.317 -				virt_to_page(MMAP_VADDR(pending_idx));
   1.318 -			skb_shinfo(skb)->frags[0].size        =
   1.319 -				txreq.size - data_len;
   1.320 -			skb_shinfo(skb)->frags[0].page_offset = 
   1.321 -				txreq.offset + data_len;
   1.322 -			skb_shinfo(skb)->nr_frags = 1;
   1.323 +			txp->offset += data_len;
   1.324 +			txp->size -= data_len;
   1.325  		} else {
   1.326  			/* Schedule a response immediately. */
   1.327  			netif_idx_release(pending_idx);
   1.328  		}
   1.329  
   1.330 -		skb->data_len  = txreq.size - data_len;
   1.331 -		skb->len      += skb->data_len;
   1.332 -		skb->truesize += skb->data_len;
   1.333 -
   1.334 -		skb->dev      = netif->dev;
   1.335 -		skb->protocol = eth_type_trans(skb, skb->dev);
   1.336 -
   1.337  		/*
   1.338  		 * Old frontends do not assert data_validated but we
   1.339  		 * can infer it from csum_blank so test both flags.
   1.340  		 */
   1.341 -		if (txreq.flags & (NETTXF_data_validated|NETTXF_csum_blank)) {
   1.342 +		if (txp->flags & (NETTXF_data_validated|NETTXF_csum_blank)) {
   1.343  			skb->ip_summed = CHECKSUM_UNNECESSARY;
   1.344  			skb->proto_data_valid = 1;
   1.345  		} else {
   1.346  			skb->ip_summed = CHECKSUM_NONE;
   1.347  			skb->proto_data_valid = 0;
   1.348  		}
   1.349 -		skb->proto_csum_blank = !!(txreq.flags & NETTXF_csum_blank);
   1.350 +		skb->proto_csum_blank = !!(txp->flags & NETTXF_csum_blank);
   1.351  
   1.352 -		netif->stats.rx_bytes += txreq.size;
   1.353 +		netbk_fill_frags(skb);
   1.354 +
   1.355 +		skb->dev      = netif->dev;
   1.356 +		skb->protocol = eth_type_trans(skb, skb->dev);
   1.357 +
   1.358 +		netif->stats.rx_bytes += skb->len;
   1.359  		netif->stats.rx_packets++;
   1.360  
   1.361  		netif_rx(skb);
   1.362  		netif->dev->last_rx = jiffies;
   1.363 -
   1.364 -		mop++;
   1.365  	}
   1.366  }
   1.367  
     2.1 --- a/linux-2.6-xen-sparse/drivers/xen/netback/xenbus.c	Mon Jun 05 15:18:13 2006 +0100
     2.2 +++ b/linux-2.6-xen-sparse/drivers/xen/netback/xenbus.c	Mon Jun 05 16:13:47 2006 +0100
     2.3 @@ -69,6 +69,8 @@ static int netback_remove(struct xenbus_
     2.4  static int netback_probe(struct xenbus_device *dev,
     2.5  			 const struct xenbus_device_id *id)
     2.6  {
     2.7 +	const char *message;
     2.8 +	xenbus_transaction_t xbt;
     2.9  	int err;
    2.10  	struct backend_info *be = kzalloc(sizeof(struct backend_info),
    2.11  					  GFP_KERNEL);
    2.12 @@ -86,6 +88,27 @@ static int netback_probe(struct xenbus_d
    2.13  	if (err)
    2.14  		goto fail;
    2.15  
    2.16 +	do {
    2.17 +		err = xenbus_transaction_start(&xbt);
    2.18 +		if (err) {
    2.19 +			xenbus_dev_fatal(dev, err, "starting transaction");
    2.20 +			goto fail;
    2.21 +		}
    2.22 +
    2.23 +		err = xenbus_printf(xbt, dev->nodename, "feature-sg", "%d", 1);
    2.24 +		if (err) {
    2.25 +			message = "writing feature-sg";
    2.26 +			goto abort_transaction;
    2.27 +		}
    2.28 +
    2.29 +		err = xenbus_transaction_end(xbt, 0);
    2.30 +	} while (err == -EAGAIN);
    2.31 +
    2.32 +	if (err) {
    2.33 +		xenbus_dev_fatal(dev, err, "completing transaction");
    2.34 +		goto fail;
    2.35 +	}
    2.36 +
    2.37  	err = xenbus_switch_state(dev, XenbusStateInitWait);
    2.38  	if (err) {
    2.39  		goto fail;
    2.40 @@ -93,6 +116,9 @@ static int netback_probe(struct xenbus_d
    2.41  
    2.42  	return 0;
    2.43  
    2.44 +abort_transaction:
    2.45 +	xenbus_transaction_end(xbt, 1);
    2.46 +	xenbus_dev_fatal(dev, err, "%s", message);
    2.47  fail:
    2.48  	DPRINTK("failed");
    2.49  	netback_remove(dev);
     3.1 --- a/xen/include/public/io/netif.h	Mon Jun 05 15:18:13 2006 +0100
     3.2 +++ b/xen/include/public/io/netif.h	Mon Jun 05 16:13:47 2006 +0100
     3.3 @@ -27,6 +27,10 @@
     3.4  #define _NETTXF_data_validated (1)
     3.5  #define  NETTXF_data_validated (1U<<_NETTXF_data_validated)
     3.6  
     3.7 +/* Packet continues in the request. */
     3.8 +#define _NETTXF_more_data      (2)
     3.9 +#define  NETTXF_more_data      (1U<<_NETTXF_more_data)
    3.10 +
    3.11  struct netif_tx_request {
    3.12      grant_ref_t gref;      /* Reference to buffer page */
    3.13      uint16_t offset;       /* Offset within buffer page */
     4.1 --- a/xen/include/public/io/ring.h	Mon Jun 05 15:18:13 2006 +0100
     4.2 +++ b/xen/include/public/io/ring.h	Mon Jun 05 16:13:47 2006 +0100
     4.3 @@ -159,11 +159,15 @@ typedef struct __name##_back_ring __name
     4.4  
     4.5  /* Test if there are outstanding messages to be processed on a ring. */
     4.6  #define RING_HAS_UNCONSUMED_RESPONSES(_r)                               \
     4.7 -    ((_r)->rsp_cons != (_r)->sring->rsp_prod)
     4.8 +    ((_r)->sring->rsp_prod - (_r)->rsp_cons)
     4.9  
    4.10  #define RING_HAS_UNCONSUMED_REQUESTS(_r)                                \
    4.11 -    (((_r)->req_cons != (_r)->sring->req_prod) &&                       \
    4.12 -     (((_r)->req_cons - (_r)->rsp_prod_pvt) != RING_SIZE(_r)))
    4.13 +    ({									\
    4.14 +	unsigned int req = (_r)->sring->req_prod - (_r)->req_cons;	\
    4.15 +	unsigned int rsp = RING_SIZE(_r) -				\
    4.16 +			   ((_r)->req_cons - (_r)->rsp_prod_pvt);	\
    4.17 +	req < rsp ? req : rsp;						\
    4.18 +    })
    4.19  
    4.20  /* Direct access to individual ring elements, by index. */
    4.21  #define RING_GET_REQUEST(_r, _idx)                                      \