ia64/xen-unstable

changeset 5077:8a1faeb0d3c6

bitkeeper revision 1.1506 (428f8748oAPuAqxeI4b_UUMZQok4QQ)

Checksum offload for local virtual networking, and to/from a physical
interface that may be connected via a virtual bridge or router. This adds
a coupel of new fields to skbuffs that are intended to survive across IP
or MAC level forwarding. I've tested basic connectivity with this patch,
but further stress-testing and performance benchmarking is really required.
Signed-off-by: Keir Fraser <keir@xensource.com>
author kaf24@viper.(none)
date Sat May 21 19:08:56 2005 +0000 (2005-05-21)
parents d55569f2ac18
children c10a5789d09e
files .rootkeys linux-2.6.11-xen-sparse/drivers/xen/netback/interface.c linux-2.6.11-xen-sparse/drivers/xen/netback/loopback.c linux-2.6.11-xen-sparse/drivers/xen/netback/netback.c linux-2.6.11-xen-sparse/drivers/xen/netfront/netfront.c linux-2.6.11-xen-sparse/include/linux/skbuff.h linux-2.6.11-xen-sparse/net/core/dev.c linux-2.6.11-xen-sparse/net/core/skbuff.c xen/include/public/io/netif.h
line diff
     1.1 --- a/.rootkeys	Sat May 21 12:40:44 2005 +0000
     1.2 +++ b/.rootkeys	Sat May 21 19:08:56 2005 +0000
     1.3 @@ -419,12 +419,15 @@ 3f689063BoW-HWV3auUJ-OqXfcGArw linux-2.6
     1.4  419b4e93z2S0gR17XTy8wg09JEwAhg linux-2.6.11-xen-sparse/include/linux/gfp.h
     1.5  42305f545Vc5SLCUewZ2-n-P9JJhEQ linux-2.6.11-xen-sparse/include/linux/highmem.h
     1.6  419dfc609zbti8rqL60tL2dHXQ_rvQ linux-2.6.11-xen-sparse/include/linux/irq.h
     1.7 +428f8747dtEZ4CfC5tb6Loe9h0Ivpg linux-2.6.11-xen-sparse/include/linux/skbuff.h
     1.8  419dfc6awx7w88wk6cG9P3mPidX6LQ linux-2.6.11-xen-sparse/kernel/irq/manage.c
     1.9  40f56a0ddHCSs3501MY4hRf22tctOw linux-2.6.11-xen-sparse/mkbuildtree
    1.10  42305f54Q6xJ1bXcQJlCQq1m-e2C8g linux-2.6.11-xen-sparse/mm/highmem.c
    1.11  412f46c0LJuKAgSPGoC0Z1DEkLfuLA linux-2.6.11-xen-sparse/mm/memory.c
    1.12  426fa4d7ooLYmFcFjJMF_ut4GFVh2Q linux-2.6.11-xen-sparse/mm/mmap.c
    1.13  410a94a4KT6I6X0LVc7djB39tRDp4g linux-2.6.11-xen-sparse/mm/page_alloc.c
    1.14 +428f8747Gp_X2UtgwcL0-YeYkCXxvQ linux-2.6.11-xen-sparse/net/core/dev.c
    1.15 +428f8747vBdkOrip6rhWK_eEvVc8dA linux-2.6.11-xen-sparse/net/core/skbuff.c
    1.16  413cb1e4zst25MDYjg63Y-NGC5_pLg netbsd-2.0-xen-sparse/Makefile
    1.17  413cb1e5c_Mkxf_X0zimEhTKI_l4DA netbsd-2.0-xen-sparse/mkbuildtree
    1.18  413cb1e5kY_Zil7-b0kI6hvCIxBEYg netbsd-2.0-xen-sparse/nbconfig-xen
     2.1 --- a/linux-2.6.11-xen-sparse/drivers/xen/netback/interface.c	Sat May 21 12:40:44 2005 +0000
     2.2 +++ b/linux-2.6.11-xen-sparse/drivers/xen/netback/interface.c	Sat May 21 19:08:56 2005 +0000
     2.3 @@ -159,6 +159,7 @@ void netif_create(netif_be_create_t *cre
     2.4      dev->get_stats       = netif_be_get_stats;
     2.5      dev->open            = net_open;
     2.6      dev->stop            = net_close;
     2.7 +    dev->features        = NETIF_F_NO_CSUM;
     2.8  
     2.9      /* Disable queuing. */
    2.10      dev->tx_queue_len = 0;
     3.1 --- a/linux-2.6.11-xen-sparse/drivers/xen/netback/loopback.c	Sat May 21 12:40:44 2005 +0000
     3.2 +++ b/linux-2.6.11-xen-sparse/drivers/xen/netback/loopback.c	Sat May 21 19:08:56 2005 +0000
     3.3 @@ -67,6 +67,11 @@ static int loopback_start_xmit(struct sk
     3.4      np->stats.rx_bytes += skb->len;
     3.5      np->stats.rx_packets++;
     3.6  
     3.7 +    if ( skb->ip_summed == CHECKSUM_HW )
     3.8 +        skb->proto_csum_blank = 1;
     3.9 +    skb->ip_summed = skb->proto_csum_valid ?
    3.10 +        CHECKSUM_UNNECESSARY : CHECKSUM_NONE;
    3.11 +
    3.12      skb->pkt_type = PACKET_HOST; /* overridden by eth_type_trans() */
    3.13      skb->protocol = eth_type_trans(skb, dev);
    3.14      skb->dev      = dev;
    3.15 @@ -95,6 +100,8 @@ static void loopback_construct(struct ne
    3.16  
    3.17      dev->tx_queue_len    = 0;
    3.18  
    3.19 +    dev->features        = NETIF_F_HIGHDMA | NETIF_F_LLTX;
    3.20 +
    3.21      /*
    3.22       * We do not set a jumbo MTU on the interface. Otherwise the network
    3.23       * stack will try to send large packets that will get dropped by the
    3.24 @@ -118,6 +125,9 @@ static int __init loopback_init(void)
    3.25      loopback_construct(dev1, dev2);
    3.26      loopback_construct(dev2, dev1);
    3.27  
    3.28 +    dev1->features |= NETIF_F_NO_CSUM;
    3.29 +    dev2->features |= NETIF_F_IP_CSUM;
    3.30 +
    3.31      /*
    3.32       * Initialise a dummy MAC address for the 'dummy backend' interface. We
    3.33       * choose the numerically largest non-broadcast address to prevent the
     4.1 --- a/linux-2.6.11-xen-sparse/drivers/xen/netback/netback.c	Sat May 21 12:40:44 2005 +0000
     4.2 +++ b/linux-2.6.11-xen-sparse/drivers/xen/netback/netback.c	Sat May 21 19:08:56 2005 +0000
     4.3 @@ -27,7 +27,8 @@ static int  make_rx_response(netif_t *ne
     4.4                               u16      id, 
     4.5                               s8       st,
     4.6                               memory_t addr,
     4.7 -                             u16      size);
     4.8 +                             u16      size,
     4.9 +                             u16      csum_valid);
    4.10  
    4.11  static void net_tx_action(unsigned long unused);
    4.12  static DECLARE_TASKLET(net_tx_tasklet, net_tx_action, 0);
    4.13 @@ -154,6 +155,7 @@ int netif_be_start_xmit(struct sk_buff *
    4.14          __skb_put(nskb, skb->len);
    4.15          (void)skb_copy_bits(skb, -hlen, nskb->data - hlen, skb->len + hlen);
    4.16          nskb->dev = skb->dev;
    4.17 +        nskb->proto_csum_valid = skb->proto_csum_valid;
    4.18          dev_kfree_skb(skb);
    4.19          skb = nskb;
    4.20      }
    4.21 @@ -308,7 +310,8 @@ static void net_rx_action(unsigned long 
    4.22  
    4.23          evtchn = netif->evtchn;
    4.24          id = netif->rx->ring[MASK_NETIF_RX_IDX(netif->rx_resp_prod)].req.id;
    4.25 -        if ( make_rx_response(netif, id, status, mdata, size) &&
    4.26 +        if ( make_rx_response(netif, id, status, mdata,
    4.27 +                              size, skb->proto_csum_valid) &&
    4.28               (rx_notify[evtchn] == 0) )
    4.29          {
    4.30              rx_notify[evtchn] = 1;
    4.31 @@ -646,6 +649,11 @@ static void net_tx_action(unsigned long 
    4.32          skb->dev      = netif->dev;
    4.33          skb->protocol = eth_type_trans(skb, skb->dev);
    4.34  
    4.35 +        /* No checking needed on localhost, but remember the field is blank. */
    4.36 +        skb->ip_summed        = CHECKSUM_UNNECESSARY;
    4.37 +        skb->proto_csum_valid = 1;
    4.38 +        skb->proto_csum_blank = txreq.csum_blank;
    4.39 +
    4.40          netif->stats.rx_bytes += txreq.size;
    4.41          netif->stats.rx_packets++;
    4.42  
    4.43 @@ -711,15 +719,17 @@ static int make_rx_response(netif_t *net
    4.44                              u16      id, 
    4.45                              s8       st,
    4.46                              memory_t addr,
    4.47 -                            u16      size)
    4.48 +                            u16      size,
    4.49 +                            u16      csum_valid)
    4.50  {
    4.51      NETIF_RING_IDX i = netif->rx_resp_prod;
    4.52      netif_rx_response_t *resp;
    4.53  
    4.54      resp = &netif->rx->ring[MASK_NETIF_RX_IDX(i)].resp;
    4.55 -    resp->addr   = addr;
    4.56 -    resp->id     = id;
    4.57 -    resp->status = (s16)size;
    4.58 +    resp->addr       = addr;
    4.59 +    resp->csum_valid = csum_valid;
    4.60 +    resp->id         = id;
    4.61 +    resp->status     = (s16)size;
    4.62      if ( st < 0 )
    4.63          resp->status = (s16)st;
    4.64      wmb();
     5.1 --- a/linux-2.6.11-xen-sparse/drivers/xen/netfront/netfront.c	Sat May 21 12:40:44 2005 +0000
     5.2 +++ b/linux-2.6.11-xen-sparse/drivers/xen/netfront/netfront.c	Sat May 21 19:08:56 2005 +0000
     5.3 @@ -473,6 +473,7 @@ static int network_start_xmit(struct sk_
     5.4      tx->id   = id;
     5.5      tx->addr = virt_to_machine(skb->data);
     5.6      tx->size = skb->len;
     5.7 +    tx->csum_blank = (skb->ip_summed == CHECKSUM_HW);
     5.8  
     5.9      wmb(); /* Ensure that backend will see the request. */
    5.10      np->tx->req_prod = i + 1;
    5.11 @@ -573,6 +574,9 @@ static int netif_poll(struct net_device 
    5.12          skb->len  = rx->status;
    5.13          skb->tail = skb->data + skb->len;
    5.14  
    5.15 +        if ( rx->csum_valid )
    5.16 +            skb->ip_summed = CHECKSUM_UNNECESSARY;
    5.17 +
    5.18          np->stats.rx_packets++;
    5.19          np->stats.rx_bytes += rx->status;
    5.20  
    5.21 @@ -967,7 +971,8 @@ static int create_netdev(int handle, str
    5.22      dev->get_stats       = network_get_stats;
    5.23      dev->poll            = netif_poll;
    5.24      dev->weight          = 64;
    5.25 -    
    5.26 +    dev->features        = NETIF_F_IP_CSUM;
    5.27 +
    5.28      if ((err = register_netdev(dev)) != 0) {
    5.29          printk(KERN_WARNING "%s> register_netdev err=%d\n", __FUNCTION__, err);
    5.30          goto exit;
     6.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     6.2 +++ b/linux-2.6.11-xen-sparse/include/linux/skbuff.h	Sat May 21 19:08:56 2005 +0000
     6.3 @@ -0,0 +1,1184 @@
     6.4 +/*
     6.5 + *	Definitions for the 'struct sk_buff' memory handlers.
     6.6 + *
     6.7 + *	Authors:
     6.8 + *		Alan Cox, <gw4pts@gw4pts.ampr.org>
     6.9 + *		Florian La Roche, <rzsfl@rz.uni-sb.de>
    6.10 + *
    6.11 + *	This program is free software; you can redistribute it and/or
    6.12 + *	modify it under the terms of the GNU General Public License
    6.13 + *	as published by the Free Software Foundation; either version
    6.14 + *	2 of the License, or (at your option) any later version.
    6.15 + */
    6.16 +
    6.17 +#ifndef _LINUX_SKBUFF_H
    6.18 +#define _LINUX_SKBUFF_H
    6.19 +
    6.20 +#include <linux/config.h>
    6.21 +#include <linux/kernel.h>
    6.22 +#include <linux/compiler.h>
    6.23 +#include <linux/time.h>
    6.24 +#include <linux/cache.h>
    6.25 +
    6.26 +#include <asm/atomic.h>
    6.27 +#include <asm/types.h>
    6.28 +#include <linux/spinlock.h>
    6.29 +#include <linux/mm.h>
    6.30 +#include <linux/highmem.h>
    6.31 +#include <linux/poll.h>
    6.32 +#include <linux/net.h>
    6.33 +#include <net/checksum.h>
    6.34 +
    6.35 +#define HAVE_ALLOC_SKB		/* For the drivers to know */
    6.36 +#define HAVE_ALIGNABLE_SKB	/* Ditto 8)		   */
    6.37 +#define SLAB_SKB 		/* Slabified skbuffs 	   */
    6.38 +
    6.39 +#define CHECKSUM_NONE 0
    6.40 +#define CHECKSUM_HW 1
    6.41 +#define CHECKSUM_UNNECESSARY 2
    6.42 +
    6.43 +#define SKB_DATA_ALIGN(X)	(((X) + (SMP_CACHE_BYTES - 1)) & \
    6.44 +				 ~(SMP_CACHE_BYTES - 1))
    6.45 +#define SKB_MAX_ORDER(X, ORDER)	(((PAGE_SIZE << (ORDER)) - (X) - \
    6.46 +				  sizeof(struct skb_shared_info)) & \
    6.47 +				  ~(SMP_CACHE_BYTES - 1))
    6.48 +#define SKB_MAX_HEAD(X)		(SKB_MAX_ORDER((X), 0))
    6.49 +#define SKB_MAX_ALLOC		(SKB_MAX_ORDER(0, 2))
    6.50 +
    6.51 +/* A. Checksumming of received packets by device.
    6.52 + *
    6.53 + *	NONE: device failed to checksum this packet.
    6.54 + *		skb->csum is undefined.
    6.55 + *
    6.56 + *	UNNECESSARY: device parsed packet and wouldbe verified checksum.
    6.57 + *		skb->csum is undefined.
    6.58 + *	      It is bad option, but, unfortunately, many of vendors do this.
    6.59 + *	      Apparently with secret goal to sell you new device, when you
    6.60 + *	      will add new protocol to your host. F.e. IPv6. 8)
    6.61 + *
    6.62 + *	HW: the most generic way. Device supplied checksum of _all_
    6.63 + *	    the packet as seen by netif_rx in skb->csum.
    6.64 + *	    NOTE: Even if device supports only some protocols, but
    6.65 + *	    is able to produce some skb->csum, it MUST use HW,
    6.66 + *	    not UNNECESSARY.
    6.67 + *
    6.68 + * B. Checksumming on output.
    6.69 + *
    6.70 + *	NONE: skb is checksummed by protocol or csum is not required.
    6.71 + *
    6.72 + *	HW: device is required to csum packet as seen by hard_start_xmit
    6.73 + *	from skb->h.raw to the end and to record the checksum
    6.74 + *	at skb->h.raw+skb->csum.
    6.75 + *
    6.76 + *	Device must show its capabilities in dev->features, set
    6.77 + *	at device setup time.
    6.78 + *	NETIF_F_HW_CSUM	- it is clever device, it is able to checksum
    6.79 + *			  everything.
    6.80 + *	NETIF_F_NO_CSUM - loopback or reliable single hop media.
    6.81 + *	NETIF_F_IP_CSUM - device is dumb. It is able to csum only
    6.82 + *			  TCP/UDP over IPv4. Sigh. Vendors like this
    6.83 + *			  way by an unknown reason. Though, see comment above
    6.84 + *			  about CHECKSUM_UNNECESSARY. 8)
    6.85 + *
    6.86 + *	Any questions? No questions, good. 		--ANK
    6.87 + */
    6.88 +
    6.89 +#ifdef __i386__
    6.90 +#define NET_CALLER(arg) (*(((void **)&arg) - 1))
    6.91 +#else
    6.92 +#define NET_CALLER(arg) __builtin_return_address(0)
    6.93 +#endif
    6.94 +
    6.95 +struct net_device;
    6.96 +
    6.97 +#ifdef CONFIG_NETFILTER
    6.98 +struct nf_conntrack {
    6.99 +	atomic_t use;
   6.100 +	void (*destroy)(struct nf_conntrack *);
   6.101 +};
   6.102 +
   6.103 +#ifdef CONFIG_BRIDGE_NETFILTER
   6.104 +struct nf_bridge_info {
   6.105 +	atomic_t use;
   6.106 +	struct net_device *physindev;
   6.107 +	struct net_device *physoutdev;
   6.108 +#if defined(CONFIG_VLAN_8021Q) || defined(CONFIG_VLAN_8021Q_MODULE)
   6.109 +	struct net_device *netoutdev;
   6.110 +#endif
   6.111 +	unsigned int mask;
   6.112 +	unsigned long data[32 / sizeof(unsigned long)];
   6.113 +};
   6.114 +#endif
   6.115 +
   6.116 +#endif
   6.117 +
   6.118 +struct sk_buff_head {
   6.119 +	/* These two members must be first. */
   6.120 +	struct sk_buff	*next;
   6.121 +	struct sk_buff	*prev;
   6.122 +
   6.123 +	__u32		qlen;
   6.124 +	spinlock_t	lock;
   6.125 +};
   6.126 +
   6.127 +struct sk_buff;
   6.128 +
   6.129 +/* To allow 64K frame to be packed as single skb without frag_list */
   6.130 +#define MAX_SKB_FRAGS (65536/PAGE_SIZE + 2)
   6.131 +
   6.132 +typedef struct skb_frag_struct skb_frag_t;
   6.133 +
   6.134 +struct skb_frag_struct {
   6.135 +	struct page *page;
   6.136 +	__u16 page_offset;
   6.137 +	__u16 size;
   6.138 +};
   6.139 +
   6.140 +/* This data is invariant across clones and lives at
   6.141 + * the end of the header data, ie. at skb->end.
   6.142 + */
   6.143 +struct skb_shared_info {
   6.144 +	atomic_t	dataref;
   6.145 +	unsigned int	nr_frags;
   6.146 +	unsigned short	tso_size;
   6.147 +	unsigned short	tso_segs;
   6.148 +	struct sk_buff	*frag_list;
   6.149 +	skb_frag_t	frags[MAX_SKB_FRAGS];
   6.150 +};
   6.151 +
   6.152 +/** 
   6.153 + *	struct sk_buff - socket buffer
   6.154 + *	@next: Next buffer in list
   6.155 + *	@prev: Previous buffer in list
   6.156 + *	@list: List we are on
   6.157 + *	@sk: Socket we are owned by
   6.158 + *	@stamp: Time we arrived
   6.159 + *	@dev: Device we arrived on/are leaving by
   6.160 + *	@input_dev: Device we arrived on
   6.161 + *      @real_dev: The real device we are using
   6.162 + *	@h: Transport layer header
   6.163 + *	@nh: Network layer header
   6.164 + *	@mac: Link layer header
   6.165 + *	@dst: FIXME: Describe this field
   6.166 + *	@cb: Control buffer. Free for use by every layer. Put private vars here
   6.167 + *	@len: Length of actual data
   6.168 + *	@data_len: Data length
   6.169 + *	@mac_len: Length of link layer header
   6.170 + *	@csum: Checksum
   6.171 + *	@__unused: Dead field, may be reused
   6.172 + *	@cloned: Head may be cloned (check refcnt to be sure)
   6.173 + *	@proto_csum_valid: Protocol csum validated since arriving at localhost
   6.174 + *	@proto_csum_blank: Protocol csum must be added before leaving localhost
   6.175 + *	@pkt_type: Packet class
   6.176 + *	@ip_summed: Driver fed us an IP checksum
   6.177 + *	@priority: Packet queueing priority
   6.178 + *	@users: User count - see {datagram,tcp}.c
   6.179 + *	@protocol: Packet protocol from driver
   6.180 + *	@security: Security level of packet
   6.181 + *	@truesize: Buffer size 
   6.182 + *	@head: Head of buffer
   6.183 + *	@data: Data head pointer
   6.184 + *	@tail: Tail pointer
   6.185 + *	@end: End pointer
   6.186 + *	@destructor: Destruct function
   6.187 + *	@nfmark: Can be used for communication between hooks
   6.188 + *	@nfcache: Cache info
   6.189 + *	@nfct: Associated connection, if any
   6.190 + *	@nfctinfo: Relationship of this skb to the connection
   6.191 + *	@nf_debug: Netfilter debugging
   6.192 + *	@nf_bridge: Saved data about a bridged frame - see br_netfilter.c
   6.193 + *      @private: Data which is private to the HIPPI implementation
   6.194 + *	@tc_index: Traffic control index
   6.195 + */
   6.196 +
   6.197 +struct sk_buff {
   6.198 +	/* These two members must be first. */
   6.199 +	struct sk_buff		*next;
   6.200 +	struct sk_buff		*prev;
   6.201 +
   6.202 +	struct sk_buff_head	*list;
   6.203 +	struct sock		*sk;
   6.204 +	struct timeval		stamp;
   6.205 +	struct net_device	*dev;
   6.206 +	struct net_device	*input_dev;
   6.207 +	struct net_device	*real_dev;
   6.208 +
   6.209 +	union {
   6.210 +		struct tcphdr	*th;
   6.211 +		struct udphdr	*uh;
   6.212 +		struct icmphdr	*icmph;
   6.213 +		struct igmphdr	*igmph;
   6.214 +		struct iphdr	*ipiph;
   6.215 +		struct ipv6hdr	*ipv6h;
   6.216 +		unsigned char	*raw;
   6.217 +	} h;
   6.218 +
   6.219 +	union {
   6.220 +		struct iphdr	*iph;
   6.221 +		struct ipv6hdr	*ipv6h;
   6.222 +		struct arphdr	*arph;
   6.223 +		unsigned char	*raw;
   6.224 +	} nh;
   6.225 +
   6.226 +	union {
   6.227 +	  	unsigned char 	*raw;
   6.228 +	} mac;
   6.229 +
   6.230 +	struct  dst_entry	*dst;
   6.231 +	struct	sec_path	*sp;
   6.232 +
   6.233 +	/*
   6.234 +	 * This is the control buffer. It is free to use for every
   6.235 +	 * layer. Please put your private variables there. If you
   6.236 +	 * want to keep them across layers you have to do a skb_clone()
   6.237 +	 * first. This is owned by whoever has the skb queued ATM.
   6.238 +	 */
   6.239 +	char			cb[40];
   6.240 +
   6.241 +	unsigned int		len,
   6.242 +				data_len,
   6.243 +				mac_len,
   6.244 +				csum;
   6.245 +	unsigned char		local_df,
   6.246 +				cloned:1,
   6.247 +				proto_csum_valid:1,
   6.248 +				proto_csum_blank:1,
   6.249 +				pkt_type,
   6.250 +				ip_summed;
   6.251 +	__u32			priority;
   6.252 +	unsigned short		protocol,
   6.253 +				security;
   6.254 +
   6.255 +	void			(*destructor)(struct sk_buff *skb);
   6.256 +#ifdef CONFIG_NETFILTER
   6.257 +        unsigned long		nfmark;
   6.258 +	__u32			nfcache;
   6.259 +	__u32			nfctinfo;
   6.260 +	struct nf_conntrack	*nfct;
   6.261 +#ifdef CONFIG_NETFILTER_DEBUG
   6.262 +        unsigned int		nf_debug;
   6.263 +#endif
   6.264 +#ifdef CONFIG_BRIDGE_NETFILTER
   6.265 +	struct nf_bridge_info	*nf_bridge;
   6.266 +#endif
   6.267 +#endif /* CONFIG_NETFILTER */
   6.268 +#if defined(CONFIG_HIPPI)
   6.269 +	union {
   6.270 +		__u32		ifield;
   6.271 +	} private;
   6.272 +#endif
   6.273 +#ifdef CONFIG_NET_SCHED
   6.274 +       __u32			tc_index;        /* traffic control index */
   6.275 +#ifdef CONFIG_NET_CLS_ACT
   6.276 +	__u32           tc_verd;               /* traffic control verdict */
   6.277 +	__u32           tc_classid;            /* traffic control classid */
   6.278 +#endif
   6.279 +
   6.280 +#endif
   6.281 +
   6.282 +
   6.283 +	/* These elements must be at the end, see alloc_skb() for details.  */
   6.284 +	unsigned int		truesize;
   6.285 +	atomic_t		users;
   6.286 +	unsigned char		*head,
   6.287 +				*data,
   6.288 +				*tail,
   6.289 +				*end;
   6.290 +};
   6.291 +
   6.292 +#ifdef __KERNEL__
   6.293 +/*
   6.294 + *	Handling routines are only of interest to the kernel
   6.295 + */
   6.296 +#include <linux/slab.h>
   6.297 +
   6.298 +#include <asm/system.h>
   6.299 +
   6.300 +extern void	       __kfree_skb(struct sk_buff *skb);
   6.301 +extern struct sk_buff *alloc_skb(unsigned int size, int priority);
   6.302 +extern struct sk_buff *alloc_skb_from_cache(kmem_cache_t *cp,
   6.303 +					    unsigned int size, int priority);
   6.304 +extern void	       kfree_skbmem(struct sk_buff *skb);
   6.305 +extern struct sk_buff *skb_clone(struct sk_buff *skb, int priority);
   6.306 +extern struct sk_buff *skb_copy(const struct sk_buff *skb, int priority);
   6.307 +extern struct sk_buff *pskb_copy(struct sk_buff *skb, int gfp_mask);
   6.308 +extern int	       pskb_expand_head(struct sk_buff *skb,
   6.309 +					int nhead, int ntail, int gfp_mask);
   6.310 +extern struct sk_buff *skb_realloc_headroom(struct sk_buff *skb,
   6.311 +					    unsigned int headroom);
   6.312 +extern struct sk_buff *skb_copy_expand(const struct sk_buff *skb,
   6.313 +				       int newheadroom, int newtailroom,
   6.314 +				       int priority);
   6.315 +extern struct sk_buff *		skb_pad(struct sk_buff *skb, int pad);
   6.316 +#define dev_kfree_skb(a)	kfree_skb(a)
   6.317 +extern void	      skb_over_panic(struct sk_buff *skb, int len,
   6.318 +				     void *here);
   6.319 +extern void	      skb_under_panic(struct sk_buff *skb, int len,
   6.320 +				      void *here);
   6.321 +
   6.322 +/* Internal */
   6.323 +#define skb_shinfo(SKB)		((struct skb_shared_info *)((SKB)->end))
   6.324 +
   6.325 +/**
   6.326 + *	skb_queue_empty - check if a queue is empty
   6.327 + *	@list: queue head
   6.328 + *
   6.329 + *	Returns true if the queue is empty, false otherwise.
   6.330 + */
   6.331 +static inline int skb_queue_empty(const struct sk_buff_head *list)
   6.332 +{
   6.333 +	return list->next == (struct sk_buff *)list;
   6.334 +}
   6.335 +
   6.336 +/**
   6.337 + *	skb_get - reference buffer
   6.338 + *	@skb: buffer to reference
   6.339 + *
   6.340 + *	Makes another reference to a socket buffer and returns a pointer
   6.341 + *	to the buffer.
   6.342 + */
   6.343 +static inline struct sk_buff *skb_get(struct sk_buff *skb)
   6.344 +{
   6.345 +	atomic_inc(&skb->users);
   6.346 +	return skb;
   6.347 +}
   6.348 +
   6.349 +/*
   6.350 + * If users == 1, we are the only owner and are can avoid redundant
   6.351 + * atomic change.
   6.352 + */
   6.353 +
   6.354 +/**
   6.355 + *	kfree_skb - free an sk_buff
   6.356 + *	@skb: buffer to free
   6.357 + *
   6.358 + *	Drop a reference to the buffer and free it if the usage count has
   6.359 + *	hit zero.
   6.360 + */
   6.361 +static inline void kfree_skb(struct sk_buff *skb)
   6.362 +{
   6.363 +	if (likely(atomic_read(&skb->users) == 1))
   6.364 +		smp_rmb();
   6.365 +	else if (likely(!atomic_dec_and_test(&skb->users)))
   6.366 +		return;
   6.367 +	__kfree_skb(skb);
   6.368 +}
   6.369 +
   6.370 +/**
   6.371 + *	skb_cloned - is the buffer a clone
   6.372 + *	@skb: buffer to check
   6.373 + *
   6.374 + *	Returns true if the buffer was generated with skb_clone() and is
   6.375 + *	one of multiple shared copies of the buffer. Cloned buffers are
   6.376 + *	shared data so must not be written to under normal circumstances.
   6.377 + */
   6.378 +static inline int skb_cloned(const struct sk_buff *skb)
   6.379 +{
   6.380 +	return skb->cloned && atomic_read(&skb_shinfo(skb)->dataref) != 1;
   6.381 +}
   6.382 +
   6.383 +/**
   6.384 + *	skb_shared - is the buffer shared
   6.385 + *	@skb: buffer to check
   6.386 + *
   6.387 + *	Returns true if more than one person has a reference to this
   6.388 + *	buffer.
   6.389 + */
   6.390 +static inline int skb_shared(const struct sk_buff *skb)
   6.391 +{
   6.392 +	return atomic_read(&skb->users) != 1;
   6.393 +}
   6.394 +
   6.395 +/**
   6.396 + *	skb_share_check - check if buffer is shared and if so clone it
   6.397 + *	@skb: buffer to check
   6.398 + *	@pri: priority for memory allocation
   6.399 + *
   6.400 + *	If the buffer is shared the buffer is cloned and the old copy
   6.401 + *	drops a reference. A new clone with a single reference is returned.
   6.402 + *	If the buffer is not shared the original buffer is returned. When
   6.403 + *	being called from interrupt status or with spinlocks held pri must
   6.404 + *	be GFP_ATOMIC.
   6.405 + *
   6.406 + *	NULL is returned on a memory allocation failure.
   6.407 + */
   6.408 +static inline struct sk_buff *skb_share_check(struct sk_buff *skb, int pri)
   6.409 +{
   6.410 +	might_sleep_if(pri & __GFP_WAIT);
   6.411 +	if (skb_shared(skb)) {
   6.412 +		struct sk_buff *nskb = skb_clone(skb, pri);
   6.413 +		kfree_skb(skb);
   6.414 +		skb = nskb;
   6.415 +	}
   6.416 +	return skb;
   6.417 +}
   6.418 +
   6.419 +/*
   6.420 + *	Copy shared buffers into a new sk_buff. We effectively do COW on
   6.421 + *	packets to handle cases where we have a local reader and forward
   6.422 + *	and a couple of other messy ones. The normal one is tcpdumping
   6.423 + *	a packet thats being forwarded.
   6.424 + */
   6.425 +
   6.426 +/**
   6.427 + *	skb_unshare - make a copy of a shared buffer
   6.428 + *	@skb: buffer to check
   6.429 + *	@pri: priority for memory allocation
   6.430 + *
   6.431 + *	If the socket buffer is a clone then this function creates a new
   6.432 + *	copy of the data, drops a reference count on the old copy and returns
   6.433 + *	the new copy with the reference count at 1. If the buffer is not a clone
   6.434 + *	the original buffer is returned. When called with a spinlock held or
   6.435 + *	from interrupt state @pri must be %GFP_ATOMIC
   6.436 + *
   6.437 + *	%NULL is returned on a memory allocation failure.
   6.438 + */
   6.439 +static inline struct sk_buff *skb_unshare(struct sk_buff *skb, int pri)
   6.440 +{
   6.441 +	might_sleep_if(pri & __GFP_WAIT);
   6.442 +	if (skb_cloned(skb)) {
   6.443 +		struct sk_buff *nskb = skb_copy(skb, pri);
   6.444 +		kfree_skb(skb);	/* Free our shared copy */
   6.445 +		skb = nskb;
   6.446 +	}
   6.447 +	return skb;
   6.448 +}
   6.449 +
   6.450 +/**
   6.451 + *	skb_peek
   6.452 + *	@list_: list to peek at
   6.453 + *
   6.454 + *	Peek an &sk_buff. Unlike most other operations you _MUST_
   6.455 + *	be careful with this one. A peek leaves the buffer on the
   6.456 + *	list and someone else may run off with it. You must hold
   6.457 + *	the appropriate locks or have a private queue to do this.
   6.458 + *
   6.459 + *	Returns %NULL for an empty list or a pointer to the head element.
   6.460 + *	The reference count is not incremented and the reference is therefore
   6.461 + *	volatile. Use with caution.
   6.462 + */
   6.463 +static inline struct sk_buff *skb_peek(struct sk_buff_head *list_)
   6.464 +{
   6.465 +	struct sk_buff *list = ((struct sk_buff *)list_)->next;
   6.466 +	if (list == (struct sk_buff *)list_)
   6.467 +		list = NULL;
   6.468 +	return list;
   6.469 +}
   6.470 +
   6.471 +/**
   6.472 + *	skb_peek_tail
   6.473 + *	@list_: list to peek at
   6.474 + *
   6.475 + *	Peek an &sk_buff. Unlike most other operations you _MUST_
   6.476 + *	be careful with this one. A peek leaves the buffer on the
   6.477 + *	list and someone else may run off with it. You must hold
   6.478 + *	the appropriate locks or have a private queue to do this.
   6.479 + *
   6.480 + *	Returns %NULL for an empty list or a pointer to the tail element.
   6.481 + *	The reference count is not incremented and the reference is therefore
   6.482 + *	volatile. Use with caution.
   6.483 + */
   6.484 +static inline struct sk_buff *skb_peek_tail(struct sk_buff_head *list_)
   6.485 +{
   6.486 +	struct sk_buff *list = ((struct sk_buff *)list_)->prev;
   6.487 +	if (list == (struct sk_buff *)list_)
   6.488 +		list = NULL;
   6.489 +	return list;
   6.490 +}
   6.491 +
   6.492 +/**
   6.493 + *	skb_queue_len	- get queue length
   6.494 + *	@list_: list to measure
   6.495 + *
   6.496 + *	Return the length of an &sk_buff queue.
   6.497 + */
   6.498 +static inline __u32 skb_queue_len(const struct sk_buff_head *list_)
   6.499 +{
   6.500 +	return list_->qlen;
   6.501 +}
   6.502 +
   6.503 +static inline void skb_queue_head_init(struct sk_buff_head *list)
   6.504 +{
   6.505 +	spin_lock_init(&list->lock);
   6.506 +	list->prev = list->next = (struct sk_buff *)list;
   6.507 +	list->qlen = 0;
   6.508 +}
   6.509 +
   6.510 +/*
   6.511 + *	Insert an sk_buff at the start of a list.
   6.512 + *
   6.513 + *	The "__skb_xxxx()" functions are the non-atomic ones that
   6.514 + *	can only be called with interrupts disabled.
   6.515 + */
   6.516 +
   6.517 +/**
   6.518 + *	__skb_queue_head - queue a buffer at the list head
   6.519 + *	@list: list to use
   6.520 + *	@newsk: buffer to queue
   6.521 + *
   6.522 + *	Queue a buffer at the start of a list. This function takes no locks
   6.523 + *	and you must therefore hold required locks before calling it.
   6.524 + *
   6.525 + *	A buffer cannot be placed on two lists at the same time.
   6.526 + */
   6.527 +extern void skb_queue_head(struct sk_buff_head *list, struct sk_buff *newsk);
   6.528 +static inline void __skb_queue_head(struct sk_buff_head *list,
   6.529 +				    struct sk_buff *newsk)
   6.530 +{
   6.531 +	struct sk_buff *prev, *next;
   6.532 +
   6.533 +	newsk->list = list;
   6.534 +	list->qlen++;
   6.535 +	prev = (struct sk_buff *)list;
   6.536 +	next = prev->next;
   6.537 +	newsk->next = next;
   6.538 +	newsk->prev = prev;
   6.539 +	next->prev  = prev->next = newsk;
   6.540 +}
   6.541 +
   6.542 +/**
   6.543 + *	__skb_queue_tail - queue a buffer at the list tail
   6.544 + *	@list: list to use
   6.545 + *	@newsk: buffer to queue
   6.546 + *
   6.547 + *	Queue a buffer at the end of a list. This function takes no locks
   6.548 + *	and you must therefore hold required locks before calling it.
   6.549 + *
   6.550 + *	A buffer cannot be placed on two lists at the same time.
   6.551 + */
   6.552 +extern void skb_queue_tail(struct sk_buff_head *list, struct sk_buff *newsk);
   6.553 +static inline void __skb_queue_tail(struct sk_buff_head *list,
   6.554 +				   struct sk_buff *newsk)
   6.555 +{
   6.556 +	struct sk_buff *prev, *next;
   6.557 +
   6.558 +	newsk->list = list;
   6.559 +	list->qlen++;
   6.560 +	next = (struct sk_buff *)list;
   6.561 +	prev = next->prev;
   6.562 +	newsk->next = next;
   6.563 +	newsk->prev = prev;
   6.564 +	next->prev  = prev->next = newsk;
   6.565 +}
   6.566 +
   6.567 +
   6.568 +/**
   6.569 + *	__skb_dequeue - remove from the head of the queue
   6.570 + *	@list: list to dequeue from
   6.571 + *
   6.572 + *	Remove the head of the list. This function does not take any locks
   6.573 + *	so must be used with appropriate locks held only. The head item is
   6.574 + *	returned or %NULL if the list is empty.
   6.575 + */
   6.576 +extern struct sk_buff *skb_dequeue(struct sk_buff_head *list);
   6.577 +static inline struct sk_buff *__skb_dequeue(struct sk_buff_head *list)
   6.578 +{
   6.579 +	struct sk_buff *next, *prev, *result;
   6.580 +
   6.581 +	prev = (struct sk_buff *) list;
   6.582 +	next = prev->next;
   6.583 +	result = NULL;
   6.584 +	if (next != prev) {
   6.585 +		result	     = next;
   6.586 +		next	     = next->next;
   6.587 +		list->qlen--;
   6.588 +		next->prev   = prev;
   6.589 +		prev->next   = next;
   6.590 +		result->next = result->prev = NULL;
   6.591 +		result->list = NULL;
   6.592 +	}
   6.593 +	return result;
   6.594 +}
   6.595 +
   6.596 +
   6.597 +/*
   6.598 + *	Insert a packet on a list.
   6.599 + */
   6.600 +extern void        skb_insert(struct sk_buff *old, struct sk_buff *newsk);
   6.601 +static inline void __skb_insert(struct sk_buff *newsk,
   6.602 +				struct sk_buff *prev, struct sk_buff *next,
   6.603 +				struct sk_buff_head *list)
   6.604 +{
   6.605 +	newsk->next = next;
   6.606 +	newsk->prev = prev;
   6.607 +	next->prev  = prev->next = newsk;
   6.608 +	newsk->list = list;
   6.609 +	list->qlen++;
   6.610 +}
   6.611 +
   6.612 +/*
   6.613 + *	Place a packet after a given packet in a list.
   6.614 + */
   6.615 +extern void	   skb_append(struct sk_buff *old, struct sk_buff *newsk);
   6.616 +static inline void __skb_append(struct sk_buff *old, struct sk_buff *newsk)
   6.617 +{
   6.618 +	__skb_insert(newsk, old, old->next, old->list);
   6.619 +}
   6.620 +
   6.621 +/*
   6.622 + * remove sk_buff from list. _Must_ be called atomically, and with
   6.623 + * the list known..
   6.624 + */
   6.625 +extern void	   skb_unlink(struct sk_buff *skb);
   6.626 +static inline void __skb_unlink(struct sk_buff *skb, struct sk_buff_head *list)
   6.627 +{
   6.628 +	struct sk_buff *next, *prev;
   6.629 +
   6.630 +	list->qlen--;
   6.631 +	next	   = skb->next;
   6.632 +	prev	   = skb->prev;
   6.633 +	skb->next  = skb->prev = NULL;
   6.634 +	skb->list  = NULL;
   6.635 +	next->prev = prev;
   6.636 +	prev->next = next;
   6.637 +}
   6.638 +
   6.639 +
   6.640 +/* XXX: more streamlined implementation */
   6.641 +
   6.642 +/**
   6.643 + *	__skb_dequeue_tail - remove from the tail of the queue
   6.644 + *	@list: list to dequeue from
   6.645 + *
   6.646 + *	Remove the tail of the list. This function does not take any locks
   6.647 + *	so must be used with appropriate locks held only. The tail item is
   6.648 + *	returned or %NULL if the list is empty.
   6.649 + */
   6.650 +extern struct sk_buff *skb_dequeue_tail(struct sk_buff_head *list);
   6.651 +static inline struct sk_buff *__skb_dequeue_tail(struct sk_buff_head *list)
   6.652 +{
   6.653 +	struct sk_buff *skb = skb_peek_tail(list);
   6.654 +	if (skb)
   6.655 +		__skb_unlink(skb, list);
   6.656 +	return skb;
   6.657 +}
   6.658 +
   6.659 +
   6.660 +static inline int skb_is_nonlinear(const struct sk_buff *skb)
   6.661 +{
   6.662 +	return skb->data_len;
   6.663 +}
   6.664 +
   6.665 +static inline unsigned int skb_headlen(const struct sk_buff *skb)
   6.666 +{
   6.667 +	return skb->len - skb->data_len;
   6.668 +}
   6.669 +
   6.670 +static inline int skb_pagelen(const struct sk_buff *skb)
   6.671 +{
   6.672 +	int i, len = 0;
   6.673 +
   6.674 +	for (i = (int)skb_shinfo(skb)->nr_frags - 1; i >= 0; i--)
   6.675 +		len += skb_shinfo(skb)->frags[i].size;
   6.676 +	return len + skb_headlen(skb);
   6.677 +}
   6.678 +
   6.679 +static inline void skb_fill_page_desc(struct sk_buff *skb, int i,
   6.680 +				      struct page *page, int off, int size)
   6.681 +{
   6.682 +	skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
   6.683 +
   6.684 +	frag->page		  = page;
   6.685 +	frag->page_offset	  = off;
   6.686 +	frag->size		  = size;
   6.687 +	skb_shinfo(skb)->nr_frags = i + 1;
   6.688 +}
   6.689 +
   6.690 +#define SKB_PAGE_ASSERT(skb) 	BUG_ON(skb_shinfo(skb)->nr_frags)
   6.691 +#define SKB_FRAG_ASSERT(skb) 	BUG_ON(skb_shinfo(skb)->frag_list)
   6.692 +#define SKB_LINEAR_ASSERT(skb)  BUG_ON(skb_is_nonlinear(skb))
   6.693 +
   6.694 +/*
   6.695 + *	Add data to an sk_buff
   6.696 + */
   6.697 +static inline unsigned char *__skb_put(struct sk_buff *skb, unsigned int len)
   6.698 +{
   6.699 +	unsigned char *tmp = skb->tail;
   6.700 +	SKB_LINEAR_ASSERT(skb);
   6.701 +	skb->tail += len;
   6.702 +	skb->len  += len;
   6.703 +	return tmp;
   6.704 +}
   6.705 +
   6.706 +/**
   6.707 + *	skb_put - add data to a buffer
   6.708 + *	@skb: buffer to use
   6.709 + *	@len: amount of data to add
   6.710 + *
   6.711 + *	This function extends the used data area of the buffer. If this would
   6.712 + *	exceed the total buffer size the kernel will panic. A pointer to the
   6.713 + *	first byte of the extra data is returned.
   6.714 + */
   6.715 +static inline unsigned char *skb_put(struct sk_buff *skb, unsigned int len)
   6.716 +{
   6.717 +	unsigned char *tmp = skb->tail;
   6.718 +	SKB_LINEAR_ASSERT(skb);
   6.719 +	skb->tail += len;
   6.720 +	skb->len  += len;
   6.721 +	if (unlikely(skb->tail>skb->end))
   6.722 +		skb_over_panic(skb, len, current_text_addr());
   6.723 +	return tmp;
   6.724 +}
   6.725 +
   6.726 +static inline unsigned char *__skb_push(struct sk_buff *skb, unsigned int len)
   6.727 +{
   6.728 +	skb->data -= len;
   6.729 +	skb->len  += len;
   6.730 +	return skb->data;
   6.731 +}
   6.732 +
   6.733 +/**
   6.734 + *	skb_push - add data to the start of a buffer
   6.735 + *	@skb: buffer to use
   6.736 + *	@len: amount of data to add
   6.737 + *
   6.738 + *	This function extends the used data area of the buffer at the buffer
   6.739 + *	start. If this would exceed the total buffer headroom the kernel will
   6.740 + *	panic. A pointer to the first byte of the extra data is returned.
   6.741 + */
   6.742 +static inline unsigned char *skb_push(struct sk_buff *skb, unsigned int len)
   6.743 +{
   6.744 +	skb->data -= len;
   6.745 +	skb->len  += len;
   6.746 +	if (unlikely(skb->data<skb->head))
   6.747 +		skb_under_panic(skb, len, current_text_addr());
   6.748 +	return skb->data;
   6.749 +}
   6.750 +
   6.751 +static inline unsigned char *__skb_pull(struct sk_buff *skb, unsigned int len)
   6.752 +{
   6.753 +	skb->len -= len;
   6.754 +	BUG_ON(skb->len < skb->data_len);
   6.755 +	return skb->data += len;
   6.756 +}
   6.757 +
   6.758 +/**
   6.759 + *	skb_pull - remove data from the start of a buffer
   6.760 + *	@skb: buffer to use
   6.761 + *	@len: amount of data to remove
   6.762 + *
   6.763 + *	This function removes data from the start of a buffer, returning
   6.764 + *	the memory to the headroom. A pointer to the next data in the buffer
   6.765 + *	is returned. Once the data has been pulled future pushes will overwrite
   6.766 + *	the old data.
   6.767 + */
   6.768 +static inline unsigned char *skb_pull(struct sk_buff *skb, unsigned int len)
   6.769 +{
   6.770 +	return unlikely(len > skb->len) ? NULL : __skb_pull(skb, len);
   6.771 +}
   6.772 +
   6.773 +extern unsigned char *__pskb_pull_tail(struct sk_buff *skb, int delta);
   6.774 +
   6.775 +static inline unsigned char *__pskb_pull(struct sk_buff *skb, unsigned int len)
   6.776 +{
   6.777 +	if (len > skb_headlen(skb) &&
   6.778 +	    !__pskb_pull_tail(skb, len-skb_headlen(skb)))
   6.779 +		return NULL;
   6.780 +	skb->len -= len;
   6.781 +	return skb->data += len;
   6.782 +}
   6.783 +
   6.784 +static inline unsigned char *pskb_pull(struct sk_buff *skb, unsigned int len)
   6.785 +{
   6.786 +	return unlikely(len > skb->len) ? NULL : __pskb_pull(skb, len);
   6.787 +}
   6.788 +
   6.789 +static inline int pskb_may_pull(struct sk_buff *skb, unsigned int len)
   6.790 +{
   6.791 +	if (likely(len <= skb_headlen(skb)))
   6.792 +		return 1;
   6.793 +	if (unlikely(len > skb->len))
   6.794 +		return 0;
   6.795 +	return __pskb_pull_tail(skb, len-skb_headlen(skb)) != NULL;
   6.796 +}
   6.797 +
   6.798 +/**
   6.799 + *	skb_headroom - bytes at buffer head
   6.800 + *	@skb: buffer to check
   6.801 + *
   6.802 + *	Return the number of bytes of free space at the head of an &sk_buff.
   6.803 + */
   6.804 +static inline int skb_headroom(const struct sk_buff *skb)
   6.805 +{
   6.806 +	return skb->data - skb->head;
   6.807 +}
   6.808 +
   6.809 +/**
   6.810 + *	skb_tailroom - bytes at buffer end
   6.811 + *	@skb: buffer to check
   6.812 + *
   6.813 + *	Return the number of bytes of free space at the tail of an sk_buff
   6.814 + */
   6.815 +static inline int skb_tailroom(const struct sk_buff *skb)
   6.816 +{
   6.817 +	return skb_is_nonlinear(skb) ? 0 : skb->end - skb->tail;
   6.818 +}
   6.819 +
   6.820 +/**
   6.821 + *	skb_reserve - adjust headroom
   6.822 + *	@skb: buffer to alter
   6.823 + *	@len: bytes to move
   6.824 + *
   6.825 + *	Increase the headroom of an empty &sk_buff by reducing the tail
   6.826 + *	room. This is only allowed for an empty buffer.
   6.827 + */
   6.828 +static inline void skb_reserve(struct sk_buff *skb, unsigned int len)
   6.829 +{
   6.830 +	skb->data += len;
   6.831 +	skb->tail += len;
   6.832 +}
   6.833 +
   6.834 +/*
   6.835 + * CPUs often take a performance hit when accessing unaligned memory
   6.836 + * locations. The actual performance hit varies, it can be small if the
   6.837 + * hardware handles it or large if we have to take an exception and fix it
   6.838 + * in software.
   6.839 + *
   6.840 + * Since an ethernet header is 14 bytes network drivers often end up with
   6.841 + * the IP header at an unaligned offset. The IP header can be aligned by
   6.842 + * shifting the start of the packet by 2 bytes. Drivers should do this
   6.843 + * with:
   6.844 + *
   6.845 + * skb_reserve(NET_IP_ALIGN);
   6.846 + *
   6.847 + * The downside to this alignment of the IP header is that the DMA is now
   6.848 + * unaligned. On some architectures the cost of an unaligned DMA is high
   6.849 + * and this cost outweighs the gains made by aligning the IP header.
   6.850 + * 
   6.851 + * Since this trade off varies between architectures, we allow NET_IP_ALIGN
   6.852 + * to be overridden.
   6.853 + */
   6.854 +#ifndef NET_IP_ALIGN
   6.855 +#define NET_IP_ALIGN	2
   6.856 +#endif
   6.857 +
   6.858 +extern int ___pskb_trim(struct sk_buff *skb, unsigned int len, int realloc);
   6.859 +
   6.860 +static inline void __skb_trim(struct sk_buff *skb, unsigned int len)
   6.861 +{
   6.862 +	if (!skb->data_len) {
   6.863 +		skb->len  = len;
   6.864 +		skb->tail = skb->data + len;
   6.865 +	} else
   6.866 +		___pskb_trim(skb, len, 0);
   6.867 +}
   6.868 +
   6.869 +/**
   6.870 + *	skb_trim - remove end from a buffer
   6.871 + *	@skb: buffer to alter
   6.872 + *	@len: new length
   6.873 + *
   6.874 + *	Cut the length of a buffer down by removing data from the tail. If
   6.875 + *	the buffer is already under the length specified it is not modified.
   6.876 + */
   6.877 +static inline void skb_trim(struct sk_buff *skb, unsigned int len)
   6.878 +{
   6.879 +	if (skb->len > len)
   6.880 +		__skb_trim(skb, len);
   6.881 +}
   6.882 +
   6.883 +
   6.884 +static inline int __pskb_trim(struct sk_buff *skb, unsigned int len)
   6.885 +{
   6.886 +	if (!skb->data_len) {
   6.887 +		skb->len  = len;
   6.888 +		skb->tail = skb->data+len;
   6.889 +		return 0;
   6.890 +	}
   6.891 +	return ___pskb_trim(skb, len, 1);
   6.892 +}
   6.893 +
   6.894 +static inline int pskb_trim(struct sk_buff *skb, unsigned int len)
   6.895 +{
   6.896 +	return (len < skb->len) ? __pskb_trim(skb, len) : 0;
   6.897 +}
   6.898 +
   6.899 +/**
   6.900 + *	skb_orphan - orphan a buffer
   6.901 + *	@skb: buffer to orphan
   6.902 + *
   6.903 + *	If a buffer currently has an owner then we call the owner's
   6.904 + *	destructor function and make the @skb unowned. The buffer continues
   6.905 + *	to exist but is no longer charged to its former owner.
   6.906 + */
   6.907 +static inline void skb_orphan(struct sk_buff *skb)
   6.908 +{
   6.909 +	if (skb->destructor)
   6.910 +		skb->destructor(skb);
   6.911 +	skb->destructor = NULL;
   6.912 +	skb->sk		= NULL;
   6.913 +}
   6.914 +
   6.915 +/**
   6.916 + *	__skb_queue_purge - empty a list
   6.917 + *	@list: list to empty
   6.918 + *
   6.919 + *	Delete all buffers on an &sk_buff list. Each buffer is removed from
   6.920 + *	the list and one reference dropped. This function does not take the
   6.921 + *	list lock and the caller must hold the relevant locks to use it.
   6.922 + */
   6.923 +extern void skb_queue_purge(struct sk_buff_head *list);
   6.924 +static inline void __skb_queue_purge(struct sk_buff_head *list)
   6.925 +{
   6.926 +	struct sk_buff *skb;
   6.927 +	while ((skb = __skb_dequeue(list)) != NULL)
   6.928 +		kfree_skb(skb);
   6.929 +}
   6.930 +
   6.931 +/**
   6.932 + *	__dev_alloc_skb - allocate an skbuff for sending
   6.933 + *	@length: length to allocate
   6.934 + *	@gfp_mask: get_free_pages mask, passed to alloc_skb
   6.935 + *
   6.936 + *	Allocate a new &sk_buff and assign it a usage count of one. The
   6.937 + *	buffer has unspecified headroom built in. Users should allocate
   6.938 + *	the headroom they think they need without accounting for the
   6.939 + *	built in space. The built in space is used for optimisations.
   6.940 + *
   6.941 + *	%NULL is returned in there is no free memory.
   6.942 + */
   6.943 +#ifndef CONFIG_HAVE_ARCH_DEV_ALLOC_SKB
   6.944 +static inline struct sk_buff *__dev_alloc_skb(unsigned int length,
   6.945 +					      int gfp_mask)
   6.946 +{
   6.947 +	struct sk_buff *skb = alloc_skb(length + 16, gfp_mask);
   6.948 +	if (likely(skb))
   6.949 +		skb_reserve(skb, 16);
   6.950 +	return skb;
   6.951 +}
   6.952 +#else
   6.953 +extern struct sk_buff *__dev_alloc_skb(unsigned int length, int gfp_mask);
   6.954 +#endif
   6.955 +
   6.956 +/**
   6.957 + *	dev_alloc_skb - allocate an skbuff for sending
   6.958 + *	@length: length to allocate
   6.959 + *
   6.960 + *	Allocate a new &sk_buff and assign it a usage count of one. The
   6.961 + *	buffer has unspecified headroom built in. Users should allocate
   6.962 + *	the headroom they think they need without accounting for the
   6.963 + *	built in space. The built in space is used for optimisations.
   6.964 + *
   6.965 + *	%NULL is returned in there is no free memory. Although this function
   6.966 + *	allocates memory it can be called from an interrupt.
   6.967 + */
   6.968 +static inline struct sk_buff *dev_alloc_skb(unsigned int length)
   6.969 +{
   6.970 +	return __dev_alloc_skb(length, GFP_ATOMIC);
   6.971 +}
   6.972 +
   6.973 +/**
   6.974 + *	skb_cow - copy header of skb when it is required
   6.975 + *	@skb: buffer to cow
   6.976 + *	@headroom: needed headroom
   6.977 + *
   6.978 + *	If the skb passed lacks sufficient headroom or its data part
   6.979 + *	is shared, data is reallocated. If reallocation fails, an error
   6.980 + *	is returned and original skb is not changed.
   6.981 + *
   6.982 + *	The result is skb with writable area skb->head...skb->tail
   6.983 + *	and at least @headroom of space at head.
   6.984 + */
   6.985 +static inline int skb_cow(struct sk_buff *skb, unsigned int headroom)
   6.986 +{
   6.987 +	int delta = (headroom > 16 ? headroom : 16) - skb_headroom(skb);
   6.988 +
   6.989 +	if (delta < 0)
   6.990 +		delta = 0;
   6.991 +
   6.992 +	if (delta || skb_cloned(skb))
   6.993 +		return pskb_expand_head(skb, (delta + 15) & ~15, 0, GFP_ATOMIC);
   6.994 +	return 0;
   6.995 +}
   6.996 +
   6.997 +/**
   6.998 + *	skb_padto	- pad an skbuff up to a minimal size
   6.999 + *	@skb: buffer to pad
  6.1000 + *	@len: minimal length
  6.1001 + *
  6.1002 + *	Pads up a buffer to ensure the trailing bytes exist and are
  6.1003 + *	blanked. If the buffer already contains sufficient data it
  6.1004 + *	is untouched. Returns the buffer, which may be a replacement
  6.1005 + *	for the original, or NULL for out of memory - in which case
  6.1006 + *	the original buffer is still freed.
  6.1007 + */
  6.1008 + 
  6.1009 +static inline struct sk_buff *skb_padto(struct sk_buff *skb, unsigned int len)
  6.1010 +{
  6.1011 +	unsigned int size = skb->len;
  6.1012 +	if (likely(size >= len))
  6.1013 +		return skb;
  6.1014 +	return skb_pad(skb, len-size);
  6.1015 +}
  6.1016 +
  6.1017 +static inline int skb_add_data(struct sk_buff *skb,
  6.1018 +			       char __user *from, int copy)
  6.1019 +{
  6.1020 +	const int off = skb->len;
  6.1021 +
  6.1022 +	if (skb->ip_summed == CHECKSUM_NONE) {
  6.1023 +		int err = 0;
  6.1024 +		unsigned int csum = csum_and_copy_from_user(from,
  6.1025 +							    skb_put(skb, copy),
  6.1026 +							    copy, 0, &err);
  6.1027 +		if (!err) {
  6.1028 +			skb->csum = csum_block_add(skb->csum, csum, off);
  6.1029 +			return 0;
  6.1030 +		}
  6.1031 +	} else if (!copy_from_user(skb_put(skb, copy), from, copy))
  6.1032 +		return 0;
  6.1033 +
  6.1034 +	__skb_trim(skb, off);
  6.1035 +	return -EFAULT;
  6.1036 +}
  6.1037 +
  6.1038 +static inline int skb_can_coalesce(struct sk_buff *skb, int i,
  6.1039 +				   struct page *page, int off)
  6.1040 +{
  6.1041 +	if (i) {
  6.1042 +		struct skb_frag_struct *frag = &skb_shinfo(skb)->frags[i - 1];
  6.1043 +
  6.1044 +		return page == frag->page &&
  6.1045 +		       off == frag->page_offset + frag->size;
  6.1046 +	}
  6.1047 +	return 0;
  6.1048 +}
  6.1049 +
  6.1050 +/**
  6.1051 + *	skb_linearize - convert paged skb to linear one
  6.1052 + *	@skb: buffer to linarize
  6.1053 + *	@gfp: allocation mode
  6.1054 + *
  6.1055 + *	If there is no free memory -ENOMEM is returned, otherwise zero
  6.1056 + *	is returned and the old skb data released.
  6.1057 + */
  6.1058 +extern int __skb_linearize(struct sk_buff *skb, int gfp);
  6.1059 +static inline int skb_linearize(struct sk_buff *skb, int gfp)
  6.1060 +{
  6.1061 +	return __skb_linearize(skb, gfp);
  6.1062 +}
  6.1063 +
  6.1064 +static inline void *kmap_skb_frag(const skb_frag_t *frag)
  6.1065 +{
  6.1066 +#ifdef CONFIG_HIGHMEM
  6.1067 +	BUG_ON(in_irq());
  6.1068 +
  6.1069 +	local_bh_disable();
  6.1070 +#endif
  6.1071 +	return kmap_atomic(frag->page, KM_SKB_DATA_SOFTIRQ);
  6.1072 +}
  6.1073 +
  6.1074 +static inline void kunmap_skb_frag(void *vaddr)
  6.1075 +{
  6.1076 +	kunmap_atomic(vaddr, KM_SKB_DATA_SOFTIRQ);
  6.1077 +#ifdef CONFIG_HIGHMEM
  6.1078 +	local_bh_enable();
  6.1079 +#endif
  6.1080 +}
  6.1081 +
  6.1082 +#define skb_queue_walk(queue, skb) \
  6.1083 +		for (skb = (queue)->next;					\
  6.1084 +		     prefetch(skb->next), (skb != (struct sk_buff *)(queue));	\
  6.1085 +		     skb = skb->next)
  6.1086 +
  6.1087 +
  6.1088 +extern struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned flags,
  6.1089 +					 int noblock, int *err);
  6.1090 +extern unsigned int    datagram_poll(struct file *file, struct socket *sock,
  6.1091 +				     struct poll_table_struct *wait);
  6.1092 +extern int	       skb_copy_datagram_iovec(const struct sk_buff *from,
  6.1093 +					       int offset, struct iovec *to,
  6.1094 +					       int size);
  6.1095 +extern int	       skb_copy_and_csum_datagram_iovec(const
  6.1096 +							struct sk_buff *skb,
  6.1097 +							int hlen,
  6.1098 +							struct iovec *iov);
  6.1099 +extern void	       skb_free_datagram(struct sock *sk, struct sk_buff *skb);
  6.1100 +extern unsigned int    skb_checksum(const struct sk_buff *skb, int offset,
  6.1101 +				    int len, unsigned int csum);
  6.1102 +extern int	       skb_copy_bits(const struct sk_buff *skb, int offset,
  6.1103 +				     void *to, int len);
  6.1104 +extern unsigned int    skb_copy_and_csum_bits(const struct sk_buff *skb,
  6.1105 +					      int offset, u8 *to, int len,
  6.1106 +					      unsigned int csum);
  6.1107 +extern void	       skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to);
  6.1108 +extern void	       skb_split(struct sk_buff *skb,
  6.1109 +				 struct sk_buff *skb1, const u32 len);
  6.1110 +
  6.1111 +static inline void *skb_header_pointer(const struct sk_buff *skb, int offset,
  6.1112 +				       int len, void *buffer)
  6.1113 +{
  6.1114 +	int hlen = skb_headlen(skb);
  6.1115 +
  6.1116 +	if (offset + len <= hlen)
  6.1117 +		return skb->data + offset;
  6.1118 +
  6.1119 +	if (skb_copy_bits(skb, offset, buffer, len) < 0)
  6.1120 +		return NULL;
  6.1121 +
  6.1122 +	return buffer;
  6.1123 +}
  6.1124 +
  6.1125 +extern void skb_init(void);
  6.1126 +extern void skb_add_mtu(int mtu);
  6.1127 +
  6.1128 +struct skb_iter {
  6.1129 +	/* Iteration functions set these */
  6.1130 +	unsigned char *data;
  6.1131 +	unsigned int len;
  6.1132 +
  6.1133 +	/* Private to iteration */
  6.1134 +	unsigned int nextfrag;
  6.1135 +	struct sk_buff *fraglist;
  6.1136 +};
  6.1137 +
  6.1138 +/* Keep iterating until skb_iter_next returns false. */
  6.1139 +extern void skb_iter_first(const struct sk_buff *skb, struct skb_iter *i);
  6.1140 +extern int skb_iter_next(const struct sk_buff *skb, struct skb_iter *i);
  6.1141 +/* Call this if aborting loop before !skb_iter_next */
  6.1142 +extern void skb_iter_abort(const struct sk_buff *skb, struct skb_iter *i);
  6.1143 +
  6.1144 +#ifdef CONFIG_NETFILTER
  6.1145 +static inline void nf_conntrack_put(struct nf_conntrack *nfct)
  6.1146 +{
  6.1147 +	if (nfct && atomic_dec_and_test(&nfct->use))
  6.1148 +		nfct->destroy(nfct);
  6.1149 +}
  6.1150 +static inline void nf_conntrack_get(struct nf_conntrack *nfct)
  6.1151 +{
  6.1152 +	if (nfct)
  6.1153 +		atomic_inc(&nfct->use);
  6.1154 +}
  6.1155 +static inline void nf_reset(struct sk_buff *skb)
  6.1156 +{
  6.1157 +	nf_conntrack_put(skb->nfct);
  6.1158 +	skb->nfct = NULL;
  6.1159 +#ifdef CONFIG_NETFILTER_DEBUG
  6.1160 +	skb->nf_debug = 0;
  6.1161 +#endif
  6.1162 +}
  6.1163 +static inline void nf_reset_debug(struct sk_buff *skb)
  6.1164 +{
  6.1165 +#ifdef CONFIG_NETFILTER_DEBUG
  6.1166 +	skb->nf_debug = 0;
  6.1167 +#endif
  6.1168 +}
  6.1169 +
  6.1170 +#ifdef CONFIG_BRIDGE_NETFILTER
  6.1171 +static inline void nf_bridge_put(struct nf_bridge_info *nf_bridge)
  6.1172 +{
  6.1173 +	if (nf_bridge && atomic_dec_and_test(&nf_bridge->use))
  6.1174 +		kfree(nf_bridge);
  6.1175 +}
  6.1176 +static inline void nf_bridge_get(struct nf_bridge_info *nf_bridge)
  6.1177 +{
  6.1178 +	if (nf_bridge)
  6.1179 +		atomic_inc(&nf_bridge->use);
  6.1180 +}
  6.1181 +#endif /* CONFIG_BRIDGE_NETFILTER */
  6.1182 +#else /* CONFIG_NETFILTER */
  6.1183 +static inline void nf_reset(struct sk_buff *skb) {}
  6.1184 +#endif /* CONFIG_NETFILTER */
  6.1185 +
  6.1186 +#endif	/* __KERNEL__ */
  6.1187 +#endif	/* _LINUX_SKBUFF_H */
     7.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     7.2 +++ b/linux-2.6.11-xen-sparse/net/core/dev.c	Sat May 21 19:08:56 2005 +0000
     7.3 @@ -0,0 +1,3389 @@
     7.4 +/*
     7.5 + * 	NET3	Protocol independent device support routines.
     7.6 + *
     7.7 + *		This program is free software; you can redistribute it and/or
     7.8 + *		modify it under the terms of the GNU General Public License
     7.9 + *		as published by the Free Software Foundation; either version
    7.10 + *		2 of the License, or (at your option) any later version.
    7.11 + *
    7.12 + *	Derived from the non IP parts of dev.c 1.0.19
    7.13 + * 		Authors:	Ross Biro, <bir7@leland.Stanford.Edu>
    7.14 + *				Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
    7.15 + *				Mark Evans, <evansmp@uhura.aston.ac.uk>
    7.16 + *
    7.17 + *	Additional Authors:
    7.18 + *		Florian la Roche <rzsfl@rz.uni-sb.de>
    7.19 + *		Alan Cox <gw4pts@gw4pts.ampr.org>
    7.20 + *		David Hinds <dahinds@users.sourceforge.net>
    7.21 + *		Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
    7.22 + *		Adam Sulmicki <adam@cfar.umd.edu>
    7.23 + *              Pekka Riikonen <priikone@poesidon.pspt.fi>
    7.24 + *
    7.25 + *	Changes:
    7.26 + *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
    7.27 + *              			to 2 if register_netdev gets called
    7.28 + *              			before net_dev_init & also removed a
    7.29 + *              			few lines of code in the process.
    7.30 + *		Alan Cox	:	device private ioctl copies fields back.
    7.31 + *		Alan Cox	:	Transmit queue code does relevant
    7.32 + *					stunts to keep the queue safe.
    7.33 + *		Alan Cox	:	Fixed double lock.
    7.34 + *		Alan Cox	:	Fixed promisc NULL pointer trap
    7.35 + *		????????	:	Support the full private ioctl range
    7.36 + *		Alan Cox	:	Moved ioctl permission check into
    7.37 + *					drivers
    7.38 + *		Tim Kordas	:	SIOCADDMULTI/SIOCDELMULTI
    7.39 + *		Alan Cox	:	100 backlog just doesn't cut it when
    7.40 + *					you start doing multicast video 8)
    7.41 + *		Alan Cox	:	Rewrote net_bh and list manager.
    7.42 + *		Alan Cox	: 	Fix ETH_P_ALL echoback lengths.
    7.43 + *		Alan Cox	:	Took out transmit every packet pass
    7.44 + *					Saved a few bytes in the ioctl handler
    7.45 + *		Alan Cox	:	Network driver sets packet type before
    7.46 + *					calling netif_rx. Saves a function
    7.47 + *					call a packet.
    7.48 + *		Alan Cox	:	Hashed net_bh()
    7.49 + *		Richard Kooijman:	Timestamp fixes.
    7.50 + *		Alan Cox	:	Wrong field in SIOCGIFDSTADDR
    7.51 + *		Alan Cox	:	Device lock protection.
    7.52 + *		Alan Cox	: 	Fixed nasty side effect of device close
    7.53 + *					changes.
    7.54 + *		Rudi Cilibrasi	:	Pass the right thing to
    7.55 + *					set_mac_address()
    7.56 + *		Dave Miller	:	32bit quantity for the device lock to
    7.57 + *					make it work out on a Sparc.
    7.58 + *		Bjorn Ekwall	:	Added KERNELD hack.
    7.59 + *		Alan Cox	:	Cleaned up the backlog initialise.
    7.60 + *		Craig Metz	:	SIOCGIFCONF fix if space for under
    7.61 + *					1 device.
    7.62 + *	    Thomas Bogendoerfer :	Return ENODEV for dev_open, if there
    7.63 + *					is no device open function.
    7.64 + *		Andi Kleen	:	Fix error reporting for SIOCGIFCONF
    7.65 + *	    Michael Chastain	:	Fix signed/unsigned for SIOCGIFCONF
    7.66 + *		Cyrus Durgin	:	Cleaned for KMOD
    7.67 + *		Adam Sulmicki   :	Bug Fix : Network Device Unload
    7.68 + *					A network device unload needs to purge
    7.69 + *					the backlog queue.
    7.70 + *	Paul Rusty Russell	:	SIOCSIFNAME
    7.71 + *              Pekka Riikonen  :	Netdev boot-time settings code
    7.72 + *              Andrew Morton   :       Make unregister_netdevice wait
    7.73 + *              			indefinitely on dev->refcnt
    7.74 + * 		J Hadi Salim	:	- Backlog queue sampling
    7.75 + *				        - netif_rx() feedback
    7.76 + */
    7.77 +
    7.78 +#include <asm/uaccess.h>
    7.79 +#include <asm/system.h>
    7.80 +#include <linux/bitops.h>
    7.81 +#include <linux/config.h>
    7.82 +#include <linux/cpu.h>
    7.83 +#include <linux/types.h>
    7.84 +#include <linux/kernel.h>
    7.85 +#include <linux/sched.h>
    7.86 +#include <linux/string.h>
    7.87 +#include <linux/mm.h>
    7.88 +#include <linux/socket.h>
    7.89 +#include <linux/sockios.h>
    7.90 +#include <linux/errno.h>
    7.91 +#include <linux/interrupt.h>
    7.92 +#include <linux/if_ether.h>
    7.93 +#include <linux/netdevice.h>
    7.94 +#include <linux/etherdevice.h>
    7.95 +#include <linux/notifier.h>
    7.96 +#include <linux/skbuff.h>
    7.97 +#include <net/sock.h>
    7.98 +#include <linux/rtnetlink.h>
    7.99 +#include <linux/proc_fs.h>
   7.100 +#include <linux/seq_file.h>
   7.101 +#include <linux/stat.h>
   7.102 +#include <linux/if_bridge.h>
   7.103 +#include <linux/divert.h>
   7.104 +#include <net/dst.h>
   7.105 +#include <net/pkt_sched.h>
   7.106 +#include <net/checksum.h>
   7.107 +#include <linux/highmem.h>
   7.108 +#include <linux/init.h>
   7.109 +#include <linux/kmod.h>
   7.110 +#include <linux/module.h>
   7.111 +#include <linux/kallsyms.h>
   7.112 +#include <linux/netpoll.h>
   7.113 +#include <linux/rcupdate.h>
   7.114 +#include <linux/delay.h>
   7.115 +#ifdef CONFIG_NET_RADIO
   7.116 +#include <linux/wireless.h>		/* Note : will define WIRELESS_EXT */
   7.117 +#include <net/iw_handler.h>
   7.118 +#endif	/* CONFIG_NET_RADIO */
   7.119 +#include <asm/current.h>
   7.120 +
   7.121 +#include <net/ip.h>
   7.122 +#include <linux/tcp.h>
   7.123 +#include <linux/udp.h>
   7.124 +
   7.125 +
   7.126 +/* This define, if set, will randomly drop a packet when congestion
   7.127 + * is more than moderate.  It helps fairness in the multi-interface
   7.128 + * case when one of them is a hog, but it kills performance for the
   7.129 + * single interface case so it is off now by default.
   7.130 + */
   7.131 +#undef RAND_LIE
   7.132 +
   7.133 +/* Setting this will sample the queue lengths and thus congestion
   7.134 + * via a timer instead of as each packet is received.
   7.135 + */
   7.136 +#undef OFFLINE_SAMPLE
   7.137 +
   7.138 +/*
   7.139 + *	The list of packet types we will receive (as opposed to discard)
   7.140 + *	and the routines to invoke.
   7.141 + *
   7.142 + *	Why 16. Because with 16 the only overlap we get on a hash of the
   7.143 + *	low nibble of the protocol value is RARP/SNAP/X.25.
   7.144 + *
   7.145 + *      NOTE:  That is no longer true with the addition of VLAN tags.  Not
   7.146 + *             sure which should go first, but I bet it won't make much
   7.147 + *             difference if we are running VLANs.  The good news is that
   7.148 + *             this protocol won't be in the list unless compiled in, so
   7.149 + *             the average user (w/out VLANs) will not be adversly affected.
   7.150 + *             --BLG
   7.151 + *
   7.152 + *		0800	IP
   7.153 + *		8100    802.1Q VLAN
   7.154 + *		0001	802.3
   7.155 + *		0002	AX.25
   7.156 + *		0004	802.2
   7.157 + *		8035	RARP
   7.158 + *		0005	SNAP
   7.159 + *		0805	X.25
   7.160 + *		0806	ARP
   7.161 + *		8137	IPX
   7.162 + *		0009	Localtalk
   7.163 + *		86DD	IPv6
   7.164 + */
   7.165 +
   7.166 +static DEFINE_SPINLOCK(ptype_lock);
   7.167 +static struct list_head ptype_base[16];	/* 16 way hashed list */
   7.168 +static struct list_head ptype_all;		/* Taps */
   7.169 +
   7.170 +#ifdef OFFLINE_SAMPLE
   7.171 +static void sample_queue(unsigned long dummy);
   7.172 +static struct timer_list samp_timer = TIMER_INITIALIZER(sample_queue, 0, 0);
   7.173 +#endif
   7.174 +
   7.175 +/*
   7.176 + * The @dev_base list is protected by @dev_base_lock and the rtln
   7.177 + * semaphore.
   7.178 + *
   7.179 + * Pure readers hold dev_base_lock for reading.
   7.180 + *
   7.181 + * Writers must hold the rtnl semaphore while they loop through the
   7.182 + * dev_base list, and hold dev_base_lock for writing when they do the
   7.183 + * actual updates.  This allows pure readers to access the list even
   7.184 + * while a writer is preparing to update it.
   7.185 + *
   7.186 + * To put it another way, dev_base_lock is held for writing only to
   7.187 + * protect against pure readers; the rtnl semaphore provides the
   7.188 + * protection against other writers.
   7.189 + *
   7.190 + * See, for example usages, register_netdevice() and
   7.191 + * unregister_netdevice(), which must be called with the rtnl
   7.192 + * semaphore held.
   7.193 + */
   7.194 +struct net_device *dev_base;
   7.195 +static struct net_device **dev_tail = &dev_base;
   7.196 +DEFINE_RWLOCK(dev_base_lock);
   7.197 +
   7.198 +EXPORT_SYMBOL(dev_base);
   7.199 +EXPORT_SYMBOL(dev_base_lock);
   7.200 +
   7.201 +#define NETDEV_HASHBITS	8
   7.202 +static struct hlist_head dev_name_head[1<<NETDEV_HASHBITS];
   7.203 +static struct hlist_head dev_index_head[1<<NETDEV_HASHBITS];
   7.204 +
   7.205 +static inline struct hlist_head *dev_name_hash(const char *name)
   7.206 +{
   7.207 +	unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
   7.208 +	return &dev_name_head[hash & ((1<<NETDEV_HASHBITS)-1)];
   7.209 +}
   7.210 +
   7.211 +static inline struct hlist_head *dev_index_hash(int ifindex)
   7.212 +{
   7.213 +	return &dev_index_head[ifindex & ((1<<NETDEV_HASHBITS)-1)];
   7.214 +}
   7.215 +
   7.216 +/*
   7.217 + *	Our notifier list
   7.218 + */
   7.219 +
   7.220 +static struct notifier_block *netdev_chain;
   7.221 +
   7.222 +/*
   7.223 + *	Device drivers call our routines to queue packets here. We empty the
   7.224 + *	queue in the local softnet handler.
   7.225 + */
   7.226 +DEFINE_PER_CPU(struct softnet_data, softnet_data) = { 0, };
   7.227 +
   7.228 +#ifdef CONFIG_SYSFS
   7.229 +extern int netdev_sysfs_init(void);
   7.230 +extern int netdev_register_sysfs(struct net_device *);
   7.231 +extern void netdev_unregister_sysfs(struct net_device *);
   7.232 +#else
   7.233 +#define netdev_sysfs_init()	 	(0)
   7.234 +#define netdev_register_sysfs(dev)	(0)
   7.235 +#define	netdev_unregister_sysfs(dev)	do { } while(0)
   7.236 +#endif
   7.237 +
   7.238 +
   7.239 +/*******************************************************************************
   7.240 +
   7.241 +		Protocol management and registration routines
   7.242 +
   7.243 +*******************************************************************************/
   7.244 +
   7.245 +/*
   7.246 + *	For efficiency
   7.247 + */
   7.248 +
   7.249 +int netdev_nit;
   7.250 +
   7.251 +/*
   7.252 + *	Add a protocol ID to the list. Now that the input handler is
   7.253 + *	smarter we can dispense with all the messy stuff that used to be
   7.254 + *	here.
   7.255 + *
   7.256 + *	BEWARE!!! Protocol handlers, mangling input packets,
   7.257 + *	MUST BE last in hash buckets and checking protocol handlers
   7.258 + *	MUST start from promiscuous ptype_all chain in net_bh.
   7.259 + *	It is true now, do not change it.
   7.260 + *	Explanation follows: if protocol handler, mangling packet, will
   7.261 + *	be the first on list, it is not able to sense, that packet
   7.262 + *	is cloned and should be copied-on-write, so that it will
   7.263 + *	change it and subsequent readers will get broken packet.
   7.264 + *							--ANK (980803)
   7.265 + */
   7.266 +
   7.267 +/**
   7.268 + *	dev_add_pack - add packet handler
   7.269 + *	@pt: packet type declaration
   7.270 + *
   7.271 + *	Add a protocol handler to the networking stack. The passed &packet_type
   7.272 + *	is linked into kernel lists and may not be freed until it has been
   7.273 + *	removed from the kernel lists.
   7.274 + *
   7.275 + *	This call does not sleep therefore it can not 
   7.276 + *	guarantee all CPU's that are in middle of receiving packets
   7.277 + *	will see the new packet type (until the next received packet).
   7.278 + */
   7.279 +
   7.280 +void dev_add_pack(struct packet_type *pt)
   7.281 +{
   7.282 +	int hash;
   7.283 +
   7.284 +	spin_lock_bh(&ptype_lock);
   7.285 +	if (pt->type == htons(ETH_P_ALL)) {
   7.286 +		netdev_nit++;
   7.287 +		list_add_rcu(&pt->list, &ptype_all);
   7.288 +	} else {
   7.289 +		hash = ntohs(pt->type) & 15;
   7.290 +		list_add_rcu(&pt->list, &ptype_base[hash]);
   7.291 +	}
   7.292 +	spin_unlock_bh(&ptype_lock);
   7.293 +}
   7.294 +
   7.295 +extern void linkwatch_run_queue(void);
   7.296 +
   7.297 +
   7.298 +
   7.299 +/**
   7.300 + *	__dev_remove_pack	 - remove packet handler
   7.301 + *	@pt: packet type declaration
   7.302 + *
   7.303 + *	Remove a protocol handler that was previously added to the kernel
   7.304 + *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
   7.305 + *	from the kernel lists and can be freed or reused once this function
   7.306 + *	returns. 
   7.307 + *
   7.308 + *      The packet type might still be in use by receivers
   7.309 + *	and must not be freed until after all the CPU's have gone
   7.310 + *	through a quiescent state.
   7.311 + */
   7.312 +void __dev_remove_pack(struct packet_type *pt)
   7.313 +{
   7.314 +	struct list_head *head;
   7.315 +	struct packet_type *pt1;
   7.316 +
   7.317 +	spin_lock_bh(&ptype_lock);
   7.318 +
   7.319 +	if (pt->type == htons(ETH_P_ALL)) {
   7.320 +		netdev_nit--;
   7.321 +		head = &ptype_all;
   7.322 +	} else
   7.323 +		head = &ptype_base[ntohs(pt->type) & 15];
   7.324 +
   7.325 +	list_for_each_entry(pt1, head, list) {
   7.326 +		if (pt == pt1) {
   7.327 +			list_del_rcu(&pt->list);
   7.328 +			goto out;
   7.329 +		}
   7.330 +	}
   7.331 +
   7.332 +	printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt);
   7.333 +out:
   7.334 +	spin_unlock_bh(&ptype_lock);
   7.335 +}
   7.336 +/**
   7.337 + *	dev_remove_pack	 - remove packet handler
   7.338 + *	@pt: packet type declaration
   7.339 + *
   7.340 + *	Remove a protocol handler that was previously added to the kernel
   7.341 + *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
   7.342 + *	from the kernel lists and can be freed or reused once this function
   7.343 + *	returns.
   7.344 + *
   7.345 + *	This call sleeps to guarantee that no CPU is looking at the packet
   7.346 + *	type after return.
   7.347 + */
   7.348 +void dev_remove_pack(struct packet_type *pt)
   7.349 +{
   7.350 +	__dev_remove_pack(pt);
   7.351 +	
   7.352 +	synchronize_net();
   7.353 +}
   7.354 +
   7.355 +/******************************************************************************
   7.356 +
   7.357 +		      Device Boot-time Settings Routines
   7.358 +
   7.359 +*******************************************************************************/
   7.360 +
   7.361 +/* Boot time configuration table */
   7.362 +static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
   7.363 +
   7.364 +/**
   7.365 + *	netdev_boot_setup_add	- add new setup entry
   7.366 + *	@name: name of the device
   7.367 + *	@map: configured settings for the device
   7.368 + *
   7.369 + *	Adds new setup entry to the dev_boot_setup list.  The function
   7.370 + *	returns 0 on error and 1 on success.  This is a generic routine to
   7.371 + *	all netdevices.
   7.372 + */
   7.373 +static int netdev_boot_setup_add(char *name, struct ifmap *map)
   7.374 +{
   7.375 +	struct netdev_boot_setup *s;
   7.376 +	int i;
   7.377 +
   7.378 +	s = dev_boot_setup;
   7.379 +	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
   7.380 +		if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
   7.381 +			memset(s[i].name, 0, sizeof(s[i].name));
   7.382 +			strcpy(s[i].name, name);
   7.383 +			memcpy(&s[i].map, map, sizeof(s[i].map));
   7.384 +			break;
   7.385 +		}
   7.386 +	}
   7.387 +
   7.388 +	return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
   7.389 +}
   7.390 +
   7.391 +/**
   7.392 + *	netdev_boot_setup_check	- check boot time settings
   7.393 + *	@dev: the netdevice
   7.394 + *
   7.395 + * 	Check boot time settings for the device.
   7.396 + *	The found settings are set for the device to be used
   7.397 + *	later in the device probing.
   7.398 + *	Returns 0 if no settings found, 1 if they are.
   7.399 + */
   7.400 +int netdev_boot_setup_check(struct net_device *dev)
   7.401 +{
   7.402 +	struct netdev_boot_setup *s = dev_boot_setup;
   7.403 +	int i;
   7.404 +
   7.405 +	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
   7.406 +		if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
   7.407 +		    !strncmp(dev->name, s[i].name, strlen(s[i].name))) {
   7.408 +			dev->irq 	= s[i].map.irq;
   7.409 +			dev->base_addr 	= s[i].map.base_addr;
   7.410 +			dev->mem_start 	= s[i].map.mem_start;
   7.411 +			dev->mem_end 	= s[i].map.mem_end;
   7.412 +			return 1;
   7.413 +		}
   7.414 +	}
   7.415 +	return 0;
   7.416 +}
   7.417 +
   7.418 +
   7.419 +/**
   7.420 + *	netdev_boot_base	- get address from boot time settings
   7.421 + *	@prefix: prefix for network device
   7.422 + *	@unit: id for network device
   7.423 + *
   7.424 + * 	Check boot time settings for the base address of device.
   7.425 + *	The found settings are set for the device to be used
   7.426 + *	later in the device probing.
   7.427 + *	Returns 0 if no settings found.
   7.428 + */
   7.429 +unsigned long netdev_boot_base(const char *prefix, int unit)
   7.430 +{
   7.431 +	const struct netdev_boot_setup *s = dev_boot_setup;
   7.432 +	char name[IFNAMSIZ];
   7.433 +	int i;
   7.434 +
   7.435 +	sprintf(name, "%s%d", prefix, unit);
   7.436 +
   7.437 +	/*
   7.438 +	 * If device already registered then return base of 1
   7.439 +	 * to indicate not to probe for this interface
   7.440 +	 */
   7.441 +	if (__dev_get_by_name(name))
   7.442 +		return 1;
   7.443 +
   7.444 +	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
   7.445 +		if (!strcmp(name, s[i].name))
   7.446 +			return s[i].map.base_addr;
   7.447 +	return 0;
   7.448 +}
   7.449 +
   7.450 +/*
   7.451 + * Saves at boot time configured settings for any netdevice.
   7.452 + */
   7.453 +int __init netdev_boot_setup(char *str)
   7.454 +{
   7.455 +	int ints[5];
   7.456 +	struct ifmap map;
   7.457 +
   7.458 +	str = get_options(str, ARRAY_SIZE(ints), ints);
   7.459 +	if (!str || !*str)
   7.460 +		return 0;
   7.461 +
   7.462 +	/* Save settings */
   7.463 +	memset(&map, 0, sizeof(map));
   7.464 +	if (ints[0] > 0)
   7.465 +		map.irq = ints[1];
   7.466 +	if (ints[0] > 1)
   7.467 +		map.base_addr = ints[2];
   7.468 +	if (ints[0] > 2)
   7.469 +		map.mem_start = ints[3];
   7.470 +	if (ints[0] > 3)
   7.471 +		map.mem_end = ints[4];
   7.472 +
   7.473 +	/* Add new entry to the list */
   7.474 +	return netdev_boot_setup_add(str, &map);
   7.475 +}
   7.476 +
   7.477 +__setup("netdev=", netdev_boot_setup);
   7.478 +
   7.479 +/*******************************************************************************
   7.480 +
   7.481 +			    Device Interface Subroutines
   7.482 +
   7.483 +*******************************************************************************/
   7.484 +
   7.485 +/**
   7.486 + *	__dev_get_by_name	- find a device by its name
   7.487 + *	@name: name to find
   7.488 + *
   7.489 + *	Find an interface by name. Must be called under RTNL semaphore
   7.490 + *	or @dev_base_lock. If the name is found a pointer to the device
   7.491 + *	is returned. If the name is not found then %NULL is returned. The
   7.492 + *	reference counters are not incremented so the caller must be
   7.493 + *	careful with locks.
   7.494 + */
   7.495 +
   7.496 +struct net_device *__dev_get_by_name(const char *name)
   7.497 +{
   7.498 +	struct hlist_node *p;
   7.499 +
   7.500 +	hlist_for_each(p, dev_name_hash(name)) {
   7.501 +		struct net_device *dev
   7.502 +			= hlist_entry(p, struct net_device, name_hlist);
   7.503 +		if (!strncmp(dev->name, name, IFNAMSIZ))
   7.504 +			return dev;
   7.505 +	}
   7.506 +	return NULL;
   7.507 +}
   7.508 +
   7.509 +/**
   7.510 + *	dev_get_by_name		- find a device by its name
   7.511 + *	@name: name to find
   7.512 + *
   7.513 + *	Find an interface by name. This can be called from any
   7.514 + *	context and does its own locking. The returned handle has
   7.515 + *	the usage count incremented and the caller must use dev_put() to
   7.516 + *	release it when it is no longer needed. %NULL is returned if no
   7.517 + *	matching device is found.
   7.518 + */
   7.519 +
   7.520 +struct net_device *dev_get_by_name(const char *name)
   7.521 +{
   7.522 +	struct net_device *dev;
   7.523 +
   7.524 +	read_lock(&dev_base_lock);
   7.525 +	dev = __dev_get_by_name(name);
   7.526 +	if (dev)
   7.527 +		dev_hold(dev);
   7.528 +	read_unlock(&dev_base_lock);
   7.529 +	return dev;
   7.530 +}
   7.531 +
   7.532 +/**
   7.533 + *	__dev_get_by_index - find a device by its ifindex
   7.534 + *	@ifindex: index of device
   7.535 + *
   7.536 + *	Search for an interface by index. Returns %NULL if the device
   7.537 + *	is not found or a pointer to the device. The device has not
   7.538 + *	had its reference counter increased so the caller must be careful
   7.539 + *	about locking. The caller must hold either the RTNL semaphore
   7.540 + *	or @dev_base_lock.
   7.541 + */
   7.542 +
   7.543 +struct net_device *__dev_get_by_index(int ifindex)
   7.544 +{
   7.545 +	struct hlist_node *p;
   7.546 +
   7.547 +	hlist_for_each(p, dev_index_hash(ifindex)) {
   7.548 +		struct net_device *dev
   7.549 +			= hlist_entry(p, struct net_device, index_hlist);
   7.550 +		if (dev->ifindex == ifindex)
   7.551 +			return dev;
   7.552 +	}
   7.553 +	return NULL;
   7.554 +}
   7.555 +
   7.556 +
   7.557 +/**
   7.558 + *	dev_get_by_index - find a device by its ifindex
   7.559 + *	@ifindex: index of device
   7.560 + *
   7.561 + *	Search for an interface by index. Returns NULL if the device
   7.562 + *	is not found or a pointer to the device. The device returned has
   7.563 + *	had a reference added and the pointer is safe until the user calls
   7.564 + *	dev_put to indicate they have finished with it.
   7.565 + */
   7.566 +
   7.567 +struct net_device *dev_get_by_index(int ifindex)
   7.568 +{
   7.569 +	struct net_device *dev;
   7.570 +
   7.571 +	read_lock(&dev_base_lock);
   7.572 +	dev = __dev_get_by_index(ifindex);
   7.573 +	if (dev)
   7.574 +		dev_hold(dev);
   7.575 +	read_unlock(&dev_base_lock);
   7.576 +	return dev;
   7.577 +}
   7.578 +
   7.579 +/**
   7.580 + *	dev_getbyhwaddr - find a device by its hardware address
   7.581 + *	@type: media type of device
   7.582 + *	@ha: hardware address
   7.583 + *
   7.584 + *	Search for an interface by MAC address. Returns NULL if the device
   7.585 + *	is not found or a pointer to the device. The caller must hold the
   7.586 + *	rtnl semaphore. The returned device has not had its ref count increased
   7.587 + *	and the caller must therefore be careful about locking
   7.588 + *
   7.589 + *	BUGS:
   7.590 + *	If the API was consistent this would be __dev_get_by_hwaddr
   7.591 + */
   7.592 +
   7.593 +struct net_device *dev_getbyhwaddr(unsigned short type, char *ha)
   7.594 +{
   7.595 +	struct net_device *dev;
   7.596 +
   7.597 +	ASSERT_RTNL();
   7.598 +
   7.599 +	for (dev = dev_base; dev; dev = dev->next)
   7.600 +		if (dev->type == type &&
   7.601 +		    !memcmp(dev->dev_addr, ha, dev->addr_len))
   7.602 +			break;
   7.603 +	return dev;
   7.604 +}
   7.605 +
   7.606 +struct net_device *dev_getfirstbyhwtype(unsigned short type)
   7.607 +{
   7.608 +	struct net_device *dev;
   7.609 +
   7.610 +	rtnl_lock();
   7.611 +	for (dev = dev_base; dev; dev = dev->next) {
   7.612 +		if (dev->type == type) {
   7.613 +			dev_hold(dev);
   7.614 +			break;
   7.615 +		}
   7.616 +	}
   7.617 +	rtnl_unlock();
   7.618 +	return dev;
   7.619 +}
   7.620 +
   7.621 +EXPORT_SYMBOL(dev_getfirstbyhwtype);
   7.622 +
   7.623 +/**
   7.624 + *	dev_get_by_flags - find any device with given flags
   7.625 + *	@if_flags: IFF_* values
   7.626 + *	@mask: bitmask of bits in if_flags to check
   7.627 + *
   7.628 + *	Search for any interface with the given flags. Returns NULL if a device
   7.629 + *	is not found or a pointer to the device. The device returned has 
   7.630 + *	had a reference added and the pointer is safe until the user calls
   7.631 + *	dev_put to indicate they have finished with it.
   7.632 + */
   7.633 +
   7.634 +struct net_device * dev_get_by_flags(unsigned short if_flags, unsigned short mask)
   7.635 +{
   7.636 +	struct net_device *dev;
   7.637 +
   7.638 +	read_lock(&dev_base_lock);
   7.639 +	for (dev = dev_base; dev != NULL; dev = dev->next) {
   7.640 +		if (((dev->flags ^ if_flags) & mask) == 0) {
   7.641 +			dev_hold(dev);
   7.642 +			break;
   7.643 +		}
   7.644 +	}
   7.645 +	read_unlock(&dev_base_lock);
   7.646 +	return dev;
   7.647 +}
   7.648 +
   7.649 +/**
   7.650 + *	dev_valid_name - check if name is okay for network device
   7.651 + *	@name: name string
   7.652 + *
   7.653 + *	Network device names need to be valid file names to
   7.654 + *	to allow sysfs to work
   7.655 + */
   7.656 +static int dev_valid_name(const char *name)
   7.657 +{
   7.658 +	return !(*name == '\0' 
   7.659 +		 || !strcmp(name, ".")
   7.660 +		 || !strcmp(name, "..")
   7.661 +		 || strchr(name, '/'));
   7.662 +}
   7.663 +
   7.664 +/**
   7.665 + *	dev_alloc_name - allocate a name for a device
   7.666 + *	@dev: device
   7.667 + *	@name: name format string
   7.668 + *
   7.669 + *	Passed a format string - eg "lt%d" it will try and find a suitable
   7.670 + *	id. Not efficient for many devices, not called a lot. The caller
   7.671 + *	must hold the dev_base or rtnl lock while allocating the name and
   7.672 + *	adding the device in order to avoid duplicates. Returns the number
   7.673 + *	of the unit assigned or a negative errno code.
   7.674 + */
   7.675 +
   7.676 +int dev_alloc_name(struct net_device *dev, const char *name)
   7.677 +{
   7.678 +	int i = 0;
   7.679 +	char buf[IFNAMSIZ];
   7.680 +	const char *p;
   7.681 +	const int max_netdevices = 8*PAGE_SIZE;
   7.682 +	long *inuse;
   7.683 +	struct net_device *d;
   7.684 +
   7.685 +	p = strnchr(name, IFNAMSIZ-1, '%');
   7.686 +	if (p) {
   7.687 +		/*
   7.688 +		 * Verify the string as this thing may have come from
   7.689 +		 * the user.  There must be either one "%d" and no other "%"
   7.690 +		 * characters.
   7.691 +		 */
   7.692 +		if (p[1] != 'd' || strchr(p + 2, '%'))
   7.693 +			return -EINVAL;
   7.694 +
   7.695 +		/* Use one page as a bit array of possible slots */
   7.696 +		inuse = (long *) get_zeroed_page(GFP_ATOMIC);
   7.697 +		if (!inuse)
   7.698 +			return -ENOMEM;
   7.699 +
   7.700 +		for (d = dev_base; d; d = d->next) {
   7.701 +			if (!sscanf(d->name, name, &i))
   7.702 +				continue;
   7.703 +			if (i < 0 || i >= max_netdevices)
   7.704 +				continue;
   7.705 +
   7.706 +			/*  avoid cases where sscanf is not exact inverse of printf */
   7.707 +			snprintf(buf, sizeof(buf), name, i);
   7.708 +			if (!strncmp(buf, d->name, IFNAMSIZ))
   7.709 +				set_bit(i, inuse);
   7.710 +		}
   7.711 +
   7.712 +		i = find_first_zero_bit(inuse, max_netdevices);
   7.713 +		free_page((unsigned long) inuse);
   7.714 +	}
   7.715 +
   7.716 +	snprintf(buf, sizeof(buf), name, i);
   7.717 +	if (!__dev_get_by_name(buf)) {
   7.718 +		strlcpy(dev->name, buf, IFNAMSIZ);
   7.719 +		return i;
   7.720 +	}
   7.721 +
   7.722 +	/* It is possible to run out of possible slots
   7.723 +	 * when the name is long and there isn't enough space left
   7.724 +	 * for the digits, or if all bits are used.
   7.725 +	 */
   7.726 +	return -ENFILE;
   7.727 +}
   7.728 +
   7.729 +
   7.730 +/**
   7.731 + *	dev_change_name - change name of a device
   7.732 + *	@dev: device
   7.733 + *	@newname: name (or format string) must be at least IFNAMSIZ
   7.734 + *
   7.735 + *	Change name of a device, can pass format strings "eth%d".
   7.736 + *	for wildcarding.
   7.737 + */
   7.738 +int dev_change_name(struct net_device *dev, char *newname)
   7.739 +{
   7.740 +	int err = 0;
   7.741 +
   7.742 +	ASSERT_RTNL();
   7.743 +
   7.744 +	if (dev->flags & IFF_UP)
   7.745 +		return -EBUSY;
   7.746 +
   7.747 +	if (!dev_valid_name(newname))
   7.748 +		return -EINVAL;
   7.749 +
   7.750 +	if (strchr(newname, '%')) {
   7.751 +		err = dev_alloc_name(dev, newname);
   7.752 +		if (err < 0)
   7.753 +			return err;
   7.754 +		strcpy(newname, dev->name);
   7.755 +	}
   7.756 +	else if (__dev_get_by_name(newname))
   7.757 +		return -EEXIST;
   7.758 +	else
   7.759 +		strlcpy(dev->name, newname, IFNAMSIZ);
   7.760 +
   7.761 +	err = class_device_rename(&dev->class_dev, dev->name);
   7.762 +	if (!err) {
   7.763 +		hlist_del(&dev->name_hlist);
   7.764 +		hlist_add_head(&dev->name_hlist, dev_name_hash(dev->name));
   7.765 +		notifier_call_chain(&netdev_chain, NETDEV_CHANGENAME, dev);
   7.766 +	}
   7.767 +
   7.768 +	return err;
   7.769 +}
   7.770 +
   7.771 +/**
   7.772 + *	netdev_state_change - device changes state
   7.773 + *	@dev: device to cause notification
   7.774 + *
   7.775 + *	Called to indicate a device has changed state. This function calls
   7.776 + *	the notifier chains for netdev_chain and sends a NEWLINK message
   7.777 + *	to the routing socket.
   7.778 + */
   7.779 +void netdev_state_change(struct net_device *dev)
   7.780 +{
   7.781 +	if (dev->flags & IFF_UP) {
   7.782 +		notifier_call_chain(&netdev_chain, NETDEV_CHANGE, dev);
   7.783 +		rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
   7.784 +	}
   7.785 +}
   7.786 +
   7.787 +/**
   7.788 + *	dev_load 	- load a network module
   7.789 + *	@name: name of interface
   7.790 + *
   7.791 + *	If a network interface is not present and the process has suitable
   7.792 + *	privileges this function loads the module. If module loading is not
   7.793 + *	available in this kernel then it becomes a nop.
   7.794 + */
   7.795 +
   7.796 +void dev_load(const char *name)
   7.797 +{
   7.798 +	struct net_device *dev;  
   7.799 +
   7.800 +	read_lock(&dev_base_lock);
   7.801 +	dev = __dev_get_by_name(name);
   7.802 +	read_unlock(&dev_base_lock);
   7.803 +
   7.804 +	if (!dev && capable(CAP_SYS_MODULE))
   7.805 +		request_module("%s", name);
   7.806 +}
   7.807 +
   7.808 +static int default_rebuild_header(struct sk_buff *skb)
   7.809 +{
   7.810 +	printk(KERN_DEBUG "%s: default_rebuild_header called -- BUG!\n",
   7.811 +	       skb->dev ? skb->dev->name : "NULL!!!");
   7.812 +	kfree_skb(skb);
   7.813 +	return 1;
   7.814 +}
   7.815 +
   7.816 +
   7.817 +/**
   7.818 + *	dev_open	- prepare an interface for use.
   7.819 + *	@dev:	device to open
   7.820 + *
   7.821 + *	Takes a device from down to up state. The device's private open
   7.822 + *	function is invoked and then the multicast lists are loaded. Finally
   7.823 + *	the device is moved into the up state and a %NETDEV_UP message is
   7.824 + *	sent to the netdev notifier chain.
   7.825 + *
   7.826 + *	Calling this function on an active interface is a nop. On a failure
   7.827 + *	a negative errno code is returned.
   7.828 + */
   7.829 +int dev_open(struct net_device *dev)
   7.830 +{
   7.831 +	int ret = 0;
   7.832 +
   7.833 +	/*
   7.834 +	 *	Is it already up?
   7.835 +	 */
   7.836 +
   7.837 +	if (dev->flags & IFF_UP)
   7.838 +		return 0;
   7.839 +
   7.840 +	/*
   7.841 +	 *	Is it even present?
   7.842 +	 */
   7.843 +	if (!netif_device_present(dev))
   7.844 +		return -ENODEV;
   7.845 +
   7.846 +	/*
   7.847 +	 *	Call device private open method
   7.848 +	 */
   7.849 +	set_bit(__LINK_STATE_START, &dev->state);
   7.850 +	if (dev->open) {
   7.851 +		ret = dev->open(dev);
   7.852 +		if (ret)
   7.853 +			clear_bit(__LINK_STATE_START, &dev->state);
   7.854 +	}
   7.855 +
   7.856 + 	/*
   7.857 +	 *	If it went open OK then:
   7.858 +	 */
   7.859 +
   7.860 +	if (!ret) {
   7.861 +		/*
   7.862 +		 *	Set the flags.
   7.863 +		 */
   7.864 +		dev->flags |= IFF_UP;
   7.865 +
   7.866 +		/*
   7.867 +		 *	Initialize multicasting status
   7.868 +		 */
   7.869 +		dev_mc_upload(dev);
   7.870 +
   7.871 +		/*
   7.872 +		 *	Wakeup transmit queue engine
   7.873 +		 */
   7.874 +		dev_activate(dev);
   7.875 +
   7.876 +		/*
   7.877 +		 *	... and announce new interface.
   7.878 +		 */
   7.879 +		notifier_call_chain(&netdev_chain, NETDEV_UP, dev);
   7.880 +	}
   7.881 +	return ret;
   7.882 +}
   7.883 +
   7.884 +/**
   7.885 + *	dev_close - shutdown an interface.
   7.886 + *	@dev: device to shutdown
   7.887 + *
   7.888 + *	This function moves an active device into down state. A
   7.889 + *	%NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
   7.890 + *	is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
   7.891 + *	chain.
   7.892 + */
   7.893 +int dev_close(struct net_device *dev)
   7.894 +{
   7.895 +	if (!(dev->flags & IFF_UP))
   7.896 +		return 0;
   7.897 +
   7.898 +	/*
   7.899 +	 *	Tell people we are going down, so that they can
   7.900 +	 *	prepare to death, when device is still operating.
   7.901 +	 */
   7.902 +	notifier_call_chain(&netdev_chain, NETDEV_GOING_DOWN, dev);
   7.903 +
   7.904 +	dev_deactivate(dev);
   7.905 +
   7.906 +	clear_bit(__LINK_STATE_START, &dev->state);
   7.907 +
   7.908 +	/* Synchronize to scheduled poll. We cannot touch poll list,
   7.909 +	 * it can be even on different cpu. So just clear netif_running(),
   7.910 +	 * and wait when poll really will happen. Actually, the best place
   7.911 +	 * for this is inside dev->stop() after device stopped its irq
   7.912 +	 * engine, but this requires more changes in devices. */
   7.913 +
   7.914 +	smp_mb__after_clear_bit(); /* Commit netif_running(). */
   7.915 +	while (test_bit(__LINK_STATE_RX_SCHED, &dev->state)) {
   7.916 +		/* No hurry. */
   7.917 +		current->state = TASK_INTERRUPTIBLE;
   7.918 +		schedule_timeout(1);
   7.919 +	}
   7.920 +
   7.921 +	/*
   7.922 +	 *	Call the device specific close. This cannot fail.
   7.923 +	 *	Only if device is UP
   7.924 +	 *
   7.925 +	 *	We allow it to be called even after a DETACH hot-plug
   7.926 +	 *	event.
   7.927 +	 */
   7.928 +	if (dev->stop)
   7.929 +		dev->stop(dev);
   7.930 +
   7.931 +	/*
   7.932 +	 *	Device is now down.
   7.933 +	 */
   7.934 +
   7.935 +	dev->flags &= ~IFF_UP;
   7.936 +
   7.937 +	/*
   7.938 +	 * Tell people we are down
   7.939 +	 */
   7.940 +	notifier_call_chain(&netdev_chain, NETDEV_DOWN, dev);
   7.941 +
   7.942 +	return 0;
   7.943 +}
   7.944 +
   7.945 +
   7.946 +/*
   7.947 + *	Device change register/unregister. These are not inline or static
   7.948 + *	as we export them to the world.
   7.949 + */
   7.950 +
   7.951 +/**
   7.952 + *	register_netdevice_notifier - register a network notifier block
   7.953 + *	@nb: notifier
   7.954 + *
   7.955 + *	Register a notifier to be called when network device events occur.
   7.956 + *	The notifier passed is linked into the kernel structures and must
   7.957 + *	not be reused until it has been unregistered. A negative errno code
   7.958 + *	is returned on a failure.
   7.959 + *
   7.960 + * 	When registered all registration and up events are replayed
   7.961 + *	to the new notifier to allow device to have a race free 
   7.962 + *	view of the network device list.
   7.963 + */
   7.964 +
   7.965 +int register_netdevice_notifier(struct notifier_block *nb)
   7.966 +{
   7.967 +	struct net_device *dev;
   7.968 +	int err;
   7.969 +
   7.970 +	rtnl_lock();
   7.971 +	err = notifier_chain_register(&netdev_chain, nb);
   7.972 +	if (!err) {
   7.973 +		for (dev = dev_base; dev; dev = dev->next) {
   7.974 +			nb->notifier_call(nb, NETDEV_REGISTER, dev);
   7.975 +
   7.976 +			if (dev->flags & IFF_UP) 
   7.977 +				nb->notifier_call(nb, NETDEV_UP, dev);
   7.978 +		}
   7.979 +	}
   7.980 +	rtnl_unlock();
   7.981 +	return err;
   7.982 +}
   7.983 +
   7.984 +/**
   7.985 + *	unregister_netdevice_notifier - unregister a network notifier block
   7.986 + *	@nb: notifier
   7.987 + *
   7.988 + *	Unregister a notifier previously registered by
   7.989 + *	register_netdevice_notifier(). The notifier is unlinked into the
   7.990 + *	kernel structures and may then be reused. A negative errno code
   7.991 + *	is returned on a failure.
   7.992 + */
   7.993 +
   7.994 +int unregister_netdevice_notifier(struct notifier_block *nb)
   7.995 +{
   7.996 +	return notifier_chain_unregister(&netdev_chain, nb);
   7.997 +}
   7.998 +
   7.999 +/**
  7.1000 + *	call_netdevice_notifiers - call all network notifier blocks
  7.1001 + *      @val: value passed unmodified to notifier function
  7.1002 + *      @v:   pointer passed unmodified to notifier function
  7.1003 + *
  7.1004 + *	Call all network notifier blocks.  Parameters and return value
  7.1005 + *	are as for notifier_call_chain().
  7.1006 + */
  7.1007 +
  7.1008 +int call_netdevice_notifiers(unsigned long val, void *v)
  7.1009 +{
  7.1010 +	return notifier_call_chain(&netdev_chain, val, v);
  7.1011 +}
  7.1012 +
  7.1013 +/* When > 0 there are consumers of rx skb time stamps */
  7.1014 +static atomic_t netstamp_needed = ATOMIC_INIT(0);
  7.1015 +
  7.1016 +void net_enable_timestamp(void)
  7.1017 +{
  7.1018 +	atomic_inc(&netstamp_needed);
  7.1019 +}
  7.1020 +
  7.1021 +void net_disable_timestamp(void)
  7.1022 +{
  7.1023 +	atomic_dec(&netstamp_needed);
  7.1024 +}
  7.1025 +
  7.1026 +static inline void net_timestamp(struct timeval *stamp)
  7.1027 +{
  7.1028 +	if (atomic_read(&netstamp_needed))
  7.1029 +		do_gettimeofday(stamp);
  7.1030 +	else {
  7.1031 +		stamp->tv_sec = 0;
  7.1032 +		stamp->tv_usec = 0;
  7.1033 +	}
  7.1034 +}
  7.1035 +
  7.1036 +/*
  7.1037 + *	Support routine. Sends outgoing frames to any network
  7.1038 + *	taps currently in use.
  7.1039 + */
  7.1040 +
  7.1041 +void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
  7.1042 +{
  7.1043 +	struct packet_type *ptype;
  7.1044 +	net_timestamp(&skb->stamp);
  7.1045 +
  7.1046 +	rcu_read_lock();
  7.1047 +	list_for_each_entry_rcu(ptype, &ptype_all, list) {
  7.1048 +		/* Never send packets back to the socket
  7.1049 +		 * they originated from - MvS (miquels@drinkel.ow.org)
  7.1050 +		 */
  7.1051 +		if ((ptype->dev == dev || !ptype->dev) &&
  7.1052 +		    (ptype->af_packet_priv == NULL ||
  7.1053 +		     (struct sock *)ptype->af_packet_priv != skb->sk)) {
  7.1054 +			struct sk_buff *skb2= skb_clone(skb, GFP_ATOMIC);
  7.1055 +			if (!skb2)
  7.1056 +				break;
  7.1057 +
  7.1058 +			/* skb->nh should be correctly
  7.1059 +			   set by sender, so that the second statement is
  7.1060 +			   just protection against buggy protocols.
  7.1061 +			 */
  7.1062 +			skb2->mac.raw = skb2->data;
  7.1063 +
  7.1064 +			if (skb2->nh.raw < skb2->data ||
  7.1065 +			    skb2->nh.raw > skb2->tail) {
  7.1066 +				if (net_ratelimit())
  7.1067 +					printk(KERN_CRIT "protocol %04x is "
  7.1068 +					       "buggy, dev %s\n",
  7.1069 +					       skb2->protocol, dev->name);
  7.1070 +				skb2->nh.raw = skb2->data;
  7.1071 +			}
  7.1072 +
  7.1073 +			skb2->h.raw = skb2->nh.raw;
  7.1074 +			skb2->pkt_type = PACKET_OUTGOING;
  7.1075 +			ptype->func(skb2, skb->dev, ptype);
  7.1076 +		}
  7.1077 +	}
  7.1078 +	rcu_read_unlock();
  7.1079 +}
  7.1080 +
  7.1081 +/*
  7.1082 + * Invalidate hardware checksum when packet is to be mangled, and
  7.1083 + * complete checksum manually on outgoing path.
  7.1084 + */
  7.1085 +int skb_checksum_help(struct sk_buff *skb, int inward)
  7.1086 +{
  7.1087 +	unsigned int csum;
  7.1088 +	int ret = 0, offset = skb->h.raw - skb->data;
  7.1089 +
  7.1090 +	if (inward) {
  7.1091 +		skb->ip_summed = CHECKSUM_NONE;
  7.1092 +		goto out;
  7.1093 +	}
  7.1094 +
  7.1095 +	if (skb_cloned(skb)) {
  7.1096 +		ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
  7.1097 +		if (ret)
  7.1098 +			goto out;
  7.1099 +	}
  7.1100 +
  7.1101 +	if (offset > (int)skb->len)
  7.1102 +		BUG();
  7.1103 +	csum = skb_checksum(skb, offset, skb->len-offset, 0);
  7.1104 +
  7.1105 +	offset = skb->tail - skb->h.raw;
  7.1106 +	if (offset <= 0)
  7.1107 +		BUG();
  7.1108 +	if (skb->csum + 2 > offset)
  7.1109 +		BUG();
  7.1110 +
  7.1111 +	*(u16*)(skb->h.raw + skb->csum) = csum_fold(csum);
  7.1112 +	skb->ip_summed = CHECKSUM_NONE;
  7.1113 +out:	
  7.1114 +	return ret;
  7.1115 +}
  7.1116 +
  7.1117 +#ifdef CONFIG_HIGHMEM
  7.1118 +/* Actually, we should eliminate this check as soon as we know, that:
  7.1119 + * 1. IOMMU is present and allows to map all the memory.
  7.1120 + * 2. No high memory really exists on this machine.
  7.1121 + */
  7.1122 +
  7.1123 +static inline int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
  7.1124 +{
  7.1125 +	int i;
  7.1126 +
  7.1127 +	if (dev->features & NETIF_F_HIGHDMA)
  7.1128 +		return 0;
  7.1129 +
  7.1130 +	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
  7.1131 +		if (PageHighMem(skb_shinfo(skb)->frags[i].page))
  7.1132 +			return 1;
  7.1133 +
  7.1134 +	return 0;
  7.1135 +}
  7.1136 +#else
  7.1137 +#define illegal_highdma(dev, skb)	(0)
  7.1138 +#endif
  7.1139 +
  7.1140 +extern void skb_release_data(struct sk_buff *);
  7.1141 +
  7.1142 +/* Keep head the same: replace data */
  7.1143 +int __skb_linearize(struct sk_buff *skb, int gfp_mask)
  7.1144 +{
  7.1145 +	unsigned int size;
  7.1146 +	u8 *data;
  7.1147 +	long offset;
  7.1148 +	struct skb_shared_info *ninfo;
  7.1149 +	int headerlen = skb->data - skb->head;
  7.1150 +	int expand = (skb->tail + skb->data_len) - skb->end;
  7.1151 +
  7.1152 +	if (skb_shared(skb))
  7.1153 +		BUG();
  7.1154 +
  7.1155 +	if (expand <= 0)
  7.1156 +		expand = 0;
  7.1157 +
  7.1158 +	size = skb->end - skb->head + expand;
  7.1159 +	size = SKB_DATA_ALIGN(size);
  7.1160 +	data = kmalloc(size + sizeof(struct skb_shared_info), gfp_mask);
  7.1161 +	if (!data)
  7.1162 +		return -ENOMEM;
  7.1163 +
  7.1164 +	/* Copy entire thing */
  7.1165 +	if (skb_copy_bits(skb, -headerlen, data, headerlen + skb->len))
  7.1166 +		BUG();
  7.1167 +
  7.1168 +	/* Set up shinfo */
  7.1169 +	ninfo = (struct skb_shared_info*)(data + size);
  7.1170 +	atomic_set(&ninfo->dataref, 1);
  7.1171 +	ninfo->tso_size = skb_shinfo(skb)->tso_size;
  7.1172 +	ninfo->tso_segs = skb_shinfo(skb)->tso_segs;
  7.1173 +	ninfo->nr_frags = 0;
  7.1174 +	ninfo->frag_list = NULL;
  7.1175 +
  7.1176 +	/* Offset between the two in bytes */
  7.1177 +	offset = data - skb->head;
  7.1178 +
  7.1179 +	/* Free old data. */
  7.1180 +	skb_release_data(skb);
  7.1181 +
  7.1182 +	skb->head = data;
  7.1183 +	skb->end  = data + size;
  7.1184 +
  7.1185 +	/* Set up new pointers */
  7.1186 +	skb->h.raw   += offset;
  7.1187 +	skb->nh.raw  += offset;
  7.1188 +	skb->mac.raw += offset;
  7.1189 +	skb->tail    += offset;
  7.1190 +	skb->data    += offset;
  7.1191 +
  7.1192 +	/* We are no longer a clone, even if we were. */
  7.1193 +	skb->cloned    = 0;
  7.1194 +
  7.1195 +	skb->tail     += skb->data_len;
  7.1196 +	skb->data_len  = 0;
  7.1197 +	return 0;
  7.1198 +}
  7.1199 +
  7.1200 +#define HARD_TX_LOCK(dev, cpu) {			\
  7.1201 +	if ((dev->features & NETIF_F_LLTX) == 0) {	\
  7.1202 +		spin_lock(&dev->xmit_lock);		\
  7.1203 +		dev->xmit_lock_owner = cpu;		\
  7.1204 +	}						\
  7.1205 +}
  7.1206 +
  7.1207 +#define HARD_TX_UNLOCK(dev) {				\
  7.1208 +	if ((dev->features & NETIF_F_LLTX) == 0) {	\
  7.1209 +		dev->xmit_lock_owner = -1;		\
  7.1210 +		spin_unlock(&dev->xmit_lock);		\
  7.1211 +	}						\
  7.1212 +}
  7.1213 +
  7.1214 +/**
  7.1215 + *	dev_queue_xmit - transmit a buffer
  7.1216 + *	@skb: buffer to transmit
  7.1217 + *
  7.1218 + *	Queue a buffer for transmission to a network device. The caller must
  7.1219 + *	have set the device and priority and built the buffer before calling
  7.1220 + *	this function. The function can be called from an interrupt.
  7.1221 + *
  7.1222 + *	A negative errno code is returned on a failure. A success does not
  7.1223 + *	guarantee the frame will be transmitted as it may be dropped due
  7.1224 + *	to congestion or traffic shaping.
  7.1225 + */
  7.1226 +
  7.1227 +int dev_queue_xmit(struct sk_buff *skb)
  7.1228 +{
  7.1229 +	struct net_device *dev = skb->dev;
  7.1230 +	struct Qdisc *q;
  7.1231 +	int rc = -ENOMEM;
  7.1232 +
  7.1233 +	if (skb_shinfo(skb)->frag_list &&
  7.1234 +	    !(dev->features & NETIF_F_FRAGLIST) &&
  7.1235 +	    __skb_linearize(skb, GFP_ATOMIC))
  7.1236 +		goto out_kfree_skb;
  7.1237 +
  7.1238 +	/* Fragmented skb is linearized if device does not support SG,
  7.1239 +	 * or if at least one of fragments is in highmem and device
  7.1240 +	 * does not support DMA from it.
  7.1241 +	 */
  7.1242 +	if (skb_shinfo(skb)->nr_frags &&
  7.1243 +	    (!(dev->features & NETIF_F_SG) || illegal_highdma(dev, skb)) &&
  7.1244 +	    __skb_linearize(skb, GFP_ATOMIC))
  7.1245 +		goto out_kfree_skb;
  7.1246 +
  7.1247 +	/* If a checksum-deferred packet is forwarded to a device that needs a
  7.1248 +	 * checksum, correct the pointers and force checksumming.
  7.1249 +	 */
  7.1250 +	if (skb->proto_csum_blank) {
  7.1251 +		if (skb->protocol != htons(ETH_P_IP))
  7.1252 +			goto out_kfree_skb;
  7.1253 +		skb->h.raw = (unsigned char *)skb->nh.iph + 4*skb->nh.iph->ihl;
  7.1254 +		if (skb->h.raw >= skb->tail)
  7.1255 +			goto out_kfree_skb;
  7.1256 +		switch (skb->nh.iph->protocol) {
  7.1257 +		case IPPROTO_TCP:
  7.1258 +			skb->csum = offsetof(struct tcphdr, check);
  7.1259 +			break;
  7.1260 +		case IPPROTO_UDP:
  7.1261 +			skb->csum = offsetof(struct udphdr, check);
  7.1262 +			break;
  7.1263 +		default:
  7.1264 +			goto out_kfree_skb;
  7.1265 +		}
  7.1266 +		if ((skb->h.raw + skb->csum + 2) > skb->tail)
  7.1267 +			goto out_kfree_skb;
  7.1268 +		skb->ip_summed = CHECKSUM_HW;
  7.1269 +	}
  7.1270 +
  7.1271 +	/* If packet is not checksummed and device does not support
  7.1272 +	 * checksumming for this protocol, complete checksumming here.
  7.1273 +	 */
  7.1274 +	if (skb->ip_summed == CHECKSUM_HW &&
  7.1275 +	    (!(dev->features & (NETIF_F_HW_CSUM | NETIF_F_NO_CSUM)) &&
  7.1276 +	     (!(dev->features & NETIF_F_IP_CSUM) ||
  7.1277 +	      skb->protocol != htons(ETH_P_IP))))
  7.1278 +	      	if (skb_checksum_help(skb, 0))
  7.1279 +	      		goto out_kfree_skb;
  7.1280 +
  7.1281 +	/* Disable soft irqs for various locks below. Also 
  7.1282 +	 * stops preemption for RCU. 
  7.1283 +	 */
  7.1284 +	local_bh_disable(); 
  7.1285 +
  7.1286 +	/* Updates of qdisc are serialized by queue_lock. 
  7.1287 +	 * The struct Qdisc which is pointed to by qdisc is now a 
  7.1288 +	 * rcu structure - it may be accessed without acquiring 
  7.1289 +	 * a lock (but the structure may be stale.) The freeing of the
  7.1290 +	 * qdisc will be deferred until it's known that there are no 
  7.1291 +	 * more references to it.
  7.1292 +	 * 
  7.1293 +	 * If the qdisc has an enqueue function, we still need to 
  7.1294 +	 * hold the queue_lock before calling it, since queue_lock
  7.1295 +	 * also serializes access to the device queue.
  7.1296 +	 */
  7.1297 +
  7.1298 +	q = rcu_dereference(dev->qdisc);
  7.1299 +#ifdef CONFIG_NET_CLS_ACT
  7.1300 +	skb->tc_verd = SET_TC_AT(skb->tc_verd,AT_EGRESS);
  7.1301 +#endif
  7.1302 +	if (q->enqueue) {
  7.1303 +		/* Grab device queue */
  7.1304 +		spin_lock(&dev->queue_lock);
  7.1305 +
  7.1306 +		rc = q->enqueue(skb, q);
  7.1307 +
  7.1308 +		qdisc_run(dev);
  7.1309 +
  7.1310 +		spin_unlock(&dev->queue_lock);
  7.1311 +		rc = rc == NET_XMIT_BYPASS ? NET_XMIT_SUCCESS : rc;
  7.1312 +		goto out;
  7.1313 +	}
  7.1314 +
  7.1315 +	/* The device has no queue. Common case for software devices:
  7.1316 +	   loopback, all the sorts of tunnels...
  7.1317 +
  7.1318 +	   Really, it is unlikely that xmit_lock protection is necessary here.
  7.1319 +	   (f.e. loopback and IP tunnels are clean ignoring statistics
  7.1320 +	   counters.)
  7.1321 +	   However, it is possible, that they rely on protection
  7.1322 +	   made by us here.
  7.1323 +
  7.1324 +	   Check this and shot the lock. It is not prone from deadlocks.
  7.1325 +	   Either shot noqueue qdisc, it is even simpler 8)
  7.1326 +	 */
  7.1327 +	if (dev->flags & IFF_UP) {
  7.1328 +		int cpu = smp_processor_id(); /* ok because BHs are off */
  7.1329 +
  7.1330 +		if (dev->xmit_lock_owner != cpu) {
  7.1331 +
  7.1332 +			HARD_TX_LOCK(dev, cpu);
  7.1333 +
  7.1334 +			if (!netif_queue_stopped(dev)) {
  7.1335 +				if (netdev_nit)
  7.1336 +					dev_queue_xmit_nit(skb, dev);
  7.1337 +
  7.1338 +				rc = 0;
  7.1339 +				if (!dev->hard_start_xmit(skb, dev)) {
  7.1340 +					HARD_TX_UNLOCK(dev);
  7.1341 +					goto out;
  7.1342 +				}
  7.1343 +			}
  7.1344 +			HARD_TX_UNLOCK(dev);
  7.1345 +			if (net_ratelimit())
  7.1346 +				printk(KERN_CRIT "Virtual device %s asks to "
  7.1347 +				       "queue packet!\n", dev->name);
  7.1348 +		} else {
  7.1349 +			/* Recursion is detected! It is possible,
  7.1350 +			 * unfortunately */
  7.1351 +			if (net_ratelimit())
  7.1352 +				printk(KERN_CRIT "Dead loop on virtual device "
  7.1353 +				       "%s, fix it urgently!\n", dev->name);
  7.1354 +		}
  7.1355 +	}
  7.1356 +
  7.1357 +	rc = -ENETDOWN;
  7.1358 +	local_bh_enable();
  7.1359 +
  7.1360 +out_kfree_skb:
  7.1361 +	kfree_skb(skb);
  7.1362 +	return rc;
  7.1363 +out:
  7.1364 +	local_bh_enable();
  7.1365 +	return rc;
  7.1366 +}
  7.1367 +
  7.1368 +
  7.1369 +/*=======================================================================
  7.1370 +			Receiver routines
  7.1371 +  =======================================================================*/
  7.1372 +
  7.1373 +int netdev_max_backlog = 300;
  7.1374 +int weight_p = 64;            /* old backlog weight */
  7.1375 +/* These numbers are selected based on intuition and some
  7.1376 + * experimentatiom, if you have more scientific way of doing this
  7.1377 + * please go ahead and fix things.
  7.1378 + */
  7.1379 +int no_cong_thresh = 10;
  7.1380 +int no_cong = 20;
  7.1381 +int lo_cong = 100;
  7.1382 +int mod_cong = 290;
  7.1383 +
  7.1384 +DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, };
  7.1385 +
  7.1386 +
  7.1387 +static void get_sample_stats(int cpu)
  7.1388 +{
  7.1389 +#ifdef RAND_LIE
  7.1390 +	unsigned long rd;
  7.1391 +	int rq;
  7.1392 +#endif
  7.1393 +	struct softnet_data *sd = &per_cpu(softnet_data, cpu);
  7.1394 +	int blog = sd->input_pkt_queue.qlen;
  7.1395 +	int avg_blog = sd->avg_blog;
  7.1396 +
  7.1397 +	avg_blog = (avg_blog >> 1) + (blog >> 1);
  7.1398 +
  7.1399 +	if (avg_blog > mod_cong) {
  7.1400 +		/* Above moderate congestion levels. */
  7.1401 +		sd->cng_level = NET_RX_CN_HIGH;
  7.1402 +#ifdef RAND_LIE
  7.1403 +		rd = net_random();
  7.1404 +		rq = rd % netdev_max_backlog;
  7.1405 +		if (rq < avg_blog) /* unlucky bastard */
  7.1406 +			sd->cng_level = NET_RX_DROP;
  7.1407 +#endif
  7.1408 +	} else if (avg_blog > lo_cong) {
  7.1409 +		sd->cng_level = NET_RX_CN_MOD;
  7.1410 +#ifdef RAND_LIE
  7.1411 +		rd = net_random();
  7.1412 +		rq = rd % netdev_max_backlog;
  7.1413 +			if (rq < avg_blog) /* unlucky bastard */
  7.1414 +				sd->cng_level = NET_RX_CN_HIGH;
  7.1415 +#endif
  7.1416 +	} else if (avg_blog > no_cong)
  7.1417 +		sd->cng_level = NET_RX_CN_LOW;
  7.1418 +	else  /* no congestion */
  7.1419 +		sd->cng_level = NET_RX_SUCCESS;
  7.1420 +
  7.1421 +	sd->avg_blog = avg_blog;
  7.1422 +}
  7.1423 +
  7.1424 +#ifdef OFFLINE_SAMPLE
  7.1425 +static void sample_queue(unsigned long dummy)
  7.1426 +{
  7.1427 +/* 10 ms 0r 1ms -- i don't care -- JHS */
  7.1428 +	int next_tick = 1;
  7.1429 +	int cpu = smp_processor_id();
  7.1430 +
  7.1431 +	get_sample_stats(cpu);
  7.1432 +	next_tick += jiffies;
  7.1433 +	mod_timer(&samp_timer, next_tick);
  7.1434 +}
  7.1435 +#endif
  7.1436 +
  7.1437 +
  7.1438 +/**
  7.1439 + *	netif_rx	-	post buffer to the network code
  7.1440 + *	@skb: buffer to post
  7.1441 + *
  7.1442 + *	This function receives a packet from a device driver and queues it for
  7.1443 + *	the upper (protocol) levels to process.  It always succeeds. The buffer
  7.1444 + *	may be dropped during processing for congestion control or by the
  7.1445 + *	protocol layers.
  7.1446 + *
  7.1447 + *	return values:
  7.1448 + *	NET_RX_SUCCESS	(no congestion)
  7.1449 + *	NET_RX_CN_LOW   (low congestion)
  7.1450 + *	NET_RX_CN_MOD   (moderate congestion)
  7.1451 + *	NET_RX_CN_HIGH  (high congestion)
  7.1452 + *	NET_RX_DROP     (packet was dropped)
  7.1453 + *
  7.1454 + */
  7.1455 +
  7.1456 +int netif_rx(struct sk_buff *skb)
  7.1457 +{
  7.1458 +	int this_cpu;
  7.1459 +	struct softnet_data *queue;
  7.1460 +	unsigned long flags;
  7.1461 +
  7.1462 +#ifdef CONFIG_NETPOLL
  7.1463 +	if (skb->dev->netpoll_rx && netpoll_rx(skb)) {
  7.1464 +		kfree_skb(skb);
  7.1465 +		return NET_RX_DROP;
  7.1466 +	}
  7.1467 +#endif
  7.1468 +	
  7.1469 +	if (!skb->stamp.tv_sec)
  7.1470 +		net_timestamp(&skb->stamp);
  7.1471 +
  7.1472 +	/*
  7.1473 +	 * The code is rearranged so that the path is the most
  7.1474 +	 * short when CPU is congested, but is still operating.
  7.1475 +	 */
  7.1476 +	local_irq_save(flags);
  7.1477 +	this_cpu = smp_processor_id();
  7.1478 +	queue = &__get_cpu_var(softnet_data);
  7.1479 +
  7.1480 +	__get_cpu_var(netdev_rx_stat).total++;
  7.1481 +	if (queue->input_pkt_queue.qlen <= netdev_max_backlog) {
  7.1482 +		if (queue->input_pkt_queue.qlen) {
  7.1483 +			if (queue->throttle)
  7.1484 +				goto drop;
  7.1485 +
  7.1486 +enqueue:
  7.1487 +			dev_hold(skb->dev);
  7.1488 +			__skb_queue_tail(&queue->input_pkt_queue, skb);
  7.1489 +#ifndef OFFLINE_SAMPLE
  7.1490 +			get_sample_stats(this_cpu);
  7.1491 +#endif
  7.1492 +			local_irq_restore(flags);
  7.1493 +			return queue->cng_level;
  7.1494 +		}
  7.1495 +
  7.1496 +		if (queue->throttle)
  7.1497 +			queue->throttle = 0;
  7.1498 +
  7.1499 +		netif_rx_schedule(&queue->backlog_dev);
  7.1500 +		goto enqueue;
  7.1501 +	}
  7.1502 +
  7.1503 +	if (!queue->throttle) {
  7.1504 +		queue->throttle = 1;
  7.1505 +		__get_cpu_var(netdev_rx_stat).throttled++;
  7.1506 +	}
  7.1507 +
  7.1508 +drop:
  7.1509 +	__get_cpu_var(netdev_rx_stat).dropped++;
  7.1510 +	local_irq_restore(flags);
  7.1511 +
  7.1512 +	kfree_skb(skb);
  7.1513 +	return NET_RX_DROP;
  7.1514 +}
  7.1515 +
  7.1516 +int netif_rx_ni(struct sk_buff *skb)
  7.1517 +{
  7.1518 +	int err;
  7.1519 +
  7.1520 +	preempt_disable();
  7.1521 +	err = netif_rx(skb);
  7.1522 +	if (local_softirq_pending())
  7.1523 +		do_softirq();
  7.1524 +	preempt_enable();
  7.1525 +
  7.1526 +	return err;
  7.1527 +}
  7.1528 +
  7.1529 +EXPORT_SYMBOL(netif_rx_ni);
  7.1530 +
  7.1531 +static __inline__ void skb_bond(struct sk_buff *skb)
  7.1532 +{
  7.1533 +	struct net_device *dev = skb->dev;
  7.1534 +
  7.1535 +	if (dev->master) {
  7.1536 +		skb->real_dev = skb->dev;
  7.1537 +		skb->dev = dev->master;
  7.1538 +	}
  7.1539 +}
  7.1540 +
  7.1541 +static void net_tx_action(struct softirq_action *h)
  7.1542 +{
  7.1543 +	struct softnet_data *sd = &__get_cpu_var(softnet_data);
  7.1544 +
  7.1545 +	if (sd->completion_queue) {
  7.1546 +		struct sk_buff *clist;
  7.1547 +
  7.1548 +		local_irq_disable();
  7.1549 +		clist = sd->completion_queue;
  7.1550 +		sd->completion_queue = NULL;
  7.1551 +		local_irq_enable();
  7.1552 +
  7.1553 +		while (clist) {
  7.1554 +			struct sk_buff *skb = clist;
  7.1555 +			clist = clist->next;
  7.1556 +
  7.1557 +			BUG_TRAP(!atomic_read(&skb->users));
  7.1558 +			__kfree_skb(skb);
  7.1559 +		}
  7.1560 +	}
  7.1561 +
  7.1562 +	if (sd->output_queue) {
  7.1563 +		struct net_device *head;
  7.1564 +
  7.1565 +		local_irq_disable();
  7.1566 +		head = sd->output_queue;
  7.1567 +		sd->output_queue = NULL;
  7.1568 +		local_irq_enable();
  7.1569 +
  7.1570 +		while (head) {
  7.1571 +			struct net_device *dev = head;
  7.1572 +			head = head->next_sched;
  7.1573 +
  7.1574 +			smp_mb__before_clear_bit();
  7.1575 +			clear_bit(__LINK_STATE_SCHED, &dev->state);
  7.1576 +
  7.1577 +			if (spin_trylock(&dev->queue_lock)) {
  7.1578 +				qdisc_run(dev);
  7.1579 +				spin_unlock(&dev->queue_lock);
  7.1580 +			} else {
  7.1581 +				netif_schedule(dev);
  7.1582 +			}
  7.1583 +		}
  7.1584 +	}
  7.1585 +}
  7.1586 +
  7.1587 +static __inline__ int deliver_skb(struct sk_buff *skb,
  7.1588 +				  struct packet_type *pt_prev)
  7.1589 +{
  7.1590 +	atomic_inc(&skb->users);
  7.1591 +	return pt_prev->func(skb, skb->dev, pt_prev);
  7.1592 +}
  7.1593 +
  7.1594 +#if defined(CONFIG_BRIDGE) || defined (CONFIG_BRIDGE_MODULE)
  7.1595 +int (*br_handle_frame_hook)(struct net_bridge_port *p, struct sk_buff **pskb);
  7.1596 +
  7.1597 +static __inline__ int handle_bridge(struct sk_buff **pskb,
  7.1598 +				    struct packet_type **pt_prev, int *ret)
  7.1599 +{
  7.1600 +	struct net_bridge_port *port;
  7.1601 +
  7.1602 +	if ((*pskb)->pkt_type == PACKET_LOOPBACK ||
  7.1603 +	    (port = rcu_dereference((*pskb)->dev->br_port)) == NULL)
  7.1604 +		return 0;
  7.1605 +
  7.1606 +	if (*pt_prev) {
  7.1607 +		*ret = deliver_skb(*pskb, *pt_prev);
  7.1608 +		*pt_prev = NULL;
  7.1609 +	} 
  7.1610 +	
  7.1611 +	return br_handle_frame_hook(port, pskb);
  7.1612 +}
  7.1613 +#else
  7.1614 +#define handle_bridge(skb, pt_prev, ret)	(0)
  7.1615 +#endif
  7.1616 +
  7.1617 +#ifdef CONFIG_NET_CLS_ACT
  7.1618 +/* TODO: Maybe we should just force sch_ingress to be compiled in
  7.1619 + * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
  7.1620 + * a compare and 2 stores extra right now if we dont have it on
  7.1621 + * but have CONFIG_NET_CLS_ACT
  7.1622 + * NOTE: This doesnt stop any functionality; if you dont have 
  7.1623 + * the ingress scheduler, you just cant add policies on ingress.
  7.1624 + *
  7.1625 + */
  7.1626 +static int ing_filter(struct sk_buff *skb) 
  7.1627 +{
  7.1628 +	struct Qdisc *q;
  7.1629 +	struct net_device *dev = skb->dev;
  7.1630 +	int result = TC_ACT_OK;
  7.1631 +	
  7.1632 +	if (dev->qdisc_ingress) {
  7.1633 +		__u32 ttl = (__u32) G_TC_RTTL(skb->tc_verd);
  7.1634 +		if (MAX_RED_LOOP < ttl++) {
  7.1635 +			printk("Redir loop detected Dropping packet (%s->%s)\n",
  7.1636 +				skb->input_dev?skb->input_dev->name:"??",skb->dev->name);
  7.1637 +			return TC_ACT_SHOT;
  7.1638 +		}
  7.1639 +
  7.1640 +		skb->tc_verd = SET_TC_RTTL(skb->tc_verd,ttl);
  7.1641 +
  7.1642 +		skb->tc_verd = SET_TC_AT(skb->tc_verd,AT_INGRESS);
  7.1643 +		if (NULL == skb->input_dev) {
  7.1644 +			skb->input_dev = skb->dev;
  7.1645 +			printk("ing_filter:  fixed  %s out %s\n",skb->input_dev->name,skb->dev->name);
  7.1646 +		}
  7.1647 +		spin_lock(&dev->ingress_lock);
  7.1648 +		if ((q = dev->qdisc_ingress) != NULL)
  7.1649 +			result = q->enqueue(skb, q);
  7.1650 +		spin_unlock(&dev->ingress_lock);
  7.1651 +
  7.1652 +	}
  7.1653 +
  7.1654 +	return result;
  7.1655 +}
  7.1656 +#endif
  7.1657 +
  7.1658 +int netif_receive_skb(struct sk_buff *skb)
  7.1659 +{
  7.1660 +	struct packet_type *ptype, *pt_prev;
  7.1661 +	int ret = NET_RX_DROP;
  7.1662 +	unsigned short type;
  7.1663 +
  7.1664 +#ifdef CONFIG_NETPOLL
  7.1665 +	if (skb->dev->netpoll_rx && skb->dev->poll && netpoll_rx(skb)) {
  7.1666 +		kfree_skb(skb);
  7.1667 +		return NET_RX_DROP;
  7.1668 +	}
  7.1669 +#endif
  7.1670 +
  7.1671 +	if (!skb->stamp.tv_sec)
  7.1672 +		net_timestamp(&skb->stamp);
  7.1673 +
  7.1674 +	skb_bond(skb);
  7.1675 +
  7.1676 +	__get_cpu_var(netdev_rx_stat).total++;
  7.1677 +
  7.1678 +	skb->h.raw = skb->nh.raw = skb->data;
  7.1679 +	skb->mac_len = skb->nh.raw - skb->mac.raw;
  7.1680 +
  7.1681 +	pt_prev = NULL;
  7.1682 +
  7.1683 +	rcu_read_lock();
  7.1684 +
  7.1685 +#ifdef CONFIG_NET_CLS_ACT
  7.1686 +	if (skb->tc_verd & TC_NCLS) {
  7.1687 +		skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
  7.1688 +		goto ncls;
  7.1689 +	}
  7.1690 +#endif
  7.1691 +
  7.1692 +	switch (skb->ip_summed) {
  7.1693 +	case CHECKSUM_UNNECESSARY:
  7.1694 +		skb->proto_csum_valid = 1;
  7.1695 +		break;
  7.1696 +	case CHECKSUM_HW:
  7.1697 +		/* XXX Implement me. */
  7.1698 +	default:
  7.1699 +		skb->proto_csum_valid = 0;
  7.1700 +		break;
  7.1701 +	}
  7.1702 +
  7.1703 +	list_for_each_entry_rcu(ptype, &ptype_all, list) {
  7.1704 +		if (!ptype->dev || ptype->dev == skb->dev) {
  7.1705 +			if (pt_prev) 
  7.1706 +				ret = deliver_skb(skb, pt_prev);
  7.1707 +			pt_prev = ptype;
  7.1708 +		}
  7.1709 +	}
  7.1710 +
  7.1711 +#ifdef CONFIG_NET_CLS_ACT
  7.1712 +	if (pt_prev) {
  7.1713 +		ret = deliver_skb(skb, pt_prev);
  7.1714 +		pt_prev = NULL; /* noone else should process this after*/
  7.1715 +	} else {
  7.1716 +		skb->tc_verd = SET_TC_OK2MUNGE(skb->tc_verd);
  7.1717 +	}
  7.1718 +
  7.1719 +	ret = ing_filter(skb);
  7.1720 +
  7.1721 +	if (ret == TC_ACT_SHOT || (ret == TC_ACT_STOLEN)) {
  7.1722 +		kfree_skb(skb);
  7.1723 +		goto out;
  7.1724 +	}
  7.1725 +
  7.1726 +	skb->tc_verd = 0;
  7.1727 +ncls:
  7.1728 +#endif
  7.1729 +
  7.1730 +	handle_diverter(skb);
  7.1731 +
  7.1732 +	if (handle_bridge(&skb, &pt_prev, &ret))
  7.1733 +		goto out;
  7.1734 +
  7.1735 +	type = skb->protocol;
  7.1736 +	list_for_each_entry_rcu(ptype, &ptype_base[ntohs(type)&15], list) {
  7.1737 +		if (ptype->type == type &&
  7.1738 +		    (!ptype->dev || ptype->dev == skb->dev)) {
  7.1739 +			if (pt_prev) 
  7.1740 +				ret = deliver_skb(skb, pt_prev);
  7.1741 +			pt_prev = ptype;
  7.1742 +		}
  7.1743 +	}
  7.1744 +
  7.1745 +	if (pt_prev) {
  7.1746 +		ret = pt_prev->func(skb, skb->dev, pt_prev);
  7.1747 +	} else {
  7.1748 +		kfree_skb(skb);
  7.1749 +		/* Jamal, now you will not able to escape explaining
  7.1750 +		 * me how you were going to use this. :-)
  7.1751 +		 */
  7.1752 +		ret = NET_RX_DROP;
  7.1753 +	}
  7.1754 +
  7.1755 +out:
  7.1756 +	rcu_read_unlock();
  7.1757 +	return ret;
  7.1758 +}
  7.1759 +
  7.1760 +static int process_backlog(struct net_device *backlog_dev, int *budget)
  7.1761 +{
  7.1762 +	int work = 0;
  7.1763 +	int quota = min(backlog_dev->quota, *budget);
  7.1764 +	struct softnet_data *queue = &__get_cpu_var(softnet_data);
  7.1765 +	unsigned long start_time = jiffies;
  7.1766 +
  7.1767 +	for (;;) {
  7.1768 +		struct sk_buff *skb;
  7.1769 +		struct net_device *dev;
  7.1770 +
  7.1771 +		local_irq_disable();
  7.1772 +		skb = __skb_dequeue(&queue->input_pkt_queue);
  7.1773 +		if (!skb)
  7.1774 +			goto job_done;
  7.1775 +		local_irq_enable();
  7.1776 +
  7.1777 +		dev = skb->dev;
  7.1778 +
  7.1779 +		netif_receive_skb(skb);
  7.1780 +
  7.1781 +		dev_put(dev);
  7.1782 +
  7.1783 +		work++;
  7.1784 +
  7.1785 +		if (work >= quota || jiffies - start_time > 1)
  7.1786 +			break;
  7.1787 +
  7.1788 +	}
  7.1789 +
  7.1790 +	backlog_dev->quota -= work;
  7.1791 +	*budget -= work;
  7.1792 +	return -1;
  7.1793 +
  7.1794 +job_done:
  7.1795 +	backlog_dev->quota -= work;
  7.1796 +	*budget -= work;
  7.1797 +
  7.1798 +	list_del(&backlog_dev->poll_list);
  7.1799 +	smp_mb__before_clear_bit();
  7.1800 +	netif_poll_enable(backlog_dev);
  7.1801 +
  7.1802 +	if (queue->throttle)
  7.1803 +		queue->throttle = 0;
  7.1804 +	local_irq_enable();
  7.1805 +	return 0;
  7.1806 +}
  7.1807 +
  7.1808 +static void net_rx_action(struct softirq_action *h)
  7.1809 +{
  7.1810 +	struct softnet_data *queue = &__get_cpu_var(softnet_data);
  7.1811 +	unsigned long start_time = jiffies;
  7.1812 +	int budget = netdev_max_backlog;
  7.1813 +
  7.1814 +	
  7.1815 +	local_irq_disable();
  7.1816 +
  7.1817 +	while (!list_empty(&queue->poll_list)) {
  7.1818 +		struct net_device *dev;
  7.1819 +
  7.1820 +		if (budget <= 0 || jiffies - start_time > 1)
  7.1821 +			goto softnet_break;
  7.1822 +
  7.1823 +		local_irq_enable();
  7.1824 +
  7.1825 +		dev = list_entry(queue->poll_list.next,
  7.1826 +				 struct net_device, poll_list);
  7.1827 +
  7.1828 +		if (dev->quota <= 0 || dev->poll(dev, &budget)) {
  7.1829 +			local_irq_disable();
  7.1830 +			list_del(&dev->poll_list);
  7.1831 +			list_add_tail(&dev->poll_list, &queue->poll_list);
  7.1832 +			if (dev->quota < 0)
  7.1833 +				dev->quota += dev->weight;
  7.1834 +			else
  7.1835 +				dev->quota = dev->weight;
  7.1836 +		} else {
  7.1837 +			dev_put(dev);
  7.1838 +			local_irq_disable();
  7.1839 +		}
  7.1840 +	}
  7.1841 +out:
  7.1842 +	local_irq_enable();
  7.1843 +	return;
  7.1844 +
  7.1845 +softnet_break:
  7.1846 +	__get_cpu_var(netdev_rx_stat).time_squeeze++;
  7.1847 +	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
  7.1848 +	goto out;
  7.1849 +}
  7.1850 +
  7.1851 +static gifconf_func_t * gifconf_list [NPROTO];
  7.1852 +
  7.1853 +/**
  7.1854 + *	register_gifconf	-	register a SIOCGIF handler
  7.1855 + *	@family: Address family
  7.1856 + *	@gifconf: Function handler
  7.1857 + *
  7.1858 + *	Register protocol dependent address dumping routines. The handler
  7.1859 + *	that is passed must not be freed or reused until it has been replaced
  7.1860 + *	by another handler.
  7.1861 + */
  7.1862 +int register_gifconf(unsigned int family, gifconf_func_t * gifconf)
  7.1863 +{
  7.1864 +	if (family >= NPROTO)
  7.1865 +		return -EINVAL;
  7.1866 +	gifconf_list[family] = gifconf;
  7.1867 +	return 0;
  7.1868 +}
  7.1869 +
  7.1870 +
  7.1871 +/*
  7.1872 + *	Map an interface index to its name (SIOCGIFNAME)
  7.1873 + */
  7.1874 +
  7.1875 +/*
  7.1876 + *	We need this ioctl for efficient implementation of the
  7.1877 + *	if_indextoname() function required by the IPv6 API.  Without
  7.1878 + *	it, we would have to search all the interfaces to find a
  7.1879 + *	match.  --pb
  7.1880 + */
  7.1881 +
  7.1882 +static int dev_ifname(struct ifreq __user *arg)
  7.1883 +{
  7.1884 +	struct net_device *dev;
  7.1885 +	struct ifreq ifr;
  7.1886 +
  7.1887 +	/*
  7.1888 +	 *	Fetch the caller's info block.
  7.1889 +	 */
  7.1890 +
  7.1891 +	if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
  7.1892 +		return -EFAULT;
  7.1893 +
  7.1894 +	read_lock(&dev_base_lock);
  7.1895 +	dev = __dev_get_by_index(ifr.ifr_ifindex);
  7.1896 +	if (!dev) {
  7.1897 +		read_unlock(&dev_base_lock);
  7.1898 +		return -ENODEV;
  7.1899 +	}
  7.1900 +
  7.1901 +	strcpy(ifr.ifr_name, dev->name);
  7.1902 +	read_unlock(&dev_base_lock);
  7.1903 +
  7.1904 +	if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
  7.1905 +		return -EFAULT;
  7.1906 +	return 0;
  7.1907 +}
  7.1908 +
  7.1909 +/*
  7.1910 + *	Perform a SIOCGIFCONF call. This structure will change
  7.1911 + *	size eventually, and there is nothing I can do about it.
  7.1912 + *	Thus we will need a 'compatibility mode'.
  7.1913 + */
  7.1914 +
  7.1915 +static int dev_ifconf(char __user *arg)
  7.1916 +{
  7.1917 +	struct ifconf ifc;
  7.1918 +	struct net_device *dev;
  7.1919 +	char __user *pos;
  7.1920 +	int len;
  7.1921 +	int total;
  7.1922 +	int i;
  7.1923 +
  7.1924 +	/*
  7.1925 +	 *	Fetch the caller's info block.
  7.1926 +	 */
  7.1927 +
  7.1928 +	if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
  7.1929 +		return -EFAULT;
  7.1930 +
  7.1931 +	pos = ifc.ifc_buf;
  7.1932 +	len = ifc.ifc_len;
  7.1933 +
  7.1934 +	/*
  7.1935 +	 *	Loop over the interfaces, and write an info block for each.
  7.1936 +	 */
  7.1937 +
  7.1938 +	total = 0;
  7.1939 +	for (dev = dev_base; dev; dev = dev->next) {
  7.1940 +		for (i = 0; i < NPROTO; i++) {
  7.1941 +			if (gifconf_list[i]) {
  7.1942 +				int done;
  7.1943 +				if (!pos)
  7.1944 +					done = gifconf_list[i](dev, NULL, 0);
  7.1945 +				else
  7.1946 +					done = gifconf_list[i](dev, pos + total,
  7.1947 +							       len - total);
  7.1948 +				if (done < 0)
  7.1949 +					return -EFAULT;
  7.1950 +				total += done;
  7.1951 +			}
  7.1952 +		}
  7.1953 +  	}
  7.1954 +
  7.1955 +	/*
  7.1956 +	 *	All done.  Write the updated control block back to the caller.
  7.1957 +	 */
  7.1958 +	ifc.ifc_len = total;
  7.1959 +
  7.1960 +	/*
  7.1961 +	 * 	Both BSD and Solaris return 0 here, so we do too.
  7.1962 +	 */
  7.1963 +	return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
  7.1964 +}
  7.1965 +
  7.1966 +#ifdef CONFIG_PROC_FS
  7.1967 +/*
  7.1968 + *	This is invoked by the /proc filesystem handler to display a device
  7.1969 + *	in detail.
  7.1970 + */
  7.1971 +static __inline__ struct net_device *dev_get_idx(loff_t pos)
  7.1972 +{
  7.1973 +	struct net_device *dev;
  7.1974 +	loff_t i;
  7.1975 +
  7.1976 +	for (i = 0, dev = dev_base; dev && i < pos; ++i, dev = dev->next);
  7.1977 +
  7.1978 +	return i == pos ? dev : NULL;
  7.1979 +}
  7.1980 +
  7.1981 +void *dev_seq_start(struct seq_file *seq, loff_t *pos)
  7.1982 +{
  7.1983 +	read_lock(&dev_base_lock);
  7.1984 +	return *pos ? dev_get_idx(*pos - 1) : SEQ_START_TOKEN;
  7.1985 +}
  7.1986 +
  7.1987 +void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
  7.1988 +{
  7.1989 +	++*pos;
  7.1990 +	return v == SEQ_START_TOKEN ? dev_base : ((struct net_device *)v)->next;
  7.1991 +}
  7.1992 +
  7.1993 +void dev_seq_stop(struct seq_file *seq, void *v)
  7.1994 +{
  7.1995 +	read_unlock(&dev_base_lock);
  7.1996 +}
  7.1997 +
  7.1998 +static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
  7.1999 +{
  7.2000 +	if (dev->get_stats) {
  7.2001 +		struct net_device_stats *stats = dev->get_stats(dev);
  7.2002 +
  7.2003 +		seq_printf(seq, "%6s:%8lu %7lu %4lu %4lu %4lu %5lu %10lu %9lu "
  7.2004 +				"%8lu %7lu %4lu %4lu %4lu %5lu %7lu %10lu\n",
  7.2005 +			   dev->name, stats->rx_bytes, stats->rx_packets,
  7.2006 +			   stats->rx_errors,
  7.2007 +			   stats->rx_dropped + stats->rx_missed_errors,
  7.2008 +			   stats->rx_fifo_errors,
  7.2009 +			   stats->rx_length_errors + stats->rx_over_errors +
  7.2010 +			     stats->rx_crc_errors + stats->rx_frame_errors,
  7.2011 +			   stats->rx_compressed, stats->multicast,
  7.2012 +			   stats->tx_bytes, stats->tx_packets,
  7.2013 +			   stats->tx_errors, stats->tx_dropped,
  7.2014 +			   stats->tx_fifo_errors, stats->collisions,
  7.2015 +			   stats->tx_carrier_errors +
  7.2016 +			     stats->tx_aborted_errors +
  7.2017 +			     stats->tx_window_errors +
  7.2018 +			     stats->tx_heartbeat_errors,
  7.2019 +			   stats->tx_compressed);
  7.2020 +	} else
  7.2021 +		seq_printf(seq, "%6s: No statistics available.\n", dev->name);
  7.2022 +}
  7.2023 +
  7.2024 +/*
  7.2025 + *	Called from the PROCfs module. This now uses the new arbitrary sized
  7.2026 + *	/proc/net interface to create /proc/net/dev
  7.2027 + */
  7.2028 +static int dev_seq_show(struct seq_file *seq, void *v)
  7.2029 +{
  7.2030 +	if (v == SEQ_START_TOKEN)
  7.2031 +		seq_puts(seq, "Inter-|   Receive                            "
  7.2032 +			      "                    |  Transmit\n"
  7.2033 +			      " face |bytes    packets errs drop fifo frame "
  7.2034 +			      "compressed multicast|bytes    packets errs "
  7.2035 +			      "drop fifo colls carrier compressed\n");
  7.2036 +	else
  7.2037 +		dev_seq_printf_stats(seq, v);
  7.2038 +	return 0;
  7.2039 +}
  7.2040 +
  7.2041 +static struct netif_rx_stats *softnet_get_online(loff_t *pos)
  7.2042 +{
  7.2043 +	struct netif_rx_stats *rc = NULL;
  7.2044 +
  7.2045 +	while (*pos < NR_CPUS)
  7.2046 +	       	if (cpu_online(*pos)) {
  7.2047 +			rc = &per_cpu(netdev_rx_stat, *pos);
  7.2048 +			break;
  7.2049 +		} else
  7.2050 +			++*pos;
  7.2051 +	return rc;
  7.2052 +}
  7.2053 +
  7.2054 +static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
  7.2055 +{
  7.2056 +	return softnet_get_online(pos);
  7.2057 +}
  7.2058 +
  7.2059 +static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
  7.2060 +{
  7.2061 +	++*pos;
  7.2062 +	return softnet_get_online(pos);
  7.2063 +}
  7.2064 +
  7.2065 +static void softnet_seq_stop(struct seq_file *seq, void *v)
  7.2066 +{
  7.2067 +}
  7.2068 +
  7.2069 +static int softnet_seq_show(struct seq_file *seq, void *v)
  7.2070 +{
  7.2071 +	struct netif_rx_stats *s = v;
  7.2072 +
  7.2073 +	seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
  7.2074 +		   s->total, s->dropped, s->time_squeeze, s->throttled,
  7.2075 +		   s->fastroute_hit, s->fastroute_success, s->fastroute_defer,
  7.2076 +		   s->fastroute_deferred_out,
  7.2077 +#if 0
  7.2078 +		   s->fastroute_latency_reduction
  7.2079 +#else
  7.2080 +		   s->cpu_collision
  7.2081 +#endif
  7.2082 +		  );
  7.2083 +	return 0;
  7.2084 +}
  7.2085 +
  7.2086 +static struct seq_operations dev_seq_ops = {
  7.2087 +	.start = dev_seq_start,
  7.2088 +	.next  = dev_seq_next,
  7.2089 +	.stop  = dev_seq_stop,
  7.2090 +	.show  = dev_seq_show,
  7.2091 +};
  7.2092 +
  7.2093 +static int dev_seq_open(struct inode *inode, struct file *file)
  7.2094 +{
  7.2095 +	return seq_open(file, &dev_seq_ops);
  7.2096 +}
  7.2097 +
  7.2098 +static struct file_operations dev_seq_fops = {
  7.2099 +	.owner	 = THIS_MODULE,
  7.2100 +	.open    = dev_seq_open,
  7.2101 +	.read    = seq_read,
  7.2102 +	.llseek  = seq_lseek,
  7.2103 +	.release = seq_release,
  7.2104 +};
  7.2105 +
  7.2106 +static struct seq_operations softnet_seq_ops = {
  7.2107 +	.start = softnet_seq_start,
  7.2108 +	.next  = softnet_seq_next,
  7.2109 +	.stop  = softnet_seq_stop,
  7.2110 +	.show  = softnet_seq_show,
  7.2111 +};
  7.2112 +
  7.2113 +static int softnet_seq_open(struct inode *inode, struct file *file)
  7.2114 +{
  7.2115 +	return seq_open(file, &softnet_seq_ops);
  7.2116 +}
  7.2117 +
  7.2118 +static struct file_operations softnet_seq_fops = {
  7.2119 +	.owner	 = THIS_MODULE,
  7.2120 +	.open    = softnet_seq_open,
  7.2121 +	.read    = seq_read,
  7.2122 +	.llseek  = seq_lseek,
  7.2123 +	.release = seq_release,
  7.2124 +};
  7.2125 +
  7.2126 +#ifdef WIRELESS_EXT
  7.2127 +extern int wireless_proc_init(void);
  7.2128 +#else
  7.2129 +#define wireless_proc_init() 0
  7.2130 +#endif
  7.2131 +
  7.2132 +static int __init dev_proc_init(void)
  7.2133 +{
  7.2134 +	int rc = -ENOMEM;
  7.2135 +
  7.2136 +	if (!proc_net_fops_create("dev", S_IRUGO, &dev_seq_fops))
  7.2137 +		goto out;
  7.2138 +	if (!proc_net_fops_create("softnet_stat", S_IRUGO, &softnet_seq_fops))
  7.2139 +		goto out_dev;
  7.2140 +	if (wireless_proc_init())
  7.2141 +		goto out_softnet;
  7.2142 +	rc = 0;
  7.2143 +out:
  7.2144 +	return rc;
  7.2145 +out_softnet:
  7.2146 +	proc_net_remove("softnet_stat");
  7.2147 +out_dev:
  7.2148 +	proc_net_remove("dev");
  7.2149 +	goto out;
  7.2150 +}
  7.2151 +#else
  7.2152 +#define dev_proc_init() 0
  7.2153 +#endif	/* CONFIG_PROC_FS */
  7.2154 +
  7.2155 +
  7.2156 +/**
  7.2157 + *	netdev_set_master	-	set up master/slave pair
  7.2158 + *	@slave: slave device
  7.2159 + *	@master: new master device
  7.2160 + *
  7.2161 + *	Changes the master device of the slave. Pass %NULL to break the
  7.2162 + *	bonding. The caller must hold the RTNL semaphore. On a failure
  7.2163 + *	a negative errno code is returned. On success the reference counts
  7.2164 + *	are adjusted, %RTM_NEWLINK is sent to the routing socket and the
  7.2165 + *	function returns zero.
  7.2166 + */
  7.2167 +int netdev_set_master(struct net_device *slave, struct net_device *master)
  7.2168 +{
  7.2169 +	struct net_device *old = slave->master;
  7.2170 +
  7.2171 +	ASSERT_RTNL();
  7.2172 +
  7.2173 +	if (master) {
  7.2174 +		if (old)
  7.2175 +			return -EBUSY;
  7.2176 +		dev_hold(master);
  7.2177 +	}
  7.2178 +
  7.2179 +	slave->master = master;
  7.2180 +	
  7.2181 +	synchronize_net();
  7.2182 +
  7.2183 +	if (old)
  7.2184 +		dev_put(old);
  7.2185 +
  7.2186 +	if (master)
  7.2187 +		slave->flags |= IFF_SLAVE;
  7.2188 +	else
  7.2189 +		slave->flags &= ~IFF_SLAVE;
  7.2190 +
  7.2191 +	rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
  7.2192 +	return 0;
  7.2193 +}
  7.2194 +
  7.2195 +/**
  7.2196 + *	dev_set_promiscuity	- update promiscuity count on a device
  7.2197 + *	@dev: device
  7.2198 + *	@inc: modifier
  7.2199 + *
  7.2200 + *	Add or remove promsicuity from a device. While the count in the device
  7.2201 + *	remains above zero the interface remains promiscuous. Once it hits zero
  7.2202 + *	the device reverts back to normal filtering operation. A negative inc
  7.2203 + *	value is used to drop promiscuity on the device.
  7.2204 + */
  7.2205 +void dev_set_promiscuity(struct net_device *dev, int inc)
  7.2206 +{
  7.2207 +	unsigned short old_flags = dev->flags;
  7.2208 +
  7.2209 +	dev->flags |= IFF_PROMISC;
  7.2210 +	if ((dev->promiscuity += inc) == 0)
  7.2211 +		dev->flags &= ~IFF_PROMISC;
  7.2212 +	if (dev->flags ^ old_flags) {
  7.2213 +		dev_mc_upload(dev);
  7.2214 +		printk(KERN_INFO "device %s %s promiscuous mode\n",
  7.2215 +		       dev->name, (dev->flags & IFF_PROMISC) ? "entered" :
  7.2216 +		       					       "left");
  7.2217 +	}
  7.2218 +}
  7.2219 +
  7.2220 +/**
  7.2221 + *	dev_set_allmulti	- update allmulti count on a device
  7.2222 + *	@dev: device
  7.2223 + *	@inc: modifier
  7.2224 + *
  7.2225 + *	Add or remove reception of all multicast frames to a device. While the
  7.2226 + *	count in the device remains above zero the interface remains listening
  7.2227 + *	to all interfaces. Once it hits zero the device reverts back to normal
  7.2228 + *	filtering operation. A negative @inc value is used to drop the counter
  7.2229 + *	when releasing a resource needing all multicasts.
  7.2230 + */
  7.2231 +
  7.2232 +void dev_set_allmulti(struct net_device *dev, int inc)
  7.2233 +{
  7.2234 +	unsigned short old_flags = dev->flags;
  7.2235 +
  7.2236 +	dev->flags |= IFF_ALLMULTI;
  7.2237 +	if ((dev->allmulti += inc) == 0)
  7.2238 +		dev->flags &= ~IFF_ALLMULTI;
  7.2239 +	if (dev->flags ^ old_flags)
  7.2240 +		dev_mc_upload(dev);
  7.2241 +}
  7.2242 +
  7.2243 +unsigned dev_get_flags(const struct net_device *dev)
  7.2244 +{
  7.2245 +	unsigned flags;
  7.2246 +
  7.2247 +	flags = (dev->flags & ~(IFF_PROMISC |
  7.2248 +				IFF_ALLMULTI |
  7.2249 +				IFF_RUNNING)) | 
  7.2250 +		(dev->gflags & (IFF_PROMISC |
  7.2251 +				IFF_ALLMULTI));
  7.2252 +
  7.2253 +	if (netif_running(dev) && netif_carrier_ok(dev))
  7.2254 +		flags |= IFF_RUNNING;
  7.2255 +
  7.2256 +	return flags;
  7.2257 +}
  7.2258 +
  7.2259 +int dev_change_flags(struct net_device *dev, unsigned flags)
  7.2260 +{
  7.2261 +	int ret;
  7.2262 +	int old_flags = dev->flags;
  7.2263 +
  7.2264 +	/*
  7.2265 +	 *	Set the flags on our device.
  7.2266 +	 */
  7.2267 +
  7.2268 +	dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
  7.2269 +			       IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
  7.2270 +			       IFF_AUTOMEDIA)) |
  7.2271 +		     (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
  7.2272 +				    IFF_ALLMULTI));
  7.2273 +
  7.2274 +	/*
  7.2275 +	 *	Load in the correct multicast list now the flags have changed.
  7.2276 +	 */
  7.2277 +
  7.2278 +	dev_mc_upload(dev);
  7.2279 +
  7.2280 +	/*
  7.2281 +	 *	Have we downed the interface. We handle IFF_UP ourselves
  7.2282 +	 *	according to user attempts to set it, rather than blindly
  7.2283 +	 *	setting it.
  7.2284 +	 */
  7.2285 +
  7.2286 +	ret = 0;
  7.2287 +	if ((old_flags ^ flags) & IFF_UP) {	/* Bit is different  ? */
  7.2288 +		ret = ((old_flags & IFF_UP) ? dev_close : dev_open)(dev);
  7.2289 +
  7.2290 +		if (!ret)
  7.2291 +			dev_mc_upload(dev);
  7.2292 +	}
  7.2293 +
  7.2294 +	if (dev->flags & IFF_UP &&
  7.2295 +	    ((old_flags ^ dev->flags) &~ (IFF_UP | IFF_PROMISC | IFF_ALLMULTI |
  7.2296 +					  IFF_VOLATILE)))
  7.2297 +		notifier_call_chain(&netdev_chain, NETDEV_CHANGE, dev);
  7.2298 +
  7.2299 +	if ((flags ^ dev->gflags) & IFF_PROMISC) {
  7.2300 +		int inc = (flags & IFF_PROMISC) ? +1 : -1;
  7.2301 +		dev->gflags ^= IFF_PROMISC;
  7.2302 +		dev_set_promiscuity(dev, inc);
  7.2303 +	}
  7.2304 +
  7.2305 +	/* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
  7.2306 +	   is important. Some (broken) drivers set IFF_PROMISC, when
  7.2307 +	   IFF_ALLMULTI is requested not asking us and not reporting.
  7.2308 +	 */
  7.2309 +	if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
  7.2310 +		int inc = (flags & IFF_ALLMULTI) ? +1 : -1;
  7.2311 +		dev->gflags ^= IFF_ALLMULTI;
  7.2312 +		dev_set_allmulti(dev, inc);
  7.2313 +	}
  7.2314 +
  7.2315 +	if (old_flags ^ dev->flags)
  7.2316 +		rtmsg_ifinfo(RTM_NEWLINK, dev, old_flags ^ dev->flags);
  7.2317 +
  7.2318 +	return ret;
  7.2319 +}
  7.2320 +
  7.2321 +int dev_set_mtu(struct net_device *dev, int new_mtu)
  7.2322 +{
  7.2323 +	int err;
  7.2324 +
  7.2325 +	if (new_mtu == dev->mtu)
  7.2326 +		return 0;
  7.2327 +
  7.2328 +	/*	MTU must be positive.	 */
  7.2329 +	if (new_mtu < 0)
  7.2330 +		return -EINVAL;
  7.2331 +
  7.2332 +	if (!netif_device_present(dev))
  7.2333 +		return -ENODEV;
  7.2334 +
  7.2335 +	err = 0;
  7.2336 +	if (dev->change_mtu)
  7.2337 +		err = dev->change_mtu(dev, new_mtu);
  7.2338 +	else
  7.2339 +		dev->mtu = new_mtu;
  7.2340 +	if (!err && dev->flags & IFF_UP)
  7.2341 +		notifier_call_chain(&netdev_chain,
  7.2342 +				    NETDEV_CHANGEMTU, dev);
  7.2343 +	return err;
  7.2344 +}
  7.2345 +
  7.2346 +
  7.2347 +/*
  7.2348 + *	Perform the SIOCxIFxxx calls.
  7.2349 + */
  7.2350 +static int dev_ifsioc(struct ifreq *ifr, unsigned int cmd)
  7.2351 +{
  7.2352 +	int err;
  7.2353 +	struct net_device *dev = __dev_get_by_name(ifr->ifr_name);
  7.2354 +
  7.2355 +	if (!dev)
  7.2356 +		return -ENODEV;
  7.2357 +
  7.2358 +	switch (cmd) {
  7.2359 +		case SIOCGIFFLAGS:	/* Get interface flags */
  7.2360 +			ifr->ifr_flags = dev_get_flags(dev);
  7.2361 +			return 0;
  7.2362 +
  7.2363 +		case SIOCSIFFLAGS:	/* Set interface flags */
  7.2364 +			return dev_change_flags(dev, ifr->ifr_flags);
  7.2365 +
  7.2366 +		case SIOCGIFMETRIC:	/* Get the metric on the interface
  7.2367 +					   (currently unused) */
  7.2368 +			ifr->ifr_metric = 0;
  7.2369 +			return 0;
  7.2370 +
  7.2371 +		case SIOCSIFMETRIC:	/* Set the metric on the interface
  7.2372 +					   (currently unused) */
  7.2373 +			return -EOPNOTSUPP;
  7.2374 +
  7.2375 +		case SIOCGIFMTU:	/* Get the MTU of a device */
  7.2376 +			ifr->ifr_mtu = dev->mtu;
  7.2377 +			return 0;
  7.2378 +
  7.2379 +		case SIOCSIFMTU:	/* Set the MTU of a device */
  7.2380 +			return dev_set_mtu(dev, ifr->ifr_mtu);
  7.2381 +
  7.2382 +		case SIOCGIFHWADDR:
  7.2383 +			if (!dev->addr_len)
  7.2384 +				memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
  7.2385 +			else
  7.2386 +				memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
  7.2387 +				       min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
  7.2388 +			ifr->ifr_hwaddr.sa_family = dev->type;
  7.2389 +			return 0;
  7.2390 +
  7.2391 +		case SIOCSIFHWADDR:
  7.2392 +			if (!dev->set_mac_address)
  7.2393 +				return -EOPNOTSUPP;
  7.2394 +			if (ifr->ifr_hwaddr.sa_family != dev->type)
  7.2395 +				return -EINVAL;
  7.2396 +			if (!netif_device_present(dev))
  7.2397 +				return -ENODEV;
  7.2398 +			err = dev->set_mac_address(dev, &ifr->ifr_hwaddr);
  7.2399 +			if (!err)
  7.2400 +				notifier_call_chain(&netdev_chain,
  7.2401 +						    NETDEV_CHANGEADDR, dev);
  7.2402 +			return err;
  7.2403 +
  7.2404 +		case SIOCSIFHWBROADCAST:
  7.2405 +			if (ifr->ifr_hwaddr.sa_family != dev->type)
  7.2406 +				return -EINVAL;
  7.2407 +			memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
  7.2408 +			       min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
  7.2409 +			notifier_call_chain(&netdev_chain,
  7.2410 +					    NETDEV_CHANGEADDR, dev);
  7.2411 +			return 0;
  7.2412 +
  7.2413 +		case SIOCGIFMAP:
  7.2414 +			ifr->ifr_map.mem_start = dev->mem_start;
  7.2415 +			ifr->ifr_map.mem_end   = dev->mem_end;
  7.2416 +			ifr->ifr_map.base_addr = dev->base_addr;
  7.2417 +			ifr->ifr_map.irq       = dev->irq;
  7.2418 +			ifr->ifr_map.dma       = dev->dma;
  7.2419 +			ifr->ifr_map.port      = dev->if_port;
  7.2420 +			return 0;
  7.2421 +
  7.2422 +		case SIOCSIFMAP:
  7.2423 +			if (dev->set_config) {
  7.2424 +				if (!netif_device_present(dev))
  7.2425 +					return -ENODEV;
  7.2426 +				return dev->set_config(dev, &ifr->ifr_map);
  7.2427 +			}
  7.2428 +			return -EOPNOTSUPP;
  7.2429 +
  7.2430 +		case SIOCADDMULTI:
  7.2431 +			if (!dev->set_multicast_list ||
  7.2432 +			    ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
  7.2433 +				return -EINVAL;
  7.2434 +			if (!netif_device_present(dev))
  7.2435 +				return -ENODEV;
  7.2436 +			return dev_mc_add(dev, ifr->ifr_hwaddr.sa_data,
  7.2437 +					  dev->addr_len, 1);
  7.2438 +
  7.2439 +		case SIOCDELMULTI:
  7.2440 +			if (!dev->set_multicast_list ||
  7.2441 +			    ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
  7.2442 +				return -EINVAL;
  7.2443 +			if (!netif_device_present(dev))
  7.2444 +				return -ENODEV;
  7.2445 +			return dev_mc_delete(dev, ifr->ifr_hwaddr.sa_data,
  7.2446 +					     dev->addr_len, 1);
  7.2447 +
  7.2448 +		case SIOCGIFINDEX:
  7.2449 +			ifr->ifr_ifindex = dev->ifindex;
  7.2450 +			return 0;
  7.2451 +
  7.2452 +		case SIOCGIFTXQLEN:
  7.2453 +			ifr->ifr_qlen = dev->tx_queue_len;
  7.2454 +			return 0;
  7.2455 +
  7.2456 +		case SIOCSIFTXQLEN:
  7.2457 +			if (ifr->ifr_qlen < 0)
  7.2458 +				return -EINVAL;
  7.2459 +			dev->tx_queue_len = ifr->ifr_qlen;
  7.2460 +			return 0;
  7.2461 +
  7.2462 +		case SIOCSIFNAME:
  7.2463 +			ifr->ifr_newname[IFNAMSIZ-1] = '\0';
  7.2464 +			return dev_change_name(dev, ifr->ifr_newname);
  7.2465 +
  7.2466 +		/*
  7.2467 +		 *	Unknown or private ioctl
  7.2468 +		 */
  7.2469 +
  7.2470 +		default:
  7.2471 +			if ((cmd >= SIOCDEVPRIVATE &&
  7.2472 +			    cmd <= SIOCDEVPRIVATE + 15) ||
  7.2473 +			    cmd == SIOCBONDENSLAVE ||
  7.2474 +			    cmd == SIOCBONDRELEASE ||
  7.2475 +			    cmd == SIOCBONDSETHWADDR ||
  7.2476 +			    cmd == SIOCBONDSLAVEINFOQUERY ||
  7.2477 +			    cmd == SIOCBONDINFOQUERY ||
  7.2478 +			    cmd == SIOCBONDCHANGEACTIVE ||
  7.2479 +			    cmd == SIOCGMIIPHY ||
  7.2480 +			    cmd == SIOCGMIIREG ||
  7.2481 +			    cmd == SIOCSMIIREG ||
  7.2482 +			    cmd == SIOCBRADDIF ||
  7.2483 +			    cmd == SIOCBRDELIF ||
  7.2484 +			    cmd == SIOCWANDEV) {
  7.2485 +				err = -EOPNOTSUPP;
  7.2486 +				if (dev->do_ioctl) {
  7.2487 +					if (netif_device_present(dev))
  7.2488 +						err = dev->do_ioctl(dev, ifr,
  7.2489 +								    cmd);
  7.2490 +					else
  7.2491 +						err = -ENODEV;
  7.2492 +				}
  7.2493 +			} else
  7.2494 +				err = -EINVAL;
  7.2495 +
  7.2496 +	}
  7.2497 +	return err;
  7.2498 +}
  7.2499 +
  7.2500 +/*
  7.2501 + *	This function handles all "interface"-type I/O control requests. The actual
  7.2502 + *	'doing' part of this is dev_ifsioc above.
  7.2503 + */
  7.2504 +
  7.2505 +/**
  7.2506 + *	dev_ioctl	-	network device ioctl
  7.2507 + *	@cmd: command to issue
  7.2508 + *	@arg: pointer to a struct ifreq in user space
  7.2509 + *
  7.2510 + *	Issue ioctl functions to devices. This is normally called by the
  7.2511 + *	user space syscall interfaces but can sometimes be useful for
  7.2512 + *	other purposes. The return value is the return from the syscall if
  7.2513 + *	positive or a negative errno code on error.
  7.2514 + */
  7.2515 +
  7.2516 +int dev_ioctl(unsigned int cmd, void __user *arg)
  7.2517 +{
  7.2518 +	struct ifreq ifr;
  7.2519 +	int ret;
  7.2520 +	char *colon;
  7.2521 +
  7.2522 +	/* One special case: SIOCGIFCONF takes ifconf argument
  7.2523 +	   and requires shared lock, because it sleeps writing
  7.2524 +	   to user space.
  7.2525 +	 */
  7.2526 +
  7.2527 +	if (cmd == SIOCGIFCONF) {
  7.2528 +		rtnl_shlock();
  7.2529 +		ret = dev_ifconf((char __user *) arg);
  7.2530 +		rtnl_shunlock();
  7.2531 +		return ret;
  7.2532 +	}
  7.2533 +	if (cmd == SIOCGIFNAME)
  7.2534 +		return dev_ifname((struct ifreq __user *)arg);
  7.2535 +
  7.2536 +	if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
  7.2537 +		return -EFAULT;
  7.2538 +
  7.2539 +	ifr.ifr_name[IFNAMSIZ-1] = 0;
  7.2540 +
  7.2541 +	colon = strchr(ifr.ifr_name, ':');
  7.2542 +	if (colon)
  7.2543 +		*colon = 0;
  7.2544 +
  7.2545 +	/*
  7.2546 +	 *	See which interface the caller is talking about.
  7.2547 +	 */
  7.2548 +
  7.2549 +	switch (cmd) {
  7.2550 +		/*
  7.2551 +		 *	These ioctl calls:
  7.2552 +		 *	- can be done by all.
  7.2553 +		 *	- atomic and do not require locking.
  7.2554 +		 *	- return a value
  7.2555 +		 */
  7.2556 +		case SIOCGIFFLAGS:
  7.2557 +		case SIOCGIFMETRIC:
  7.2558 +		case SIOCGIFMTU:
  7.2559 +		case SIOCGIFHWADDR:
  7.2560 +		case SIOCGIFSLAVE:
  7.2561 +		case SIOCGIFMAP:
  7.2562 +		case SIOCGIFINDEX:
  7.2563 +		case SIOCGIFTXQLEN:
  7.2564 +			dev_load(ifr.ifr_name);
  7.2565 +			read_lock(&dev_base_lock);
  7.2566 +			ret = dev_ifsioc(&ifr, cmd);
  7.2567 +			read_unlock(&dev_base_lock);
  7.2568 +			if (!ret) {
  7.2569 +				if (colon)
  7.2570 +					*colon = ':';
  7.2571 +				if (copy_to_user(arg, &ifr,
  7.2572 +						 sizeof(struct ifreq)))
  7.2573 +					ret = -EFAULT;
  7.2574 +			}
  7.2575 +			return ret;
  7.2576 +
  7.2577 +		case SIOCETHTOOL:
  7.2578 +			dev_load(ifr.ifr_name);
  7.2579 +			rtnl_lock();
  7.2580 +			ret = dev_ethtool(&ifr);
  7.2581 +			rtnl_unlock();
  7.2582 +			if (!ret) {
  7.2583 +				if (colon)
  7.2584 +					*colon = ':';
  7.2585 +				if (copy_to_user(arg, &ifr,
  7.2586 +						 sizeof(struct ifreq)))
  7.2587 +					ret = -EFAULT;
  7.2588 +			}
  7.2589 +			return ret;
  7.2590 +
  7.2591 +		/*
  7.2592 +		 *	These ioctl calls:
  7.2593 +		 *	- require superuser power.
  7.2594 +		 *	- require strict serialization.
  7.2595 +		 *	- return a value
  7.2596 +		 */
  7.2597 +		case SIOCGMIIPHY:
  7.2598 +		case SIOCGMIIREG:
  7.2599 +		case SIOCSIFNAME:
  7.2600 +			if (!capable(CAP_NET_ADMIN))
  7.2601 +				return -EPERM;
  7.2602 +			dev_load(ifr.ifr_name);
  7.2603 +			rtnl_lock();
  7.2604 +			ret = dev_ifsioc(&ifr, cmd);
  7.2605 +			rtnl_unlock();
  7.2606 +			if (!ret) {
  7.2607 +				if (colon)
  7.2608 +					*colon = ':';
  7.2609 +				if (copy_to_user(arg, &ifr,
  7.2610 +						 sizeof(struct ifreq)))
  7.2611 +					ret = -EFAULT;
  7.2612 +			}
  7.2613 +			return ret;
  7.2614 +
  7.2615 +		/*
  7.2616 +		 *	These ioctl calls:
  7.2617 +		 *	- require superuser power.
  7.2618 +		 *	- require strict serialization.
  7.2619 +		 *	- do not return a value
  7.2620 +		 */
  7.2621 +		case SIOCSIFFLAGS:
  7.2622 +		case SIOCSIFMETRIC:
  7.2623 +		case SIOCSIFMTU:
  7.2624 +		case SIOCSIFMAP:
  7.2625 +		case SIOCSIFHWADDR:
  7.2626 +		case SIOCSIFSLAVE:
  7.2627 +		case SIOCADDMULTI:
  7.2628 +		case SIOCDELMULTI:
  7.2629 +		case SIOCSIFHWBROADCAST:
  7.2630 +		case SIOCSIFTXQLEN:
  7.2631 +		case SIOCSMIIREG:
  7.2632 +		case SIOCBONDENSLAVE:
  7.2633 +		case SIOCBONDRELEASE:
  7.2634 +		case SIOCBONDSETHWADDR:
  7.2635 +		case SIOCBONDSLAVEINFOQUERY:
  7.2636 +		case SIOCBONDINFOQUERY:
  7.2637 +		case SIOCBONDCHANGEACTIVE:
  7.2638 +		case SIOCBRADDIF:
  7.2639 +		case SIOCBRDELIF:
  7.2640 +			if (!capable(CAP_NET_ADMIN))
  7.2641 +				return -EPERM;
  7.2642 +			dev_load(ifr.ifr_name);
  7.2643 +			rtnl_lock();
  7.2644 +			ret = dev_ifsioc(&ifr, cmd);
  7.2645 +			rtnl_unlock();
  7.2646 +			return ret;
  7.2647 +
  7.2648 +		case SIOCGIFMEM:
  7.2649 +			/* Get the per device memory space. We can add this but
  7.2650 +			 * currently do not support it */
  7.2651 +		case SIOCSIFMEM:
  7.2652 +			/* Set the per device memory buffer space.
  7.2653 +			 * Not applicable in our case */
  7.2654 +		case SIOCSIFLINK:
  7.2655 +			return -EINVAL;
  7.2656 +
  7.2657 +		/*
  7.2658 +		 *	Unknown or private ioctl.
  7.2659 +		 */
  7.2660 +		default:
  7.2661 +			if (cmd == SIOCWANDEV ||
  7.2662 +			    (cmd >= SIOCDEVPRIVATE &&
  7.2663 +			     cmd <= SIOCDEVPRIVATE + 15)) {
  7.2664 +				dev_load(ifr.ifr_name);
  7.2665 +				rtnl_lock();
  7.2666 +				ret = dev_ifsioc(&ifr, cmd);
  7.2667 +				rtnl_unlock();
  7.2668 +				if (!ret && copy_to_user(arg, &ifr,
  7.2669 +							 sizeof(struct ifreq)))
  7.2670 +					ret = -EFAULT;
  7.2671 +				return ret;
  7.2672 +			}
  7.2673 +#ifdef WIRELESS_EXT
  7.2674 +			/* Take care of Wireless Extensions */
  7.2675 +			if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST) {
  7.2676 +				/* If command is `set a parameter', or
  7.2677 +				 * `get the encoding parameters', check if
  7.2678 +				 * the user has the right to do it */
  7.2679 +				if (IW_IS_SET(cmd) || cmd == SIOCGIWENCODE) {
  7.2680 +					if (!capable(CAP_NET_ADMIN))
  7.2681 +						return -EPERM;
  7.2682 +				}
  7.2683 +				dev_load(ifr.ifr_name);
  7.2684 +				rtnl_lock();
  7.2685 +				/* Follow me in net/core/wireless.c */
  7.2686 +				ret = wireless_process_ioctl(&ifr, cmd);
  7.2687 +				rtnl_unlock();
  7.2688 +				if (IW_IS_GET(cmd) &&
  7.2689 +				    copy_to_user(arg, &ifr,
  7.2690 +					    	 sizeof(struct ifreq)))
  7.2691 +					ret = -EFAULT;
  7.2692 +				return ret;
  7.2693 +			}
  7.2694 +#endif	/* WIRELESS_EXT */
  7.2695 +			return -EINVAL;
  7.2696 +	}
  7.2697 +}
  7.2698 +
  7.2699 +
  7.2700 +/**
  7.2701 + *	dev_new_index	-	allocate an ifindex
  7.2702 + *
  7.2703 + *	Returns a suitable unique value for a new device interface
  7.2704 + *	number.  The caller must hold the rtnl semaphore or the
  7.2705 + *	dev_base_lock to be sure it remains unique.
  7.2706 + */
  7.2707 +static int dev_new_index(void)
  7.2708 +{
  7.2709 +	static int ifindex;
  7.2710 +	for (;;) {
  7.2711 +		if (++ifindex <= 0)
  7.2712 +			ifindex = 1;
  7.2713 +		if (!__dev_get_by_index(ifindex))
  7.2714 +			return ifindex;
  7.2715 +	}
  7.2716 +}
  7.2717 +
  7.2718 +static int dev_boot_phase = 1;
  7.2719 +
  7.2720 +/* Delayed registration/unregisteration */
  7.2721 +static DEFINE_SPINLOCK(net_todo_list_lock);
  7.2722 +static struct list_head net_todo_list = LIST_HEAD_INIT(net_todo_list);
  7.2723 +
  7.2724 +static inline void net_set_todo(struct net_device *dev)
  7.2725 +{
  7.2726 +	spin_lock(&net_todo_list_lock);
  7.2727 +	list_add_tail(&dev->todo_list, &net_todo_list);
  7.2728 +	spin_unlock(&net_todo_list_lock);
  7.2729 +}
  7.2730 +
  7.2731 +/**
  7.2732 + *	register_netdevice	- register a network device
  7.2733 + *	@dev: device to register
  7.2734 + *
  7.2735 + *	Take a completed network device structure and add it to the kernel
  7.2736 + *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
  7.2737 + *	chain. 0 is returned on success. A negative errno code is returned
  7.2738 + *	on a failure to set up the device, or if the name is a duplicate.
  7.2739 + *
  7.2740 + *	Callers must hold the rtnl semaphore. You may want
  7.2741 + *	register_netdev() instead of this.
  7.2742 + *
  7.2743 + *	BUGS:
  7.2744 + *	The locking appears insufficient to guarantee two parallel registers
  7.2745 + *	will not get the same name.
  7.2746 + */
  7.2747 +
  7.2748 +int register_netdevice(struct net_device *dev)
  7.2749 +{
  7.2750 +	struct hlist_head *head;
  7.2751 +	struct hlist_node *p;
  7.2752 +	int ret;
  7.2753 +
  7.2754 +	BUG_ON(dev_boot_phase);
  7.2755 +	ASSERT_RTNL();
  7.2756 +
  7.2757 +	/* When net_device's are persistent, this will be fatal. */
  7.2758 +	BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
  7.2759 +
  7.2760 +	spin_lock_init(&dev->queue_lock);
  7.2761 +	spin_lock_init(&dev->xmit_lock);
  7.2762 +	dev->xmit_lock_owner = -1;
  7.2763 +#ifdef CONFIG_NET_CLS_ACT
  7.2764 +	spin_lock_init(&dev->ingress_lock);
  7.2765 +#endif
  7.2766 +
  7.2767 +	ret = alloc_divert_blk(dev);
  7.2768 +	if (ret)
  7.2769 +		goto out;
  7.2770 +
  7.2771 +	dev->iflink = -1;
  7.2772 +
  7.2773 +	/* Init, if this function is available */
  7.2774 +	if (dev->init) {
  7.2775 +		ret = dev->init(dev);
  7.2776 +		if (ret) {
  7.2777 +			if (ret > 0)
  7.2778 +				ret = -EIO;
  7.2779 +			goto out_err;
  7.2780 +		}
  7.2781 +	}
  7.2782 + 
  7.2783 +	if (!dev_valid_name(dev->name)) {
  7.2784 +		ret = -EINVAL;
  7.2785 +		goto out_err;
  7.2786 +	}
  7.2787 +
  7.2788 +	dev->ifindex = dev_new_index();
  7.2789 +	if (dev->iflink == -1)
  7.2790 +		dev->iflink = dev->ifindex;
  7.2791 +
  7.2792 +	/* Check for existence of name */
  7.2793 +	head = dev_name_hash(dev->name);
  7.2794 +	hlist_for_each(p, head) {
  7.2795 +		struct net_device *d
  7.2796 +			= hlist_entry(p, struct net_device, name_hlist);
  7.2797 +		if (!strncmp(d->name, dev->name, IFNAMSIZ)) {
  7.2798 +			ret = -EEXIST;
  7.2799 + 			goto out_err;
  7.2800 +		}
  7.2801 + 	}
  7.2802 +
  7.2803 +	/* Fix illegal SG+CSUM combinations. */
  7.2804 +	if ((dev->features & NETIF_F_SG) &&
  7.2805 +	    !(dev->features & (NETIF_F_IP_CSUM |
  7.2806 +			       NETIF_F_NO_CSUM |
  7.2807 +			       NETIF_F_HW_CSUM))) {
  7.2808 +		printk("%s: Dropping NETIF_F_SG since no checksum feature.\n",
  7.2809 +		       dev->name);
  7.2810 +		dev->features &= ~NETIF_F_SG;
  7.2811 +	}
  7.2812 +
  7.2813 +	/* TSO requires that SG is present as well. */
  7.2814 +	if ((dev->features & NETIF_F_TSO) &&
  7.2815 +	    !(dev->features & NETIF_F_SG)) {
  7.2816 +		printk("%s: Dropping NETIF_F_TSO since no SG feature.\n",
  7.2817 +		       dev->name);
  7.2818 +		dev->features &= ~NETIF_F_TSO;
  7.2819 +	}
  7.2820 +
  7.2821 +	/*
  7.2822 +	 *	nil rebuild_header routine,
  7.2823 +	 *	that should be never called and used as just bug trap.
  7.2824 +	 */
  7.2825 +
  7.2826 +	if (!dev->rebuild_header)
  7.2827 +		dev->rebuild_header = default_rebuild_header;
  7.2828 +
  7.2829 +	/*
  7.2830 +	 *	Default initial state at registry is that the
  7.2831 +	 *	device is present.
  7.2832 +	 */
  7.2833 +
  7.2834 +	set_bit(__LINK_STATE_PRESENT, &dev->state);
  7.2835 +
  7.2836 +	dev->next = NULL;
  7.2837 +	dev_init_scheduler(dev);
  7.2838 +	write_lock_bh(&dev_base_lock);
  7.2839 +	*dev_tail = dev;
  7.2840 +	dev_tail = &dev->next;
  7.2841 +	hlist_add_head(&dev->name_hlist, head);
  7.2842 +	hlist_add_head(&dev->index_hlist, dev_index_hash(dev->ifindex));
  7.2843 +	dev_hold(dev);
  7.2844 +	dev->reg_state = NETREG_REGISTERING;
  7.2845 +	write_unlock_bh(&dev_base_lock);
  7.2846 +
  7.2847 +	/* Notify protocols, that a new device appeared. */
  7.2848 +	notifier_call_chain(&netdev_chain, NETDEV_REGISTER, dev);
  7.2849 +
  7.2850 +	/* Finish registration after unlock */
  7.2851 +	net_set_todo(dev);
  7.2852 +	ret = 0;
  7.2853 +
  7.2854 +out:
  7.2855 +	return ret;
  7.2856 +out_err:
  7.2857 +	free_divert_blk(dev);
  7.2858 +	goto out;
  7.2859 +}
  7.2860 +
  7.2861 +/**
  7.2862 + *	register_netdev	- register a network device
  7.2863 + *	@dev: device to register
  7.2864 + *
  7.2865 + *	Take a completed network device structure and add it to the kernel
  7.2866 + *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
  7.2867 + *	chain. 0 is returned on success. A negative errno code is returned
  7.2868 + *	on a failure to set up the device, or if the name is a duplicate.
  7.2869 + *
  7.2870 + *	This is a wrapper around register_netdev that takes the rtnl semaphore
  7.2871 + *	and expands the device name if you passed a format string to
  7.2872 + *	alloc_netdev.
  7.2873 + */
  7.2874 +int register_netdev(struct net_device *dev)
  7.2875 +{
  7.2876 +	int err;
  7.2877 +
  7.2878 +	rtnl_lock();
  7.2879 +
  7.2880 +	/*
  7.2881 +	 * If the name is a format string the caller wants us to do a
  7.2882 +	 * name allocation.
  7.2883 +	 */
  7.2884 +	if (strchr(dev->name, '%')) {
  7.2885 +		err = dev_alloc_name(dev, dev->name);
  7.2886 +		if (err < 0)
  7.2887 +			goto out;
  7.2888 +	}
  7.2889 +	
  7.2890 +	/*
  7.2891 +	 * Back compatibility hook. Kill this one in 2.5
  7.2892 +	 */
  7.2893 +	if (dev->name[0] == 0 || dev->name[0] == ' ') {
  7.2894 +		err = dev_alloc_name(dev, "eth%d");
  7.2895 +		if (err < 0)
  7.2896 +			goto out;
  7.2897 +	}
  7.2898 +
  7.2899 +	err = register_netdevice(dev);
  7.2900 +out:
  7.2901 +	rtnl_unlock();
  7.2902 +	return err;
  7.2903 +}
  7.2904 +EXPORT_SYMBOL(register_netdev);
  7.2905 +
  7.2906 +/*
  7.2907 + * netdev_wait_allrefs - wait until all references are gone.
  7.2908 + *
  7.2909 + * This is called when unregistering network devices.
  7.2910 + *
  7.2911 + * Any protocol or device that holds a reference should register
  7.2912 + * for netdevice notification, and cleanup and put back the
  7.2913 + * reference if they receive an UNREGISTER event.
  7.2914 + * We can get stuck here if buggy protocols don't correctly
  7.2915 + * call dev_put. 
  7.2916 + */
  7.2917 +static void netdev_wait_allrefs(struct net_device *dev)
  7.2918 +{
  7.2919 +	unsigned long rebroadcast_time, warning_time;
  7.2920 +
  7.2921 +	rebroadcast_time = warning_time = jiffies;
  7.2922 +	while (atomic_read(&dev->refcnt) != 0) {
  7.2923 +		if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
  7.2924 +			rtnl_shlock();
  7.2925 +
  7.2926 +			/* Rebroadcast unregister notification */
  7.2927 +			notifier_call_chain(&netdev_chain,
  7.2928 +					    NETDEV_UNREGISTER, dev);
  7.2929 +
  7.2930 +			if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
  7.2931 +				     &dev->state)) {
  7.2932 +				/* We must not have linkwatch events
  7.2933 +				 * pending on unregister. If this
  7.2934 +				 * happens, we simply run the queue
  7.2935 +				 * unscheduled, resulting in a noop
  7.2936 +				 * for this device.
  7.2937 +				 */
  7.2938 +				linkwatch_run_queue();
  7.2939 +			}
  7.2940 +
  7.2941 +			rtnl_shunlock();
  7.2942 +
  7.2943 +			rebroadcast_time = jiffies;
  7.2944 +		}
  7.2945 +
  7.2946 +		msleep(250);
  7.2947 +
  7.2948 +		if (time_after(jiffies, warning_time + 10 * HZ)) {
  7.2949 +			printk(KERN_EMERG "unregister_netdevice: "
  7.2950 +			       "waiting for %s to become free. Usage "
  7.2951 +			       "count = %d\n",
  7.2952 +			       dev->name, atomic_read(&dev->refcnt));
  7.2953 +			warning_time = jiffies;
  7.2954 +		}
  7.2955 +	}
  7.2956 +}
  7.2957 +
  7.2958 +/* The sequence is:
  7.2959 + *
  7.2960 + *	rtnl_lock();
  7.2961 + *	...
  7.2962 + *	register_netdevice(x1);
  7.2963 + *	register_netdevice(x2);
  7.2964 + *	...
  7.2965 + *	unregister_netdevice(y1);
  7.2966 + *	unregister_netdevice(y2);
  7.2967 + *      ...
  7.2968 + *	rtnl_unlock();
  7.2969 + *	free_netdev(y1);
  7.2970 + *	free_netdev(y2);
  7.2971 + *
  7.2972 + * We are invoked by rtnl_unlock() after it drops the semaphore.
  7.2973 + * This allows us to deal with problems:
  7.2974 + * 1) We can create/delete sysfs objects which invoke hotplug
  7.2975 + *    without deadlocking with linkwatch via keventd.
  7.2976 + * 2) Since we run with the RTNL semaphore not held, we can sleep
  7.2977 + *    safely in order to wait for the netdev refcnt to drop to zero.
  7.2978 + */
  7.2979 +static DECLARE_MUTEX(net_todo_run_mutex);
  7.2980 +void netdev_run_todo(void)
  7.2981 +{
  7.2982 +	struct list_head list = LIST_HEAD_INIT(list);
  7.2983 +	int err;
  7.2984 +
  7.2985 +
  7.2986 +	/* Need to guard against multiple cpu's getting out of order. */
  7.2987 +	down(&net_todo_run_mutex);
  7.2988 +
  7.2989 +	/* Not safe to do outside the semaphore.  We must not return
  7.2990 +	 * until all unregister events invoked by the local processor
  7.2991 +	 * have been completed (either by this todo run, or one on
  7.2992 +	 * another cpu).
  7.2993 +	 */
  7.2994 +	if (list_empty(&net_todo_list))
  7.2995 +		goto out;
  7.2996 +
  7.2997 +	/* Snapshot list, allow later requests */
  7.2998 +	spin_lock(&net_todo_list_lock);
  7.2999 +	list_splice_init(&net_todo_list, &list);
  7.3000 +	spin_unlock(&net_todo_list_lock);
  7.3001 +		
  7.3002 +	while (!list_empty(&list)) {
  7.3003 +		struct net_device *dev
  7.3004 +			= list_entry(list.next, struct net_device, todo_list);
  7.3005 +		list_del(&dev->todo_list);
  7.3006 +
  7.3007 +		switch(dev->reg_state) {
  7.3008 +		case NETREG_REGISTERING:
  7.3009 +			err = netdev_register_sysfs(dev);
  7.3010 +			if (err)
  7.3011 +				printk(KERN_ERR "%s: failed sysfs registration (%d)\n",
  7.3012 +				       dev->name, err);
  7.3013 +			dev->reg_state = NETREG_REGISTERED;
  7.3014 +			break;
  7.3015 +
  7.3016 +		case NETREG_UNREGISTERING:
  7.3017 +			netdev_unregister_sysfs(dev);
  7.3018 +			dev->reg_state = NETREG_UNREGISTERED;
  7.3019 +
  7.3020 +			netdev_wait_allrefs(dev);
  7.3021 +
  7.3022 +			/* paranoia */
  7.3023 +			BUG_ON(atomic_read(&dev->refcnt));
  7.3024 +			BUG_TRAP(!dev->ip_ptr);
  7.3025 +			BUG_TRAP(!dev->ip6_ptr);
  7.3026 +			BUG_TRAP(!dev->dn_ptr);
  7.3027 +
  7.3028 +
  7.3029 +			/* It must be the very last action, 
  7.3030 +			 * after this 'dev' may point to freed up memory.
  7.3031 +			 */
  7.3032 +			if (dev->destructor)
  7.3033 +				dev->destructor(dev);
  7.3034 +			break;
  7.3035 +
  7.3036 +		default:
  7.3037 +			printk(KERN_ERR "network todo '%s' but state %d\n",
  7.3038 +			       dev->name, dev->reg_state);
  7.3039 +			break;
  7.3040 +		}
  7.3041 +	}
  7.3042 +
  7.3043 +out:
  7.3044 +	up(&net_todo_run_mutex);
  7.3045 +}
  7.3046 +
  7.3047 +/**
  7.3048 + *	alloc_netdev - allocate network device
  7.3049 + *	@sizeof_priv:	size of private data to allocate space for
  7.3050 + *	@name:		device name format string
  7.3051 + *	@setup:		callback to initialize device
  7.3052 + *
  7.3053 + *	Allocates a struct net_device with private data area for driver use
  7.3054 + *	and performs basic initialization.
  7.3055 + */
  7.3056 +struct net_device *alloc_netdev(int sizeof_priv, const char *name,
  7.3057 +		void (*setup)(struct net_device *))
  7.3058 +{
  7.3059 +	void *p;
  7.3060 +	struct net_device *dev;
  7.3061 +	int alloc_size;
  7.3062 +
  7.3063 +	/* ensure 32-byte alignment of both the device and private area */
  7.3064 +	alloc_size = (sizeof(*dev) + NETDEV_ALIGN_CONST) & ~NETDEV_ALIGN_CONST;
  7.3065 +	alloc_size += sizeof_priv + NETDEV_ALIGN_CONST;
  7.3066 +
  7.3067 +	p = kmalloc(alloc_size, GFP_KERNEL);
  7.3068 +	if (!p) {
  7.3069 +		printk(KERN_ERR "alloc_dev: Unable to allocate device.\n");
  7.3070 +		return NULL;
  7.3071 +	}
  7.3072 +	memset(p, 0, alloc_size);
  7.3073 +
  7.3074 +	dev = (struct net_device *)
  7.3075 +		(((long)p + NETDEV_ALIGN_CONST) & ~NETDEV_ALIGN_CONST);
  7.3076 +	dev->padded = (char *)dev - (char *)p;
  7.3077 +
  7.3078 +	if (sizeof_priv)
  7.3079 +		dev->priv = netdev_priv(dev);
  7.3080 +
  7.3081 +	setup(dev);
  7.3082 +	strcpy(dev->name, name);
  7.3083 +	return dev;
  7.3084 +}
  7.3085 +EXPORT_SYMBOL(alloc_netdev);
  7.3086 +
  7.3087 +/**
  7.3088 + *	free_netdev - free network device
  7.3089 + *	@dev: device
  7.3090 + *
  7.3091 + *	This function does the last stage of destroying an allocated device 
  7.3092 + * 	interface. The reference to the device object is released.  
  7.3093 + *	If this is the last reference then it will be freed.
  7.3094 + */
  7.3095 +void free_netdev(struct net_device *dev)
  7.3096 +{
  7.3097 +#ifdef CONFIG_SYSFS
  7.3098 +	/*  Compatiablity with error handling in drivers */
  7.3099 +	if (dev->reg_state == NETREG_UNINITIALIZED) {
  7.3100 +		kfree((char *)dev - dev->padded);
  7.3101 +		return;
  7.3102 +	}
  7.3103 +
  7.3104 +	BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
  7.3105 +	dev->reg_state = NETREG_RELEASED;
  7.3106 +
  7.3107 +	/* will free via class release */
  7.3108 +	class_device_put(&dev->class_dev);
  7.3109 +#else
  7.3110 +	kfree((char *)dev - dev->padded);
  7.3111 +#endif
  7.3112 +}
  7.3113 + 
  7.3114 +/* Synchronize with packet receive processing. */
  7.3115 +void synchronize_net(void) 
  7.3116 +{
  7.3117 +	might_sleep();
  7.3118 +	synchronize_kernel();
  7.3119 +}
  7.3120 +
  7.3121 +/**
  7.3122 + *	unregister_netdevice - remove device from the kernel
  7.3123 + *	@dev: device
  7.3124 + *
  7.3125 + *	This function shuts down a device interface and removes it
  7.3126 + *	from the kernel tables. On success 0 is returned, on a failure
  7.3127 + *	a negative errno code is returned.
  7.3128 + *
  7.3129 + *	Callers must hold the rtnl semaphore.  You may want
  7.3130 + *	unregister_netdev() instead of this.
  7.3131 + */
  7.3132 +
  7.3133 +int unregister_netdevice(struct net_device *dev)
  7.3134 +{
  7.3135 +	struct net_device *d, **dp;
  7.3136 +
  7.3137 +	BUG_ON(dev_boot_phase);
  7.3138 +	ASSERT_RTNL();
  7.3139 +
  7.3140 +	/* Some devices call without registering for initialization unwind. */
  7.3141 +	if (dev->reg_state == NETREG_UNINITIALIZED) {
  7.3142 +		printk(KERN_DEBUG "unregister_netdevice: device %s/%p never "
  7.3143 +				  "was registered\n", dev->name, dev);
  7.3144 +		return -ENODEV;
  7.3145 +	}
  7.3146 +
  7.3147 +	BUG_ON(dev->reg_state != NETREG_REGISTERED);
  7.3148 +
  7.3149 +	/* If device is running, close it first. */
  7.3150 +	if (dev->flags & IFF_UP)
  7.3151 +		dev_close(dev);
  7.3152 +
  7.3153 +	/* And unlink it from device chain. */
  7.3154 +	for (dp = &dev_base; (d = *dp) != NULL; dp = &d->next) {
  7.3155 +		if (d == dev) {
  7.3156 +			write_lock_bh(&dev_base_lock);
  7.3157 +			hlist_del(&dev->name_hlist);
  7.3158 +			hlist_del(&dev->index_hlist);
  7.3159 +			if (dev_tail == &dev->next)
  7.3160 +				dev_tail = dp;
  7.3161 +			*dp = d->next;
  7.3162 +			write_unlock_bh(&dev_base_lock);
  7.3163 +			break;
  7.3164 +		}
  7.3165 +	}
  7.3166 +	if (!d) {
  7.3167 +		printk(KERN_ERR "unregister net_device: '%s' not found\n",
  7.3168 +		       dev->name);
  7.3169 +		return -ENODEV;
  7.3170 +	}
  7.3171 +
  7.3172 +	dev->reg_state = NETREG_UNREGISTERING;
  7.3173 +
  7.3174 +	synchronize_net();
  7.3175 +
  7.3176 +	/* Shutdown queueing discipline. */
  7.3177 +	dev_shutdown(dev);
  7.3178 +
  7.3179 +	
  7.3180 +	/* Notify protocols, that we are about to destroy
  7.3181 +	   this device. They should clean all the things.
  7.3182 +	*/
  7.3183 +	notifier_call_chain(&netdev_chain, NETDEV_UNREGISTER, dev);
  7.3184 +	
  7.3185 +	/*
  7.3186 +	 *	Flush the multicast chain
  7.3187 +	 */
  7.3188 +	dev_mc_discard(dev);
  7.3189 +
  7.3190 +	if (dev->uninit)
  7.3191 +		dev->uninit(dev);
  7.3192 +
  7.3193 +	/* Notifier chain MUST detach us from master device. */
  7.3194 +	BUG_TRAP(!dev->master);
  7.3195 +
  7.3196 +	free_divert_blk(dev);
  7.3197 +
  7.3198 +	/* Finish processing unregister after unlock */
  7.3199 +	net_set_todo(dev);
  7.3200 +
  7.3201 +	synchronize_net();
  7.3202 +
  7.3203 +	dev_put(dev);
  7.3204 +	return 0;
  7.3205 +}
  7.3206 +
  7.3207 +/**
  7.3208 + *	unregister_netdev - remove device from the kernel
  7.3209 + *	@dev: device
  7.3210 + *
  7.3211 + *	This function shuts down a device interface and removes it
  7.3212 + *	from the kernel tables. On success 0 is returned, on a failure
  7.3213 + *	a negative errno code is returned.
  7.3214 + *
  7.3215 + *	This is just a wrapper for unregister_netdevice that takes
  7.3216 + *	the rtnl semaphore.  In general you want to use this and not
  7.3217 + *	unregister_netdevice.
  7.3218 + */
  7.3219 +void unregister_netdev(struct net_device *dev)
  7.3220 +{
  7.3221 +	rtnl_lock();
  7.3222 +	unregister_netdevice(dev);
  7.3223 +	rtnl_unlock();
  7.3224 +}
  7.3225 +
  7.3226 +EXPORT_SYMBOL(unregister_netdev);
  7.3227 +
  7.3228 +#ifdef CONFIG_HOTPLUG_CPU
  7.3229 +static int dev_cpu_callback(struct notifier_block *nfb,
  7.3230 +			    unsigned long action,
  7.3231 +			    void *ocpu)
  7.3232 +{
  7.3233 +	struct sk_buff **list_skb;
  7.3234 +	struct net_device **list_net;
  7.3235 +	struct sk_buff *skb;
  7.3236 +	unsigned int cpu, oldcpu = (unsigned long)ocpu;
  7.3237 +	struct softnet_data *sd, *oldsd;
  7.3238 +
  7.3239 +	if (action != CPU_DEAD)
  7.3240 +		return NOTIFY_OK;
  7.3241 +
  7.3242 +	local_irq_disable();
  7.3243 +	cpu = smp_processor_id();
  7.3244 +	sd = &per_cpu(softnet_data, cpu);
  7.3245 +	oldsd = &per_cpu(softnet_data, oldcpu);
  7.3246 +
  7.3247 +	/* Find end of our completion_queue. */
  7.3248 +	list_skb = &sd->completion_queue;
  7.3249 +	while (*list_skb)
  7.3250 +		list_skb = &(*list_skb)->next;
  7.3251 +	/* Append completion queue from offline CPU. */
  7.3252 +	*list_skb = oldsd->completion_queue;
  7.3253 +	oldsd->completion_queue = NULL;
  7.3254 +
  7.3255 +	/* Find end of our output_queue. */
  7.3256 +	list_net = &sd->output_queue;
  7.3257 +	while (*list_net)
  7.3258 +		list_net = &(*list_net)->next_sched;
  7.3259 +	/* Append output queue from offline CPU. */
  7.3260 +	*list_net = oldsd->output_queue;
  7.3261 +	oldsd->output_queue = NULL;
  7.3262 +
  7.3263 +	raise_softirq_irqoff(NET_TX_SOFTIRQ);
  7.3264 +	local_irq_enable();
  7.3265 +
  7.3266 +	/* Process offline CPU's input_pkt_queue */
  7.3267 +	while ((skb = __skb_dequeue(&oldsd->input_pkt_queue)))
  7.3268 +		netif_rx(skb);
  7.3269 +
  7.3270 +	return NOTIFY_OK;
  7.3271 +}
  7.3272 +#endif /* CONFIG_HOTPLUG_CPU */
  7.3273 +
  7.3274 +
  7.3275 +/*
  7.3276 + *	Initialize the DEV module. At boot time this walks the device list and
  7.3277 + *	unhooks any devices that fail to initialise (normally hardware not
  7.3278 + *	present) and leaves us with a valid list of present and active devices.
  7.3279 + *
  7.3280 + */
  7.3281 +
  7.3282 +/*
  7.3283 + *       This is called single threaded during boot, so no need
  7.3284 + *       to take the rtnl semaphore.
  7.3285 + */
  7.3286 +static int __init net_dev_init(void)
  7.3287 +{
  7.3288 +	int i, rc = -ENOMEM;
  7.3289 +
  7.3290 +	BUG_ON(!dev_boot_phase);
  7.3291 +
  7.3292 +	net_random_init();
  7.3293 +
  7.3294 +	if (dev_proc_init())
  7.3295 +		goto out;
  7.3296 +
  7.3297 +	if (netdev_sysfs_init())
  7.3298 +		goto out;
  7.3299 +
  7.3300 +	INIT_LIST_HEAD(&ptype_all);
  7.3301 +	for (i = 0; i < 16; i++) 
  7.3302 +		INIT_LIST_HEAD(&ptype_base[i]);
  7.3303 +
  7.3304 +	for (i = 0; i < ARRAY_SIZE(dev_name_head); i++)
  7.3305 +		INIT_HLIST_HEAD(&dev_name_head[i]);
  7.3306 +
  7.3307 +	for (i = 0; i < ARRAY_SIZE(dev_index_head); i++)
  7.3308 +		INIT_HLIST_HEAD(&dev_index_head[i]);
  7.3309 +
  7.3310 +	/*
  7.3311 +	 *	Initialise the packet receive queues.
  7.3312 +	 */
  7.3313 +
  7.3314 +	for (i = 0; i < NR_CPUS; i++) {
  7.3315 +		struct softnet_data *queue;
  7.3316 +
  7.3317 +		queue = &per_cpu(softnet_data, i);
  7.3318 +		skb_queue_head_init(&queue->input_pkt_queue);
  7.3319 +		queue->throttle = 0;
  7.3320 +		queue->cng_level = 0;
  7.3321 +		queue->avg_blog = 10; /* arbitrary non-zero */
  7.3322 +		queue->completion_queue = NULL;
  7.3323 +		INIT_LIST_HEAD(&queue->poll_list);
  7.3324 +		set_bit(__LINK_STATE_START, &queue->backlog_dev.state);
  7.3325 +		queue->backlog_dev.weight = weight_p;
  7.3326 +		queue->backlog_dev.poll = process_backlog;
  7.3327 +		atomic_set(&queue->backlog_dev.refcnt, 1);
  7.3328 +	}
  7.3329 +
  7.3330 +#ifdef OFFLINE_SAMPLE
  7.3331 +	samp_timer.expires = jiffies + (10 * HZ);
  7.3332 +	add_timer(&samp_timer);
  7.3333 +#endif
  7.3334 +
  7.3335 +	dev_boot_phase = 0;
  7.3336 +
  7.3337 +	open_softirq(NET_TX_SOFTIRQ, net_tx_action, NULL);
  7.3338 +	open_softirq(NET_RX_SOFTIRQ, net_rx_action, NULL);
  7.3339 +
  7.3340 +	hotcpu_notifier(dev_cpu_callback, 0);
  7.3341 +	dst_init();
  7.3342 +	dev_mcast_init();
  7.3343 +	rc = 0;
  7.3344 +out:
  7.3345 +	return rc;
  7.3346 +}
  7.3347 +
  7.3348 +subsys_initcall(net_dev_init);
  7.3349 +
  7.3350 +EXPORT_SYMBOL(__dev_get_by_index);
  7.3351 +EXPORT_SYMBOL(__dev_get_by_name);
  7.3352 +EXPORT_SYMBOL(__dev_remove_pack);
  7.3353 +EXPORT_SYMBOL(__skb_linearize);
  7.3354 +EXPORT_SYMBOL(dev_add_pack);
  7.3355 +EXPORT_SYMBOL(dev_alloc_name);
  7.3356 +EXPORT_SYMBOL(dev_close);
  7.3357 +EXPORT_SYMBOL(dev_get_by_flags);
  7.3358 +EXPORT_SYMBOL(dev_get_by_index);
  7.3359 +EXPORT_SYMBOL(dev_get_by_name);
  7.3360 +EXPORT_SYMBOL(dev_ioctl);
  7.3361 +EXPORT_SYMBOL(dev_open);
  7.3362 +EXPORT_SYMBOL(dev_queue_xmit);
  7.3363 +EXPORT_SYMBOL(dev_remove_pack);
  7.3364 +EXPORT_SYMBOL(dev_set_allmulti);
  7.3365 +EXPORT_SYMBOL(dev_set_promiscuity);
  7.3366 +EXPORT_SYMBOL(dev_change_flags);
  7.3367 +EXPORT_SYMBOL(dev_set_mtu);
  7.3368 +EXPORT_SYMBOL(free_netdev);
  7.3369 +EXPORT_SYMBOL(netdev_boot_setup_check);
  7.3370 +EXPORT_SYMBOL(netdev_set_master);
  7.3371 +EXPORT_SYMBOL(netdev_state_change);
  7.3372 +EXPORT_SYMBOL(netif_receive_skb);
  7.3373 +EXPORT_SYMBOL(netif_rx);
  7.3374 +EXPORT_SYMBOL(register_gifconf);
  7.3375 +EXPORT_SYMBOL(register_netdevice);
  7.3376 +EXPORT_SYMBOL(register_netdevice_notifier);
  7.3377 +EXPORT_SYMBOL(skb_checksum_help);
  7.3378 +EXPORT_SYMBOL(synchronize_net);
  7.3379 +EXPORT_SYMBOL(unregister_netdevice);
  7.3380 +EXPORT_SYMBOL(unregister_netdevice_notifier);
  7.3381 +EXPORT_SYMBOL(net_enable_timestamp);
  7.3382 +EXPORT_SYMBOL(net_disable_timestamp);
  7.3383 +
  7.3384 +#if defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)
  7.3385 +EXPORT_SYMBOL(br_handle_frame_hook);
  7.3386 +#endif
  7.3387 +
  7.3388 +#ifdef CONFIG_KMOD
  7.3389 +EXPORT_SYMBOL(dev_load);
  7.3390 +#endif
  7.3391 +
  7.3392 +EXPORT_PER_CPU_SYMBOL(softnet_data);
     8.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     8.2 +++ b/linux-2.6.11-xen-sparse/net/core/skbuff.c	Sat May 21 19:08:56 2005 +0000
     8.3 @@ -0,0 +1,1523 @@
     8.4 +/*
     8.5 + *	Routines having to do with the 'struct sk_buff' memory handlers.
     8.6 + *
     8.7 + *	Authors:	Alan Cox <iiitac@pyr.swan.ac.uk>
     8.8 + *			Florian La Roche <rzsfl@rz.uni-sb.de>
     8.9 + *
    8.10 + *	Version:	$Id: skbuff.c,v 1.90 2001/11/07 05:56:19 davem Exp $
    8.11 + *
    8.12 + *	Fixes:
    8.13 + *		Alan Cox	:	Fixed the worst of the load
    8.14 + *					balancer bugs.
    8.15 + *		Dave Platt	:	Interrupt stacking fix.
    8.16 + *	Richard Kooijman	:	Timestamp fixes.
    8.17 + *		Alan Cox	:	Changed buffer format.
    8.18 + *		Alan Cox	:	destructor hook for AF_UNIX etc.
    8.19 + *		Linus Torvalds	:	Better skb_clone.
    8.20 + *		Alan Cox	:	Added skb_copy.
    8.21 + *		Alan Cox	:	Added all the changed routines Linus
    8.22 + *					only put in the headers
    8.23 + *		Ray VanTassle	:	Fixed --skb->lock in free
    8.24 + *		Alan Cox	:	skb_copy copy arp field
    8.25 + *		Andi Kleen	:	slabified it.
    8.26 + *		Robert Olsson	:	Removed skb_head_pool
    8.27 + *
    8.28 + *	NOTE:
    8.29 + *		The __skb_ routines should be called with interrupts
    8.30 + *	disabled, or you better be *real* sure that the operation is atomic
    8.31 + *	with respect to whatever list is being frobbed (e.g. via lock_sock()
    8.32 + *	or via disabling bottom half handlers, etc).
    8.33 + *
    8.34 + *	This program is free software; you can redistribute it and/or
    8.35 + *	modify it under the terms of the GNU General Public License
    8.36 + *	as published by the Free Software Foundation; either version
    8.37 + *	2 of the License, or (at your option) any later version.
    8.38 + */
    8.39 +
    8.40 +/*
    8.41 + *	The functions in this file will not compile correctly with gcc 2.4.x
    8.42 + */
    8.43 +
    8.44 +#include <linux/config.h>
    8.45 +#include <linux/module.h>
    8.46 +#include <linux/types.h>
    8.47 +#include <linux/kernel.h>
    8.48 +#include <linux/sched.h>
    8.49 +#include <linux/mm.h>
    8.50 +#include <linux/interrupt.h>
    8.51 +#include <linux/in.h>
    8.52 +#include <linux/inet.h>
    8.53 +#include <linux/slab.h>
    8.54 +#include <linux/netdevice.h>
    8.55 +#ifdef CONFIG_NET_CLS_ACT
    8.56 +#include <net/pkt_sched.h>
    8.57 +#endif
    8.58 +#include <linux/string.h>
    8.59 +#include <linux/skbuff.h>
    8.60 +#include <linux/cache.h>
    8.61 +#include <linux/rtnetlink.h>
    8.62 +#include <linux/init.h>
    8.63 +#include <linux/highmem.h>
    8.64 +
    8.65 +#include <net/protocol.h>
    8.66 +#include <net/dst.h>
    8.67 +#include <net/sock.h>
    8.68 +#include <net/checksum.h>
    8.69 +#include <net/xfrm.h>
    8.70 +
    8.71 +#include <asm/uaccess.h>
    8.72 +#include <asm/system.h>
    8.73 +
    8.74 +static kmem_cache_t *skbuff_head_cache;
    8.75 +
    8.76 +/*
    8.77 + *	Keep out-of-line to prevent kernel bloat.
    8.78 + *	__builtin_return_address is not used because it is not always
    8.79 + *	reliable.
    8.80 + */
    8.81 +
    8.82 +/**
    8.83 + *	skb_over_panic	- 	private function
    8.84 + *	@skb: buffer
    8.85 + *	@sz: size
    8.86 + *	@here: address
    8.87 + *
    8.88 + *	Out of line support code for skb_put(). Not user callable.
    8.89 + */
    8.90 +void skb_over_panic(struct sk_buff *skb, int sz, void *here)
    8.91 +{
    8.92 +	printk(KERN_INFO "skput:over: %p:%d put:%d dev:%s",
    8.93 +		here, skb->len, sz, skb->dev ? skb->dev->name : "<NULL>");
    8.94 +	BUG();
    8.95 +}
    8.96 +
    8.97 +/**
    8.98 + *	skb_under_panic	- 	private function
    8.99 + *	@skb: buffer
   8.100 + *	@sz: size
   8.101 + *	@here: address
   8.102 + *
   8.103 + *	Out of line support code for skb_push(). Not user callable.
   8.104 + */
   8.105 +
   8.106 +void skb_under_panic(struct sk_buff *skb, int sz, void *here)
   8.107 +{
   8.108 +	printk(KERN_INFO "skput:under: %p:%d put:%d dev:%s",
   8.109 +               here, skb->len, sz, skb->dev ? skb->dev->name : "<NULL>");
   8.110 +	BUG();
   8.111 +}
   8.112 +
   8.113 +/* 	Allocate a new skbuff. We do this ourselves so we can fill in a few
   8.114 + *	'private' fields and also do memory statistics to find all the
   8.115 + *	[BEEP] leaks.
   8.116 + *
   8.117 + */
   8.118 +
   8.119 +/**
   8.120 + *	alloc_skb	-	allocate a network buffer
   8.121 + *	@size: size to allocate
   8.122 + *	@gfp_mask: allocation mask
   8.123 + *
   8.124 + *	Allocate a new &sk_buff. The returned buffer has no headroom and a
   8.125 + *	tail room of size bytes. The object has a reference count of one.
   8.126 + *	The return is the buffer. On a failure the return is %NULL.
   8.127 + *
   8.128 + *	Buffers may only be allocated from interrupts using a @gfp_mask of
   8.129 + *	%GFP_ATOMIC.
   8.130 + */
   8.131 +struct sk_buff *alloc_skb(unsigned int size, int gfp_mask)
   8.132 +{
   8.133 +	struct sk_buff *skb;
   8.134 +	u8 *data;
   8.135 +
   8.136 +	/* Get the HEAD */
   8.137 +	skb = kmem_cache_alloc(skbuff_head_cache,
   8.138 +			       gfp_mask & ~__GFP_DMA);
   8.139 +	if (!skb)
   8.140 +		goto out;
   8.141 +
   8.142 +	/* Get the DATA. Size must match skb_add_mtu(). */
   8.143 +	size = SKB_DATA_ALIGN(size);
   8.144 +	data = kmalloc(size + sizeof(struct skb_shared_info), gfp_mask);
   8.145 +	if (!data)
   8.146 +		goto nodata;
   8.147 +
   8.148 +	memset(skb, 0, offsetof(struct sk_buff, truesize));
   8.149 +	skb->truesize = size + sizeof(struct sk_buff);
   8.150 +	atomic_set(&skb->users, 1);
   8.151 +	skb->head = data;
   8.152 +	skb->data = data;
   8.153 +	skb->tail = data;
   8.154 +	skb->end  = data + size;
   8.155 +
   8.156 +	atomic_set(&(skb_shinfo(skb)->dataref), 1);
   8.157 +	skb_shinfo(skb)->nr_frags  = 0;
   8.158 +	skb_shinfo(skb)->tso_size = 0;
   8.159 +	skb_shinfo(skb)->tso_segs = 0;
   8.160 +	skb_shinfo(skb)->frag_list = NULL;
   8.161 +out:
   8.162 +	return skb;
   8.163 +nodata:
   8.164 +	kmem_cache_free(skbuff_head_cache, skb);
   8.165 +	skb = NULL;
   8.166 +	goto out;
   8.167 +}
   8.168 +
   8.169 +/**
   8.170 + *	alloc_skb_from_cache	-	allocate a network buffer
   8.171 + *	@cp: kmem_cache from which to allocate the data area
   8.172 + *           (object size must be big enough for @size bytes + skb overheads)
   8.173 + *	@size: size to allocate
   8.174 + *	@gfp_mask: allocation mask
   8.175 + *
   8.176 + *	Allocate a new &sk_buff. The returned buffer has no headroom and
   8.177 + *	tail room of size bytes. The object has a reference count of one.
   8.178 + *	The return is the buffer. On a failure the return is %NULL.
   8.179 + *
   8.180 + *	Buffers may only be allocated from interrupts using a @gfp_mask of
   8.181 + *	%GFP_ATOMIC.
   8.182 + */
   8.183 +struct sk_buff *alloc_skb_from_cache(kmem_cache_t *cp,
   8.184 +				     unsigned int size, int gfp_mask)
   8.185 +{
   8.186 +	struct sk_buff *skb;
   8.187 +	u8 *data;
   8.188 +
   8.189 +	/* Get the HEAD */
   8.190 +	skb = kmem_cache_alloc(skbuff_head_cache,
   8.191 +			       gfp_mask & ~__GFP_DMA);
   8.192 +	if (!skb)
   8.193 +		goto out;
   8.194 +
   8.195 +	/* Get the DATA. */
   8.196 +	size = SKB_DATA_ALIGN(size);
   8.197 +	data = kmem_cache_alloc(cp, gfp_mask);
   8.198 +	if (!data)
   8.199 +		goto nodata;
   8.200 +
   8.201 +	memset(skb, 0, offsetof(struct sk_buff, truesize));
   8.202 +	skb->truesize = size + sizeof(struct sk_buff);
   8.203 +	atomic_set(&skb->users, 1);
   8.204 +	skb->head = data;
   8.205 +	skb->data = data;
   8.206 +	skb->tail = data;
   8.207 +	skb->end  = data + size;
   8.208 +
   8.209 +	atomic_set(&(skb_shinfo(skb)->dataref), 1);
   8.210 +	skb_shinfo(skb)->nr_frags  = 0;
   8.211 +	skb_shinfo(skb)->tso_size = 0;
   8.212 +	skb_shinfo(skb)->tso_segs = 0;
   8.213 +	skb_shinfo(skb)->frag_list = NULL;
   8.214 +out:
   8.215 +	return skb;
   8.216 +nodata:
   8.217 +	kmem_cache_free(skbuff_head_cache, skb);
   8.218 +	skb = NULL;
   8.219 +	goto out;
   8.220 +}
   8.221 +
   8.222 +
   8.223 +static void skb_drop_fraglist(struct sk_buff *skb)
   8.224 +{
   8.225 +	struct sk_buff *list = skb_shinfo(skb)->frag_list;
   8.226 +
   8.227 +	skb_shinfo(skb)->frag_list = NULL;
   8.228 +
   8.229 +	do {
   8.230 +		struct sk_buff *this = list;
   8.231 +		list = list->next;
   8.232 +		kfree_skb(this);
   8.233 +	} while (list);
   8.234 +}
   8.235 +
   8.236 +static void skb_clone_fraglist(struct sk_buff *skb)
   8.237 +{
   8.238 +	struct sk_buff *list;
   8.239 +
   8.240 +	for (list = skb_shinfo(skb)->frag_list; list; list = list->next)
   8.241 +		skb_get(list);
   8.242 +}
   8.243 +
   8.244 +void skb_release_data(struct sk_buff *skb)
   8.245 +{
   8.246 +	if (!skb->cloned ||
   8.247 +	    atomic_dec_and_test(&(skb_shinfo(skb)->dataref))) {
   8.248 +		if (skb_shinfo(skb)->nr_frags) {
   8.249 +			int i;
   8.250 +			for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
   8.251 +				put_page(skb_shinfo(skb)->frags[i].page);
   8.252 +		}
   8.253 +
   8.254 +		if (skb_shinfo(skb)->frag_list)
   8.255 +			skb_drop_fraglist(skb);
   8.256 +
   8.257 +		kfree(skb->head);
   8.258 +	}
   8.259 +}
   8.260 +
   8.261 +/*
   8.262 + *	Free an skbuff by memory without cleaning the state.
   8.263 + */
   8.264 +void kfree_skbmem(struct sk_buff *skb)
   8.265 +{
   8.266 +	skb_release_data(skb);
   8.267 +	kmem_cache_free(skbuff_head_cache, skb);
   8.268 +}
   8.269 +
   8.270 +/**
   8.271 + *	__kfree_skb - private function
   8.272 + *	@skb: buffer
   8.273 + *
   8.274 + *	Free an sk_buff. Release anything attached to the buffer.
   8.275 + *	Clean the state. This is an internal helper function. Users should
   8.276 + *	always call kfree_skb
   8.277 + */
   8.278 +
   8.279 +void __kfree_skb(struct sk_buff *skb)
   8.280 +{
   8.281 +	if (skb->list) {
   8.282 +	 	printk(KERN_WARNING "Warning: kfree_skb passed an skb still "
   8.283 +		       "on a list (from %p).\n", NET_CALLER(skb));
   8.284 +		BUG();
   8.285 +	}
   8.286 +
   8.287 +	dst_release(skb->dst);
   8.288 +#ifdef CONFIG_XFRM
   8.289 +	secpath_put(skb->sp);
   8.290 +#endif
   8.291 +	if(skb->destructor) {
   8.292 +		if (in_irq())
   8.293 +			printk(KERN_WARNING "Warning: kfree_skb on "
   8.294 +					    "hard IRQ %p\n", NET_CALLER(skb));
   8.295 +		skb->destructor(skb);
   8.296 +	}
   8.297 +#ifdef CONFIG_NETFILTER
   8.298 +	nf_conntrack_put(skb->nfct);
   8.299 +#ifdef CONFIG_BRIDGE_NETFILTER
   8.300 +	nf_bridge_put(skb->nf_bridge);
   8.301 +#endif
   8.302 +#endif
   8.303 +/* XXX: IS this still necessary? - JHS */
   8.304 +#ifdef CONFIG_NET_SCHED
   8.305 +	skb->tc_index = 0;
   8.306 +#ifdef CONFIG_NET_CLS_ACT
   8.307 +	skb->tc_verd = 0;
   8.308 +	skb->tc_classid = 0;
   8.309 +#endif
   8.310 +#endif
   8.311 +
   8.312 +	kfree_skbmem(skb);
   8.313 +}
   8.314 +
   8.315 +/**
   8.316 + *	skb_clone	-	duplicate an sk_buff
   8.317 + *	@skb: buffer to clone
   8.318 + *	@gfp_mask: allocation priority
   8.319 + *
   8.320 + *	Duplicate an &sk_buff. The new one is not owned by a socket. Both
   8.321 + *	copies share the same packet data but not structure. The new
   8.322 + *	buffer has a reference count of 1. If the allocation fails the
   8.323 + *	function returns %NULL otherwise the new buffer is returned.
   8.324 + *
   8.325 + *	If this function is called from an interrupt gfp_mask() must be
   8.326 + *	%GFP_ATOMIC.
   8.327 + */
   8.328 +
   8.329 +struct sk_buff *skb_clone(struct sk_buff *skb, int gfp_mask)
   8.330 +{
   8.331 +	struct sk_buff *n = kmem_cache_alloc(skbuff_head_cache, gfp_mask);
   8.332 +
   8.333 +	if (!n) 
   8.334 +		return NULL;
   8.335 +
   8.336 +#define C(x) n->x = skb->x
   8.337 +
   8.338 +	n->next = n->prev = NULL;
   8.339 +	n->list = NULL;
   8.340 +	n->sk = NULL;
   8.341 +	C(stamp);
   8.342 +	C(dev);
   8.343 +	C(real_dev);
   8.344 +	C(h);
   8.345 +	C(nh);
   8.346 +	C(mac);
   8.347 +	C(dst);
   8.348 +	dst_clone(skb->dst);
   8.349 +	C(sp);
   8.350 +#ifdef CONFIG_INET
   8.351 +	secpath_get(skb->sp);
   8.352 +#endif
   8.353 +	memcpy(n->cb, skb->cb, sizeof(skb->cb));
   8.354 +	C(len);
   8.355 +	C(data_len);
   8.356 +	C(csum);
   8.357 +	C(local_df);
   8.358 +	n->cloned = 1;
   8.359 +	C(proto_csum_valid);
   8.360 +	C(proto_csum_blank);
   8.361 +	C(pkt_type);
   8.362 +	C(ip_summed);
   8.363 +	C(priority);
   8.364 +	C(protocol);
   8.365 +	C(security);
   8.366 +	n->destructor = NULL;
   8.367 +#ifdef CONFIG_NETFILTER
   8.368 +	C(nfmark);
   8.369 +	C(nfcache);
   8.370 +	C(nfct);
   8.371 +	nf_conntrack_get(skb->nfct);
   8.372 +	C(nfctinfo);
   8.373 +#ifdef CONFIG_NETFILTER_DEBUG
   8.374 +	C(nf_debug);
   8.375 +#endif
   8.376 +#ifdef CONFIG_BRIDGE_NETFILTER
   8.377 +	C(nf_bridge);
   8.378 +	nf_bridge_get(skb->nf_bridge);
   8.379 +#endif
   8.380 +#endif /*CONFIG_NETFILTER*/
   8.381 +#if defined(CONFIG_HIPPI)
   8.382 +	C(private);
   8.383 +#endif
   8.384 +#ifdef CONFIG_NET_SCHED
   8.385 +	C(tc_index);
   8.386 +#ifdef CONFIG_NET_CLS_ACT
   8.387 +	n->tc_verd = SET_TC_VERD(skb->tc_verd,0);
   8.388 +	n->tc_verd = CLR_TC_OK2MUNGE(skb->tc_verd);
   8.389 +	n->tc_verd = CLR_TC_MUNGED(skb->tc_verd);
   8.390 +	C(input_dev);
   8.391 +	C(tc_classid);
   8.392 +#endif
   8.393 +
   8.394 +#endif
   8.395 +	C(truesize);
   8.396 +	atomic_set(&n->users, 1);
   8.397 +	C(head);
   8.398 +	C(data);
   8.399 +	C(tail);
   8.400 +	C(end);
   8.401 +
   8.402 +	atomic_inc(&(skb_shinfo(skb)->dataref));
   8.403 +	skb->cloned = 1;
   8.404 +
   8.405 +	return n;
   8.406 +}
   8.407 +
   8.408 +static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
   8.409 +{
   8.410 +	/*
   8.411 +	 *	Shift between the two data areas in bytes
   8.412 +	 */
   8.413 +	unsigned long offset = new->data - old->data;
   8.414 +
   8.415 +	new->list	= NULL;
   8.416 +	new->sk		= NULL;
   8.417 +	new->dev	= old->dev;
   8.418 +	new->real_dev	= old->real_dev;
   8.419 +	new->priority	= old->priority;
   8.420 +	new->protocol	= old->protocol;
   8.421 +	new->dst	= dst_clone(old->dst);
   8.422 +#ifdef CONFIG_INET
   8.423 +	new->sp		= secpath_get(old->sp);
   8.424 +#endif
   8.425 +	new->h.raw	= old->h.raw + offset;
   8.426 +	new->nh.raw	= old->nh.raw + offset;
   8.427 +	new->mac.raw	= old->mac.raw + offset;
   8.428 +	memcpy(new->cb, old->cb, sizeof(old->cb));
   8.429 +	new->local_df	= old->local_df;
   8.430 +	new->pkt_type	= old->pkt_type;
   8.431 +	new->stamp	= old->stamp;
   8.432 +	new->destructor = NULL;
   8.433 +	new->security	= old->security;
   8.434 +#ifdef CONFIG_NETFILTER
   8.435 +	new->nfmark	= old->nfmark;
   8.436 +	new->nfcache	= old->nfcache;
   8.437 +	new->nfct	= old->nfct;
   8.438 +	nf_conntrack_get(old->nfct);
   8.439 +	new->nfctinfo	= old->nfctinfo;
   8.440 +#ifdef CONFIG_NETFILTER_DEBUG
   8.441 +	new->nf_debug	= old->nf_debug;
   8.442 +#endif
   8.443 +#ifdef CONFIG_BRIDGE_NETFILTER
   8.444 +	new->nf_bridge	= old->nf_bridge;
   8.445 +	nf_bridge_get(old->nf_bridge);
   8.446 +#endif
   8.447 +#endif
   8.448 +#ifdef CONFIG_NET_SCHED
   8.449 +#ifdef CONFIG_NET_CLS_ACT
   8.450 +	new->tc_verd = old->tc_verd;
   8.451 +#endif
   8.452 +	new->tc_index	= old->tc_index;
   8.453 +#endif
   8.454 +	atomic_set(&new->users, 1);
   8.455 +	skb_shinfo(new)->tso_size = skb_shinfo(old)->tso_size;
   8.456 +	skb_shinfo(new)->tso_segs = skb_shinfo(old)->tso_segs;
   8.457 +}
   8.458 +
   8.459 +/**
   8.460 + *	skb_copy	-	create private copy of an sk_buff
   8.461 + *	@skb: buffer to copy
   8.462 + *	@gfp_mask: allocation priority
   8.463 + *
   8.464 + *	Make a copy of both an &sk_buff and its data. This is used when the
   8.465 + *	caller wishes to modify the data and needs a private copy of the
   8.466 + *	data to alter. Returns %NULL on failure or the pointer to the buffer
   8.467 + *	on success. The returned buffer has a reference count of 1.
   8.468 + *
   8.469 + *	As by-product this function converts non-linear &sk_buff to linear
   8.470 + *	one, so that &sk_buff becomes completely private and caller is allowed
   8.471 + *	to modify all the data of returned buffer. This means that this
   8.472 + *	function is not recommended for use in circumstances when only
   8.473 + *	header is going to be modified. Use pskb_copy() instead.
   8.474 + */
   8.475 +
   8.476 +struct sk_buff *skb_copy(const struct sk_buff *skb, int gfp_mask)
   8.477 +{
   8.478 +	int headerlen = skb->data - skb->head;
   8.479 +	/*
   8.480 +	 *	Allocate the copy buffer
   8.481 +	 */
   8.482 +	struct sk_buff *n = alloc_skb(skb->end - skb->head + skb->data_len,
   8.483 +				      gfp_mask);
   8.484 +	if (!n)
   8.485 +		return NULL;
   8.486 +
   8.487 +	/* Set the data pointer */
   8.488 +	skb_reserve(n, headerlen);
   8.489 +	/* Set the tail pointer and length */
   8.490 +	skb_put(n, skb->len);
   8.491 +	n->csum	     = skb->csum;
   8.492 +	n->ip_summed = skb->ip_summed;
   8.493 +
   8.494 +	if (skb_copy_bits(skb, -headerlen, n->head, headerlen + skb->len))
   8.495 +		BUG();
   8.496 +
   8.497 +	copy_skb_header(n, skb);
   8.498 +	return n;
   8.499 +}
   8.500 +
   8.501 +
   8.502 +/**
   8.503 + *	pskb_copy	-	create copy of an sk_buff with private head.
   8.504 + *	@skb: buffer to copy
   8.505 + *	@gfp_mask: allocation priority
   8.506 + *
   8.507 + *	Make a copy of both an &sk_buff and part of its data, located
   8.508 + *	in header. Fragmented data remain shared. This is used when
   8.509 + *	the caller wishes to modify only header of &sk_buff and needs
   8.510 + *	private copy of the header to alter. Returns %NULL on failure
   8.511 + *	or the pointer to the buffer on success.
   8.512 + *	The returned buffer has a reference count of 1.
   8.513 + */
   8.514 +
   8.515 +struct sk_buff *pskb_copy(struct sk_buff *skb, int gfp_mask)
   8.516 +{
   8.517 +	/*
   8.518 +	 *	Allocate the copy buffer
   8.519 +	 */
   8.520 +	struct sk_buff *n = alloc_skb(skb->end - skb->head, gfp_mask);
   8.521 +
   8.522 +	if (!n)
   8.523 +		goto out;
   8.524 +
   8.525 +	/* Set the data pointer */
   8.526 +	skb_reserve(n, skb->data - skb->head);
   8.527 +	/* Set the tail pointer and length */
   8.528 +	skb_put(n, skb_headlen(skb));
   8.529 +	/* Copy the bytes */
   8.530 +	memcpy(n->data, skb->data, n->len);
   8.531 +	n->csum	     = skb->csum;
   8.532 +	n->ip_summed = skb->ip_summed;
   8.533 +
   8.534 +	n->data_len  = skb->data_len;
   8.535 +	n->len	     = skb->len;
   8.536 +
   8.537 +	if (skb_shinfo(skb)->nr_frags) {
   8.538 +		int i;
   8.539 +
   8.540 +		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
   8.541 +			skb_shinfo(n)->frags[i] = skb_shinfo(skb)->frags[i];
   8.542 +			get_page(skb_shinfo(n)->frags[i].page);
   8.543 +		}
   8.544 +		skb_shinfo(n)->nr_frags = i;
   8.545 +	}
   8.546 +
   8.547 +	if (skb_shinfo(skb)->frag_list) {
   8.548 +		skb_shinfo(n)->frag_list = skb_shinfo(skb)->frag_list;
   8.549 +		skb_clone_fraglist(n);
   8.550 +	}
   8.551 +
   8.552 +	copy_skb_header(n, skb);
   8.553 +out:
   8.554 +	return n;
   8.555 +}
   8.556 +
   8.557 +/**
   8.558 + *	pskb_expand_head - reallocate header of &sk_buff
   8.559 + *	@skb: buffer to reallocate
   8.560 + *	@nhead: room to add at head
   8.561 + *	@ntail: room to add at tail
   8.562 + *	@gfp_mask: allocation priority
   8.563 + *
   8.564 + *	Expands (or creates identical copy, if &nhead and &ntail are zero)
   8.565 + *	header of skb. &sk_buff itself is not changed. &sk_buff MUST have
   8.566 + *	reference count of 1. Returns zero in the case of success or error,
   8.567 + *	if expansion failed. In the last case, &sk_buff is not changed.
   8.568 + *
   8.569 + *	All the pointers pointing into skb header may change and must be
   8.570 + *	reloaded after call to this function.
   8.571 + */
   8.572 +
   8.573 +int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, int gfp_mask)
   8.574 +{
   8.575 +	int i;
   8.576 +	u8 *data;
   8.577 +	int size = nhead + (skb->end - skb->head) + ntail;
   8.578 +	long off;
   8.579 +
   8.580 +	if (skb_shared(skb))
   8.581 +		BUG();
   8.582 +
   8.583 +	size = SKB_DATA_ALIGN(size);
   8.584 +
   8.585 +	data = kmalloc(size + sizeof(struct skb_shared_info), gfp_mask);
   8.586 +	if (!data)
   8.587 +		goto nodata;
   8.588 +
   8.589 +	/* Copy only real data... and, alas, header. This should be
   8.590 +	 * optimized for the cases when header is void. */
   8.591 +	memcpy(data + nhead, skb->head, skb->tail - skb->head);
   8.592 +	memcpy(data + size, skb->end, sizeof(struct skb_shared_info));
   8.593 +
   8.594 +	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
   8.595 +		get_page(skb_shinfo(skb)->frags[i].page);
   8.596 +
   8.597 +	if (skb_shinfo(skb)->frag_list)
   8.598 +		skb_clone_fraglist(skb);
   8.599 +
   8.600 +	skb_release_data(skb);
   8.601 +
   8.602 +	off = (data + nhead) - skb->head;
   8.603 +
   8.604 +	skb->head     = data;
   8.605 +	skb->end      = data + size;
   8.606 +	skb->data    += off;
   8.607 +	skb->tail    += off;
   8.608 +	skb->mac.raw += off;
   8.609 +	skb->h.raw   += off;
   8.610 +	skb->nh.raw  += off;
   8.611 +	skb->cloned   = 0;
   8.612 +	atomic_set(&skb_shinfo(skb)->dataref, 1);
   8.613 +	return 0;
   8.614 +
   8.615 +nodata:
   8.616 +	return -ENOMEM;
   8.617 +}
   8.618 +
   8.619 +/* Make private copy of skb with writable head and some headroom */
   8.620 +
   8.621 +struct sk_buff *skb_realloc_headroom(struct sk_buff *skb, unsigned int headroom)
   8.622 +{
   8.623 +	struct sk_buff *skb2;
   8.624 +	int delta = headroom - skb_headroom(skb);
   8.625 +
   8.626 +	if (delta <= 0)
   8.627 +		skb2 = pskb_copy(skb, GFP_ATOMIC);
   8.628 +	else {
   8.629 +		skb2 = skb_clone(skb, GFP_ATOMIC);
   8.630 +		if (skb2 && pskb_expand_head(skb2, SKB_DATA_ALIGN(delta), 0,
   8.631 +					     GFP_ATOMIC)) {
   8.632 +			kfree_skb(skb2);
   8.633 +			skb2 = NULL;
   8.634 +		}
   8.635 +	}
   8.636 +	return skb2;
   8.637 +}
   8.638 +
   8.639 +
   8.640 +/**
   8.641 + *	skb_copy_expand	-	copy and expand sk_buff
   8.642 + *	@skb: buffer to copy
   8.643 + *	@newheadroom: new free bytes at head
   8.644 + *	@newtailroom: new free bytes at tail
   8.645 + *	@gfp_mask: allocation priority
   8.646 + *
   8.647 + *	Make a copy of both an &sk_buff and its data and while doing so
   8.648 + *	allocate additional space.
   8.649 + *
   8.650 + *	This is used when the caller wishes to modify the data and needs a
   8.651 + *	private copy of the data to alter as well as more space for new fields.
   8.652 + *	Returns %NULL on failure or the pointer to the buffer
   8.653 + *	on success. The returned buffer has a reference count of 1.
   8.654 + *
   8.655 + *	You must pass %GFP_ATOMIC as the allocation priority if this function
   8.656 + *	is called from an interrupt.
   8.657 + *
   8.658 + *	BUG ALERT: ip_summed is not copied. Why does this work? Is it used
   8.659 + *	only by netfilter in the cases when checksum is recalculated? --ANK
   8.660 + */
   8.661 +struct sk_buff *skb_copy_expand(const struct sk_buff *skb,
   8.662 +				int newheadroom, int newtailroom, int gfp_mask)
   8.663 +{
   8.664 +	/*
   8.665 +	 *	Allocate the copy buffer
   8.666 +	 */
   8.667 +	struct sk_buff *n = alloc_skb(newheadroom + skb->len + newtailroom,
   8.668 +				      gfp_mask);
   8.669 +	int head_copy_len, head_copy_off;
   8.670 +
   8.671 +	if (!n)
   8.672 +		return NULL;
   8.673 +
   8.674 +	skb_reserve(n, newheadroom);
   8.675 +
   8.676 +	/* Set the tail pointer and length */
   8.677 +	skb_put(n, skb->len);
   8.678 +
   8.679 +	head_copy_len = skb_headroom(skb);
   8.680 +	head_copy_off = 0;
   8.681 +	if (newheadroom <= head_copy_len)
   8.682 +		head_copy_len = newheadroom;
   8.683 +	else
   8.684 +		head_copy_off = newheadroom - head_copy_len;
   8.685 +
   8.686 +	/* Copy the linear header and data. */
   8.687 +	if (skb_copy_bits(skb, -head_copy_len, n->head + head_copy_off,
   8.688 +			  skb->len + head_copy_len))
   8.689 +		BUG();
   8.690 +
   8.691 +	copy_skb_header(n, skb);
   8.692 +
   8.693 +	return n;
   8.694 +}
   8.695 +
   8.696 +/**
   8.697 + *	skb_pad			-	zero pad the tail of an skb
   8.698 + *	@skb: buffer to pad
   8.699 + *	@pad: space to pad
   8.700 + *
   8.701 + *	Ensure that a buffer is followed by a padding area that is zero
   8.702 + *	filled. Used by network drivers which may DMA or transfer data
   8.703 + *	beyond the buffer end onto the wire.
   8.704 + *
   8.705 + *	May return NULL in out of memory cases.
   8.706 + */
   8.707 + 
   8.708 +struct sk_buff *skb_pad(struct sk_buff *skb, int pad)
   8.709 +{
   8.710 +	struct sk_buff *nskb;
   8.711 +	
   8.712 +	/* If the skbuff is non linear tailroom is always zero.. */
   8.713 +	if (skb_tailroom(skb) >= pad) {
   8.714 +		memset(skb->data+skb->len, 0, pad);
   8.715 +		return skb;
   8.716 +	}
   8.717 +	
   8.718 +	nskb = skb_copy_expand(skb, skb_headroom(skb), skb_tailroom(skb) + pad, GFP_ATOMIC);
   8.719 +	kfree_skb(skb);
   8.720 +	if (nskb)
   8.721 +		memset(nskb->data+nskb->len, 0, pad);
   8.722 +	return nskb;
   8.723 +}	
   8.724 + 
   8.725 +/* Trims skb to length len. It can change skb pointers, if "realloc" is 1.
   8.726 + * If realloc==0 and trimming is impossible without change of data,
   8.727 + * it is BUG().
   8.728 + */
   8.729 +
   8.730 +int ___pskb_trim(struct sk_buff *skb, unsigned int len, int realloc)
   8.731 +{
   8.732 +	int offset = skb_headlen(skb);
   8.733 +	int nfrags = skb_shinfo(skb)->nr_frags;
   8.734 +	int i;
   8.735 +
   8.736 +	for (i = 0; i < nfrags; i++) {
   8.737 +		int end = offset + skb_shinfo(skb)->frags[i].size;
   8.738 +		if (end > len) {
   8.739 +			if (skb_cloned(skb)) {
   8.740 +				if (!realloc)
   8.741 +					BUG();
   8.742 +				if (pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
   8.743 +					return -ENOMEM;
   8.744 +			}
   8.745 +			if (len <= offset) {
   8.746 +				put_page(skb_shinfo(skb)->frags[i].page);
   8.747 +				skb_shinfo(skb)->nr_frags--;
   8.748 +			} else {
   8.749 +				skb_shinfo(skb)->frags[i].size = len - offset;
   8.750 +			}
   8.751 +		}
   8.752 +		offset = end;
   8.753 +	}
   8.754 +
   8.755 +	if (offset < len) {
   8.756 +		skb->data_len -= skb->len - len;
   8.757 +		skb->len       = len;
   8.758 +	} else {
   8.759 +		if (len <= skb_headlen(skb)) {
   8.760 +			skb->len      = len;
   8.761 +			skb->data_len = 0;
   8.762 +			skb->tail     = skb->data + len;
   8.763 +			if (skb_shinfo(skb)->frag_list && !skb_cloned(skb))
   8.764 +				skb_drop_fraglist(skb);
   8.765 +		} else {
   8.766 +			skb->data_len -= skb->len - len;
   8.767 +			skb->len       = len;
   8.768 +		}
   8.769 +	}
   8.770 +
   8.771 +	return 0;
   8.772 +}
   8.773 +
   8.774 +/**
   8.775 + *	__pskb_pull_tail - advance tail of skb header
   8.776 + *	@skb: buffer to reallocate
   8.777 + *	@delta: number of bytes to advance tail
   8.778 + *
   8.779 + *	The function makes a sense only on a fragmented &sk_buff,
   8.780 + *	it expands header moving its tail forward and copying necessary
   8.781 + *	data from fragmented part.
   8.782 + *
   8.783 + *	&sk_buff MUST have reference count of 1.
   8.784 + *
   8.785 + *	Returns %NULL (and &sk_buff does not change) if pull failed
   8.786 + *	or value of new tail of skb in the case of success.
   8.787 + *
   8.788 + *	All the pointers pointing into skb header may change and must be
   8.789 + *	reloaded after call to this function.
   8.790 + */
   8.791 +
   8.792 +/* Moves tail of skb head forward, copying data from fragmented part,
   8.793 + * when it is necessary.
   8.794 + * 1. It may fail due to malloc failure.
   8.795 + * 2. It may change skb pointers.
   8.796 + *
   8.797 + * It is pretty complicated. Luckily, it is called only in exceptional cases.
   8.798 + */
   8.799 +unsigned char *__pskb_pull_tail(struct sk_buff *skb, int delta)
   8.800 +{
   8.801 +	/* If skb has not enough free space at tail, get new one
   8.802 +	 * plus 128 bytes for future expansions. If we have enough
   8.803 +	 * room at tail, reallocate without expansion only if skb is cloned.
   8.804 +	 */
   8.805 +	int i, k, eat = (skb->tail + delta) - skb->end;
   8.806 +
   8.807 +	if (eat > 0 || skb_cloned(skb)) {
   8.808 +		if (pskb_expand_head(skb, 0, eat > 0 ? eat + 128 : 0,
   8.809 +				     GFP_ATOMIC))
   8.810 +			return NULL;
   8.811 +	}
   8.812 +
   8.813 +	if (skb_copy_bits(skb, skb_headlen(skb), skb->tail, delta))
   8.814 +		BUG();
   8.815 +
   8.816 +	/* Optimization: no fragments, no reasons to preestimate
   8.817 +	 * size of pulled pages. Superb.
   8.818 +	 */
   8.819 +	if (!skb_shinfo(skb)->frag_list)
   8.820 +		goto pull_pages;
   8.821 +
   8.822 +	/* Estimate size of pulled pages. */
   8.823 +	eat = delta;
   8.824 +	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
   8.825 +		if (skb_shinfo(skb)->frags[i].size >= eat)
   8.826 +			goto pull_pages;
   8.827 +		eat -= skb_shinfo(skb)->frags[i].size;
   8.828 +	}
   8.829 +
   8.830 +	/* If we need update frag list, we are in troubles.
   8.831 +	 * Certainly, it possible to add an offset to skb data,
   8.832 +	 * but taking into account that pulling is expected to
   8.833 +	 * be very rare operation, it is worth to fight against
   8.834 +	 * further bloating skb head and crucify ourselves here instead.
   8.835 +	 * Pure masohism, indeed. 8)8)
   8.836 +	 */
   8.837 +	if (eat) {
   8.838 +		struct sk_buff *list = skb_shinfo(skb)->frag_list;
   8.839 +		struct sk_buff *clone = NULL;
   8.840 +		struct sk_buff *insp = NULL;
   8.841 +
   8.842 +		do {
   8.843 +			if (!list)
   8.844 +				BUG();
   8.845 +
   8.846 +			if (list->len <= eat) {
   8.847 +				/* Eaten as whole. */
   8.848 +				eat -= list->len;
   8.849 +				list = list->next;
   8.850 +				insp = list;
   8.851 +			} else {
   8.852 +				/* Eaten partially. */
   8.853 +
   8.854 +				if (skb_shared(list)) {
   8.855 +					/* Sucks! We need to fork list. :-( */
   8.856 +					clone = skb_clone(list, GFP_ATOMIC);
   8.857 +					if (!clone)
   8.858 +						return NULL;
   8.859 +					insp = list->next;
   8.860 +					list = clone;
   8.861 +				} else {
   8.862 +					/* This may be pulled without
   8.863 +					 * problems. */
   8.864 +					insp = list;
   8.865 +				}
   8.866 +				if (!pskb_pull(list, eat)) {
   8.867 +					if (clone)
   8.868 +						kfree_skb(clone);
   8.869 +					return NULL;
   8.870 +				}
   8.871 +				break;
   8.872 +			}
   8.873 +		} while (eat);
   8.874 +
   8.875 +		/* Free pulled out fragments. */
   8.876 +		while ((list = skb_shinfo(skb)->frag_list) != insp) {
   8.877 +			skb_shinfo(skb)->frag_list = list->next;
   8.878 +			kfree_skb(list);
   8.879 +		}
   8.880 +		/* And insert new clone at head. */
   8.881 +		if (clone) {
   8.882 +			clone->next = list;
   8.883 +			skb_shinfo(skb)->frag_list = clone;
   8.884 +		}
   8.885 +	}
   8.886 +	/* Success! Now we may commit changes to skb data. */
   8.887 +
   8.888 +pull_pages:
   8.889 +	eat = delta;
   8.890 +	k = 0;
   8.891 +	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
   8.892 +		if (skb_shinfo(skb)->frags[i].size <= eat) {
   8.893 +			put_page(skb_shinfo(skb)->frags[i].page);
   8.894 +			eat -= skb_shinfo(skb)->frags[i].size;
   8.895 +		} else {
   8.896 +			skb_shinfo(skb)->frags[k] = skb_shinfo(skb)->frags[i];
   8.897 +			if (eat) {
   8.898 +				skb_shinfo(skb)->frags[k].page_offset += eat;
   8.899 +				skb_shinfo(skb)->frags[k].size -= eat;
   8.900 +				eat = 0;
   8.901 +			}
   8.902 +			k++;
   8.903 +		}
   8.904 +	}
   8.905 +	skb_shinfo(skb)->nr_frags = k;
   8.906 +
   8.907 +	skb->tail     += delta;
   8.908 +	skb->data_len -= delta;
   8.909 +
   8.910 +	return skb->tail;
   8.911 +}
   8.912 +
   8.913 +/* Copy some data bits from skb to kernel buffer. */
   8.914 +
   8.915 +int skb_copy_bits(const struct sk_buff *skb, int offset, void *to, int len)
   8.916 +{
   8.917 +	int i, copy;
   8.918 +	int start = skb_headlen(skb);
   8.919 +
   8.920 +	if (offset > (int)skb->len - len)
   8.921 +		goto fault;
   8.922 +
   8.923 +	/* Copy header. */
   8.924 +	if ((copy = start - offset) > 0) {
   8.925 +		if (copy > len)
   8.926 +			copy = len;
   8.927 +		memcpy(to, skb->data + offset, copy);
   8.928 +		if ((len -= copy) == 0)
   8.929 +			return 0;
   8.930 +		offset += copy;
   8.931 +		to     += copy;
   8.932 +	}
   8.933 +
   8.934 +	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
   8.935 +		int end;
   8.936 +
   8.937 +		BUG_TRAP(start <= offset + len);
   8.938 +
   8.939 +		end = start + skb_shinfo(skb)->frags[i].size;
   8.940 +		if ((copy = end - offset) > 0) {
   8.941 +			u8 *vaddr;
   8.942 +
   8.943 +			if (copy > len)
   8.944 +				copy = len;
   8.945 +
   8.946 +			vaddr = kmap_skb_frag(&skb_shinfo(skb)->frags[i]);
   8.947 +			memcpy(to,
   8.948 +			       vaddr + skb_shinfo(skb)->frags[i].page_offset+
   8.949 +			       offset - start, copy);
   8.950 +			kunmap_skb_frag(vaddr);
   8.951 +
   8.952 +			if ((len -= copy) == 0)
   8.953 +				return 0;
   8.954 +			offset += copy;
   8.955 +			to     += copy;
   8.956 +		}
   8.957 +		start = end;
   8.958 +	}
   8.959 +
   8.960 +	if (skb_shinfo(skb)->frag_list) {
   8.961 +		struct sk_buff *list = skb_shinfo(skb)->frag_list;
   8.962 +
   8.963 +		for (; list; list = list->next) {
   8.964 +			int end;
   8.965 +
   8.966 +			BUG_TRAP(start <= offset + len);
   8.967 +
   8.968 +			end = start + list->len;
   8.969 +			if ((copy = end - offset) > 0) {
   8.970 +				if (copy > len)
   8.971 +					copy = len;
   8.972 +				if (skb_copy_bits(list, offset - start,
   8.973 +						  to, copy))
   8.974 +					goto fault;
   8.975 +				if ((len -= copy) == 0)
   8.976 +					return 0;
   8.977 +				offset += copy;
   8.978 +				to     += copy;
   8.979 +			}
   8.980 +			start = end;
   8.981 +		}
   8.982 +	}
   8.983 +	if (!len)
   8.984 +		return 0;
   8.985 +
   8.986 +fault:
   8.987 +	return -EFAULT;
   8.988 +}
   8.989 +
   8.990 +/* Keep iterating until skb_iter_next returns false. */
   8.991 +void skb_iter_first(const struct sk_buff *skb, struct skb_iter *i)
   8.992 +{
   8.993 +	i->len = skb_headlen(skb);
   8.994 +	i->data = (unsigned char *)skb->data;
   8.995 +	i->nextfrag = 0;
   8.996 +	i->fraglist = NULL;
   8.997 +}
   8.998 +
   8.999 +int skb_iter_next(const struct sk_buff *skb, struct skb_iter *i)
  8.1000 +{
  8.1001 +	/* Unmap previous, if not head fragment. */
  8.1002 +	if (i->nextfrag)
  8.1003 +		kunmap_skb_frag(i->data);
  8.1004 +
  8.1005 +	if (i->fraglist) {
  8.1006 +	fraglist:
  8.1007 +		/* We're iterating through fraglist. */
  8.1008 +		if (i->nextfrag < skb_shinfo(i->fraglist)->nr_frags) {
  8.1009 +			i->data = kmap_skb_frag(&skb_shinfo(i->fraglist)
  8.1010 +						->frags[i->nextfrag]);
  8.1011 +			i->len = skb_shinfo(i->fraglist)->frags[i->nextfrag]
  8.1012 +				.size;
  8.1013 +			i->nextfrag++;
  8.1014 +			return 1;
  8.1015 +		}
  8.1016 +		/* Fragments with fragments?  Too hard! */
  8.1017 +		BUG_ON(skb_shinfo(i->fraglist)->frag_list);
  8.1018 +		i->fraglist = i->fraglist->next;
  8.1019 +		if (!i->fraglist)
  8.1020 +			goto end;
  8.1021 +
  8.1022 +		i->len = skb_headlen(i->fraglist);
  8.1023 +		i->data = i->fraglist->data;
  8.1024 +		i->nextfrag = 0;
  8.1025 +		return 1;
  8.1026 +	}
  8.1027 +
  8.1028 +	if (i->nextfrag < skb_shinfo(skb)->nr_frags) {
  8.1029 +		i->data = kmap_skb_frag(&skb_shinfo(skb)->frags[i->nextfrag]);
  8.1030 +		i->len = skb_shinfo(skb)->frags[i->nextfrag].size;
  8.1031 +		i->nextfrag++;
  8.1032 +		return 1;
  8.1033 +	}
  8.1034 +
  8.1035 +	i->fraglist = skb_shinfo(skb)->frag_list;
  8.1036 +	if (i->fraglist)
  8.1037 +		goto fraglist;
  8.1038 +
  8.1039 +end:
  8.1040 +	/* Bug trap for callers */
  8.1041 +	i->data = NULL;
  8.1042 +	return 0;
  8.1043 +}
  8.1044 +
  8.1045 +void skb_iter_abort(const struct sk_buff *skb, struct skb_iter *i)
  8.1046 +{
  8.1047 +	/* Unmap previous, if not head fragment. */
  8.1048 +	if (i->data && i->nextfrag)
  8.1049 +		kunmap_skb_frag(i->data);
  8.1050 +	/* Bug trap for callers */
  8.1051 +	i->data = NULL;
  8.1052 +}
  8.1053 +
  8.1054 +/* Checksum skb data. */
  8.1055 +
  8.1056 +unsigned int skb_checksum(const struct sk_buff *skb, int offset,
  8.1057 +			  int len, unsigned int csum)
  8.1058 +{
  8.1059 +	int start = skb_headlen(skb);
  8.1060 +	int i, copy = start - offset;
  8.1061 +	int pos = 0;
  8.1062 +
  8.1063 +	/* Checksum header. */
  8.1064 +	if (copy > 0) {
  8.1065 +		if (copy > len)
  8.1066 +			copy = len;
  8.1067 +		csum = csum_partial(skb->data + offset, copy, csum);
  8.1068 +		if ((len -= copy) == 0)
  8.1069 +			return csum;
  8.1070 +		offset += copy;
  8.1071 +		pos	= copy;
  8.1072 +	}
  8.1073 +
  8.1074 +	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
  8.1075 +		int end;
  8.1076 +
  8.1077 +		BUG_TRAP(start <= offset + len);
  8.1078 +
  8.1079 +		end = start + skb_shinfo(skb)->frags[i].size;
  8.1080 +		if ((copy = end - offset) > 0) {
  8.1081 +			unsigned int csum2;
  8.1082 +			u8 *vaddr;
  8.1083 +			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
  8.1084 +
  8.1085 +			if (copy > len)
  8.1086 +				copy = len;
  8.1087 +			vaddr = kmap_skb_frag(frag);
  8.1088 +			csum2 = csum_partial(vaddr + frag->page_offset +
  8.1089 +					     offset - start, copy, 0);
  8.1090 +			kunmap_skb_frag(vaddr);
  8.1091 +			csum = csum_block_add(csum, csum2, pos);
  8.1092 +			if (!(len -= copy))
  8.1093 +				return csum;
  8.1094 +			offset += copy;
  8.1095 +			pos    += copy;
  8.1096 +		}
  8.1097 +		start = end;
  8.1098 +	}
  8.1099 +
  8.1100 +	if (skb_shinfo(skb)->frag_list) {
  8.1101 +		struct sk_buff *list = skb_shinfo(skb)->frag_list;
  8.1102 +
  8.1103 +		for (; list; list = list->next) {
  8.1104 +			int end;
  8.1105 +
  8.1106 +			BUG_TRAP(start <= offset + len);
  8.1107 +
  8.1108 +			end = start + list->len;
  8.1109 +			if ((copy = end - offset) > 0) {
  8.1110 +				unsigned int csum2;
  8.1111 +				if (copy > len)
  8.1112 +					copy = len;
  8.1113 +				csum2 = skb_checksum(list, offset - start,
  8.1114 +						     copy, 0);
  8.1115 +				csum = csum_block_add(csum, csum2, pos);
  8.1116 +				if ((len -= copy) == 0)
  8.1117 +					return csum;
  8.1118 +				offset += copy;
  8.1119 +				pos    += copy;
  8.1120 +			}
  8.1121 +			start = end;
  8.1122 +		}
  8.1123 +	}
  8.1124 +	if (len)
  8.1125 +		BUG();
  8.1126 +
  8.1127 +	return csum;
  8.1128 +}
  8.1129 +
  8.1130 +/* Both of above in one bottle. */
  8.1131 +
  8.1132 +unsigned int skb_copy_and_csum_bits(const struct sk_buff *skb, int offset,
  8.1133 +				    u8 *to, int len, unsigned int csum)
  8.1134 +{
  8.1135 +	int start = skb_headlen(skb);
  8.1136 +	int i, copy = start - offset;
  8.1137 +	int pos = 0;
  8.1138 +
  8.1139 +	/* Copy header. */
  8.1140 +	if (copy > 0) {
  8.1141 +		if (copy > len)
  8.1142 +			copy = len;
  8.1143 +		csum = csum_partial_copy_nocheck(skb->data + offset, to,
  8.1144 +						 copy, csum);
  8.1145 +		if ((len -= copy) == 0)
  8.1146 +			return csum;
  8.1147 +		offset += copy;
  8.1148 +		to     += copy;
  8.1149 +		pos	= copy;
  8.1150 +	}
  8.1151 +
  8.1152 +	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
  8.1153 +		int end;
  8.1154 +
  8.1155 +		BUG_TRAP(start <= offset + len);
  8.1156 +
  8.1157 +		end = start + skb_shinfo(skb)->frags[i].size;
  8.1158 +		if ((copy = end - offset) > 0) {
  8.1159 +			unsigned int csum2;
  8.1160 +			u8 *vaddr;
  8.1161 +			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
  8.1162 +
  8.1163 +			if (copy > len)
  8.1164 +				copy = len;
  8.1165 +			vaddr = kmap_skb_frag(frag);
  8.1166 +			csum2 = csum_partial_copy_nocheck(vaddr +
  8.1167 +							  frag->page_offset +
  8.1168 +							  offset - start, to,
  8.1169 +							  copy, 0);
  8.1170 +			kunmap_skb_frag(vaddr);
  8.1171 +			csum = csum_block_add(csum, csum2, pos);
  8.1172 +			if (!(len -= copy))
  8.1173 +				return csum;
  8.1174 +			offset += copy;
  8.1175 +			to     += copy;
  8.1176 +			pos    += copy;
  8.1177 +		}
  8.1178 +		start = end;
  8.1179 +	}
  8.1180 +
  8.1181 +	if (skb_shinfo(skb)->frag_list) {
  8.1182 +		struct sk_buff *list = skb_shinfo(skb)->frag_list;
  8.1183 +
  8.1184 +		for (; list; list = list->next) {
  8.1185 +			unsigned int csum2;
  8.1186 +			int end;
  8.1187 +
  8.1188 +			BUG_TRAP(start <= offset + len);
  8.1189 +
  8.1190 +			end = start + list->len;
  8.1191 +			if ((copy = end - offset) > 0) {
  8.1192 +				if (copy > len)
  8.1193 +					copy = len;
  8.1194 +				csum2 = skb_copy_and_csum_bits(list,
  8.1195 +							       offset - start,
  8.1196 +							       to, copy, 0);
  8.1197 +				csum = csum_block_add(csum, csum2, pos);
  8.1198 +				if ((len -= copy) == 0)
  8.1199 +					return csum;
  8.1200 +				offset += copy;
  8.1201 +				to     += copy;
  8.1202 +				pos    += copy;
  8.1203 +			}
  8.1204 +			start = end;
  8.1205 +		}
  8.1206 +	}
  8.1207 +	if (len)
  8.1208 +		BUG();
  8.1209 +	return csum;
  8.1210 +}
  8.1211 +
  8.1212 +void skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to)
  8.1213 +{
  8.1214 +	unsigned int csum;
  8.1215 +	long csstart;
  8.1216 +
  8.1217 +	if (skb->ip_summed == CHECKSUM_HW)
  8.1218 +		csstart = skb->h.raw - skb->data;
  8.1219 +	else
  8.1220 +		csstart = skb_headlen(skb);
  8.1221 +
  8.1222 +	if (csstart > skb_headlen(skb))
  8.1223 +		BUG();
  8.1224 +
  8.1225 +	memcpy(to, skb->data, csstart);
  8.1226 +
  8.1227 +	csum = 0;
  8.1228 +	if (csstart != skb->len)
  8.1229 +		csum = skb_copy_and_csum_bits(skb, csstart, to + csstart,
  8.1230 +					      skb->len - csstart, 0);
  8.1231 +
  8.1232 +	if (skb->ip_summed == CHECKSUM_HW) {
  8.1233 +		long csstuff = csstart + skb->csum;
  8.1234 +
  8.1235 +		*((unsigned short *)(to + csstuff)) = csum_fold(csum);
  8.1236 +	}
  8.1237 +}
  8.1238 +
  8.1239 +/**
  8.1240 + *	skb_dequeue - remove from the head of the queue
  8.1241 + *	@list: list to dequeue from
  8.1242 + *
  8.1243 + *	Remove the head of the list. The list lock is taken so the function
  8.1244 + *	may be used safely with other locking list functions. The head item is
  8.1245 + *	returned or %NULL if the list is empty.
  8.1246 + */
  8.1247 +
  8.1248 +struct sk_buff *skb_dequeue(struct sk_buff_head *list)
  8.1249 +{
  8.1250 +	unsigned long flags;
  8.1251 +	struct sk_buff *result;
  8.1252 +
  8.1253 +	spin_lock_irqsave(&list->lock, flags);
  8.1254 +	result = __skb_dequeue(list);
  8.1255 +	spin_unlock_irqrestore(&list->lock, flags);
  8.1256 +	return result;
  8.1257 +}
  8.1258 +
  8.1259 +/**
  8.1260 + *	skb_dequeue_tail - remove from the tail of the queue
  8.1261 + *	@list: list to dequeue from
  8.1262 + *
  8.1263 + *	Remove the tail of the list. The list lock is taken so the function
  8.1264 + *	may be used safely with other locking list functions. The tail item is
  8.1265 + *	returned or %NULL if the list is empty.
  8.1266 + */
  8.1267 +struct sk_buff *skb_dequeue_tail(struct sk_buff_head *list)
  8.1268 +{
  8.1269 +	unsigned long flags;
  8.1270 +	struct sk_buff *result;
  8.1271 +
  8.1272 +	spin_lock_irqsave(&list->lock, flags);
  8.1273 +	result = __skb_dequeue_tail(list);
  8.1274 +	spin_unlock_irqrestore(&list->lock, flags);
  8.1275 +	return result;
  8.1276 +}
  8.1277 +
  8.1278 +/**
  8.1279 + *	skb_queue_purge - empty a list
  8.1280 + *	@list: list to empty
  8.1281 + *
  8.1282 + *	Delete all buffers on an &sk_buff list. Each buffer is removed from
  8.1283 + *	the list and one reference dropped. This function takes the list
  8.1284 + *	lock and is atomic with respect to other list locking functions.
  8.1285 + */
  8.1286 +void skb_queue_purge(struct sk_buff_head *list)
  8.1287 +{
  8.1288 +	struct sk_buff *skb;
  8.1289 +	while ((skb = skb_dequeue(list)) != NULL)
  8.1290 +		kfree_skb(skb);
  8.1291 +}
  8.1292 +
  8.1293 +/**
  8.1294 + *	skb_queue_head - queue a buffer at the list head
  8.1295 + *	@list: list to use
  8.1296 + *	@newsk: buffer to queue
  8.1297 + *
  8.1298 + *	Queue a buffer at the start of the list. This function takes the
  8.1299 + *	list lock and can be used safely with other locking &sk_buff functions
  8.1300 + *	safely.
  8.1301 + *
  8.1302 + *	A buffer cannot be placed on two lists at the same time.
  8.1303 + */
  8.1304 +void skb_queue_head(struct sk_buff_head *list, struct sk_buff *newsk)
  8.1305 +{
  8.1306 +	unsigned long flags;
  8.1307 +
  8.1308 +	spin_lock_irqsave(&list->lock, flags);
  8.1309 +	__skb_queue_head(list, newsk);
  8.1310 +	spin_unlock_irqrestore(&list->lock, flags);
  8.1311 +}
  8.1312 +
  8.1313 +/**
  8.1314 + *	skb_queue_tail - queue a buffer at the list tail
  8.1315 + *	@list: list to use
  8.1316 + *	@newsk: buffer to queue
  8.1317 + *
  8.1318 + *	Queue a buffer at the tail of the list. This function takes the
  8.1319 + *	list lock and can be used safely with other locking &sk_buff functions
  8.1320 + *	safely.
  8.1321 + *
  8.1322 + *	A buffer cannot be placed on two lists at the same time.
  8.1323 + */
  8.1324 +void skb_queue_tail(struct sk_buff_head *list, struct sk_buff *newsk)
  8.1325 +{
  8.1326 +	unsigned long flags;
  8.1327 +
  8.1328 +	spin_lock_irqsave(&list->lock, flags);
  8.1329 +	__skb_queue_tail(list, newsk);
  8.1330 +	spin_unlock_irqrestore(&list->lock, flags);
  8.1331 +}
  8.1332 +/**
  8.1333 + *	skb_unlink	-	remove a buffer from a list
  8.1334 + *	@skb: buffer to remove
  8.1335 + *
  8.1336 + *	Place a packet after a given packet in a list. The list locks are taken
  8.1337 + *	and this function is atomic with respect to other list locked calls
  8.1338 + *
  8.1339 + *	Works even without knowing the list it is sitting on, which can be
  8.1340 + *	handy at times. It also means that THE LIST MUST EXIST when you
  8.1341 + *	unlink. Thus a list must have its contents unlinked before it is
  8.1342 + *	destroyed.
  8.1343 + */
  8.1344 +void skb_unlink(struct sk_buff *skb)
  8.1345 +{
  8.1346 +	struct sk_buff_head *list = skb->list;
  8.1347 +
  8.1348 +	if (list) {
  8.1349 +		unsigned long flags;
  8.1350 +
  8.1351 +		spin_lock_irqsave(&list->lock, flags);
  8.1352 +		if (skb->list == list)
  8.1353 +			__skb_unlink(skb, skb->list);
  8.1354 +		spin_unlock_irqrestore(&list->lock, flags);
  8.1355 +	}
  8.1356 +}
  8.1357 +
  8.1358 +
  8.1359 +/**
  8.1360 + *	skb_append	-	append a buffer
  8.1361 + *	@old: buffer to insert after
  8.1362 + *	@newsk: buffer to insert
  8.1363 + *
  8.1364 + *	Place a packet after a given packet in a list. The list locks are taken
  8.1365 + *	and this function is atomic with respect to other list locked calls.
  8.1366 + *	A buffer cannot be placed on two lists at the same time.
  8.1367 + */
  8.1368 +
  8.1369 +void skb_append(struct sk_buff *old, struct sk_buff *newsk)
  8.1370 +{
  8.1371 +	unsigned long flags;
  8.1372 +
  8.1373 +	spin_lock_irqsave(&old->list->lock, flags);
  8.1374 +	__skb_append(old, newsk);
  8.1375 +	spin_unlock_irqrestore(&old->list->lock, flags);
  8.1376 +}
  8.1377 +
  8.1378 +
  8.1379 +/**
  8.1380 + *	skb_insert	-	insert a buffer
  8.1381 + *	@old: buffer to insert before
  8.1382 + *	@newsk: buffer to insert
  8.1383 + *
  8.1384 + *	Place a packet before a given packet in a list. The list locks are taken
  8.1385 + *	and this function is atomic with respect to other list locked calls
  8.1386 + *	A buffer cannot be placed on two lists at the same time.
  8.1387 + */
  8.1388 +
  8.1389 +void skb_insert(struct sk_buff *old, struct sk_buff *newsk)
  8.1390 +{
  8.1391 +	unsigned long flags;
  8.1392 +
  8.1393 +	spin_lock_irqsave(&old->list->lock, flags);
  8.1394 +	__skb_insert(newsk, old->prev, old, old->list);
  8.1395 +	spin_unlock_irqrestore(&old->list->lock, flags);
  8.1396 +}
  8.1397 +
  8.1398 +#if 0
  8.1399 +/*
  8.1400 + * 	Tune the memory allocator for a new MTU size.
  8.1401 + */
  8.1402 +void skb_add_mtu(int mtu)
  8.1403 +{
  8.1404 +	/* Must match allocation in alloc_skb */
  8.1405 +	mtu = SKB_DATA_ALIGN(mtu) + sizeof(struct skb_shared_info);
  8.1406 +
  8.1407 +	kmem_add_cache_size(mtu);
  8.1408 +}
  8.1409 +#endif
  8.1410 +
  8.1411 +static inline void skb_split_inside_header(struct sk_buff *skb,
  8.1412 +					   struct sk_buff* skb1,
  8.1413 +					   const u32 len, const int pos)
  8.1414 +{
  8.1415 +	int i;
  8.1416 +
  8.1417 +	memcpy(skb_put(skb1, pos - len), skb->data + len, pos - len);
  8.1418 +
  8.1419 +	/* And move data appendix as is. */
  8.1420 +	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
  8.1421 +		skb_shinfo(skb1)->frags[i] = skb_shinfo(skb)->frags[i];
  8.1422 +
  8.1423 +	skb_shinfo(skb1)->nr_frags = skb_shinfo(skb)->nr_frags;
  8.1424 +	skb_shinfo(skb)->nr_frags  = 0;
  8.1425 +	skb1->data_len		   = skb->data_len;
  8.1426 +	skb1->len		   += skb1->data_len;
  8.1427 +	skb->data_len		   = 0;
  8.1428 +	skb->len		   = len;
  8.1429 +	skb->tail		   = skb->data + len;
  8.1430 +}
  8.1431 +
  8.1432 +static inline void skb_split_no_header(struct sk_buff *skb,
  8.1433 +				       struct sk_buff* skb1,
  8.1434 +				       const u32 len, int pos)
  8.1435 +{
  8.1436 +	int i, k = 0;
  8.1437 +	const int nfrags = skb_shinfo(skb)->nr_frags;
  8.1438 +
  8.1439 +	skb_shinfo(skb)->nr_frags = 0;
  8.1440 +	skb1->len		  = skb1->data_len = skb->len - len;
  8.1441 +	skb->len		  = len;
  8.1442 +	skb->data_len		  = len - pos;
  8.1443 +
  8.1444 +	for (i = 0; i < nfrags; i++) {
  8.1445 +		int size = skb_shinfo(skb)->frags[i].size;
  8.1446 +
  8.1447 +		if (pos + size > len) {
  8.1448 +			skb_shinfo(skb1)->frags[k] = skb_shinfo(skb)->frags[i];
  8.1449 +
  8.1450 +			if (pos < len) {
  8.1451 +				/* Split frag.
  8.1452 +				 * We have to variants in this case:
  8.1453 +				 * 1. Move all the frag to the second
  8.1454 +				 *    part, if it is possible. F.e.
  8.1455 +				 *    this approach is mandatory for TUX,
  8.1456 +				 *    where splitting is expensive.
  8.1457 +				 * 2. Split is accurately. We make this.
  8.1458 +				 */
  8.1459 +				get_page(skb_shinfo(skb)->frags[i].page);
  8.1460 +				skb_shinfo(skb1)->frags[0].page_offset += len - pos;
  8.1461 +				skb_shinfo(skb1)->frags[0].size -= len - pos;
  8.1462 +				skb_shinfo(skb)->frags[i].size	= len - pos;
  8.1463 +				skb_shinfo(skb)->nr_frags++;
  8.1464 +			}
  8.1465 +			k++;
  8.1466 +		} else
  8.1467 +			skb_shinfo(skb)->nr_frags++;
  8.1468 +		pos += size;
  8.1469 +	}
  8.1470 +	skb_shinfo(skb1)->nr_frags = k;
  8.1471 +}
  8.1472 +
  8.1473 +/**
  8.1474 + * skb_split - Split fragmented skb to two parts at length len.
  8.1475 + */
  8.1476 +void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len)
  8.1477 +{
  8.1478 +	int pos = skb_headlen(skb);
  8.1479 +
  8.1480 +	if (len < pos)	/* Split line is inside header. */
  8.1481 +		skb_split_inside_header(skb, skb1, len, pos);
  8.1482 +	else		/* Second chunk has no header, nothing to copy. */
  8.1483 +		skb_split_no_header(skb, skb1, len, pos);
  8.1484 +}
  8.1485 +
  8.1486 +void __init skb_init(void)
  8.1487 +{
  8.1488 +	skbuff_head_cache = kmem_cache_create("skbuff_head_cache",
  8.1489 +					      sizeof(struct sk_buff),
  8.1490 +					      0,
  8.1491 +					      SLAB_HWCACHE_ALIGN,
  8.1492 +					      NULL, NULL);
  8.1493 +	if (!skbuff_head_cache)
  8.1494 +		panic("cannot create skbuff cache");
  8.1495 +}
  8.1496 +
  8.1497 +EXPORT_SYMBOL(___pskb_trim);
  8.1498 +EXPORT_SYMBOL(__kfree_skb);
  8.1499 +EXPORT_SYMBOL(__pskb_pull_tail);
  8.1500 +EXPORT_SYMBOL(alloc_skb);
  8.1501 +EXPORT_SYMBOL(pskb_copy);
  8.1502 +EXPORT_SYMBOL(pskb_expand_head);
  8.1503 +EXPORT_SYMBOL(skb_checksum);
  8.1504 +EXPORT_SYMBOL(skb_clone);
  8.1505 +EXPORT_SYMBOL(skb_clone_fraglist);
  8.1506 +EXPORT_SYMBOL(skb_copy);
  8.1507 +EXPORT_SYMBOL(skb_copy_and_csum_bits);
  8.1508 +EXPORT_SYMBOL(skb_copy_and_csum_dev);
  8.1509 +EXPORT_SYMBOL(skb_copy_bits);
  8.1510 +EXPORT_SYMBOL(skb_copy_expand);
  8.1511 +EXPORT_SYMBOL(skb_over_panic);
  8.1512 +EXPORT_SYMBOL(skb_pad);
  8.1513 +EXPORT_SYMBOL(skb_realloc_headroom);
  8.1514 +EXPORT_SYMBOL(skb_under_panic);
  8.1515 +EXPORT_SYMBOL(skb_dequeue);
  8.1516 +EXPORT_SYMBOL(skb_dequeue_tail);
  8.1517 +EXPORT_SYMBOL(skb_insert);
  8.1518 +EXPORT_SYMBOL(skb_queue_purge);
  8.1519 +EXPORT_SYMBOL(skb_queue_head);
  8.1520 +EXPORT_SYMBOL(skb_queue_tail);
  8.1521 +EXPORT_SYMBOL(skb_unlink);
  8.1522 +EXPORT_SYMBOL(skb_append);
  8.1523 +EXPORT_SYMBOL(skb_split);
  8.1524 +EXPORT_SYMBOL(skb_iter_first);
  8.1525 +EXPORT_SYMBOL(skb_iter_next);
  8.1526 +EXPORT_SYMBOL(skb_iter_abort);
     9.1 --- a/xen/include/public/io/netif.h	Sat May 21 12:40:44 2005 +0000
     9.2 +++ b/xen/include/public/io/netif.h	Sat May 21 19:08:56 2005 +0000
     9.3 @@ -12,7 +12,8 @@
     9.4  typedef struct {
     9.5      memory_t addr;   /*  0: Machine address of packet.  */
     9.6      MEMORY_PADDING;
     9.7 -    u16      id;     /*  8: Echoed in response message. */
     9.8 +    u16      csum_blank:1; /* Proto csum field blank?   */
     9.9 +    u16      id:15;  /*  8: Echoed in response message. */
    9.10      u16      size;   /* 10: Packet size in bytes.       */
    9.11  } PACKED netif_tx_request_t; /* 12 bytes */
    9.12  
    9.13 @@ -29,7 +30,8 @@ typedef struct {
    9.14  typedef struct {
    9.15      memory_t addr;   /*  0: Machine address of packet.              */
    9.16      MEMORY_PADDING;
    9.17 -    u16      id;     /*  8:  */
    9.18 +    u16      csum_valid:1; /* Protocol checksum is validated?       */
    9.19 +    u16      id:15;  /*  8:  */
    9.20      s16      status; /* 10: -ve: BLKIF_RSP_* ; +ve: Rx'ed pkt size. */
    9.21  } PACKED netif_rx_response_t; /* 12 bytes */
    9.22