ia64/xen-unstable

view xen-2.4.16/net/dev.c @ 97:ce656d157bbf

bitkeeper revision 1.15.1.9 (3e2fd418NdZYqL3KPr6URqd77DwnJQ)

Add shadow ring in the RX direction to protect buffers from guest
tampering once they have been passed to the hypervisor. This
is under the umbrella of eventual zero-copy network code.
author akw27@plucky.localdomain
date Thu Jan 23 11:38:00 2003 +0000 (2003-01-23)
parents e84c63b9a807
children 91f5e18965d9 cb2688ed1a23
line source
1 /*
2 * NET3 Protocol independent device support routines.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 */
10 #include <asm/uaccess.h>
11 #include <asm/system.h>
12 #include <asm/bitops.h>
13 #include <linux/config.h>
14 #include <linux/delay.h>
15 #include <linux/lib.h>
16 #include <linux/types.h>
17 #include <linux/sched.h>
18 #include <linux/mm.h>
19 #include <linux/socket.h>
20 #include <linux/sockios.h>
21 #include <linux/errno.h>
22 #include <linux/interrupt.h>
23 #include <linux/if_ether.h>
24 #include <linux/netdevice.h>
25 #include <linux/etherdevice.h>
26 #include <linux/skbuff.h>
27 #include <linux/brlock.h>
28 #include <linux/init.h>
29 #include <linux/module.h>
30 #include <linux/pkt_sched.h>
32 #include <linux/event.h>
34 #define BUG_TRAP ASSERT
35 #define notifier_call_chain(_a,_b,_c) ((void)0)
36 #define rtmsg_ifinfo(_a,_b,_c) ((void)0)
37 #define rtnl_lock() ((void)0)
38 #define rtnl_unlock() ((void)0)
39 #define dst_init() ((void)0)
41 struct net_device *the_dev = NULL;
43 /*
44 * Device drivers call our routines to queue packets here. We empty the
45 * queue in the local softnet handler.
46 */
47 struct softnet_data softnet_data[NR_CPUS] __cacheline_aligned;
50 /*****************************************************************************************
52 Device Interface Subroutines
54 ******************************************************************************************/
56 /**
57 * __dev_get_by_name - find a device by its name
58 * @name: name to find
59 *
60 * Find an interface by name. Must be called under RTNL semaphore
61 * or @dev_base_lock. If the name is found a pointer to the device
62 * is returned. If the name is not found then %NULL is returned. The
63 * reference counters are not incremented so the caller must be
64 * careful with locks.
65 */
68 struct net_device *__dev_get_by_name(const char *name)
69 {
70 struct net_device *dev;
72 for (dev = dev_base; dev != NULL; dev = dev->next) {
73 if (strncmp(dev->name, name, IFNAMSIZ) == 0)
74 return dev;
75 }
76 return NULL;
77 }
79 /**
80 * dev_get_by_name - find a device by its name
81 * @name: name to find
82 *
83 * Find an interface by name. This can be called from any
84 * context and does its own locking. The returned handle has
85 * the usage count incremented and the caller must use dev_put() to
86 * release it when it is no longer needed. %NULL is returned if no
87 * matching device is found.
88 */
90 struct net_device *dev_get_by_name(const char *name)
91 {
92 struct net_device *dev;
94 read_lock(&dev_base_lock);
95 dev = __dev_get_by_name(name);
96 if (dev)
97 dev_hold(dev);
98 read_unlock(&dev_base_lock);
99 return dev;
100 }
102 /*
103 Return value is changed to int to prevent illegal usage in future.
104 It is still legal to use to check for device existance.
106 User should understand, that the result returned by this function
107 is meaningless, if it was not issued under rtnl semaphore.
108 */
110 /**
111 * dev_get - test if a device exists
112 * @name: name to test for
113 *
114 * Test if a name exists. Returns true if the name is found. In order
115 * to be sure the name is not allocated or removed during the test the
116 * caller must hold the rtnl semaphore.
117 *
118 * This function primarily exists for back compatibility with older
119 * drivers.
120 */
122 int dev_get(const char *name)
123 {
124 struct net_device *dev;
126 read_lock(&dev_base_lock);
127 dev = __dev_get_by_name(name);
128 read_unlock(&dev_base_lock);
129 return dev != NULL;
130 }
132 /**
133 * __dev_get_by_index - find a device by its ifindex
134 * @ifindex: index of device
135 *
136 * Search for an interface by index. Returns %NULL if the device
137 * is not found or a pointer to the device. The device has not
138 * had its reference counter increased so the caller must be careful
139 * about locking. The caller must hold either the RTNL semaphore
140 * or @dev_base_lock.
141 */
143 struct net_device * __dev_get_by_index(int ifindex)
144 {
145 struct net_device *dev;
147 for (dev = dev_base; dev != NULL; dev = dev->next) {
148 if (dev->ifindex == ifindex)
149 return dev;
150 }
151 return NULL;
152 }
155 /**
156 * dev_get_by_index - find a device by its ifindex
157 * @ifindex: index of device
158 *
159 * Search for an interface by index. Returns NULL if the device
160 * is not found or a pointer to the device. The device returned has
161 * had a reference added and the pointer is safe until the user calls
162 * dev_put to indicate they have finished with it.
163 */
165 struct net_device * dev_get_by_index(int ifindex)
166 {
167 struct net_device *dev;
169 read_lock(&dev_base_lock);
170 dev = __dev_get_by_index(ifindex);
171 if (dev)
172 dev_hold(dev);
173 read_unlock(&dev_base_lock);
174 return dev;
175 }
177 /**
178 * dev_getbyhwaddr - find a device by its hardware address
179 * @type: media type of device
180 * @ha: hardware address
181 *
182 * Search for an interface by MAC address. Returns NULL if the device
183 * is not found or a pointer to the device. The caller must hold the
184 * rtnl semaphore. The returned device has not had its ref count increased
185 * and the caller must therefore be careful about locking
186 *
187 * BUGS:
188 * If the API was consistent this would be __dev_get_by_hwaddr
189 */
191 struct net_device *dev_getbyhwaddr(unsigned short type, char *ha)
192 {
193 struct net_device *dev;
195 for (dev = dev_base; dev != NULL; dev = dev->next) {
196 if (dev->type == type &&
197 memcmp(dev->dev_addr, ha, dev->addr_len) == 0)
198 return dev;
199 }
200 return NULL;
201 }
203 /**
204 * dev_alloc_name - allocate a name for a device
205 * @dev: device
206 * @name: name format string
207 *
208 * Passed a format string - eg "lt%d" it will try and find a suitable
209 * id. Not efficient for many devices, not called a lot. The caller
210 * must hold the dev_base or rtnl lock while allocating the name and
211 * adding the device in order to avoid duplicates. Returns the number
212 * of the unit assigned or a negative errno code.
213 */
215 int dev_alloc_name(struct net_device *dev, const char *name)
216 {
217 int i;
218 char buf[32];
219 char *p;
221 /*
222 * Verify the string as this thing may have come from
223 * the user. There must be either one "%d" and no other "%"
224 * characters, or no "%" characters at all.
225 */
226 p = strchr(name, '%');
227 if (p && (p[1] != 'd' || strchr(p+2, '%')))
228 return -EINVAL;
230 /*
231 * If you need over 100 please also fix the algorithm...
232 */
233 for (i = 0; i < 100; i++) {
234 snprintf(buf,sizeof(buf),name,i);
235 if (__dev_get_by_name(buf) == NULL) {
236 strcpy(dev->name, buf);
237 return i;
238 }
239 }
240 return -ENFILE; /* Over 100 of the things .. bail out! */
241 }
243 /**
244 * dev_alloc - allocate a network device and name
245 * @name: name format string
246 * @err: error return pointer
247 *
248 * Passed a format string, eg. "lt%d", it will allocate a network device
249 * and space for the name. %NULL is returned if no memory is available.
250 * If the allocation succeeds then the name is assigned and the
251 * device pointer returned. %NULL is returned if the name allocation
252 * failed. The cause of an error is returned as a negative errno code
253 * in the variable @err points to.
254 *
255 * The caller must hold the @dev_base or RTNL locks when doing this in
256 * order to avoid duplicate name allocations.
257 */
259 struct net_device *dev_alloc(const char *name, int *err)
260 {
261 struct net_device *dev=kmalloc(sizeof(struct net_device), GFP_KERNEL);
262 if (dev == NULL) {
263 *err = -ENOBUFS;
264 return NULL;
265 }
266 memset(dev, 0, sizeof(struct net_device));
267 *err = dev_alloc_name(dev, name);
268 if (*err < 0) {
269 kfree(dev);
270 return NULL;
271 }
272 return dev;
273 }
275 /**
276 * netdev_state_change - device changes state
277 * @dev: device to cause notification
278 *
279 * Called to indicate a device has changed state. This function calls
280 * the notifier chains for netdev_chain and sends a NEWLINK message
281 * to the routing socket.
282 */
284 void netdev_state_change(struct net_device *dev)
285 {
286 if (dev->flags&IFF_UP) {
287 notifier_call_chain(&netdev_chain, NETDEV_CHANGE, dev);
288 rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
289 }
290 }
293 #ifdef CONFIG_KMOD
295 /**
296 * dev_load - load a network module
297 * @name: name of interface
298 *
299 * If a network interface is not present and the process has suitable
300 * privileges this function loads the module. If module loading is not
301 * available in this kernel then it becomes a nop.
302 */
304 void dev_load(const char *name)
305 {
306 if (!dev_get(name) && capable(CAP_SYS_MODULE))
307 request_module(name);
308 }
310 #else
312 extern inline void dev_load(const char *unused){;}
314 #endif
316 static int default_rebuild_header(struct sk_buff *skb)
317 {
318 printk(KERN_DEBUG "%s: default_rebuild_header called -- BUG!\n", skb->dev ? skb->dev->name : "NULL!!!");
319 kfree_skb(skb);
320 return 1;
321 }
323 /**
324 * dev_open - prepare an interface for use.
325 * @dev: device to open
326 *
327 * Takes a device from down to up state. The device's private open
328 * function is invoked and then the multicast lists are loaded. Finally
329 * the device is moved into the up state and a %NETDEV_UP message is
330 * sent to the netdev notifier chain.
331 *
332 * Calling this function on an active interface is a nop. On a failure
333 * a negative errno code is returned.
334 */
336 int dev_open(struct net_device *dev)
337 {
338 int ret = 0;
340 /*
341 * Is it already up?
342 */
344 if (dev->flags&IFF_UP)
345 return 0;
347 /*
348 * Is it even present?
349 */
350 if (!netif_device_present(dev))
351 return -ENODEV;
353 /*
354 * Call device private open method
355 */
356 if (try_inc_mod_count(dev->owner)) {
357 if (dev->open) {
358 ret = dev->open(dev);
359 if (ret != 0 && dev->owner)
360 __MOD_DEC_USE_COUNT(dev->owner);
361 }
362 } else {
363 ret = -ENODEV;
364 }
366 /*
367 * If it went open OK then:
368 */
370 if (ret == 0)
371 {
372 /*
373 * Set the flags.
374 */
375 dev->flags |= IFF_UP;
377 set_bit(__LINK_STATE_START, &dev->state);
379 /*
380 * Initialize multicasting status
381 */
382 dev_mc_upload(dev);
384 /*
385 * Wakeup transmit queue engine
386 */
387 dev_activate(dev);
389 /*
390 * ... and announce new interface.
391 */
392 notifier_call_chain(&netdev_chain, NETDEV_UP, dev);
393 }
394 return(ret);
395 }
398 /**
399 * dev_close - shutdown an interface.
400 * @dev: device to shutdown
401 *
402 * This function moves an active device into down state. A
403 * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
404 * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
405 * chain.
406 */
408 int dev_close(struct net_device *dev)
409 {
410 if (!(dev->flags&IFF_UP))
411 return 0;
413 /*
414 * Tell people we are going down, so that they can
415 * prepare to death, when device is still operating.
416 */
417 notifier_call_chain(&netdev_chain, NETDEV_GOING_DOWN, dev);
419 dev_deactivate(dev);
421 clear_bit(__LINK_STATE_START, &dev->state);
423 /*
424 * Call the device specific close. This cannot fail.
425 * Only if device is UP
426 *
427 * We allow it to be called even after a DETACH hot-plug
428 * event.
429 */
431 if (dev->stop)
432 dev->stop(dev);
434 /*
435 * Device is now down.
436 */
438 dev->flags &= ~IFF_UP;
440 /*
441 * Tell people we are down
442 */
443 notifier_call_chain(&netdev_chain, NETDEV_DOWN, dev);
445 /*
446 * Drop the module refcount
447 */
448 if (dev->owner)
449 __MOD_DEC_USE_COUNT(dev->owner);
451 return(0);
452 }
455 #ifdef CONFIG_HIGHMEM
456 /* Actually, we should eliminate this check as soon as we know, that:
457 * 1. IOMMU is present and allows to map all the memory.
458 * 2. No high memory really exists on this machine.
459 */
461 static inline int
462 illegal_highdma(struct net_device *dev, struct sk_buff *skb)
463 {
464 int i;
466 if (dev->features&NETIF_F_HIGHDMA)
467 return 0;
469 for (i=0; i<skb_shinfo(skb)->nr_frags; i++)
470 if (skb_shinfo(skb)->frags[i].page >= highmem_start_page)
471 return 1;
473 return 0;
474 }
475 #else
476 #define illegal_highdma(dev, skb) (0)
477 #endif
479 /**
480 * dev_queue_xmit - transmit a buffer
481 * @skb: buffer to transmit
482 *
483 * Queue a buffer for transmission to a network device. The caller must
484 * have set the device and priority and built the buffer before calling this
485 * function. The function can be called from an interrupt.
486 *
487 * A negative errno code is returned on a failure. A success does not
488 * guarantee the frame will be transmitted as it may be dropped due
489 * to congestion or traffic shaping.
490 */
492 int dev_queue_xmit(struct sk_buff *skb)
493 {
494 struct net_device *dev = skb->dev;
495 struct Qdisc *q;
497 if (skb_shinfo(skb)->frag_list &&
498 !(dev->features&NETIF_F_FRAGLIST) &&
499 skb_linearize(skb, GFP_ATOMIC) != 0) {
500 kfree_skb(skb);
501 return -ENOMEM;
502 }
504 /* Fragmented skb is linearized if device does not support SG,
505 * or if at least one of fragments is in highmem and device
506 * does not support DMA from it.
507 */
508 if (skb_shinfo(skb)->nr_frags &&
509 (!(dev->features&NETIF_F_SG) || illegal_highdma(dev, skb)) &&
510 skb_linearize(skb, GFP_ATOMIC) != 0) {
511 kfree_skb(skb);
512 return -ENOMEM;
513 }
515 /* Grab device queue */
516 spin_lock_bh(&dev->queue_lock);
517 q = dev->qdisc;
518 if (q->enqueue) {
519 int ret = q->enqueue(skb, q);
521 qdisc_run(dev);
523 spin_unlock_bh(&dev->queue_lock);
524 return ret == NET_XMIT_BYPASS ? NET_XMIT_SUCCESS : ret;
525 }
527 /* The device has no queue. Common case for software devices:
528 loopback, all the sorts of tunnels...
530 Really, it is unlikely that xmit_lock protection is necessary here.
531 (f.e. loopback and IP tunnels are clean ignoring statistics counters.)
532 However, it is possible, that they rely on protection
533 made by us here.
535 Check this and shot the lock. It is not prone from deadlocks.
536 Either shot noqueue qdisc, it is even simpler 8)
537 */
538 if (dev->flags&IFF_UP) {
539 int cpu = smp_processor_id();
541 if (dev->xmit_lock_owner != cpu) {
542 spin_unlock(&dev->queue_lock);
543 spin_lock(&dev->xmit_lock);
544 dev->xmit_lock_owner = cpu;
546 if (!netif_queue_stopped(dev)) {
547 if (dev->hard_start_xmit(skb, dev) == 0) {
548 dev->xmit_lock_owner = -1;
549 spin_unlock_bh(&dev->xmit_lock);
550 return 0;
551 }
552 }
553 dev->xmit_lock_owner = -1;
554 spin_unlock_bh(&dev->xmit_lock);
555 kfree_skb(skb);
556 return -ENETDOWN;
557 } else {
558 /* Recursion is detected! It is possible, unfortunately */
559 }
560 }
561 spin_unlock_bh(&dev->queue_lock);
563 kfree_skb(skb);
564 return -ENETDOWN;
565 }
568 /*=======================================================================
569 Receiver routines
570 =======================================================================*/
572 int netdev_max_backlog = 300;
573 /* These numbers are selected based on intuition and some
574 * experimentatiom, if you have more scientific way of doing this
575 * please go ahead and fix things.
576 */
577 int no_cong_thresh = 10;
578 int no_cong = 20;
579 int lo_cong = 100;
580 int mod_cong = 290;
582 struct netif_rx_stats netdev_rx_stat[NR_CPUS];
585 #ifdef CONFIG_NET_HW_FLOWCONTROL
586 atomic_t netdev_dropping = ATOMIC_INIT(0);
587 static unsigned long netdev_fc_mask = 1;
588 unsigned long netdev_fc_xoff = 0;
589 spinlock_t netdev_fc_lock = SPIN_LOCK_UNLOCKED;
591 static struct
592 {
593 void (*stimul)(struct net_device *);
594 struct net_device *dev;
595 } netdev_fc_slots[BITS_PER_LONG];
597 int netdev_register_fc(struct net_device *dev, void (*stimul)(struct net_device *dev))
598 {
599 int bit = 0;
600 unsigned long flags;
602 spin_lock_irqsave(&netdev_fc_lock, flags);
603 if (netdev_fc_mask != ~0UL) {
604 bit = ffz(netdev_fc_mask);
605 netdev_fc_slots[bit].stimul = stimul;
606 netdev_fc_slots[bit].dev = dev;
607 set_bit(bit, &netdev_fc_mask);
608 clear_bit(bit, &netdev_fc_xoff);
609 }
610 spin_unlock_irqrestore(&netdev_fc_lock, flags);
611 return bit;
612 }
614 void netdev_unregister_fc(int bit)
615 {
616 unsigned long flags;
618 spin_lock_irqsave(&netdev_fc_lock, flags);
619 if (bit > 0) {
620 netdev_fc_slots[bit].stimul = NULL;
621 netdev_fc_slots[bit].dev = NULL;
622 clear_bit(bit, &netdev_fc_mask);
623 clear_bit(bit, &netdev_fc_xoff);
624 }
625 spin_unlock_irqrestore(&netdev_fc_lock, flags);
626 }
628 static void netdev_wakeup(void)
629 {
630 unsigned long xoff;
632 spin_lock(&netdev_fc_lock);
633 xoff = netdev_fc_xoff;
634 netdev_fc_xoff = 0;
635 while (xoff) {
636 int i = ffz(~xoff);
637 xoff &= ~(1<<i);
638 netdev_fc_slots[i].stimul(netdev_fc_slots[i].dev);
639 }
640 spin_unlock(&netdev_fc_lock);
641 }
642 #endif
644 static void get_sample_stats(int cpu)
645 {
646 int blog = softnet_data[cpu].input_pkt_queue.qlen;
647 int avg_blog = softnet_data[cpu].avg_blog;
649 avg_blog = (avg_blog >> 1)+ (blog >> 1);
651 if (avg_blog > mod_cong) {
652 /* Above moderate congestion levels. */
653 softnet_data[cpu].cng_level = NET_RX_CN_HIGH;
654 } else if (avg_blog > lo_cong) {
655 softnet_data[cpu].cng_level = NET_RX_CN_MOD;
656 } else if (avg_blog > no_cong)
657 softnet_data[cpu].cng_level = NET_RX_CN_LOW;
658 else /* no congestion */
659 softnet_data[cpu].cng_level = NET_RX_SUCCESS;
661 softnet_data[cpu].avg_blog = avg_blog;
662 }
665 /**
666 * netif_rx - post buffer to the network code
667 * @skb: buffer to post
668 *
669 * This function receives a packet from a device driver and queues it for
670 * the upper (protocol) levels to process. It always succeeds. The buffer
671 * may be dropped during processing for congestion control or by the
672 * protocol layers.
673 *
674 * return values:
675 * NET_RX_SUCCESS (no congestion)
676 * NET_RX_CN_LOW (low congestion)
677 * NET_RX_CN_MOD (moderate congestion)
678 * NET_RX_CN_HIGH (high congestion)
679 * NET_RX_DROP (packet was dropped)
680 *
681 *
682 */
684 int netif_rx(struct sk_buff *skb)
685 {
686 #ifdef CONFIG_SMP
687 unsigned long cpu_mask;
688 #endif
689 struct task_struct *p;
690 int this_cpu = smp_processor_id();
691 struct softnet_data *queue;
692 unsigned long flags;
693 net_vif_t *vif;
695 if (skb->stamp.tv_sec == 0)
696 get_fast_time(&skb->stamp);
698 /* The code is rearranged so that the path is the most
699 short when CPU is congested, but is still operating.
700 */
701 queue = &softnet_data[this_cpu];
703 local_irq_save(flags);
705 netdev_rx_stat[this_cpu].total++;
707 if ( skb->src_vif == VIF_UNKNOWN_INTERFACE )
708 skb->src_vif = VIF_PHYSICAL_INTERFACE;
710 if ( skb->dst_vif == VIF_UNKNOWN_INTERFACE )
711 net_get_target_vif(skb);
713 if ( (vif = sys_vif_list[skb->dst_vif]) == NULL )
714 {
715 // the target vif does not exist.
716 goto drop;
717 }
719 /* This lock-and-walk of the task list isn't really necessary, and is an
720 * artifact of the old code. The vif contains a pointer to the skb list
721 * we are going to queue the packet in, so the lock and the inner loop
722 * could be removed.
723 *
724 * The argument against this is a possible race in which a domain is killed
725 * as packets are being delivered to it. This would result in the dest vif
726 * vanishing before we can deliver to it.
727 */
729 if ( skb->dst_vif >= VIF_PHYSICAL_INTERFACE )
730 {
731 read_lock(&tasklist_lock);
732 p = &idle0_task;
733 do {
734 if ( p->domain != vif->domain ) continue;
735 if ( vif->skb_list.qlen > 100 ) break;
736 skb_queue_tail(&vif->skb_list, skb);
737 cpu_mask = mark_hyp_event(p, _HYP_EVENT_NET_RX);
738 read_unlock(&tasklist_lock);
739 goto found;
740 }
741 while ( (p = p->next_task) != &idle0_task );
742 read_unlock(&tasklist_lock);
743 goto drop;
744 }
746 drop:
747 netdev_rx_stat[this_cpu].dropped++;
748 local_irq_restore(flags);
750 kfree_skb(skb);
751 return NET_RX_DROP;
753 found:
754 hyp_event_notify(cpu_mask);
755 local_irq_restore(flags);
756 return 0;
757 }
759 /* Deliver skb to an old protocol, which is not threaded well
760 or which do not understand shared skbs.
761 */
762 static int deliver_to_old_ones(struct packet_type *pt, struct sk_buff *skb, int last)
763 {
764 static spinlock_t net_bh_lock = SPIN_LOCK_UNLOCKED;
765 int ret = NET_RX_DROP;
768 if (!last) {
769 skb = skb_clone(skb, GFP_ATOMIC);
770 if (skb == NULL)
771 return ret;
772 }
773 if (skb_is_nonlinear(skb) && skb_linearize(skb, GFP_ATOMIC) != 0) {
774 kfree_skb(skb);
775 return ret;
776 }
778 /* The assumption (correct one) is that old protocols
779 did not depened on BHs different of NET_BH and TIMER_BH.
780 */
782 /* Emulate NET_BH with special spinlock */
783 spin_lock(&net_bh_lock);
785 /* Disable timers and wait for all timers completion */
786 tasklet_disable(bh_task_vec+TIMER_BH);
788 ret = pt->func(skb, skb->dev, pt);
790 tasklet_hi_enable(bh_task_vec+TIMER_BH);
791 spin_unlock(&net_bh_lock);
792 return ret;
793 }
795 static void net_tx_action(struct softirq_action *h)
796 {
797 int cpu = smp_processor_id();
799 if (softnet_data[cpu].completion_queue) {
800 struct sk_buff *clist;
802 local_irq_disable();
803 clist = softnet_data[cpu].completion_queue;
804 softnet_data[cpu].completion_queue = NULL;
805 local_irq_enable();
807 while (clist != NULL) {
808 struct sk_buff *skb = clist;
809 clist = clist->next;
811 BUG_TRAP(atomic_read(&skb->users) == 0);
812 __kfree_skb(skb);
813 }
814 }
816 if (softnet_data[cpu].output_queue) {
817 struct net_device *head;
819 local_irq_disable();
820 head = softnet_data[cpu].output_queue;
821 softnet_data[cpu].output_queue = NULL;
822 local_irq_enable();
824 while (head != NULL) {
825 struct net_device *dev = head;
826 head = head->next_sched;
828 smp_mb__before_clear_bit();
829 clear_bit(__LINK_STATE_SCHED, &dev->state);
831 if (spin_trylock(&dev->queue_lock)) {
832 qdisc_run(dev);
833 spin_unlock(&dev->queue_lock);
834 } else {
835 netif_schedule(dev);
836 }
837 }
838 }
839 }
842 #if defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)
843 void (*br_handle_frame_hook)(struct sk_buff *skb) = NULL;
844 #endif
846 static __inline__ int handle_bridge(struct sk_buff *skb,
847 struct packet_type *pt_prev)
848 {
849 int ret = NET_RX_DROP;
851 if (pt_prev) {
852 if (!pt_prev->data)
853 ret = deliver_to_old_ones(pt_prev, skb, 0);
854 else {
855 atomic_inc(&skb->users);
856 ret = pt_prev->func(skb, skb->dev, pt_prev);
857 }
858 }
860 #if defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)
861 br_handle_frame_hook(skb);
862 #endif
863 return ret;
864 }
867 #ifdef CONFIG_NET_DIVERT
868 static inline void handle_diverter(struct sk_buff *skb)
869 {
870 /* if diversion is supported on device, then divert */
871 if (skb->dev->divert && skb->dev->divert->divert)
872 divert_frame(skb);
873 }
874 #endif /* CONFIG_NET_DIVERT */
877 void flush_rx_queue(void)
878 {
879 struct sk_buff *skb;
880 shared_info_t *s = current->shared_info;
881 net_ring_t *net_ring;
882 net_shadow_ring_t *shadow_ring;
883 unsigned int i, nvif;
884 rx_shadow_entry_t *rx;
886 /* I have changed this to batch flush all vifs for a guest
887 * at once, whenever this is called. Since the guest is about to be
888 * scheduled and issued an RX interrupt for one nic, it might as well
889 * receive all pending traffic although it will still only get
890 * interrupts about rings that pass the event marker.
891 *
892 * If this doesn't make sense, _HYP_EVENT_NET_RX can be modified to
893 * represent individual interrups as _EVENT_NET_RX and the outer for
894 * loop can be replaced with a translation to the specific NET
895 * interrupt to serve. --akw
896 */
898 clear_bit(_HYP_EVENT_NET_RX, &current->hyp_events);
900 for (nvif = 0; nvif < current->num_net_vifs; nvif++)
901 {
902 net_ring = current->net_vif_list[nvif]->net_ring;
903 shadow_ring = current->net_vif_list[nvif]->shadow_ring;
904 while ( (skb = skb_dequeue(&current->net_vif_list[nvif]->skb_list))
905 != NULL )
906 {
907 /*
908 * Write the virtual MAC address into the destination field
909 * of the ethernet packet. Furthermore, do the same for ARP
910 * reply packets. This is easy because the virtual MAC address
911 * is always 00-00-00-00-00-00.
912 *
913 * Actually, the MAC address is now all zeros, except for the
914 * second sixteen bits, which are the per-host vif id.
915 * (so eth0 should be 00-00-..., eth1 is 00-01-...)
916 */
917 memset(skb->mac.ethernet->h_dest, 0, ETH_ALEN);
918 *(unsigned int *)(skb->mac.ethernet->h_dest + 1) = nvif;
919 if ( ntohs(skb->mac.ethernet->h_proto) == ETH_P_ARP )
920 {
921 memset(skb->nh.raw + 18, 0, ETH_ALEN);
922 *(unsigned int *)(skb->nh.raw + 18 + 1) = nvif;
923 }
925 i = net_ring->rx_cons;
926 if ( i != net_ring->rx_prod )
927 {
928 if ( shadow_ring->rx_ring[i].status == RING_STATUS_OK)
929 {
930 rx = shadow_ring->rx_ring+i;
931 if ( (skb->len + ETH_HLEN) < rx->size )
932 rx->size = skb->len + ETH_HLEN;
933 copy_to_user((void *)rx->addr, skb->mac.raw, rx->size);
934 copy_to_user(net_ring->rx_ring+i, rx, sizeof(rx));
935 }
936 net_ring->rx_cons = (i+1) & (RX_RING_SIZE-1);
937 if ( net_ring->rx_cons == net_ring->rx_event )
938 set_bit(_EVENT_NET_RX_FOR_VIF(nvif), &s->events);
939 }
940 kfree_skb(skb);
941 }
942 }
943 }
946 /*
947 * Map an interface index to its name (SIOCGIFNAME)
948 */
950 /*
951 * We need this ioctl for efficient implementation of the
952 * if_indextoname() function required by the IPv6 API. Without
953 * it, we would have to search all the interfaces to find a
954 * match. --pb
955 */
957 static int dev_ifname(struct ifreq *arg)
958 {
959 struct net_device *dev;
960 struct ifreq ifr;
962 /*
963 * Fetch the caller's info block.
964 */
966 if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
967 return -EFAULT;
969 read_lock(&dev_base_lock);
970 dev = __dev_get_by_index(ifr.ifr_ifindex);
971 if (!dev) {
972 read_unlock(&dev_base_lock);
973 return -ENODEV;
974 }
976 strcpy(ifr.ifr_name, dev->name);
977 read_unlock(&dev_base_lock);
979 if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
980 return -EFAULT;
981 return 0;
982 }
985 /**
986 * netdev_set_master - set up master/slave pair
987 * @slave: slave device
988 * @master: new master device
989 *
990 * Changes the master device of the slave. Pass %NULL to break the
991 * bonding. The caller must hold the RTNL semaphore. On a failure
992 * a negative errno code is returned. On success the reference counts
993 * are adjusted, %RTM_NEWLINK is sent to the routing socket and the
994 * function returns zero.
995 */
997 int netdev_set_master(struct net_device *slave, struct net_device *master)
998 {
999 struct net_device *old = slave->master;
1001 if (master) {
1002 if (old)
1003 return -EBUSY;
1004 dev_hold(master);
1007 br_write_lock_bh(BR_NETPROTO_LOCK);
1008 slave->master = master;
1009 br_write_unlock_bh(BR_NETPROTO_LOCK);
1011 if (old)
1012 dev_put(old);
1014 if (master)
1015 slave->flags |= IFF_SLAVE;
1016 else
1017 slave->flags &= ~IFF_SLAVE;
1019 rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
1020 return 0;
1023 /**
1024 * dev_set_promiscuity - update promiscuity count on a device
1025 * @dev: device
1026 * @inc: modifier
1028 * Add or remove promsicuity from a device. While the count in the device
1029 * remains above zero the interface remains promiscuous. Once it hits zero
1030 * the device reverts back to normal filtering operation. A negative inc
1031 * value is used to drop promiscuity on the device.
1032 */
1034 void dev_set_promiscuity(struct net_device *dev, int inc)
1036 unsigned short old_flags = dev->flags;
1038 dev->flags |= IFF_PROMISC;
1039 if ((dev->promiscuity += inc) == 0)
1040 dev->flags &= ~IFF_PROMISC;
1041 if (dev->flags^old_flags) {
1042 #ifdef CONFIG_NET_FASTROUTE
1043 if (dev->flags&IFF_PROMISC) {
1044 netdev_fastroute_obstacles++;
1045 dev_clear_fastroute(dev);
1046 } else
1047 netdev_fastroute_obstacles--;
1048 #endif
1049 dev_mc_upload(dev);
1050 printk(KERN_INFO "device %s %s promiscuous mode\n",
1051 dev->name, (dev->flags&IFF_PROMISC) ? "entered" : "left");
1055 /**
1056 * dev_set_allmulti - update allmulti count on a device
1057 * @dev: device
1058 * @inc: modifier
1060 * Add or remove reception of all multicast frames to a device. While the
1061 * count in the device remains above zero the interface remains listening
1062 * to all interfaces. Once it hits zero the device reverts back to normal
1063 * filtering operation. A negative @inc value is used to drop the counter
1064 * when releasing a resource needing all multicasts.
1065 */
1067 void dev_set_allmulti(struct net_device *dev, int inc)
1069 unsigned short old_flags = dev->flags;
1071 dev->flags |= IFF_ALLMULTI;
1072 if ((dev->allmulti += inc) == 0)
1073 dev->flags &= ~IFF_ALLMULTI;
1074 if (dev->flags^old_flags)
1075 dev_mc_upload(dev);
1078 int dev_change_flags(struct net_device *dev, unsigned flags)
1080 int ret;
1081 int old_flags = dev->flags;
1083 /*
1084 * Set the flags on our device.
1085 */
1087 dev->flags = (flags & (IFF_DEBUG|IFF_NOTRAILERS|IFF_NOARP|IFF_DYNAMIC|
1088 IFF_MULTICAST|IFF_PORTSEL|IFF_AUTOMEDIA)) |
1089 (dev->flags & (IFF_UP|IFF_VOLATILE|IFF_PROMISC|IFF_ALLMULTI));
1091 /*
1092 * Load in the correct multicast list now the flags have changed.
1093 */
1095 dev_mc_upload(dev);
1097 /*
1098 * Have we downed the interface. We handle IFF_UP ourselves
1099 * according to user attempts to set it, rather than blindly
1100 * setting it.
1101 */
1103 ret = 0;
1104 if ((old_flags^flags)&IFF_UP) /* Bit is different ? */
1106 ret = ((old_flags & IFF_UP) ? dev_close : dev_open)(dev);
1108 if (ret == 0)
1109 dev_mc_upload(dev);
1112 if (dev->flags&IFF_UP &&
1113 ((old_flags^dev->flags)&~(IFF_UP|IFF_PROMISC|IFF_ALLMULTI|IFF_VOLATILE)))
1114 notifier_call_chain(&netdev_chain, NETDEV_CHANGE, dev);
1116 if ((flags^dev->gflags)&IFF_PROMISC) {
1117 int inc = (flags&IFF_PROMISC) ? +1 : -1;
1118 dev->gflags ^= IFF_PROMISC;
1119 dev_set_promiscuity(dev, inc);
1122 /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
1123 is important. Some (broken) drivers set IFF_PROMISC, when
1124 IFF_ALLMULTI is requested not asking us and not reporting.
1125 */
1126 if ((flags^dev->gflags)&IFF_ALLMULTI) {
1127 int inc = (flags&IFF_ALLMULTI) ? +1 : -1;
1128 dev->gflags ^= IFF_ALLMULTI;
1129 dev_set_allmulti(dev, inc);
1132 if (old_flags^dev->flags)
1133 rtmsg_ifinfo(RTM_NEWLINK, dev, old_flags^dev->flags);
1135 return ret;
1138 /*
1139 * Perform the SIOCxIFxxx calls.
1140 */
1142 static int dev_ifsioc(struct ifreq *ifr, unsigned int cmd)
1144 struct net_device *dev;
1145 int err;
1147 if ((dev = __dev_get_by_name(ifr->ifr_name)) == NULL)
1148 return -ENODEV;
1150 switch(cmd)
1152 case SIOCGIFFLAGS: /* Get interface flags */
1153 ifr->ifr_flags = (dev->flags&~(IFF_PROMISC|IFF_ALLMULTI|IFF_RUNNING))
1154 |(dev->gflags&(IFF_PROMISC|IFF_ALLMULTI));
1155 if (netif_running(dev) && netif_carrier_ok(dev))
1156 ifr->ifr_flags |= IFF_RUNNING;
1157 return 0;
1159 case SIOCSIFFLAGS: /* Set interface flags */
1160 return dev_change_flags(dev, ifr->ifr_flags);
1162 case SIOCGIFMETRIC: /* Get the metric on the interface (currently unused) */
1163 ifr->ifr_metric = 0;
1164 return 0;
1166 case SIOCSIFMETRIC: /* Set the metric on the interface (currently unused) */
1167 return -EOPNOTSUPP;
1169 case SIOCGIFMTU: /* Get the MTU of a device */
1170 ifr->ifr_mtu = dev->mtu;
1171 return 0;
1173 case SIOCSIFMTU: /* Set the MTU of a device */
1174 if (ifr->ifr_mtu == dev->mtu)
1175 return 0;
1177 /*
1178 * MTU must be positive.
1179 */
1181 if (ifr->ifr_mtu<0)
1182 return -EINVAL;
1184 if (!netif_device_present(dev))
1185 return -ENODEV;
1187 if (dev->change_mtu)
1188 err = dev->change_mtu(dev, ifr->ifr_mtu);
1189 else {
1190 dev->mtu = ifr->ifr_mtu;
1191 err = 0;
1193 if (!err && dev->flags&IFF_UP)
1194 notifier_call_chain(&netdev_chain, NETDEV_CHANGEMTU, dev);
1195 return err;
1197 case SIOCGIFHWADDR:
1198 memcpy(ifr->ifr_hwaddr.sa_data,dev->dev_addr, MAX_ADDR_LEN);
1199 ifr->ifr_hwaddr.sa_family=dev->type;
1200 return 0;
1202 case SIOCSIFHWADDR:
1203 if (dev->set_mac_address == NULL)
1204 return -EOPNOTSUPP;
1205 if (ifr->ifr_hwaddr.sa_family!=dev->type)
1206 return -EINVAL;
1207 if (!netif_device_present(dev))
1208 return -ENODEV;
1209 err = dev->set_mac_address(dev, &ifr->ifr_hwaddr);
1210 if (!err)
1211 notifier_call_chain(&netdev_chain, NETDEV_CHANGEADDR, dev);
1212 return err;
1214 case SIOCSIFHWBROADCAST:
1215 if (ifr->ifr_hwaddr.sa_family!=dev->type)
1216 return -EINVAL;
1217 memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data, MAX_ADDR_LEN);
1218 notifier_call_chain(&netdev_chain, NETDEV_CHANGEADDR, dev);
1219 return 0;
1221 case SIOCGIFMAP:
1222 ifr->ifr_map.mem_start=dev->mem_start;
1223 ifr->ifr_map.mem_end=dev->mem_end;
1224 ifr->ifr_map.base_addr=dev->base_addr;
1225 ifr->ifr_map.irq=dev->irq;
1226 ifr->ifr_map.dma=dev->dma;
1227 ifr->ifr_map.port=dev->if_port;
1228 return 0;
1230 case SIOCSIFMAP:
1231 if (dev->set_config) {
1232 if (!netif_device_present(dev))
1233 return -ENODEV;
1234 return dev->set_config(dev,&ifr->ifr_map);
1236 return -EOPNOTSUPP;
1238 case SIOCADDMULTI:
1239 if (dev->set_multicast_list == NULL ||
1240 ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
1241 return -EINVAL;
1242 if (!netif_device_present(dev))
1243 return -ENODEV;
1244 dev_mc_add(dev,ifr->ifr_hwaddr.sa_data, dev->addr_len, 1);
1245 return 0;
1247 case SIOCDELMULTI:
1248 if (dev->set_multicast_list == NULL ||
1249 ifr->ifr_hwaddr.sa_family!=AF_UNSPEC)
1250 return -EINVAL;
1251 if (!netif_device_present(dev))
1252 return -ENODEV;
1253 dev_mc_delete(dev,ifr->ifr_hwaddr.sa_data,dev->addr_len, 1);
1254 return 0;
1256 case SIOCGIFINDEX:
1257 ifr->ifr_ifindex = dev->ifindex;
1258 return 0;
1260 case SIOCGIFTXQLEN:
1261 ifr->ifr_qlen = dev->tx_queue_len;
1262 return 0;
1264 case SIOCSIFTXQLEN:
1265 if (ifr->ifr_qlen<0)
1266 return -EINVAL;
1267 dev->tx_queue_len = ifr->ifr_qlen;
1268 return 0;
1270 case SIOCSIFNAME:
1271 if (dev->flags&IFF_UP)
1272 return -EBUSY;
1273 if (__dev_get_by_name(ifr->ifr_newname))
1274 return -EEXIST;
1275 memcpy(dev->name, ifr->ifr_newname, IFNAMSIZ);
1276 dev->name[IFNAMSIZ-1] = 0;
1277 notifier_call_chain(&netdev_chain, NETDEV_CHANGENAME, dev);
1278 return 0;
1280 #ifdef WIRELESS_EXT
1281 case SIOCGIWSTATS:
1282 return dev_iwstats(dev, ifr);
1283 #endif /* WIRELESS_EXT */
1285 /*
1286 * Unknown or private ioctl
1287 */
1289 default:
1290 if ((cmd >= SIOCDEVPRIVATE &&
1291 cmd <= SIOCDEVPRIVATE + 15) ||
1292 cmd == SIOCBONDENSLAVE ||
1293 cmd == SIOCBONDRELEASE ||
1294 cmd == SIOCBONDSETHWADDR ||
1295 cmd == SIOCBONDSLAVEINFOQUERY ||
1296 cmd == SIOCBONDINFOQUERY ||
1297 cmd == SIOCBONDCHANGEACTIVE ||
1298 cmd == SIOCETHTOOL ||
1299 cmd == SIOCGMIIPHY ||
1300 cmd == SIOCGMIIREG ||
1301 cmd == SIOCSMIIREG) {
1302 if (dev->do_ioctl) {
1303 if (!netif_device_present(dev))
1304 return -ENODEV;
1305 return dev->do_ioctl(dev, ifr, cmd);
1307 return -EOPNOTSUPP;
1310 #ifdef WIRELESS_EXT
1311 if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST) {
1312 if (dev->do_ioctl) {
1313 if (!netif_device_present(dev))
1314 return -ENODEV;
1315 return dev->do_ioctl(dev, ifr, cmd);
1317 return -EOPNOTSUPP;
1319 #endif /* WIRELESS_EXT */
1322 return -EINVAL;
1325 /*
1326 * This function handles all "interface"-type I/O control requests. The actual
1327 * 'doing' part of this is dev_ifsioc above.
1328 */
1330 /**
1331 * dev_ioctl - network device ioctl
1332 * @cmd: command to issue
1333 * @arg: pointer to a struct ifreq in user space
1335 * Issue ioctl functions to devices. This is normally called by the
1336 * user space syscall interfaces but can sometimes be useful for
1337 * other purposes. The return value is the return from the syscall if
1338 * positive or a negative errno code on error.
1339 */
1341 int dev_ioctl(unsigned int cmd, void *arg)
1343 struct ifreq ifr;
1344 int ret;
1345 char *colon;
1347 /* One special case: SIOCGIFCONF takes ifconf argument
1348 and requires shared lock, because it sleeps writing
1349 to user space.
1350 */
1352 if (cmd == SIOCGIFCONF) {
1353 return -ENOSYS;
1355 if (cmd == SIOCGIFNAME) {
1356 return dev_ifname((struct ifreq *)arg);
1359 if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
1360 return -EFAULT;
1362 ifr.ifr_name[IFNAMSIZ-1] = 0;
1364 colon = strchr(ifr.ifr_name, ':');
1365 if (colon)
1366 *colon = 0;
1368 /*
1369 * See which interface the caller is talking about.
1370 */
1372 switch(cmd)
1374 /*
1375 * These ioctl calls:
1376 * - can be done by all.
1377 * - atomic and do not require locking.
1378 * - return a value
1379 */
1381 case SIOCGIFFLAGS:
1382 case SIOCGIFMETRIC:
1383 case SIOCGIFMTU:
1384 case SIOCGIFHWADDR:
1385 case SIOCGIFSLAVE:
1386 case SIOCGIFMAP:
1387 case SIOCGIFINDEX:
1388 case SIOCGIFTXQLEN:
1389 dev_load(ifr.ifr_name);
1390 read_lock(&dev_base_lock);
1391 ret = dev_ifsioc(&ifr, cmd);
1392 read_unlock(&dev_base_lock);
1393 if (!ret) {
1394 if (colon)
1395 *colon = ':';
1396 if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
1397 return -EFAULT;
1399 return ret;
1401 /*
1402 * These ioctl calls:
1403 * - require superuser power.
1404 * - require strict serialization.
1405 * - return a value
1406 */
1408 case SIOCETHTOOL:
1409 case SIOCGMIIPHY:
1410 case SIOCGMIIREG:
1411 if (!capable(CAP_NET_ADMIN))
1412 return -EPERM;
1413 dev_load(ifr.ifr_name);
1414 dev_probe_lock();
1415 rtnl_lock();
1416 ret = dev_ifsioc(&ifr, cmd);
1417 rtnl_unlock();
1418 dev_probe_unlock();
1419 if (!ret) {
1420 if (colon)
1421 *colon = ':';
1422 if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
1423 return -EFAULT;
1425 return ret;
1427 /*
1428 * These ioctl calls:
1429 * - require superuser power.
1430 * - require strict serialization.
1431 * - do not return a value
1432 */
1434 case SIOCSIFFLAGS:
1435 case SIOCSIFMETRIC:
1436 case SIOCSIFMTU:
1437 case SIOCSIFMAP:
1438 case SIOCSIFHWADDR:
1439 case SIOCSIFSLAVE:
1440 case SIOCADDMULTI:
1441 case SIOCDELMULTI:
1442 case SIOCSIFHWBROADCAST:
1443 case SIOCSIFTXQLEN:
1444 case SIOCSIFNAME:
1445 case SIOCSMIIREG:
1446 case SIOCBONDENSLAVE:
1447 case SIOCBONDRELEASE:
1448 case SIOCBONDSETHWADDR:
1449 case SIOCBONDSLAVEINFOQUERY:
1450 case SIOCBONDINFOQUERY:
1451 case SIOCBONDCHANGEACTIVE:
1452 if (!capable(CAP_NET_ADMIN))
1453 return -EPERM;
1454 dev_load(ifr.ifr_name);
1455 dev_probe_lock();
1456 rtnl_lock();
1457 ret = dev_ifsioc(&ifr, cmd);
1458 rtnl_unlock();
1459 dev_probe_unlock();
1460 return ret;
1462 case SIOCGIFMEM:
1463 /* Get the per device memory space. We can add this but currently
1464 do not support it */
1465 case SIOCSIFMEM:
1466 /* Set the per device memory buffer space. Not applicable in our case */
1467 case SIOCSIFLINK:
1468 return -EINVAL;
1470 /*
1471 * Unknown or private ioctl.
1472 */
1474 default:
1475 if (cmd >= SIOCDEVPRIVATE &&
1476 cmd <= SIOCDEVPRIVATE + 15) {
1477 dev_load(ifr.ifr_name);
1478 dev_probe_lock();
1479 rtnl_lock();
1480 ret = dev_ifsioc(&ifr, cmd);
1481 rtnl_unlock();
1482 dev_probe_unlock();
1483 if (!ret && copy_to_user(arg, &ifr, sizeof(struct ifreq)))
1484 return -EFAULT;
1485 return ret;
1487 #ifdef WIRELESS_EXT
1488 /* Take care of Wireless Extensions */
1489 if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST) {
1490 /* If command is `set a parameter', or
1491 * `get the encoding parameters', check if
1492 * the user has the right to do it */
1493 if (IW_IS_SET(cmd) || (cmd == SIOCGIWENCODE)) {
1494 if(!capable(CAP_NET_ADMIN))
1495 return -EPERM;
1497 dev_load(ifr.ifr_name);
1498 rtnl_lock();
1499 ret = dev_ifsioc(&ifr, cmd);
1500 rtnl_unlock();
1501 if (!ret && IW_IS_GET(cmd) &&
1502 copy_to_user(arg, &ifr, sizeof(struct ifreq)))
1503 return -EFAULT;
1504 return ret;
1506 #endif /* WIRELESS_EXT */
1507 return -EINVAL;
1512 /**
1513 * dev_new_index - allocate an ifindex
1515 * Returns a suitable unique value for a new device interface
1516 * number. The caller must hold the rtnl semaphore or the
1517 * dev_base_lock to be sure it remains unique.
1518 */
1520 int dev_new_index(void)
1522 static int ifindex;
1523 for (;;) {
1524 if (++ifindex <= 0)
1525 ifindex=1;
1526 if (__dev_get_by_index(ifindex) == NULL)
1527 return ifindex;
1531 static int dev_boot_phase = 1;
1533 /**
1534 * register_netdevice - register a network device
1535 * @dev: device to register
1537 * Take a completed network device structure and add it to the kernel
1538 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
1539 * chain. 0 is returned on success. A negative errno code is returned
1540 * on a failure to set up the device, or if the name is a duplicate.
1542 * Callers must hold the rtnl semaphore. See the comment at the
1543 * end of Space.c for details about the locking. You may want
1544 * register_netdev() instead of this.
1546 * BUGS:
1547 * The locking appears insufficient to guarantee two parallel registers
1548 * will not get the same name.
1549 */
1551 int net_dev_init(void);
1553 int register_netdevice(struct net_device *dev)
1555 struct net_device *d, **dp;
1556 #ifdef CONFIG_NET_DIVERT
1557 int ret;
1558 #endif
1560 spin_lock_init(&dev->queue_lock);
1561 spin_lock_init(&dev->xmit_lock);
1562 dev->xmit_lock_owner = -1;
1563 #ifdef CONFIG_NET_FASTROUTE
1564 dev->fastpath_lock=RW_LOCK_UNLOCKED;
1565 #endif
1567 if (dev_boot_phase)
1568 net_dev_init();
1570 #ifdef CONFIG_NET_DIVERT
1571 ret = alloc_divert_blk(dev);
1572 if (ret)
1573 return ret;
1574 #endif /* CONFIG_NET_DIVERT */
1576 dev->iflink = -1;
1578 /* Init, if this function is available */
1579 if (dev->init && dev->init(dev) != 0) {
1580 #ifdef CONFIG_NET_DIVERT
1581 free_divert_blk(dev);
1582 #endif
1583 return -EIO;
1586 dev->ifindex = dev_new_index();
1587 if (dev->iflink == -1)
1588 dev->iflink = dev->ifindex;
1590 /* Check for existence, and append to tail of chain */
1591 for (dp=&dev_base; (d=*dp) != NULL; dp=&d->next) {
1592 if (d == dev || strcmp(d->name, dev->name) == 0) {
1593 #ifdef CONFIG_NET_DIVERT
1594 free_divert_blk(dev);
1595 #endif
1596 return -EEXIST;
1599 /*
1600 * nil rebuild_header routine,
1601 * that should be never called and used as just bug trap.
1602 */
1604 if (dev->rebuild_header == NULL)
1605 dev->rebuild_header = default_rebuild_header;
1607 /*
1608 * Default initial state at registry is that the
1609 * device is present.
1610 */
1612 set_bit(__LINK_STATE_PRESENT, &dev->state);
1614 dev->next = NULL;
1615 dev_init_scheduler(dev);
1616 write_lock_bh(&dev_base_lock);
1617 *dp = dev;
1618 dev_hold(dev);
1619 dev->deadbeaf = 0;
1620 write_unlock_bh(&dev_base_lock);
1622 /* Notify protocols, that a new device appeared. */
1623 notifier_call_chain(&netdev_chain, NETDEV_REGISTER, dev);
1625 return 0;
1628 /**
1629 * netdev_finish_unregister - complete unregistration
1630 * @dev: device
1632 * Destroy and free a dead device. A value of zero is returned on
1633 * success.
1634 */
1636 int netdev_finish_unregister(struct net_device *dev)
1638 BUG_TRAP(dev->ip_ptr==NULL);
1639 BUG_TRAP(dev->ip6_ptr==NULL);
1640 BUG_TRAP(dev->dn_ptr==NULL);
1642 if (!dev->deadbeaf) {
1643 printk(KERN_ERR "Freeing alive device %p, %s\n", dev, dev->name);
1644 return 0;
1646 #ifdef NET_REFCNT_DEBUG
1647 printk(KERN_DEBUG "netdev_finish_unregister: %s%s.\n", dev->name,
1648 (dev->features & NETIF_F_DYNALLOC)?"":", old style");
1649 #endif
1650 if (dev->destructor)
1651 dev->destructor(dev);
1652 if (dev->features & NETIF_F_DYNALLOC)
1653 kfree(dev);
1654 return 0;
1657 /**
1658 * unregister_netdevice - remove device from the kernel
1659 * @dev: device
1661 * This function shuts down a device interface and removes it
1662 * from the kernel tables. On success 0 is returned, on a failure
1663 * a negative errno code is returned.
1665 * Callers must hold the rtnl semaphore. See the comment at the
1666 * end of Space.c for details about the locking. You may want
1667 * unregister_netdev() instead of this.
1668 */
1670 int unregister_netdevice(struct net_device *dev)
1672 unsigned long now, warning_time;
1673 struct net_device *d, **dp;
1675 /* If device is running, close it first. */
1676 if (dev->flags & IFF_UP)
1677 dev_close(dev);
1679 BUG_TRAP(dev->deadbeaf==0);
1680 dev->deadbeaf = 1;
1682 /* And unlink it from device chain. */
1683 for (dp = &dev_base; (d=*dp) != NULL; dp=&d->next) {
1684 if (d == dev) {
1685 write_lock_bh(&dev_base_lock);
1686 *dp = d->next;
1687 write_unlock_bh(&dev_base_lock);
1688 break;
1691 if (d == NULL) {
1692 printk(KERN_DEBUG "unregister_netdevice: device %s/%p never was registered\n", dev->name, dev);
1693 return -ENODEV;
1696 /* Synchronize to net_rx_action. */
1697 br_write_lock_bh(BR_NETPROTO_LOCK);
1698 br_write_unlock_bh(BR_NETPROTO_LOCK);
1700 if (dev_boot_phase == 0) {
1702 /* Shutdown queueing discipline. */
1703 dev_shutdown(dev);
1705 /* Notify protocols, that we are about to destroy
1706 this device. They should clean all the things.
1707 */
1708 notifier_call_chain(&netdev_chain, NETDEV_UNREGISTER, dev);
1710 /*
1711 * Flush the multicast chain
1712 */
1713 dev_mc_discard(dev);
1716 if (dev->uninit)
1717 dev->uninit(dev);
1719 /* Notifier chain MUST detach us from master device. */
1720 BUG_TRAP(dev->master==NULL);
1722 #ifdef CONFIG_NET_DIVERT
1723 free_divert_blk(dev);
1724 #endif
1726 if (dev->features & NETIF_F_DYNALLOC) {
1727 #ifdef NET_REFCNT_DEBUG
1728 if (atomic_read(&dev->refcnt) != 1)
1729 printk(KERN_DEBUG "unregister_netdevice: holding %s refcnt=%d\n", dev->name, atomic_read(&dev->refcnt)-1);
1730 #endif
1731 dev_put(dev);
1732 return 0;
1735 /* Last reference is our one */
1736 if (atomic_read(&dev->refcnt) == 1) {
1737 dev_put(dev);
1738 return 0;
1741 #ifdef NET_REFCNT_DEBUG
1742 printk("unregister_netdevice: waiting %s refcnt=%d\n", dev->name, atomic_read(&dev->refcnt));
1743 #endif
1745 /* EXPLANATION. If dev->refcnt is not now 1 (our own reference)
1746 it means that someone in the kernel still has a reference
1747 to this device and we cannot release it.
1749 "New style" devices have destructors, hence we can return from this
1750 function and destructor will do all the work later. As of kernel 2.4.0
1751 there are very few "New Style" devices.
1753 "Old style" devices expect that the device is free of any references
1754 upon exit from this function.
1755 We cannot return from this function until all such references have
1756 fallen away. This is because the caller of this function will probably
1757 immediately kfree(*dev) and then be unloaded via sys_delete_module.
1759 So, we linger until all references fall away. The duration of the
1760 linger is basically unbounded! It is driven by, for example, the
1761 current setting of sysctl_ipfrag_time.
1763 After 1 second, we start to rebroadcast unregister notifications
1764 in hope that careless clients will release the device.
1766 */
1768 now = warning_time = jiffies;
1769 while (atomic_read(&dev->refcnt) != 1) {
1770 if ((jiffies - now) > 1*HZ) {
1771 /* Rebroadcast unregister notification */
1772 notifier_call_chain(&netdev_chain, NETDEV_UNREGISTER, dev);
1774 mdelay(250);
1775 if ((jiffies - warning_time) > 10*HZ) {
1776 printk(KERN_EMERG "unregister_netdevice: waiting for %s to "
1777 "become free. Usage count = %d\n",
1778 dev->name, atomic_read(&dev->refcnt));
1779 warning_time = jiffies;
1782 dev_put(dev);
1783 return 0;
1787 /*
1788 * Initialize the DEV module. At boot time this walks the device list and
1789 * unhooks any devices that fail to initialise (normally hardware not
1790 * present) and leaves us with a valid list of present and active devices.
1792 */
1794 extern void net_device_init(void);
1795 extern void ip_auto_config(void);
1796 #ifdef CONFIG_NET_DIVERT
1797 extern void dv_init(void);
1798 #endif /* CONFIG_NET_DIVERT */
1801 /*
1802 * Callers must hold the rtnl semaphore. See the comment at the
1803 * end of Space.c for details about the locking.
1804 */
1805 int __init net_dev_init(void)
1807 struct net_device *dev, **dp;
1808 int i;
1810 if (!dev_boot_phase)
1811 return 0;
1813 /*
1814 * KAF: was sone in socket_init, but that top-half stuff is gone.
1815 */
1816 skb_init();
1818 /*
1819 * Initialise the packet receive queues.
1820 */
1822 for (i = 0; i < NR_CPUS; i++) {
1823 struct softnet_data *queue;
1825 queue = &softnet_data[i];
1826 skb_queue_head_init(&queue->input_pkt_queue);
1827 queue->throttle = 0;
1828 queue->cng_level = 0;
1829 queue->avg_blog = 10; /* arbitrary non-zero */
1830 queue->completion_queue = NULL;
1833 /*
1834 * Add the devices.
1835 * If the call to dev->init fails, the dev is removed
1836 * from the chain disconnecting the device until the
1837 * next reboot.
1839 * NB At boot phase networking is dead. No locking is required.
1840 * But we still preserve dev_base_lock for sanity.
1841 */
1843 dp = &dev_base;
1844 while ((dev = *dp) != NULL) {
1845 spin_lock_init(&dev->queue_lock);
1846 spin_lock_init(&dev->xmit_lock);
1848 dev->xmit_lock_owner = -1;
1849 dev->iflink = -1;
1850 dev_hold(dev);
1852 /*
1853 * Allocate name. If the init() fails
1854 * the name will be reissued correctly.
1855 */
1856 if (strchr(dev->name, '%'))
1857 dev_alloc_name(dev, dev->name);
1859 if (dev->init && dev->init(dev)) {
1860 /*
1861 * It failed to come up. It will be unhooked later.
1862 * dev_alloc_name can now advance to next suitable
1863 * name that is checked next.
1864 */
1865 dev->deadbeaf = 1;
1866 dp = &dev->next;
1867 } else {
1868 dp = &dev->next;
1869 dev->ifindex = dev_new_index();
1870 if (dev->iflink == -1)
1871 dev->iflink = dev->ifindex;
1872 if (dev->rebuild_header == NULL)
1873 dev->rebuild_header = default_rebuild_header;
1874 dev_init_scheduler(dev);
1875 set_bit(__LINK_STATE_PRESENT, &dev->state);
1879 /*
1880 * Unhook devices that failed to come up
1881 */
1882 dp = &dev_base;
1883 while ((dev = *dp) != NULL) {
1884 if (dev->deadbeaf) {
1885 write_lock_bh(&dev_base_lock);
1886 *dp = dev->next;
1887 write_unlock_bh(&dev_base_lock);
1888 dev_put(dev);
1889 } else {
1890 dp = &dev->next;
1894 dev_boot_phase = 0;
1896 open_softirq(NET_TX_SOFTIRQ, net_tx_action, NULL);
1897 //open_softirq(NET_RX_SOFTIRQ, net_rx_action, NULL);
1899 dst_init();
1900 dev_mcast_init();
1902 #ifdef CONFIG_NET_SCHED
1903 pktsched_init();
1904 #endif
1906 /*
1907 * Initialise network devices
1908 */
1910 net_device_init();
1912 return 0;
1916 /*
1917 * do_net_update:
1919 * Called from guest OS to notify updates to its transmit and/or receive
1920 * descriptor rings.
1921 */
1922 #define TX_RING_INC(_i) (((_i)+1) & (TX_RING_SIZE-1))
1923 #define RX_RING_INC(_i) (((_i)+1) & (RX_RING_SIZE-1))
1924 #define TX_RING_ADD(_i,_j) (((_i)+(_j)) & (TX_RING_SIZE-1))
1925 #define RX_RING_ADD(_i,_j) (((_i)+(_j)) & (RX_RING_SIZE-1))
1926 long do_net_update(void)
1928 shared_info_t *shared = current->shared_info;
1929 net_ring_t *net_ring;
1930 net_shadow_ring_t *shadow_ring;
1931 net_vif_t *current_vif;
1932 unsigned int i, j;
1933 struct sk_buff *skb;
1934 tx_entry_t tx;
1936 for ( j = 0; j < current->num_net_vifs; j++)
1938 current_vif = current->net_vif_list[j];
1939 net_ring = current_vif->net_ring;
1941 /* First, we send out pending TX descriptors if they exist on this ring.
1942 */
1944 for ( i = net_ring->tx_cons; i != net_ring->tx_prod; i = TX_RING_INC(i) )
1946 if ( copy_from_user(&tx, net_ring->tx_ring+i, sizeof(tx)) )
1947 continue;
1949 if ( TX_RING_INC(i) == net_ring->tx_event )
1950 set_bit(_EVENT_NET_TX_FOR_VIF(j), &shared->events);
1952 skb = alloc_skb(tx.size, GFP_KERNEL);
1953 if ( skb == NULL ) continue;
1954 skb_put(skb, tx.size);
1955 if ( copy_from_user(skb->data, (void *)tx.addr, tx.size) )
1957 kfree_skb(skb);
1958 continue;
1960 skb->dev = the_dev;
1962 if ( skb->len < 16 )
1964 kfree_skb(skb);
1965 continue;
1968 memcpy(skb->data + ETH_ALEN, skb->dev->dev_addr, ETH_ALEN);
1970 switch ( ntohs(*(unsigned short *)(skb->data + 12)) )
1972 case ETH_P_ARP:
1973 skb->protocol = __constant_htons(ETH_P_ARP);
1974 if ( skb->len < 42 ) break;
1975 memcpy(skb->data + 22, skb->dev->dev_addr, 6);
1976 break;
1977 case ETH_P_IP:
1978 skb->protocol = __constant_htons(ETH_P_IP);
1979 break;
1980 default:
1981 kfree_skb(skb);
1982 skb = NULL;
1983 break;
1986 if ( skb != NULL )
1988 skb->protocol = eth_type_trans(skb, skb->dev);
1989 skb->src_vif = current_vif->id;
1990 net_get_target_vif(skb);
1991 if ( skb->dst_vif > VIF_PHYSICAL_INTERFACE )
1993 (void)netif_rx(skb);
1995 else if ( skb->dst_vif == VIF_PHYSICAL_INTERFACE )
1997 skb_push(skb, skb->dev->hard_header_len);
1998 dev_queue_xmit(skb);
2000 else
2002 kfree_skb(skb);
2007 net_ring->tx_cons = i;
2009 /* Next, pull any new RX descriptors across to the shadow ring.
2010 * Note that in the next revision, these will reference PTEs and the
2011 * code here will have to validate reference and flush counts, copy the
2012 * descriptor, change the ownership to dom0 and invalidate the client's
2013 * version of the page.
2014 */
2016 shadow_ring = current_vif->shadow_ring;
2018 for (i = shadow_ring->rx_prod; i != net_ring->rx_prod; i = TX_RING_INC(i))
2020 /* This copy assumes that rx_shadow_entry_t is an extension of rx_net_entry_t
2021 * extra fields must be tacked on to the end.
2022 */
2024 if ( copy_from_user( shadow_ring->rx_ring+i, net_ring->rx_ring+i,
2025 sizeof (rx_entry_t) ) )
2027 shadow_ring->rx_ring[i].status = RING_STATUS_ERR_CFU;
2028 } else {
2029 shadow_ring->rx_ring[i].status = RING_STATUS_OK;
2033 shadow_ring->rx_prod = net_ring->rx_prod;
2036 return 0;
2040 int setup_network_devices(void)
2042 int ret;
2043 struct net_device *dev = dev_get_by_name("eth0");
2045 if ( dev == NULL )
2047 printk("Could not find device eth0\n");
2048 return 0;
2051 ret = dev_open(dev);
2052 if ( ret != 0 )
2054 printk("Error opening device eth0 for use (%d)\n", ret);
2055 return 0;
2057 printk("Device eth0 opened and ready for use\n");
2058 the_dev = dev;
2060 return 1;