ia64/xen-unstable

view xen/net/dev.c @ 665:63045128dbbb

bitkeeper revision 1.380 (3f27db60mH7HcLNPwteU9ODmiRrLTg)

dev.c:
Fix network tx events
author kaf24@scramble.cl.cam.ac.uk
date Wed Jul 30 14:51:12 2003 +0000 (2003-07-30)
parents 8772d7598f1e
children 6c29b3a63599
line source
1 /*
2 * NET3 Protocol independent device support routines.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 */
10 #include <asm/uaccess.h>
11 #include <asm/system.h>
12 #include <asm/bitops.h>
13 #include <linux/config.h>
14 #include <linux/delay.h>
15 #include <linux/lib.h>
16 #include <linux/types.h>
17 #include <linux/sched.h>
18 #include <linux/mm.h>
19 #include <linux/socket.h>
20 #include <linux/sockios.h>
21 #include <linux/errno.h>
22 #include <linux/interrupt.h>
23 #include <linux/if_ether.h>
24 #include <linux/netdevice.h>
25 #include <linux/etherdevice.h>
26 #include <linux/skbuff.h>
27 #include <linux/brlock.h>
28 #include <linux/init.h>
29 #include <linux/module.h>
31 #include <linux/event.h>
32 #include <asm/domain_page.h>
33 #include <asm/pgalloc.h>
35 #include <xeno/perfc.h>
37 #define BUG_TRAP ASSERT
38 #define notifier_call_chain(_a,_b,_c) ((void)0)
39 #define rtmsg_ifinfo(_a,_b,_c) ((void)0)
40 #define rtnl_lock() ((void)0)
41 #define rtnl_unlock() ((void)0)
43 #if 0
44 #define DPRINTK(_f, _a...) printk(_f , ## _a)
45 #else
46 #define DPRINTK(_f, _a...) ((void)0)
47 #endif
49 #define TX_RING_INC(_i) (((_i)+1) & (TX_RING_SIZE-1))
50 #define RX_RING_INC(_i) (((_i)+1) & (RX_RING_SIZE-1))
51 #define TX_RING_ADD(_i,_j) (((_i)+(_j)) & (TX_RING_SIZE-1))
52 #define RX_RING_ADD(_i,_j) (((_i)+(_j)) & (RX_RING_SIZE-1))
54 static void make_tx_response(net_vif_t *vif,
55 unsigned short id,
56 unsigned char st);
57 static void make_rx_response(net_vif_t *vif,
58 unsigned short id,
59 unsigned short size,
60 unsigned char st,
61 unsigned char off);
63 struct net_device *the_dev = NULL;
65 /*
66 * Transmitted packets are fragmented, so we can copy the important headesr
67 * before checking them for validity. Avoids need for page protection.
68 */
69 /* Ethernet + IP headers */
70 #define PKT_PROT_LEN (ETH_HLEN + 20)
71 static kmem_cache_t *net_header_cachep;
73 /**
74 * __dev_get_by_name - find a device by its name
75 * @name: name to find
76 *
77 * Find an interface by name. Must be called under RTNL semaphore
78 * or @dev_base_lock. If the name is found a pointer to the device
79 * is returned. If the name is not found then %NULL is returned. The
80 * reference counters are not incremented so the caller must be
81 * careful with locks.
82 */
85 struct net_device *__dev_get_by_name(const char *name)
86 {
87 struct net_device *dev;
89 for (dev = dev_base; dev != NULL; dev = dev->next) {
90 if (strncmp(dev->name, name, IFNAMSIZ) == 0)
91 return dev;
92 }
93 return NULL;
94 }
96 /**
97 * dev_get_by_name - find a device by its name
98 * @name: name to find
99 *
100 * Find an interface by name. This can be called from any
101 * context and does its own locking. The returned handle has
102 * the usage count incremented and the caller must use dev_put() to
103 * release it when it is no longer needed. %NULL is returned if no
104 * matching device is found.
105 */
107 struct net_device *dev_get_by_name(const char *name)
108 {
109 struct net_device *dev;
111 read_lock(&dev_base_lock);
112 dev = __dev_get_by_name(name);
113 if (dev)
114 dev_hold(dev);
115 read_unlock(&dev_base_lock);
116 return dev;
117 }
119 /**
120 * dev_get - test if a device exists
121 * @name: name to test for
122 *
123 * Test if a name exists. Returns true if the name is found. In order
124 * to be sure the name is not allocated or removed during the test the
125 * caller must hold the rtnl semaphore.
126 *
127 * This function primarily exists for back compatibility with older
128 * drivers.
129 */
131 int dev_get(const char *name)
132 {
133 struct net_device *dev;
135 read_lock(&dev_base_lock);
136 dev = __dev_get_by_name(name);
137 read_unlock(&dev_base_lock);
138 return dev != NULL;
139 }
141 /**
142 * __dev_get_by_index - find a device by its ifindex
143 * @ifindex: index of device
144 *
145 * Search for an interface by index. Returns %NULL if the device
146 * is not found or a pointer to the device. The device has not
147 * had its reference counter increased so the caller must be careful
148 * about locking. The caller must hold either the RTNL semaphore
149 * or @dev_base_lock.
150 */
152 struct net_device * __dev_get_by_index(int ifindex)
153 {
154 struct net_device *dev;
156 for (dev = dev_base; dev != NULL; dev = dev->next) {
157 if (dev->ifindex == ifindex)
158 return dev;
159 }
160 return NULL;
161 }
164 /**
165 * dev_get_by_index - find a device by its ifindex
166 * @ifindex: index of device
167 *
168 * Search for an interface by index. Returns NULL if the device
169 * is not found or a pointer to the device. The device returned has
170 * had a reference added and the pointer is safe until the user calls
171 * dev_put to indicate they have finished with it.
172 */
174 struct net_device * dev_get_by_index(int ifindex)
175 {
176 struct net_device *dev;
178 read_lock(&dev_base_lock);
179 dev = __dev_get_by_index(ifindex);
180 if (dev)
181 dev_hold(dev);
182 read_unlock(&dev_base_lock);
183 return dev;
184 }
186 /**
187 * dev_getbyhwaddr - find a device by its hardware address
188 * @type: media type of device
189 * @ha: hardware address
190 *
191 * Search for an interface by MAC address. Returns NULL if the device
192 * is not found or a pointer to the device. The caller must hold the
193 * rtnl semaphore. The returned device has not had its ref count increased
194 * and the caller must therefore be careful about locking
195 *
196 * BUGS:
197 * If the API was consistent this would be __dev_get_by_hwaddr
198 */
200 struct net_device *dev_getbyhwaddr(unsigned short type, char *ha)
201 {
202 struct net_device *dev;
204 for (dev = dev_base; dev != NULL; dev = dev->next) {
205 if (dev->type == type &&
206 memcmp(dev->dev_addr, ha, dev->addr_len) == 0)
207 return dev;
208 }
209 return NULL;
210 }
212 /**
213 * dev_alloc_name - allocate a name for a device
214 * @dev: device
215 * @name: name format string
216 *
217 * Passed a format string - eg "lt%d" it will try and find a suitable
218 * id. Not efficient for many devices, not called a lot. The caller
219 * must hold the dev_base or rtnl lock while allocating the name and
220 * adding the device in order to avoid duplicates. Returns the number
221 * of the unit assigned or a negative errno code.
222 */
224 int dev_alloc_name(struct net_device *dev, const char *name)
225 {
226 int i;
227 char buf[32];
228 char *p;
230 /*
231 * Verify the string as this thing may have come from
232 * the user. There must be either one "%d" and no other "%"
233 * characters, or no "%" characters at all.
234 */
235 p = strchr(name, '%');
236 if (p && (p[1] != 'd' || strchr(p+2, '%')))
237 return -EINVAL;
239 /*
240 * If you need over 100 please also fix the algorithm...
241 */
242 for (i = 0; i < 100; i++) {
243 snprintf(buf,sizeof(buf),name,i);
244 if (__dev_get_by_name(buf) == NULL) {
245 strcpy(dev->name, buf);
246 return i;
247 }
248 }
249 return -ENFILE; /* Over 100 of the things .. bail out! */
250 }
252 /**
253 * dev_alloc - allocate a network device and name
254 * @name: name format string
255 * @err: error return pointer
256 *
257 * Passed a format string, eg. "lt%d", it will allocate a network device
258 * and space for the name. %NULL is returned if no memory is available.
259 * If the allocation succeeds then the name is assigned and the
260 * device pointer returned. %NULL is returned if the name allocation
261 * failed. The cause of an error is returned as a negative errno code
262 * in the variable @err points to.
263 *
264 * The caller must hold the @dev_base or RTNL locks when doing this in
265 * order to avoid duplicate name allocations.
266 */
268 struct net_device *dev_alloc(const char *name, int *err)
269 {
270 struct net_device *dev=kmalloc(sizeof(struct net_device), GFP_KERNEL);
271 if (dev == NULL) {
272 *err = -ENOBUFS;
273 return NULL;
274 }
275 memset(dev, 0, sizeof(struct net_device));
276 *err = dev_alloc_name(dev, name);
277 if (*err < 0) {
278 kfree(dev);
279 return NULL;
280 }
281 return dev;
282 }
284 /**
285 * netdev_state_change - device changes state
286 * @dev: device to cause notification
287 *
288 * Called to indicate a device has changed state. This function calls
289 * the notifier chains for netdev_chain and sends a NEWLINK message
290 * to the routing socket.
291 */
293 void netdev_state_change(struct net_device *dev)
294 {
295 if (dev->flags&IFF_UP) {
296 notifier_call_chain(&netdev_chain, NETDEV_CHANGE, dev);
297 rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
298 }
299 }
302 #ifdef CONFIG_KMOD
304 /**
305 * dev_load - load a network module
306 * @name: name of interface
307 *
308 * If a network interface is not present and the process has suitable
309 * privileges this function loads the module. If module loading is not
310 * available in this kernel then it becomes a nop.
311 */
313 void dev_load(const char *name)
314 {
315 if (!dev_get(name) && capable(CAP_SYS_MODULE))
316 request_module(name);
317 }
319 #else
321 extern inline void dev_load(const char *unused){;}
323 #endif
325 static int default_rebuild_header(struct sk_buff *skb)
326 {
327 printk(KERN_DEBUG "%s: default_rebuild_header called -- BUG!\n",
328 skb->dev ? skb->dev->name : "NULL!!!");
329 kfree_skb(skb);
330 return 1;
331 }
333 /**
334 * dev_open - prepare an interface for use.
335 * @dev: device to open
336 *
337 * Takes a device from down to up state. The device's private open
338 * function is invoked and then the multicast lists are loaded. Finally
339 * the device is moved into the up state and a %NETDEV_UP message is
340 * sent to the netdev notifier chain.
341 *
342 * Calling this function on an active interface is a nop. On a failure
343 * a negative errno code is returned.
344 */
346 int dev_open(struct net_device *dev)
347 {
348 int ret = 0;
350 /*
351 * Is it already up?
352 */
354 if (dev->flags&IFF_UP)
355 return 0;
357 /*
358 * Is it even present?
359 */
360 if (!netif_device_present(dev))
361 return -ENODEV;
363 /*
364 * Call device private open method
365 */
366 if (try_inc_mod_count(dev->owner)) {
367 if (dev->open) {
368 ret = dev->open(dev);
369 if (ret != 0 && dev->owner)
370 __MOD_DEC_USE_COUNT(dev->owner);
371 }
372 } else {
373 ret = -ENODEV;
374 }
376 /*
377 * If it went open OK then:
378 */
380 if (ret == 0)
381 {
382 /*
383 * Set the flags.
384 */
385 dev->flags |= IFF_UP;
387 set_bit(__LINK_STATE_START, &dev->state);
389 /*
390 * Initialize multicasting status
391 */
392 dev_mc_upload(dev);
394 /*
395 * Wakeup transmit queue engine
396 */
397 dev_activate(dev);
399 /*
400 * ... and announce new interface.
401 */
402 notifier_call_chain(&netdev_chain, NETDEV_UP, dev);
403 }
404 return(ret);
405 }
408 /**
409 * dev_close - shutdown an interface.
410 * @dev: device to shutdown
411 *
412 * This function moves an active device into down state. A
413 * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
414 * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
415 * chain.
416 */
418 int dev_close(struct net_device *dev)
419 {
420 if (!(dev->flags&IFF_UP))
421 return 0;
423 /*
424 * Tell people we are going down, so that they can
425 * prepare to death, when device is still operating.
426 */
427 notifier_call_chain(&netdev_chain, NETDEV_GOING_DOWN, dev);
429 dev_deactivate(dev);
431 clear_bit(__LINK_STATE_START, &dev->state);
433 /*
434 * Call the device specific close. This cannot fail.
435 * Only if device is UP
436 *
437 * We allow it to be called even after a DETACH hot-plug
438 * event.
439 */
441 if (dev->stop)
442 dev->stop(dev);
444 /*
445 * Device is now down.
446 */
448 dev->flags &= ~IFF_UP;
450 /*
451 * Tell people we are down
452 */
453 notifier_call_chain(&netdev_chain, NETDEV_DOWN, dev);
455 /*
456 * Drop the module refcount
457 */
458 if (dev->owner)
459 __MOD_DEC_USE_COUNT(dev->owner);
461 return(0);
462 }
465 #ifdef CONFIG_HIGHMEM
466 /* Actually, we should eliminate this check as soon as we know, that:
467 * 1. IOMMU is present and allows to map all the memory.
468 * 2. No high memory really exists on this machine.
469 */
471 static inline int
472 illegal_highdma(struct net_device *dev, struct sk_buff *skb)
473 {
474 int i;
476 if (dev->features&NETIF_F_HIGHDMA)
477 return 0;
479 for (i=0; i<skb_shinfo(skb)->nr_frags; i++)
480 if (skb_shinfo(skb)->frags[i].page >= highmem_start_page)
481 return 1;
483 return 0;
484 }
485 #else
486 #define illegal_highdma(dev, skb) (0)
487 #endif
490 /*=======================================================================
491 Receiver routines
492 =======================================================================*/
494 struct netif_rx_stats netdev_rx_stat[NR_CPUS];
496 void deliver_packet(struct sk_buff *skb, net_vif_t *vif)
497 {
498 rx_shadow_entry_t *rx;
499 unsigned long *ptep;
500 struct pfn_info *old_page, *new_page, *pte_page;
501 unsigned int i;
502 unsigned short size;
503 unsigned char offset, status = RING_STATUS_OK;
505 memcpy(skb->mac.ethernet->h_dest, vif->vmac, ETH_ALEN);
506 if ( ntohs(skb->mac.ethernet->h_proto) == ETH_P_ARP )
507 memcpy(skb->nh.raw + 18, vif->vmac, ETH_ALEN);
509 /*
510 * Slightly gross: we need the page_lock so that we can do PTE checking.
511 * However, we take it slightly early so that it can protect the update
512 * of rx_cons. This saves us from grabbing two locks.
513 */
514 spin_lock(&vif->domain->page_lock);
516 if ( (i = vif->rx_cons) == vif->rx_prod )
517 {
518 spin_unlock(&vif->domain->page_lock);
519 perfc_incr(net_rx_capacity_drop);
520 return;
521 }
522 rx = vif->rx_shadow_ring + i;
523 vif->rx_cons = RX_RING_INC(i);
525 size = (unsigned short)skb->len;
526 offset = (unsigned char)((unsigned long)skb->data & ~PAGE_MASK);
528 /* Release the page-table page. */
529 pte_page = frame_table + (rx->pte_ptr >> PAGE_SHIFT);
530 put_page_type(pte_page);
531 put_page_tot(pte_page);
533 old_page = frame_table + rx->buf_pfn;
534 new_page = skb->pf;
536 ptep = map_domain_mem(rx->pte_ptr);
538 if ( (*ptep & _PAGE_PRESENT) )
539 {
540 /* Bail out if the PTE has been reused under our feet. */
541 list_add(&old_page->list, &vif->domain->pg_head);
542 old_page->flags = vif->domain->domain;
543 unmap_domain_mem(ptep);
544 spin_unlock(&vif->domain->page_lock);
545 status = RING_STATUS_BAD_PAGE;
546 goto out;
547 }
549 /* Give the new page to the domain, marking it writeable. */
550 new_page->tot_count = new_page->type_count = 1;
551 new_page->flags = vif->domain->domain | PGT_writeable_page | PG_need_flush;
552 list_add(&new_page->list, &vif->domain->pg_head);
554 /* Patch the PTE to map the new page as writeable. */
555 machine_to_phys_mapping[new_page - frame_table]
556 = machine_to_phys_mapping[old_page - frame_table];
557 *ptep = (*ptep & ~PAGE_MASK) | _PAGE_RW | _PAGE_PRESENT |
558 (((new_page - frame_table) << PAGE_SHIFT) & PAGE_MASK);
560 unmap_domain_mem(ptep);
562 spin_unlock(&vif->domain->page_lock);
564 /* Our skbuff now points at the guest's old frame. */
565 skb->pf = old_page;
567 /* Updates must happen before releasing the descriptor. */
568 smp_wmb();
570 /*
571 * NB. The remote flush here should be safe, as we hold no locks. The
572 * network driver that called us should also have no nasty locks.
573 */
574 if ( rx->flush_count == (unsigned short)
575 atomic_read(&tlb_flush_count[vif->domain->processor]) )
576 {
577 perfc_incr(net_rx_tlbflush);
578 flush_tlb_cpu(vif->domain->processor);
579 }
581 perfc_incr(net_rx_delivered);
583 /* record this so they can be billed */
584 vif->total_packets_received++;
585 vif->total_bytes_received += size;
587 out:
588 make_rx_response(vif, rx->id, size, status, offset);
589 }
591 /**
592 * netif_rx - post buffer to the network code
593 * @skb: buffer to post
594 *
595 * This function receives a packet from a device driver and queues it for
596 * the upper (protocol) levels to process. It always succeeds. The buffer
597 * may be dropped during processing for congestion control or by the
598 * protocol layers.
599 *
600 * return values:
601 * NET_RX_SUCCESS (no congestion)
602 * NET_RX_DROP (packet was dropped)
603 */
605 int netif_rx(struct sk_buff *skb)
606 {
607 int offset, this_cpu = smp_processor_id();
608 unsigned long flags;
610 local_irq_save(flags);
612 ASSERT(skb->skb_type == SKB_ZERO_COPY);
614 /*
615 * Offset will include 16 bytes padding from dev_alloc_skb, 14 bytes for
616 * ethernet header, plus any other alignment padding added by the driver.
617 */
618 offset = (int)skb->data & ~PAGE_MASK;
619 skb->head = (u8 *)map_domain_mem(((skb->pf - frame_table) << PAGE_SHIFT));
620 skb->data = skb->nh.raw = skb->head + offset;
621 skb->tail = skb->data + skb->len;
622 skb_push(skb, ETH_HLEN);
623 skb->mac.raw = skb->data;
625 netdev_rx_stat[this_cpu].total++;
627 if ( skb->dst_vif == NULL )
628 skb->dst_vif = net_get_target_vif(skb->data, skb->len, skb->src_vif);
630 if ( !VIF_LOCAL(skb->dst_vif) )
631 skb->dst_vif = find_vif_by_id(0);
633 deliver_packet(skb, skb->dst_vif);
634 put_vif(skb->dst_vif);
636 unmap_domain_mem(skb->head);
637 kfree_skb(skb);
638 local_irq_restore(flags);
639 return NET_RX_SUCCESS;
640 }
643 /*************************************************************
644 * NEW TRANSMIT SCHEDULER
645 *
646 * NB. We ought also to only send a limited number of bytes to the NIC
647 * for transmission at any one time (to avoid head-of-line blocking).
648 * However, driver rings are small enough that they provide a reasonable
649 * limit.
650 *
651 * eg. 3c905 has 16 descriptors == 8 packets, at 100Mbps
652 * e1000 has 256 descriptors == 128 packets, at 1000Mbps
653 * tg3 has 512 descriptors == 256 packets, at 1000Mbps
654 *
655 * So, worst case is tg3 with 256 1500-bytes packets == 375kB.
656 * This would take 3ms, and represents our worst-case HoL blocking cost.
657 *
658 * We think this is reasonable.
659 */
661 struct list_head net_schedule_list;
662 spinlock_t net_schedule_list_lock;
664 static int __on_net_schedule_list(net_vif_t *vif)
665 {
666 return vif->list.next != NULL;
667 }
669 static void remove_from_net_schedule_list(net_vif_t *vif)
670 {
671 unsigned long flags;
672 spin_lock_irqsave(&net_schedule_list_lock, flags);
673 ASSERT(__on_net_schedule_list(vif));
674 list_del(&vif->list);
675 vif->list.next = NULL;
676 put_vif(vif);
677 spin_unlock_irqrestore(&net_schedule_list_lock, flags);
678 }
680 static void add_to_net_schedule_list_tail(net_vif_t *vif)
681 {
682 unsigned long flags;
683 if ( __on_net_schedule_list(vif) ) return;
684 spin_lock_irqsave(&net_schedule_list_lock, flags);
685 if ( !__on_net_schedule_list(vif) )
686 {
687 list_add_tail(&vif->list, &net_schedule_list);
688 get_vif(vif);
689 }
690 spin_unlock_irqrestore(&net_schedule_list_lock, flags);
691 }
694 /* Destructor function for tx skbs. */
695 static void tx_skb_release(struct sk_buff *skb)
696 {
697 int i;
698 net_vif_t *vif = skb->src_vif;
699 unsigned long flags;
701 spin_lock_irqsave(&vif->domain->page_lock, flags);
702 for ( i = 0; i < skb_shinfo(skb)->nr_frags; i++ )
703 put_page_tot(skb_shinfo(skb)->frags[i].page);
704 spin_unlock_irqrestore(&vif->domain->page_lock, flags);
706 if ( skb->skb_type == SKB_NODATA )
707 kmem_cache_free(net_header_cachep, skb->head);
709 skb_shinfo(skb)->nr_frags = 0;
711 make_tx_response(vif, skb->guest_id, RING_STATUS_OK);
713 put_vif(vif);
714 }
717 static void net_tx_action(unsigned long unused)
718 {
719 struct net_device *dev = the_dev;
720 struct list_head *ent;
721 struct sk_buff *skb;
722 net_vif_t *vif;
723 tx_shadow_entry_t *tx;
725 spin_lock(&dev->xmit_lock);
726 while ( !netif_queue_stopped(dev) &&
727 !list_empty(&net_schedule_list) )
728 {
729 /* Get a vif from the list with work to do. */
730 ent = net_schedule_list.next;
731 vif = list_entry(ent, net_vif_t, list);
732 get_vif(vif);
733 remove_from_net_schedule_list(vif);
734 if ( vif->tx_cons == vif->tx_prod )
735 {
736 put_vif(vif);
737 continue;
738 }
740 if ( (skb = alloc_skb_nodata(GFP_ATOMIC)) == NULL )
741 {
742 printk("Out of memory in net_tx_action()!\n");
743 add_to_net_schedule_list_tail(vif);
744 put_vif(vif);
745 break;
746 }
748 /* Pick an entry from the transmit queue. */
749 tx = &vif->tx_shadow_ring[vif->tx_cons];
750 vif->tx_cons = TX_RING_INC(vif->tx_cons);
751 if ( vif->tx_cons != vif->tx_prod )
752 add_to_net_schedule_list_tail(vif);
754 skb->destructor = tx_skb_release;
756 skb->head = skb->data = tx->header;
757 skb->end = skb->tail = skb->head + PKT_PROT_LEN;
759 skb->dev = the_dev;
760 skb->src_vif = vif;
761 skb->dst_vif = NULL;
762 skb->mac.raw = skb->data;
763 skb->guest_id = tx->id;
765 skb_shinfo(skb)->frags[0].page = frame_table +
766 (tx->payload >> PAGE_SHIFT);
767 skb_shinfo(skb)->frags[0].size = tx->size - PKT_PROT_LEN;
768 skb_shinfo(skb)->frags[0].page_offset = tx->payload & ~PAGE_MASK;
769 skb_shinfo(skb)->nr_frags = 1;
771 skb->data_len = tx->size - PKT_PROT_LEN;
772 skb->len = tx->size;
774 /* record the transmission so they can be billed */
775 vif->total_packets_sent++;
776 vif->total_bytes_sent += tx->size;
778 /* Is the NIC crap? */
779 if ( !(dev->features & NETIF_F_SG) )
780 skb_linearize(skb, GFP_KERNEL);
782 /* Transmit should always work, or the queue would be stopped. */
783 if ( dev->hard_start_xmit(skb, dev) != 0 )
784 {
785 printk("Weird failure in hard_start_xmit!\n");
786 kfree_skb(skb);
787 break;
788 }
789 }
790 spin_unlock(&dev->xmit_lock);
791 }
793 DECLARE_TASKLET_DISABLED(net_tx_tasklet, net_tx_action, 0);
795 static inline void maybe_schedule_tx_action(void)
796 {
797 smp_mb();
798 if ( !netif_queue_stopped(the_dev) &&
799 !list_empty(&net_schedule_list) )
800 tasklet_schedule(&net_tx_tasklet);
801 }
804 /*
805 * We need this ioctl for efficient implementation of the
806 * if_indextoname() function required by the IPv6 API. Without
807 * it, we would have to search all the interfaces to find a
808 * match. --pb
809 */
811 static int dev_ifname(struct ifreq *arg)
812 {
813 struct net_device *dev;
814 struct ifreq ifr;
816 /*
817 * Fetch the caller's info block.
818 */
820 if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
821 return -EFAULT;
823 read_lock(&dev_base_lock);
824 dev = __dev_get_by_index(ifr.ifr_ifindex);
825 if (!dev) {
826 read_unlock(&dev_base_lock);
827 return -ENODEV;
828 }
830 strcpy(ifr.ifr_name, dev->name);
831 read_unlock(&dev_base_lock);
833 if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
834 return -EFAULT;
835 return 0;
836 }
839 /**
840 * netdev_set_master - set up master/slave pair
841 * @slave: slave device
842 * @master: new master device
843 *
844 * Changes the master device of the slave. Pass %NULL to break the
845 * bonding. The caller must hold the RTNL semaphore. On a failure
846 * a negative errno code is returned. On success the reference counts
847 * are adjusted, %RTM_NEWLINK is sent to the routing socket and the
848 * function returns zero.
849 */
851 int netdev_set_master(struct net_device *slave, struct net_device *master)
852 {
853 struct net_device *old = slave->master;
855 if (master) {
856 if (old)
857 return -EBUSY;
858 dev_hold(master);
859 }
861 br_write_lock_bh(BR_NETPROTO_LOCK);
862 slave->master = master;
863 br_write_unlock_bh(BR_NETPROTO_LOCK);
865 if (old)
866 dev_put(old);
868 if (master)
869 slave->flags |= IFF_SLAVE;
870 else
871 slave->flags &= ~IFF_SLAVE;
873 rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
874 return 0;
875 }
877 /**
878 * dev_set_promiscuity - update promiscuity count on a device
879 * @dev: device
880 * @inc: modifier
881 *
882 * Add or remove promsicuity from a device. While the count in the device
883 * remains above zero the interface remains promiscuous. Once it hits zero
884 * the device reverts back to normal filtering operation. A negative inc
885 * value is used to drop promiscuity on the device.
886 */
888 void dev_set_promiscuity(struct net_device *dev, int inc)
889 {
890 unsigned short old_flags = dev->flags;
892 dev->flags |= IFF_PROMISC;
893 if ((dev->promiscuity += inc) == 0)
894 dev->flags &= ~IFF_PROMISC;
895 if (dev->flags^old_flags) {
896 #ifdef CONFIG_NET_FASTROUTE
897 if (dev->flags&IFF_PROMISC) {
898 netdev_fastroute_obstacles++;
899 dev_clear_fastroute(dev);
900 } else
901 netdev_fastroute_obstacles--;
902 #endif
903 dev_mc_upload(dev);
904 printk(KERN_INFO "device %s %s promiscuous mode\n",
905 dev->name, (dev->flags&IFF_PROMISC) ? "entered" : "left");
906 }
907 }
909 /**
910 * dev_set_allmulti - update allmulti count on a device
911 * @dev: device
912 * @inc: modifier
913 *
914 * Add or remove reception of all multicast frames to a device. While the
915 * count in the device remains above zero the interface remains listening
916 * to all interfaces. Once it hits zero the device reverts back to normal
917 * filtering operation. A negative @inc value is used to drop the counter
918 * when releasing a resource needing all multicasts.
919 */
921 void dev_set_allmulti(struct net_device *dev, int inc)
922 {
923 unsigned short old_flags = dev->flags;
925 dev->flags |= IFF_ALLMULTI;
926 if ((dev->allmulti += inc) == 0)
927 dev->flags &= ~IFF_ALLMULTI;
928 if (dev->flags^old_flags)
929 dev_mc_upload(dev);
930 }
932 int dev_change_flags(struct net_device *dev, unsigned flags)
933 {
934 int ret;
935 int old_flags = dev->flags;
937 /*
938 * Set the flags on our device.
939 */
941 dev->flags = (flags & (IFF_DEBUG|IFF_NOTRAILERS|IFF_NOARP|IFF_DYNAMIC|
942 IFF_MULTICAST|IFF_PORTSEL|IFF_AUTOMEDIA)) |
943 (dev->flags & (IFF_UP|IFF_VOLATILE|IFF_PROMISC|IFF_ALLMULTI));
945 /*
946 * Load in the correct multicast list now the flags have changed.
947 */
949 dev_mc_upload(dev);
951 /*
952 * Have we downed the interface. We handle IFF_UP ourselves
953 * according to user attempts to set it, rather than blindly
954 * setting it.
955 */
957 ret = 0;
958 if ((old_flags^flags)&IFF_UP) /* Bit is different ? */
959 {
960 ret = ((old_flags & IFF_UP) ? dev_close : dev_open)(dev);
962 if (ret == 0)
963 dev_mc_upload(dev);
964 }
966 if (dev->flags&IFF_UP &&
967 ((old_flags^dev->flags)&
968 ~(IFF_UP|IFF_PROMISC|IFF_ALLMULTI|IFF_VOLATILE)))
969 notifier_call_chain(&netdev_chain, NETDEV_CHANGE, dev);
971 if ((flags^dev->gflags)&IFF_PROMISC) {
972 int inc = (flags&IFF_PROMISC) ? +1 : -1;
973 dev->gflags ^= IFF_PROMISC;
974 dev_set_promiscuity(dev, inc);
975 }
977 /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
978 is important. Some (broken) drivers set IFF_PROMISC, when
979 IFF_ALLMULTI is requested not asking us and not reporting.
980 */
981 if ((flags^dev->gflags)&IFF_ALLMULTI) {
982 int inc = (flags&IFF_ALLMULTI) ? +1 : -1;
983 dev->gflags ^= IFF_ALLMULTI;
984 dev_set_allmulti(dev, inc);
985 }
987 if (old_flags^dev->flags)
988 rtmsg_ifinfo(RTM_NEWLINK, dev, old_flags^dev->flags);
990 return ret;
991 }
993 /*
994 * Perform the SIOCxIFxxx calls.
995 */
997 static int dev_ifsioc(struct ifreq *ifr, unsigned int cmd)
998 {
999 struct net_device *dev;
1000 int err;
1002 if ((dev = __dev_get_by_name(ifr->ifr_name)) == NULL)
1003 return -ENODEV;
1005 switch(cmd)
1007 case SIOCGIFFLAGS: /* Get interface flags */
1008 ifr->ifr_flags = (dev->flags&~(IFF_PROMISC|IFF_ALLMULTI|IFF_RUNNING))
1009 |(dev->gflags&(IFF_PROMISC|IFF_ALLMULTI));
1010 if (netif_running(dev) && netif_carrier_ok(dev))
1011 ifr->ifr_flags |= IFF_RUNNING;
1012 return 0;
1014 case SIOCSIFFLAGS: /* Set interface flags */
1015 return dev_change_flags(dev, ifr->ifr_flags);
1017 case SIOCGIFMETRIC: /* Get the metric on the interface */
1018 ifr->ifr_metric = 0;
1019 return 0;
1021 case SIOCSIFMETRIC: /* Set the metric on the interface */
1022 return -EOPNOTSUPP;
1024 case SIOCGIFMTU: /* Get the MTU of a device */
1025 ifr->ifr_mtu = dev->mtu;
1026 return 0;
1028 case SIOCSIFMTU: /* Set the MTU of a device */
1029 if (ifr->ifr_mtu == dev->mtu)
1030 return 0;
1032 /*
1033 * MTU must be positive.
1034 */
1036 if (ifr->ifr_mtu<0)
1037 return -EINVAL;
1039 if (!netif_device_present(dev))
1040 return -ENODEV;
1042 if (dev->change_mtu)
1043 err = dev->change_mtu(dev, ifr->ifr_mtu);
1044 else {
1045 dev->mtu = ifr->ifr_mtu;
1046 err = 0;
1048 if (!err && dev->flags&IFF_UP)
1049 notifier_call_chain(&netdev_chain, NETDEV_CHANGEMTU, dev);
1050 return err;
1052 case SIOCGIFHWADDR:
1053 memcpy(ifr->ifr_hwaddr.sa_data,dev->dev_addr, MAX_ADDR_LEN);
1054 ifr->ifr_hwaddr.sa_family=dev->type;
1055 return 0;
1057 case SIOCSIFHWADDR:
1058 if (dev->set_mac_address == NULL)
1059 return -EOPNOTSUPP;
1060 if (ifr->ifr_hwaddr.sa_family!=dev->type)
1061 return -EINVAL;
1062 if (!netif_device_present(dev))
1063 return -ENODEV;
1064 err = dev->set_mac_address(dev, &ifr->ifr_hwaddr);
1065 if (!err)
1066 notifier_call_chain(&netdev_chain, NETDEV_CHANGEADDR, dev);
1067 return err;
1069 case SIOCSIFHWBROADCAST:
1070 if (ifr->ifr_hwaddr.sa_family!=dev->type)
1071 return -EINVAL;
1072 memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data, MAX_ADDR_LEN);
1073 notifier_call_chain(&netdev_chain, NETDEV_CHANGEADDR, dev);
1074 return 0;
1076 case SIOCGIFMAP:
1077 ifr->ifr_map.mem_start=dev->mem_start;
1078 ifr->ifr_map.mem_end=dev->mem_end;
1079 ifr->ifr_map.base_addr=dev->base_addr;
1080 ifr->ifr_map.irq=dev->irq;
1081 ifr->ifr_map.dma=dev->dma;
1082 ifr->ifr_map.port=dev->if_port;
1083 return 0;
1085 case SIOCSIFMAP:
1086 if (dev->set_config) {
1087 if (!netif_device_present(dev))
1088 return -ENODEV;
1089 return dev->set_config(dev,&ifr->ifr_map);
1091 return -EOPNOTSUPP;
1093 case SIOCADDMULTI:
1094 if (dev->set_multicast_list == NULL ||
1095 ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
1096 return -EINVAL;
1097 if (!netif_device_present(dev))
1098 return -ENODEV;
1099 dev_mc_add(dev,ifr->ifr_hwaddr.sa_data, dev->addr_len, 1);
1100 return 0;
1102 case SIOCDELMULTI:
1103 if (dev->set_multicast_list == NULL ||
1104 ifr->ifr_hwaddr.sa_family!=AF_UNSPEC)
1105 return -EINVAL;
1106 if (!netif_device_present(dev))
1107 return -ENODEV;
1108 dev_mc_delete(dev,ifr->ifr_hwaddr.sa_data,dev->addr_len, 1);
1109 return 0;
1111 case SIOCGIFINDEX:
1112 ifr->ifr_ifindex = dev->ifindex;
1113 return 0;
1115 case SIOCSIFNAME:
1116 if (dev->flags&IFF_UP)
1117 return -EBUSY;
1118 if (__dev_get_by_name(ifr->ifr_newname))
1119 return -EEXIST;
1120 memcpy(dev->name, ifr->ifr_newname, IFNAMSIZ);
1121 dev->name[IFNAMSIZ-1] = 0;
1122 notifier_call_chain(&netdev_chain, NETDEV_CHANGENAME, dev);
1123 return 0;
1125 #ifdef WIRELESS_EXT
1126 case SIOCGIWSTATS:
1127 return dev_iwstats(dev, ifr);
1128 #endif /* WIRELESS_EXT */
1130 /*
1131 * Unknown or private ioctl
1132 */
1134 default:
1135 if ((cmd >= SIOCDEVPRIVATE &&
1136 cmd <= SIOCDEVPRIVATE + 15) ||
1137 cmd == SIOCBONDENSLAVE ||
1138 cmd == SIOCBONDRELEASE ||
1139 cmd == SIOCBONDSETHWADDR ||
1140 cmd == SIOCBONDSLAVEINFOQUERY ||
1141 cmd == SIOCBONDINFOQUERY ||
1142 cmd == SIOCBONDCHANGEACTIVE ||
1143 cmd == SIOCETHTOOL ||
1144 cmd == SIOCGMIIPHY ||
1145 cmd == SIOCGMIIREG ||
1146 cmd == SIOCSMIIREG) {
1147 if (dev->do_ioctl) {
1148 if (!netif_device_present(dev))
1149 return -ENODEV;
1150 return dev->do_ioctl(dev, ifr, cmd);
1152 return -EOPNOTSUPP;
1155 #ifdef WIRELESS_EXT
1156 if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST) {
1157 if (dev->do_ioctl) {
1158 if (!netif_device_present(dev))
1159 return -ENODEV;
1160 return dev->do_ioctl(dev, ifr, cmd);
1162 return -EOPNOTSUPP;
1164 #endif /* WIRELESS_EXT */
1167 return -EINVAL;
1170 /*
1171 * This function handles all "interface"-type I/O control requests. The actual
1172 * 'doing' part of this is dev_ifsioc above.
1173 */
1175 /**
1176 * dev_ioctl - network device ioctl
1177 * @cmd: command to issue
1178 * @arg: pointer to a struct ifreq in user space
1180 * Issue ioctl functions to devices. This is normally called by the
1181 * user space syscall interfaces but can sometimes be useful for
1182 * other purposes. The return value is the return from the syscall if
1183 * positive or a negative errno code on error.
1184 */
1186 int dev_ioctl(unsigned int cmd, void *arg)
1188 struct ifreq ifr;
1189 int ret;
1190 char *colon;
1192 /* One special case: SIOCGIFCONF takes ifconf argument
1193 and requires shared lock, because it sleeps writing
1194 to user space.
1195 */
1197 if (cmd == SIOCGIFCONF) {
1198 return -ENOSYS;
1200 if (cmd == SIOCGIFNAME) {
1201 return dev_ifname((struct ifreq *)arg);
1204 if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
1205 return -EFAULT;
1207 ifr.ifr_name[IFNAMSIZ-1] = 0;
1209 colon = strchr(ifr.ifr_name, ':');
1210 if (colon)
1211 *colon = 0;
1213 /*
1214 * See which interface the caller is talking about.
1215 */
1217 switch(cmd)
1219 /*
1220 * These ioctl calls:
1221 * - can be done by all.
1222 * - atomic and do not require locking.
1223 * - return a value
1224 */
1226 case SIOCGIFFLAGS:
1227 case SIOCGIFMETRIC:
1228 case SIOCGIFMTU:
1229 case SIOCGIFHWADDR:
1230 case SIOCGIFSLAVE:
1231 case SIOCGIFMAP:
1232 case SIOCGIFINDEX:
1233 dev_load(ifr.ifr_name);
1234 read_lock(&dev_base_lock);
1235 ret = dev_ifsioc(&ifr, cmd);
1236 read_unlock(&dev_base_lock);
1237 if (!ret) {
1238 if (colon)
1239 *colon = ':';
1240 if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
1241 return -EFAULT;
1243 return ret;
1245 /*
1246 * These ioctl calls:
1247 * - require superuser power.
1248 * - require strict serialization.
1249 * - return a value
1250 */
1252 case SIOCETHTOOL:
1253 case SIOCGMIIPHY:
1254 case SIOCGMIIREG:
1255 if (!capable(CAP_NET_ADMIN))
1256 return -EPERM;
1257 dev_load(ifr.ifr_name);
1258 dev_probe_lock();
1259 rtnl_lock();
1260 ret = dev_ifsioc(&ifr, cmd);
1261 rtnl_unlock();
1262 dev_probe_unlock();
1263 if (!ret) {
1264 if (colon)
1265 *colon = ':';
1266 if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
1267 return -EFAULT;
1269 return ret;
1271 /*
1272 * These ioctl calls:
1273 * - require superuser power.
1274 * - require strict serialization.
1275 * - do not return a value
1276 */
1278 case SIOCSIFFLAGS:
1279 case SIOCSIFMETRIC:
1280 case SIOCSIFMTU:
1281 case SIOCSIFMAP:
1282 case SIOCSIFHWADDR:
1283 case SIOCSIFSLAVE:
1284 case SIOCADDMULTI:
1285 case SIOCDELMULTI:
1286 case SIOCSIFHWBROADCAST:
1287 case SIOCSIFNAME:
1288 case SIOCSMIIREG:
1289 case SIOCBONDENSLAVE:
1290 case SIOCBONDRELEASE:
1291 case SIOCBONDSETHWADDR:
1292 case SIOCBONDSLAVEINFOQUERY:
1293 case SIOCBONDINFOQUERY:
1294 case SIOCBONDCHANGEACTIVE:
1295 if (!capable(CAP_NET_ADMIN))
1296 return -EPERM;
1297 dev_load(ifr.ifr_name);
1298 dev_probe_lock();
1299 rtnl_lock();
1300 ret = dev_ifsioc(&ifr, cmd);
1301 rtnl_unlock();
1302 dev_probe_unlock();
1303 return ret;
1305 case SIOCGIFMEM:
1306 /* Get the per device memory space. We can add this but currently
1307 do not support it */
1308 case SIOCSIFMEM:
1309 /* Set the per device memory buffer space. */
1310 case SIOCSIFLINK:
1311 return -EINVAL;
1313 /*
1314 * Unknown or private ioctl.
1315 */
1317 default:
1318 if (cmd >= SIOCDEVPRIVATE &&
1319 cmd <= SIOCDEVPRIVATE + 15) {
1320 dev_load(ifr.ifr_name);
1321 dev_probe_lock();
1322 rtnl_lock();
1323 ret = dev_ifsioc(&ifr, cmd);
1324 rtnl_unlock();
1325 dev_probe_unlock();
1326 if (!ret && copy_to_user(arg, &ifr, sizeof(struct ifreq)))
1327 return -EFAULT;
1328 return ret;
1330 #ifdef WIRELESS_EXT
1331 /* Take care of Wireless Extensions */
1332 if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST) {
1333 /* If command is `set a parameter', or
1334 * `get the encoding parameters', check if
1335 * the user has the right to do it */
1336 if (IW_IS_SET(cmd) || (cmd == SIOCGIWENCODE)) {
1337 if(!capable(CAP_NET_ADMIN))
1338 return -EPERM;
1340 dev_load(ifr.ifr_name);
1341 rtnl_lock();
1342 ret = dev_ifsioc(&ifr, cmd);
1343 rtnl_unlock();
1344 if (!ret && IW_IS_GET(cmd) &&
1345 copy_to_user(arg, &ifr,
1346 sizeof(struct ifreq)))
1347 return -EFAULT;
1348 return ret;
1350 #endif /* WIRELESS_EXT */
1351 return -EINVAL;
1356 /**
1357 * dev_new_index - allocate an ifindex
1359 * Returns a suitable unique value for a new device interface
1360 * number. The caller must hold the rtnl semaphore or the
1361 * dev_base_lock to be sure it remains unique.
1362 */
1364 int dev_new_index(void)
1366 static int ifindex;
1367 for (;;) {
1368 if (++ifindex <= 0)
1369 ifindex=1;
1370 if (__dev_get_by_index(ifindex) == NULL)
1371 return ifindex;
1375 static int dev_boot_phase = 1;
1377 /**
1378 * register_netdevice - register a network device
1379 * @dev: device to register
1381 * Take a completed network device structure and add it to the kernel
1382 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
1383 * chain. 0 is returned on success. A negative errno code is returned
1384 * on a failure to set up the device, or if the name is a duplicate.
1386 * Callers must hold the rtnl semaphore. See the comment at the
1387 * end of Space.c for details about the locking. You may want
1388 * register_netdev() instead of this.
1390 * BUGS:
1391 * The locking appears insufficient to guarantee two parallel registers
1392 * will not get the same name.
1393 */
1395 int net_dev_init(void);
1397 int register_netdevice(struct net_device *dev)
1399 struct net_device *d, **dp;
1400 #ifdef CONFIG_NET_DIVERT
1401 int ret;
1402 #endif
1404 spin_lock_init(&dev->queue_lock);
1405 spin_lock_init(&dev->xmit_lock);
1406 dev->xmit_lock_owner = -1;
1407 #ifdef CONFIG_NET_FASTROUTE
1408 dev->fastpath_lock=RW_LOCK_UNLOCKED;
1409 #endif
1411 if (dev_boot_phase)
1412 net_dev_init();
1414 #ifdef CONFIG_NET_DIVERT
1415 ret = alloc_divert_blk(dev);
1416 if (ret)
1417 return ret;
1418 #endif /* CONFIG_NET_DIVERT */
1420 dev->iflink = -1;
1422 /* Init, if this function is available */
1423 if (dev->init && dev->init(dev) != 0) {
1424 #ifdef CONFIG_NET_DIVERT
1425 free_divert_blk(dev);
1426 #endif
1427 return -EIO;
1430 dev->ifindex = dev_new_index();
1431 if (dev->iflink == -1)
1432 dev->iflink = dev->ifindex;
1434 /* Check for existence, and append to tail of chain */
1435 for (dp=&dev_base; (d=*dp) != NULL; dp=&d->next) {
1436 if (d == dev || strcmp(d->name, dev->name) == 0) {
1437 #ifdef CONFIG_NET_DIVERT
1438 free_divert_blk(dev);
1439 #endif
1440 return -EEXIST;
1443 /*
1444 * nil rebuild_header routine,
1445 * that should be never called and used as just bug trap.
1446 */
1448 if (dev->rebuild_header == NULL)
1449 dev->rebuild_header = default_rebuild_header;
1451 /*
1452 * Default initial state at registry is that the
1453 * device is present.
1454 */
1456 set_bit(__LINK_STATE_PRESENT, &dev->state);
1458 dev->next = NULL;
1459 dev_init_scheduler(dev);
1460 write_lock_bh(&dev_base_lock);
1461 *dp = dev;
1462 dev_hold(dev);
1463 dev->deadbeaf = 0;
1464 write_unlock_bh(&dev_base_lock);
1466 /* Notify protocols, that a new device appeared. */
1467 notifier_call_chain(&netdev_chain, NETDEV_REGISTER, dev);
1469 return 0;
1472 /**
1473 * netdev_finish_unregister - complete unregistration
1474 * @dev: device
1476 * Destroy and free a dead device. A value of zero is returned on
1477 * success.
1478 */
1480 int netdev_finish_unregister(struct net_device *dev)
1482 BUG_TRAP(dev->ip_ptr==NULL);
1483 BUG_TRAP(dev->ip6_ptr==NULL);
1484 BUG_TRAP(dev->dn_ptr==NULL);
1486 if (!dev->deadbeaf) {
1487 printk(KERN_ERR "Freeing alive device %p, %s\n",
1488 dev, dev->name);
1489 return 0;
1491 #ifdef NET_REFCNT_DEBUG
1492 printk(KERN_DEBUG "netdev_finish_unregister: %s%s.\n", dev->name,
1493 (dev->features & NETIF_F_DYNALLOC)?"":", old style");
1494 #endif
1495 if (dev->destructor)
1496 dev->destructor(dev);
1497 if (dev->features & NETIF_F_DYNALLOC)
1498 kfree(dev);
1499 return 0;
1502 /**
1503 * unregister_netdevice - remove device from the kernel
1504 * @dev: device
1506 * This function shuts down a device interface and removes it
1507 * from the kernel tables. On success 0 is returned, on a failure
1508 * a negative errno code is returned.
1510 * Callers must hold the rtnl semaphore. See the comment at the
1511 * end of Space.c for details about the locking. You may want
1512 * unregister_netdev() instead of this.
1513 */
1515 int unregister_netdevice(struct net_device *dev)
1517 unsigned long now, warning_time;
1518 struct net_device *d, **dp;
1520 /* If device is running, close it first. */
1521 if (dev->flags & IFF_UP)
1522 dev_close(dev);
1524 BUG_TRAP(dev->deadbeaf==0);
1525 dev->deadbeaf = 1;
1527 /* And unlink it from device chain. */
1528 for (dp = &dev_base; (d=*dp) != NULL; dp=&d->next) {
1529 if (d == dev) {
1530 write_lock_bh(&dev_base_lock);
1531 *dp = d->next;
1532 write_unlock_bh(&dev_base_lock);
1533 break;
1536 if (d == NULL) {
1537 printk(KERN_DEBUG "unregister_netdevice: device %s/%p"
1538 " not registered\n", dev->name, dev);
1539 return -ENODEV;
1542 /* Synchronize to net_rx_action. */
1543 br_write_lock_bh(BR_NETPROTO_LOCK);
1544 br_write_unlock_bh(BR_NETPROTO_LOCK);
1546 if (dev_boot_phase == 0) {
1548 /* Shutdown queueing discipline. */
1549 dev_shutdown(dev);
1551 /* Notify protocols, that we are about to destroy
1552 this device. They should clean all the things.
1553 */
1554 notifier_call_chain(&netdev_chain, NETDEV_UNREGISTER, dev);
1556 /*
1557 * Flush the multicast chain
1558 */
1559 dev_mc_discard(dev);
1562 if (dev->uninit)
1563 dev->uninit(dev);
1565 /* Notifier chain MUST detach us from master device. */
1566 BUG_TRAP(dev->master==NULL);
1568 #ifdef CONFIG_NET_DIVERT
1569 free_divert_blk(dev);
1570 #endif
1572 if (dev->features & NETIF_F_DYNALLOC) {
1573 #ifdef NET_REFCNT_DEBUG
1574 if (atomic_read(&dev->refcnt) != 1)
1575 printk(KERN_DEBUG "unregister_netdevice: holding %s refcnt=%d\n",
1576 dev->name, atomic_read(&dev->refcnt)-1);
1577 #endif
1578 dev_put(dev);
1579 return 0;
1582 /* Last reference is our one */
1583 if (atomic_read(&dev->refcnt) == 1) {
1584 dev_put(dev);
1585 return 0;
1588 #ifdef NET_REFCNT_DEBUG
1589 printk("unregister_netdevice: waiting %s refcnt=%d\n",
1590 dev->name, atomic_read(&dev->refcnt));
1591 #endif
1593 /* EXPLANATION. If dev->refcnt is not now 1 (our own reference)
1594 it means that someone in the kernel still has a reference
1595 to this device and we cannot release it.
1597 "New style" devices have destructors, hence we can return from this
1598 function and destructor will do all the work later. As of kernel 2.4.0
1599 there are very few "New Style" devices.
1601 "Old style" devices expect that the device is free of any references
1602 upon exit from this function.
1603 We cannot return from this function until all such references have
1604 fallen away. This is because the caller of this function will probably
1605 immediately kfree(*dev) and then be unloaded via sys_delete_module.
1607 So, we linger until all references fall away. The duration of the
1608 linger is basically unbounded! It is driven by, for example, the
1609 current setting of sysctl_ipfrag_time.
1611 After 1 second, we start to rebroadcast unregister notifications
1612 in hope that careless clients will release the device.
1614 */
1616 now = warning_time = jiffies;
1617 while (atomic_read(&dev->refcnt) != 1) {
1618 if ((jiffies - now) > 1*HZ) {
1619 /* Rebroadcast unregister notification */
1620 notifier_call_chain(&netdev_chain, NETDEV_UNREGISTER, dev);
1622 mdelay(250);
1623 if ((jiffies - warning_time) > 10*HZ) {
1624 printk(KERN_EMERG "unregister_netdevice: waiting for %s to "
1625 "become free. Usage count = %d\n",
1626 dev->name, atomic_read(&dev->refcnt));
1627 warning_time = jiffies;
1630 dev_put(dev);
1631 return 0;
1635 /*
1636 * Initialize the DEV module. At boot time this walks the device list and
1637 * unhooks any devices that fail to initialise (normally hardware not
1638 * present) and leaves us with a valid list of present and active devices.
1640 */
1642 extern void net_device_init(void);
1643 extern void ip_auto_config(void);
1644 #ifdef CONFIG_NET_DIVERT
1645 extern void dv_init(void);
1646 #endif /* CONFIG_NET_DIVERT */
1649 /*
1650 * Callers must hold the rtnl semaphore. See the comment at the
1651 * end of Space.c for details about the locking.
1652 */
1653 int __init net_dev_init(void)
1655 struct net_device *dev, **dp;
1657 if ( !dev_boot_phase )
1658 return 0;
1660 skb_init();
1662 net_header_cachep = kmem_cache_create(
1663 "net_header_cache",
1664 (PKT_PROT_LEN + sizeof(void *) - 1) & ~(sizeof(void *) - 1),
1665 0, SLAB_HWCACHE_ALIGN, NULL, NULL);
1667 spin_lock_init(&net_schedule_list_lock);
1668 INIT_LIST_HEAD(&net_schedule_list);
1670 /*
1671 * Add the devices.
1672 * If the call to dev->init fails, the dev is removed
1673 * from the chain disconnecting the device until the
1674 * next reboot.
1676 * NB At boot phase networking is dead. No locking is required.
1677 * But we still preserve dev_base_lock for sanity.
1678 */
1679 dp = &dev_base;
1680 while ((dev = *dp) != NULL) {
1681 spin_lock_init(&dev->queue_lock);
1682 spin_lock_init(&dev->xmit_lock);
1684 dev->xmit_lock_owner = -1;
1685 dev->iflink = -1;
1686 dev_hold(dev);
1688 /*
1689 * Allocate name. If the init() fails
1690 * the name will be reissued correctly.
1691 */
1692 if (strchr(dev->name, '%'))
1693 dev_alloc_name(dev, dev->name);
1695 if (dev->init && dev->init(dev)) {
1696 /*
1697 * It failed to come up. It will be unhooked later.
1698 * dev_alloc_name can now advance to next suitable
1699 * name that is checked next.
1700 */
1701 dev->deadbeaf = 1;
1702 dp = &dev->next;
1703 } else {
1704 dp = &dev->next;
1705 dev->ifindex = dev_new_index();
1706 if (dev->iflink == -1)
1707 dev->iflink = dev->ifindex;
1708 if (dev->rebuild_header == NULL)
1709 dev->rebuild_header = default_rebuild_header;
1710 dev_init_scheduler(dev);
1711 set_bit(__LINK_STATE_PRESENT, &dev->state);
1715 /*
1716 * Unhook devices that failed to come up
1717 */
1718 dp = &dev_base;
1719 while ((dev = *dp) != NULL) {
1720 if (dev->deadbeaf) {
1721 write_lock_bh(&dev_base_lock);
1722 *dp = dev->next;
1723 write_unlock_bh(&dev_base_lock);
1724 dev_put(dev);
1725 } else {
1726 dp = &dev->next;
1730 dev_boot_phase = 0;
1732 dev_mcast_init();
1734 /*
1735 * Initialise network devices
1736 */
1738 net_device_init();
1740 return 0;
1743 inline int init_tx_header(u8 *data, unsigned int len, struct net_device *dev)
1745 memcpy(data + ETH_ALEN, dev->dev_addr, ETH_ALEN);
1747 switch ( ntohs(*(unsigned short *)(data + 12)) )
1749 case ETH_P_ARP:
1750 if ( len < 42 ) break;
1751 memcpy(data + 22, dev->dev_addr, ETH_ALEN);
1752 return ETH_P_ARP;
1753 case ETH_P_IP:
1754 return ETH_P_IP;
1756 return 0;
1760 /*
1761 * do_net_update:
1763 * Called from guest OS to notify updates to its transmit and/or receive
1764 * descriptor rings.
1765 */
1767 long do_net_update(void)
1769 net_ring_t *shared_rings;
1770 net_vif_t *vif;
1771 net_idx_t *shared_idxs;
1772 unsigned int i, j, idx;
1773 struct sk_buff *skb, *interdom_skb = NULL;
1774 tx_req_entry_t tx;
1775 rx_req_entry_t rx;
1776 unsigned long pte_pfn, buf_pfn;
1777 struct pfn_info *pte_page, *buf_page;
1778 unsigned long *ptep;
1779 net_vif_t *target;
1780 u8 *g_data;
1781 unsigned short protocol;
1783 for ( idx = 0; idx < MAX_DOMAIN_VIFS; idx++ )
1785 if ( (vif = current->net_vif_list[idx]) == NULL )
1786 break;
1788 shared_idxs = vif->shared_idxs;
1789 shared_rings = vif->shared_rings;
1791 /*
1792 * PHASE 1 -- TRANSMIT RING
1793 */
1795 /*
1796 * Collect up new transmit buffers. We collect up to the guest OS's
1797 * new producer index, but take care not to catch up with our own
1798 * consumer index.
1799 */
1800 j = vif->tx_prod;
1801 for ( i = vif->tx_req_cons;
1802 (i != shared_idxs->tx_req_prod) &&
1803 (((vif->tx_resp_prod-i) & (TX_RING_SIZE-1)) != 1);
1804 i = TX_RING_INC(i) )
1806 tx = shared_rings->tx_ring[i].req;
1807 target = VIF_DROP;
1809 if ( (tx.size < PKT_PROT_LEN) || (tx.size > ETH_FRAME_LEN) )
1811 DPRINTK("Bad packet size: %d\n", tx.size);
1812 make_tx_response(vif, tx.id, RING_STATUS_BAD_PAGE);
1813 continue;
1816 /* No crossing a page boundary as the payload mustn't fragment. */
1817 if ( ((tx.addr & ~PAGE_MASK) + tx.size) >= PAGE_SIZE )
1819 DPRINTK("tx.addr: %lx, size: %u, end: %lu\n",
1820 tx.addr, tx.size, (tx.addr &~PAGE_MASK) + tx.size);
1821 make_tx_response(vif, tx.id, RING_STATUS_BAD_PAGE);
1822 continue;
1825 buf_pfn = tx.addr >> PAGE_SHIFT;
1826 buf_page = frame_table + buf_pfn;
1827 spin_lock_irq(&current->page_lock);
1828 if ( (buf_pfn >= max_page) ||
1829 ((buf_page->flags & PG_domain_mask) != current->domain) )
1831 DPRINTK("Bad page frame\n");
1832 spin_unlock_irq(&current->page_lock);
1833 make_tx_response(vif, tx.id, RING_STATUS_BAD_PAGE);
1834 continue;
1837 g_data = map_domain_mem(tx.addr);
1839 protocol = __constant_htons(
1840 init_tx_header(g_data, tx.size, the_dev));
1841 if ( protocol == 0 )
1843 make_tx_response(vif, tx.id, RING_STATUS_BAD_PAGE);
1844 goto tx_unmap_and_continue;
1847 target = net_get_target_vif(g_data, tx.size, vif);
1849 if ( VIF_LOCAL(target) )
1851 /* Local delivery */
1852 if ( (skb = dev_alloc_skb(ETH_FRAME_LEN + 32)) == NULL )
1854 make_tx_response(vif, tx.id, RING_STATUS_BAD_PAGE);
1855 put_vif(target);
1856 goto tx_unmap_and_continue;
1859 skb->src_vif = vif;
1860 skb->dst_vif = target;
1861 skb->protocol = protocol;
1863 /*
1864 * We don't need a well-formed skb as netif_rx will fill these
1865 * fields in as necessary. All we actually need is the right
1866 * page offset in skb->data, and the right length in skb->len.
1867 * Note that the correct address/length *excludes* link header.
1868 */
1869 skb->head = (u8 *)map_domain_mem(
1870 ((skb->pf - frame_table) << PAGE_SHIFT));
1871 skb->data = skb->head + 18;
1872 memcpy(skb->data, g_data, tx.size);
1873 skb->data += ETH_HLEN;
1874 skb->len = tx.size - ETH_HLEN;
1875 unmap_domain_mem(skb->head);
1877 /*
1878 * We must defer netif_rx until we have released the current
1879 * domain's page_lock, or we may deadlock on SMP.
1880 */
1881 interdom_skb = skb;
1883 make_tx_response(vif, tx.id, RING_STATUS_OK);
1885 else if ( (target == VIF_PHYS) || IS_PRIV(current) )
1887 vif->tx_shadow_ring[j].id = tx.id;
1888 vif->tx_shadow_ring[j].size = tx.size;
1889 vif->tx_shadow_ring[j].header =
1890 kmem_cache_alloc(net_header_cachep, GFP_KERNEL);
1891 if ( vif->tx_shadow_ring[j].header == NULL )
1893 make_tx_response(vif, tx.id, RING_STATUS_OK);
1894 goto tx_unmap_and_continue;
1897 memcpy(vif->tx_shadow_ring[j].header, g_data, PKT_PROT_LEN);
1898 vif->tx_shadow_ring[j].payload = tx.addr + PKT_PROT_LEN;
1899 get_page_tot(buf_page);
1900 j = TX_RING_INC(j);
1902 else
1904 make_tx_response(vif, tx.id, RING_STATUS_DROPPED);
1907 tx_unmap_and_continue:
1908 unmap_domain_mem(g_data);
1909 spin_unlock_irq(&current->page_lock);
1910 if ( interdom_skb != NULL )
1912 (void)netif_rx(interdom_skb);
1913 interdom_skb = NULL;
1917 vif->tx_req_cons = i;
1919 if ( vif->tx_prod != j )
1921 smp_mb(); /* Let other CPUs see new descriptors first. */
1922 vif->tx_prod = j;
1923 add_to_net_schedule_list_tail(vif);
1924 maybe_schedule_tx_action();
1927 /*
1928 * PHASE 2 -- RECEIVE RING
1929 */
1931 /*
1932 * Collect up new receive buffers. We collect up to the guest OS's
1933 * new producer index, but take care not to catch up with our own
1934 * consumer index.
1935 */
1936 j = vif->rx_prod;
1937 for ( i = vif->rx_req_cons;
1938 (i != shared_idxs->rx_req_prod) &&
1939 (((vif->rx_resp_prod-i) & (RX_RING_SIZE-1)) != 1);
1940 i = RX_RING_INC(i) )
1942 rx = shared_rings->rx_ring[i].req;
1944 pte_pfn = rx.addr >> PAGE_SHIFT;
1945 pte_page = frame_table + pte_pfn;
1947 spin_lock_irq(&current->page_lock);
1948 if ( (pte_pfn >= max_page) ||
1949 ((pte_page->flags & (PG_type_mask | PG_domain_mask)) !=
1950 (PGT_l1_page_table | current->domain)) )
1952 DPRINTK("Bad page frame for ppte %d,%08lx,%08lx,%08lx\n",
1953 current->domain, pte_pfn, max_page, pte_page->flags);
1954 spin_unlock_irq(&current->page_lock);
1955 make_rx_response(vif, rx.id, 0, RING_STATUS_BAD_PAGE, 0);
1956 continue;
1959 ptep = map_domain_mem(rx.addr);
1961 if ( !(*ptep & _PAGE_PRESENT) )
1963 DPRINTK("Invalid PTE passed down (not present)\n");
1964 make_rx_response(vif, rx.id, 0, RING_STATUS_BAD_PAGE, 0);
1965 goto rx_unmap_and_continue;
1968 buf_pfn = *ptep >> PAGE_SHIFT;
1969 buf_page = frame_table + buf_pfn;
1971 if ( ((buf_page->flags & (PG_type_mask | PG_domain_mask)) !=
1972 (PGT_writeable_page | current->domain)) ||
1973 (buf_page->tot_count != 1) )
1975 DPRINTK("Need a mapped-once writeable page (%ld/%ld/%08lx)\n",
1976 buf_page->type_count, buf_page->tot_count, buf_page->flags);
1977 make_rx_response(vif, rx.id, 0, RING_STATUS_BAD_PAGE, 0);
1978 goto rx_unmap_and_continue;
1981 /*
1982 * The pte they passed was good, so take it away from them. We
1983 * also lock down the page-table page, so it doesn't go away.
1984 */
1985 get_page_type(pte_page);
1986 get_page_tot(pte_page);
1987 *ptep &= ~_PAGE_PRESENT;
1988 buf_page->flags = buf_page->type_count = buf_page->tot_count = 0;
1989 list_del(&buf_page->list);
1991 vif->rx_shadow_ring[j].id = rx.id;
1992 vif->rx_shadow_ring[j].pte_ptr = rx.addr;
1993 vif->rx_shadow_ring[j].buf_pfn = buf_pfn;
1994 vif->rx_shadow_ring[j].flush_count = (unsigned short)
1995 atomic_read(&tlb_flush_count[smp_processor_id()]);
1996 j = RX_RING_INC(j);
1998 rx_unmap_and_continue:
1999 unmap_domain_mem(ptep);
2000 spin_unlock_irq(&current->page_lock);
2003 vif->rx_req_cons = i;
2005 if ( vif->rx_prod != j )
2007 smp_mb(); /* Let other CPUs see new descriptors first. */
2008 vif->rx_prod = j;
2012 return 0;
2016 static void make_tx_response(net_vif_t *vif,
2017 unsigned short id,
2018 unsigned char st)
2020 unsigned long flags;
2021 unsigned int pos;
2022 tx_resp_entry_t *resp;
2024 /* Place on the response ring for the relevant domain. */
2025 spin_lock_irqsave(&vif->tx_lock, flags);
2026 pos = vif->tx_resp_prod;
2027 resp = &vif->shared_rings->tx_ring[pos].resp;
2028 resp->id = id;
2029 resp->status = st;
2030 pos = TX_RING_INC(pos);
2031 vif->tx_resp_prod = vif->shared_idxs->tx_resp_prod = pos;
2032 if ( pos == vif->shared_idxs->tx_event )
2034 unsigned long cpu_mask = mark_guest_event(vif->domain, _EVENT_NET);
2035 guest_event_notify(cpu_mask);
2037 spin_unlock_irqrestore(&vif->tx_lock, flags);
2041 static void make_rx_response(net_vif_t *vif,
2042 unsigned short id,
2043 unsigned short size,
2044 unsigned char st,
2045 unsigned char off)
2047 unsigned long flags;
2048 unsigned int pos;
2049 rx_resp_entry_t *resp;
2051 /* Place on the response ring for the relevant domain. */
2052 spin_lock_irqsave(&vif->rx_lock, flags);
2053 pos = vif->rx_resp_prod;
2054 resp = &vif->shared_rings->rx_ring[pos].resp;
2055 resp->id = id;
2056 resp->size = size;
2057 resp->status = st;
2058 resp->offset = off;
2059 pos = RX_RING_INC(pos);
2060 vif->rx_resp_prod = vif->shared_idxs->rx_resp_prod = pos;
2061 if ( pos == vif->shared_idxs->rx_event )
2063 unsigned long cpu_mask = mark_guest_event(vif->domain, _EVENT_NET);
2064 guest_event_notify(cpu_mask);
2066 spin_unlock_irqrestore(&vif->rx_lock, flags);
2070 int setup_network_devices(void)
2072 int ret;
2073 extern char opt_ifname[];
2074 struct net_device *dev = dev_get_by_name(opt_ifname);
2076 if ( dev == NULL )
2078 printk("Could not find device %s\n", opt_ifname);
2079 return 0;
2082 ret = dev_open(dev);
2083 if ( ret != 0 )
2085 printk("Error opening device %s for use (%d)\n", opt_ifname, ret);
2086 return 0;
2088 printk("Device %s opened and ready for use.\n", opt_ifname);
2089 the_dev = dev;
2091 tasklet_enable(&net_tx_tasklet);
2093 return 1;