ia64/xen-unstable

view xen/net/dev.c @ 779:30c521db4c71

bitkeeper revision 1.475 (3f796d3bblNbmAFIYOUxNwNgsLILNg)

network.c, dev.c, keyhandler.c:
Stronger SMP memory barriers in the newtork code.
author kaf24@scramble.cl.cam.ac.uk
date Tue Sep 30 11:47:07 2003 +0000 (2003-09-30)
parents c54c6595f6ae
children 3f26e93f5e01
line source
1 /*
2 * NET3 Protocol independent device support routines.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 */
10 #include <asm/uaccess.h>
11 #include <asm/system.h>
12 #include <asm/bitops.h>
13 #include <linux/config.h>
14 #include <linux/delay.h>
15 #include <linux/lib.h>
16 #include <linux/types.h>
17 #include <linux/sched.h>
18 #include <linux/mm.h>
19 #include <linux/socket.h>
20 #include <linux/sockios.h>
21 #include <linux/errno.h>
22 #include <linux/interrupt.h>
23 #include <linux/if_ether.h>
24 #include <linux/netdevice.h>
25 #include <linux/etherdevice.h>
26 #include <linux/skbuff.h>
27 #include <linux/brlock.h>
28 #include <linux/init.h>
29 #include <linux/module.h>
31 #include <linux/event.h>
32 #include <asm/domain_page.h>
33 #include <asm/pgalloc.h>
35 #include <xeno/perfc.h>
37 #define BUG_TRAP ASSERT
38 #define notifier_call_chain(_a,_b,_c) ((void)0)
39 #define rtmsg_ifinfo(_a,_b,_c) ((void)0)
40 #define rtnl_lock() ((void)0)
41 #define rtnl_unlock() ((void)0)
43 #if 0
44 #define DPRINTK(_f, _a...) printk(_f , ## _a)
45 #else
46 #define DPRINTK(_f, _a...) ((void)0)
47 #endif
49 #define TX_RING_INC(_i) (((_i)+1) & (TX_RING_SIZE-1))
50 #define RX_RING_INC(_i) (((_i)+1) & (RX_RING_SIZE-1))
51 #define TX_RING_ADD(_i,_j) (((_i)+(_j)) & (TX_RING_SIZE-1))
52 #define RX_RING_ADD(_i,_j) (((_i)+(_j)) & (RX_RING_SIZE-1))
54 static struct sk_buff_head rx_skb_queue[NR_CPUS] __cacheline_aligned;
56 static int get_tx_bufs(net_vif_t *vif);
58 static void __make_tx_response(net_vif_t *vif,
59 unsigned short id,
60 unsigned char st);
61 static void make_rx_response(net_vif_t *vif,
62 unsigned short id,
63 unsigned short size,
64 unsigned char st,
65 unsigned char off);
67 struct net_device *the_dev = NULL;
69 /*
70 * Transmitted packets are fragmented, so we can copy the important headesr
71 * before checking them for validity. Avoids need for page protection.
72 */
73 /* Ethernet + IP headers */
74 #define PKT_PROT_LEN (ETH_HLEN + 20)
75 static kmem_cache_t *net_header_cachep;
77 /**
78 * __dev_get_by_name - find a device by its name
79 * @name: name to find
80 *
81 * Find an interface by name. Must be called under RTNL semaphore
82 * or @dev_base_lock. If the name is found a pointer to the device
83 * is returned. If the name is not found then %NULL is returned. The
84 * reference counters are not incremented so the caller must be
85 * careful with locks.
86 */
89 struct net_device *__dev_get_by_name(const char *name)
90 {
91 struct net_device *dev;
93 for (dev = dev_base; dev != NULL; dev = dev->next) {
94 if (strncmp(dev->name, name, IFNAMSIZ) == 0)
95 return dev;
96 }
97 return NULL;
98 }
100 /**
101 * dev_get_by_name - find a device by its name
102 * @name: name to find
103 *
104 * Find an interface by name. This can be called from any
105 * context and does its own locking. The returned handle has
106 * the usage count incremented and the caller must use dev_put() to
107 * release it when it is no longer needed. %NULL is returned if no
108 * matching device is found.
109 */
111 struct net_device *dev_get_by_name(const char *name)
112 {
113 struct net_device *dev;
115 read_lock(&dev_base_lock);
116 dev = __dev_get_by_name(name);
117 if (dev)
118 dev_hold(dev);
119 read_unlock(&dev_base_lock);
120 return dev;
121 }
123 /**
124 * dev_get - test if a device exists
125 * @name: name to test for
126 *
127 * Test if a name exists. Returns true if the name is found. In order
128 * to be sure the name is not allocated or removed during the test the
129 * caller must hold the rtnl semaphore.
130 *
131 * This function primarily exists for back compatibility with older
132 * drivers.
133 */
135 int dev_get(const char *name)
136 {
137 struct net_device *dev;
139 read_lock(&dev_base_lock);
140 dev = __dev_get_by_name(name);
141 read_unlock(&dev_base_lock);
142 return dev != NULL;
143 }
145 /**
146 * __dev_get_by_index - find a device by its ifindex
147 * @ifindex: index of device
148 *
149 * Search for an interface by index. Returns %NULL if the device
150 * is not found or a pointer to the device. The device has not
151 * had its reference counter increased so the caller must be careful
152 * about locking. The caller must hold either the RTNL semaphore
153 * or @dev_base_lock.
154 */
156 struct net_device * __dev_get_by_index(int ifindex)
157 {
158 struct net_device *dev;
160 for (dev = dev_base; dev != NULL; dev = dev->next) {
161 if (dev->ifindex == ifindex)
162 return dev;
163 }
164 return NULL;
165 }
168 /**
169 * dev_get_by_index - find a device by its ifindex
170 * @ifindex: index of device
171 *
172 * Search for an interface by index. Returns NULL if the device
173 * is not found or a pointer to the device. The device returned has
174 * had a reference added and the pointer is safe until the user calls
175 * dev_put to indicate they have finished with it.
176 */
178 struct net_device * dev_get_by_index(int ifindex)
179 {
180 struct net_device *dev;
182 read_lock(&dev_base_lock);
183 dev = __dev_get_by_index(ifindex);
184 if (dev)
185 dev_hold(dev);
186 read_unlock(&dev_base_lock);
187 return dev;
188 }
190 /**
191 * dev_getbyhwaddr - find a device by its hardware address
192 * @type: media type of device
193 * @ha: hardware address
194 *
195 * Search for an interface by MAC address. Returns NULL if the device
196 * is not found or a pointer to the device. The caller must hold the
197 * rtnl semaphore. The returned device has not had its ref count increased
198 * and the caller must therefore be careful about locking
199 *
200 * BUGS:
201 * If the API was consistent this would be __dev_get_by_hwaddr
202 */
204 struct net_device *dev_getbyhwaddr(unsigned short type, char *ha)
205 {
206 struct net_device *dev;
208 for (dev = dev_base; dev != NULL; dev = dev->next) {
209 if (dev->type == type &&
210 memcmp(dev->dev_addr, ha, dev->addr_len) == 0)
211 return dev;
212 }
213 return NULL;
214 }
216 /**
217 * dev_alloc_name - allocate a name for a device
218 * @dev: device
219 * @name: name format string
220 *
221 * Passed a format string - eg "lt%d" it will try and find a suitable
222 * id. Not efficient for many devices, not called a lot. The caller
223 * must hold the dev_base or rtnl lock while allocating the name and
224 * adding the device in order to avoid duplicates. Returns the number
225 * of the unit assigned or a negative errno code.
226 */
228 int dev_alloc_name(struct net_device *dev, const char *name)
229 {
230 int i;
231 char buf[32];
232 char *p;
234 /*
235 * Verify the string as this thing may have come from
236 * the user. There must be either one "%d" and no other "%"
237 * characters, or no "%" characters at all.
238 */
239 p = strchr(name, '%');
240 if (p && (p[1] != 'd' || strchr(p+2, '%')))
241 return -EINVAL;
243 /*
244 * If you need over 100 please also fix the algorithm...
245 */
246 for (i = 0; i < 100; i++) {
247 snprintf(buf,sizeof(buf),name,i);
248 if (__dev_get_by_name(buf) == NULL) {
249 strcpy(dev->name, buf);
250 return i;
251 }
252 }
253 return -ENFILE; /* Over 100 of the things .. bail out! */
254 }
256 /**
257 * dev_alloc - allocate a network device and name
258 * @name: name format string
259 * @err: error return pointer
260 *
261 * Passed a format string, eg. "lt%d", it will allocate a network device
262 * and space for the name. %NULL is returned if no memory is available.
263 * If the allocation succeeds then the name is assigned and the
264 * device pointer returned. %NULL is returned if the name allocation
265 * failed. The cause of an error is returned as a negative errno code
266 * in the variable @err points to.
267 *
268 * The caller must hold the @dev_base or RTNL locks when doing this in
269 * order to avoid duplicate name allocations.
270 */
272 struct net_device *dev_alloc(const char *name, int *err)
273 {
274 struct net_device *dev=kmalloc(sizeof(struct net_device), GFP_KERNEL);
275 if (dev == NULL) {
276 *err = -ENOBUFS;
277 return NULL;
278 }
279 memset(dev, 0, sizeof(struct net_device));
280 *err = dev_alloc_name(dev, name);
281 if (*err < 0) {
282 kfree(dev);
283 return NULL;
284 }
285 return dev;
286 }
288 /**
289 * netdev_state_change - device changes state
290 * @dev: device to cause notification
291 *
292 * Called to indicate a device has changed state. This function calls
293 * the notifier chains for netdev_chain and sends a NEWLINK message
294 * to the routing socket.
295 */
297 void netdev_state_change(struct net_device *dev)
298 {
299 if (dev->flags&IFF_UP) {
300 notifier_call_chain(&netdev_chain, NETDEV_CHANGE, dev);
301 rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
302 }
303 }
306 #ifdef CONFIG_KMOD
308 /**
309 * dev_load - load a network module
310 * @name: name of interface
311 *
312 * If a network interface is not present and the process has suitable
313 * privileges this function loads the module. If module loading is not
314 * available in this kernel then it becomes a nop.
315 */
317 void dev_load(const char *name)
318 {
319 if (!dev_get(name) && capable(CAP_SYS_MODULE))
320 request_module(name);
321 }
323 #else
325 extern inline void dev_load(const char *unused){;}
327 #endif
329 static int default_rebuild_header(struct sk_buff *skb)
330 {
331 printk(KERN_DEBUG "%s: default_rebuild_header called -- BUG!\n",
332 skb->dev ? skb->dev->name : "NULL!!!");
333 kfree_skb(skb);
334 return 1;
335 }
337 /**
338 * dev_open - prepare an interface for use.
339 * @dev: device to open
340 *
341 * Takes a device from down to up state. The device's private open
342 * function is invoked and then the multicast lists are loaded. Finally
343 * the device is moved into the up state and a %NETDEV_UP message is
344 * sent to the netdev notifier chain.
345 *
346 * Calling this function on an active interface is a nop. On a failure
347 * a negative errno code is returned.
348 */
350 int dev_open(struct net_device *dev)
351 {
352 int ret = 0;
354 /*
355 * Is it already up?
356 */
358 if (dev->flags&IFF_UP)
359 return 0;
361 /*
362 * Is it even present?
363 */
364 if (!netif_device_present(dev))
365 return -ENODEV;
367 /*
368 * Call device private open method
369 */
370 if (try_inc_mod_count(dev->owner)) {
371 if (dev->open) {
372 ret = dev->open(dev);
373 if (ret != 0 && dev->owner)
374 __MOD_DEC_USE_COUNT(dev->owner);
375 }
376 } else {
377 ret = -ENODEV;
378 }
380 /*
381 * If it went open OK then:
382 */
384 if (ret == 0)
385 {
386 /*
387 * Set the flags.
388 */
389 dev->flags |= IFF_UP;
391 set_bit(__LINK_STATE_START, &dev->state);
393 /*
394 * Initialize multicasting status
395 */
396 dev_mc_upload(dev);
398 /*
399 * Wakeup transmit queue engine
400 */
401 dev_activate(dev);
403 /*
404 * ... and announce new interface.
405 */
406 notifier_call_chain(&netdev_chain, NETDEV_UP, dev);
407 }
408 return(ret);
409 }
412 /**
413 * dev_close - shutdown an interface.
414 * @dev: device to shutdown
415 *
416 * This function moves an active device into down state. A
417 * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
418 * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
419 * chain.
420 */
422 int dev_close(struct net_device *dev)
423 {
424 if (!(dev->flags&IFF_UP))
425 return 0;
427 /*
428 * Tell people we are going down, so that they can
429 * prepare to death, when device is still operating.
430 */
431 notifier_call_chain(&netdev_chain, NETDEV_GOING_DOWN, dev);
433 dev_deactivate(dev);
435 clear_bit(__LINK_STATE_START, &dev->state);
437 /*
438 * Call the device specific close. This cannot fail.
439 * Only if device is UP
440 *
441 * We allow it to be called even after a DETACH hot-plug
442 * event.
443 */
445 if (dev->stop)
446 dev->stop(dev);
448 /*
449 * Device is now down.
450 */
452 dev->flags &= ~IFF_UP;
454 /*
455 * Tell people we are down
456 */
457 notifier_call_chain(&netdev_chain, NETDEV_DOWN, dev);
459 /*
460 * Drop the module refcount
461 */
462 if (dev->owner)
463 __MOD_DEC_USE_COUNT(dev->owner);
465 return(0);
466 }
469 #ifdef CONFIG_HIGHMEM
470 /* Actually, we should eliminate this check as soon as we know, that:
471 * 1. IOMMU is present and allows to map all the memory.
472 * 2. No high memory really exists on this machine.
473 */
475 static inline int
476 illegal_highdma(struct net_device *dev, struct sk_buff *skb)
477 {
478 int i;
480 if (dev->features&NETIF_F_HIGHDMA)
481 return 0;
483 for (i=0; i<skb_shinfo(skb)->nr_frags; i++)
484 if (skb_shinfo(skb)->frags[i].page >= highmem_start_page)
485 return 1;
487 return 0;
488 }
489 #else
490 #define illegal_highdma(dev, skb) (0)
491 #endif
494 /*=======================================================================
495 Receiver routines
496 =======================================================================*/
498 struct netif_rx_stats netdev_rx_stat[NR_CPUS];
500 void deliver_packet(struct sk_buff *skb, net_vif_t *vif)
501 {
502 rx_shadow_entry_t *rx;
503 unsigned long *ptep;
504 struct pfn_info *old_page, *new_page, *pte_page;
505 unsigned int i;
506 unsigned short size;
507 unsigned char offset, status = RING_STATUS_OK;
509 memcpy(skb->mac.ethernet->h_dest, vif->vmac, ETH_ALEN);
510 if ( ntohs(skb->mac.ethernet->h_proto) == ETH_P_ARP )
511 memcpy(skb->nh.raw + 18, vif->vmac, ETH_ALEN);
513 /*
514 * Slightly gross: we need the page_lock so that we can do PTE checking.
515 * However, we take it slightly early so that it can protect the update
516 * of rx_cons. This saves us from grabbing two locks.
517 */
518 spin_lock(&vif->domain->page_lock);
520 if ( (i = vif->rx_cons) == vif->rx_prod )
521 {
522 spin_unlock(&vif->domain->page_lock);
523 perfc_incr(net_rx_capacity_drop);
524 return;
525 }
526 rx = vif->rx_shadow_ring + i;
527 vif->rx_cons = RX_RING_INC(i);
529 size = (unsigned short)skb->len;
530 offset = (unsigned char)((unsigned long)skb->data & ~PAGE_MASK);
532 /* Release the page-table page. */
533 pte_page = frame_table + (rx->pte_ptr >> PAGE_SHIFT);
534 put_page_type(pte_page);
535 put_page_tot(pte_page);
537 old_page = frame_table + rx->buf_pfn;
538 new_page = skb->pf;
540 ptep = map_domain_mem(rx->pte_ptr);
542 if ( (*ptep & _PAGE_PRESENT) )
543 {
544 /* Bail out if the PTE has been reused under our feet. */
545 list_add(&old_page->list, &vif->domain->pg_head);
546 old_page->flags = vif->domain->domain;
547 unmap_domain_mem(ptep);
548 spin_unlock(&vif->domain->page_lock);
549 status = RING_STATUS_BAD_PAGE;
550 goto out;
551 }
553 /* Give the new page to the domain, marking it writeable. */
554 new_page->tot_count = new_page->type_count = 1;
555 new_page->flags = vif->domain->domain | PGT_writeable_page | PG_need_flush;
556 list_add(&new_page->list, &vif->domain->pg_head);
558 /* Patch the PTE to map the new page as writeable. */
559 machine_to_phys_mapping[new_page - frame_table]
560 = machine_to_phys_mapping[old_page - frame_table];
561 *ptep = (*ptep & ~PAGE_MASK) | _PAGE_RW | _PAGE_PRESENT |
562 (((new_page - frame_table) << PAGE_SHIFT) & PAGE_MASK);
564 unmap_domain_mem(ptep);
566 spin_unlock(&vif->domain->page_lock);
568 /* Our skbuff now points at the guest's old frame. */
569 skb->pf = old_page;
571 /* Updates must happen before releasing the descriptor. */
572 smp_wmb();
574 /*
575 * NB. The remote flush here should be safe, as we hold no locks. The
576 * network driver that called us should also have no nasty locks.
577 */
578 if ( rx->flush_count == (unsigned short)
579 atomic_read(&tlb_flush_count[vif->domain->processor]) )
580 {
581 perfc_incr(net_rx_tlbflush);
582 flush_tlb_cpu(vif->domain->processor);
583 }
585 perfc_incr(net_rx_delivered);
587 /* record this so they can be billed */
588 vif->total_packets_received++;
589 vif->total_bytes_received += size;
591 out:
592 make_rx_response(vif, rx->id, size, status, offset);
593 }
595 /**
596 * netif_rx - post buffer to the network code
597 * @skb: buffer to post
598 *
599 * This function receives a packet from a device driver and queues it for
600 * the upper (protocol) levels to process. It always succeeds. The buffer
601 * may be dropped during processing for congestion control or by the
602 * protocol layers.
603 *
604 * return values:
605 * NET_RX_SUCCESS (no congestion)
606 * NET_RX_DROP (packet was dropped)
607 */
609 int netif_rx(struct sk_buff *skb)
610 {
611 int this_cpu = smp_processor_id();
612 struct sk_buff_head *q = &rx_skb_queue[this_cpu];
613 unsigned long flags;
615 /* This oughtn't to happen, really! */
616 if ( unlikely(skb_queue_len(q) > 100) )
617 {
618 perfc_incr(net_rx_congestion_drop);
619 return NET_RX_DROP;
620 }
622 local_irq_save(flags);
623 __skb_queue_tail(q, skb);
624 local_irq_restore(flags);
626 __cpu_raise_softirq(this_cpu, NET_RX_SOFTIRQ);
628 return NET_RX_SUCCESS;
629 }
631 static void net_rx_action(struct softirq_action *h)
632 {
633 int offset, this_cpu = smp_processor_id();
634 struct sk_buff_head *q = &rx_skb_queue[this_cpu];
635 struct sk_buff *skb;
637 local_irq_disable();
639 while ( (skb = __skb_dequeue(q)) != NULL )
640 {
641 ASSERT(skb->skb_type == SKB_ZERO_COPY);
643 /*
644 * Offset will include 16 bytes padding from dev_alloc_skb, 14 bytes
645 * for ethernet header, plus any other alignment padding added by the
646 * driver.
647 */
648 offset = (int)skb->data & ~PAGE_MASK;
649 skb->head = (u8 *)map_domain_mem(((skb->pf - frame_table) <<
650 PAGE_SHIFT));
651 skb->data = skb->nh.raw = skb->head + offset;
652 skb->tail = skb->data + skb->len;
653 skb_push(skb, ETH_HLEN);
654 skb->mac.raw = skb->data;
656 netdev_rx_stat[this_cpu].total++;
658 if ( skb->dst_vif == NULL )
659 skb->dst_vif = net_get_target_vif(
660 skb->data, skb->len, skb->src_vif);
662 if ( !VIF_LOCAL(skb->dst_vif) )
663 skb->dst_vif = find_vif_by_id(0);
665 if ( skb->dst_vif != NULL )
666 {
667 deliver_packet(skb, skb->dst_vif);
668 put_vif(skb->dst_vif);
669 }
671 unmap_domain_mem(skb->head);
672 kfree_skb(skb);
673 }
675 local_irq_enable();
676 }
679 /*************************************************************
680 * NEW TRANSMIT SCHEDULER
681 *
682 * NB. We ought also to only send a limited number of bytes to the NIC
683 * for transmission at any one time (to avoid head-of-line blocking).
684 * However, driver rings are small enough that they provide a reasonable
685 * limit.
686 *
687 * eg. 3c905 has 16 descriptors == 8 packets, at 100Mbps
688 * e1000 has 256 descriptors == 128 packets, at 1000Mbps
689 * tg3 has 512 descriptors == 256 packets, at 1000Mbps
690 *
691 * So, worst case is tg3 with 256 1500-bytes packets == 375kB.
692 * This would take 3ms, and represents our worst-case HoL blocking cost.
693 *
694 * We think this is reasonable.
695 */
697 struct list_head net_schedule_list;
698 spinlock_t net_schedule_list_lock;
700 static int __on_net_schedule_list(net_vif_t *vif)
701 {
702 return vif->list.next != NULL;
703 }
705 static void remove_from_net_schedule_list(net_vif_t *vif)
706 {
707 unsigned long flags;
708 spin_lock_irqsave(&net_schedule_list_lock, flags);
709 ASSERT(__on_net_schedule_list(vif));
710 list_del(&vif->list);
711 vif->list.next = NULL;
712 put_vif(vif);
713 spin_unlock_irqrestore(&net_schedule_list_lock, flags);
714 }
716 static void add_to_net_schedule_list_tail(net_vif_t *vif)
717 {
718 unsigned long flags;
719 if ( __on_net_schedule_list(vif) ) return;
720 spin_lock_irqsave(&net_schedule_list_lock, flags);
721 if ( !__on_net_schedule_list(vif) )
722 {
723 list_add_tail(&vif->list, &net_schedule_list);
724 get_vif(vif);
725 }
726 spin_unlock_irqrestore(&net_schedule_list_lock, flags);
727 }
730 static void tx_skb_release(struct sk_buff *skb);
732 static void net_tx_action(unsigned long unused)
733 {
734 struct net_device *dev = the_dev;
735 struct list_head *ent;
736 struct sk_buff *skb, *nskb;
737 net_vif_t *vif;
738 tx_shadow_entry_t *tx;
740 spin_lock(&dev->xmit_lock);
741 while ( !netif_queue_stopped(dev) &&
742 !list_empty(&net_schedule_list) )
743 {
744 /* Get a vif from the list with work to do. */
745 ent = net_schedule_list.next;
746 vif = list_entry(ent, net_vif_t, list);
747 get_vif(vif);
748 remove_from_net_schedule_list(vif);
750 /* Check whether there are packets to be transmitted. */
751 if ( (vif->tx_cons == vif->tx_prod) && !get_tx_bufs(vif) )
752 {
753 put_vif(vif);
754 continue;
755 }
757 add_to_net_schedule_list_tail(vif);
759 if ( (skb = alloc_skb_nodata(GFP_ATOMIC)) == NULL )
760 {
761 printk("Out of memory in net_tx_action()!\n");
762 add_to_net_schedule_list_tail(vif);
763 put_vif(vif);
764 break;
765 }
767 /* Pick an entry from the transmit queue. */
768 tx = &vif->tx_shadow_ring[vif->tx_cons];
769 vif->tx_cons = TX_RING_INC(vif->tx_cons);
771 skb->destructor = tx_skb_release;
773 skb->head = skb->data = tx->header;
774 skb->end = skb->tail = skb->head + PKT_PROT_LEN;
776 skb->dev = the_dev;
777 skb->src_vif = vif;
778 skb->dst_vif = NULL;
779 skb->mac.raw = skb->data;
780 skb->guest_id = tx->id;
782 skb_shinfo(skb)->frags[0].page = frame_table +
783 (tx->payload >> PAGE_SHIFT);
784 skb_shinfo(skb)->frags[0].size = tx->size - PKT_PROT_LEN;
785 skb_shinfo(skb)->frags[0].page_offset = tx->payload & ~PAGE_MASK;
786 skb_shinfo(skb)->nr_frags = 1;
788 skb->data_len = tx->size - PKT_PROT_LEN;
789 skb->len = tx->size;
791 /* record the transmission so they can be billed */
792 vif->total_packets_sent++;
793 vif->total_bytes_sent += tx->size;
795 /* Is the NIC crap? */
796 if ( !(dev->features & NETIF_F_SG) )
797 {
798 nskb = skb_copy(skb, GFP_KERNEL);
799 kfree_skb(skb);
800 skb = nskb;
801 }
803 /* Transmit should always work, or the queue would be stopped. */
804 if ( dev->hard_start_xmit(skb, dev) != 0 )
805 {
806 printk("Weird failure in hard_start_xmit!\n");
807 kfree_skb(skb);
808 break;
809 }
811 perfc_incr(net_tx_transmitted);
812 }
813 spin_unlock(&dev->xmit_lock);
814 }
816 DECLARE_TASKLET_DISABLED(net_tx_tasklet, net_tx_action, 0);
818 static inline void maybe_schedule_tx_action(void)
819 {
820 smp_mb();
821 if ( !netif_queue_stopped(the_dev) &&
822 !list_empty(&net_schedule_list) )
823 tasklet_schedule(&net_tx_tasklet);
824 }
827 /* Destructor function for tx skbs. */
828 static void tx_skb_release(struct sk_buff *skb)
829 {
830 int i;
831 net_vif_t *vif = skb->src_vif;
832 unsigned long flags;
834 spin_lock_irqsave(&vif->domain->page_lock, flags);
835 for ( i = 0; i < skb_shinfo(skb)->nr_frags; i++ )
836 put_page_tot(skb_shinfo(skb)->frags[i].page);
837 spin_unlock_irqrestore(&vif->domain->page_lock, flags);
839 if ( skb->skb_type == SKB_NODATA )
840 kmem_cache_free(net_header_cachep, skb->head);
842 skb_shinfo(skb)->nr_frags = 0;
844 spin_lock_irqsave(&vif->tx_lock, flags);
845 __make_tx_response(vif, skb->guest_id, RING_STATUS_OK);
846 spin_unlock_irqrestore(&vif->tx_lock, flags);
848 /*
849 * Checks below must happen after the above response is posted.
850 * This avoids a possible race with a guest OS on another CPU.
851 */
852 smp_mb();
854 if ( (vif->tx_cons == vif->tx_prod) && get_tx_bufs(vif) )
855 {
856 add_to_net_schedule_list_tail(vif);
857 maybe_schedule_tx_action();
858 }
860 put_vif(vif);
861 }
864 /*
865 * We need this ioctl for efficient implementation of the
866 * if_indextoname() function required by the IPv6 API. Without
867 * it, we would have to search all the interfaces to find a
868 * match. --pb
869 */
871 static int dev_ifname(struct ifreq *arg)
872 {
873 struct net_device *dev;
874 struct ifreq ifr;
876 /*
877 * Fetch the caller's info block.
878 */
880 if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
881 return -EFAULT;
883 read_lock(&dev_base_lock);
884 dev = __dev_get_by_index(ifr.ifr_ifindex);
885 if (!dev) {
886 read_unlock(&dev_base_lock);
887 return -ENODEV;
888 }
890 strcpy(ifr.ifr_name, dev->name);
891 read_unlock(&dev_base_lock);
893 if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
894 return -EFAULT;
895 return 0;
896 }
899 /**
900 * netdev_set_master - set up master/slave pair
901 * @slave: slave device
902 * @master: new master device
903 *
904 * Changes the master device of the slave. Pass %NULL to break the
905 * bonding. The caller must hold the RTNL semaphore. On a failure
906 * a negative errno code is returned. On success the reference counts
907 * are adjusted, %RTM_NEWLINK is sent to the routing socket and the
908 * function returns zero.
909 */
911 int netdev_set_master(struct net_device *slave, struct net_device *master)
912 {
913 struct net_device *old = slave->master;
915 if (master) {
916 if (old)
917 return -EBUSY;
918 dev_hold(master);
919 }
921 br_write_lock_bh(BR_NETPROTO_LOCK);
922 slave->master = master;
923 br_write_unlock_bh(BR_NETPROTO_LOCK);
925 if (old)
926 dev_put(old);
928 if (master)
929 slave->flags |= IFF_SLAVE;
930 else
931 slave->flags &= ~IFF_SLAVE;
933 rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
934 return 0;
935 }
937 /**
938 * dev_set_promiscuity - update promiscuity count on a device
939 * @dev: device
940 * @inc: modifier
941 *
942 * Add or remove promsicuity from a device. While the count in the device
943 * remains above zero the interface remains promiscuous. Once it hits zero
944 * the device reverts back to normal filtering operation. A negative inc
945 * value is used to drop promiscuity on the device.
946 */
948 void dev_set_promiscuity(struct net_device *dev, int inc)
949 {
950 unsigned short old_flags = dev->flags;
952 dev->flags |= IFF_PROMISC;
953 if ((dev->promiscuity += inc) == 0)
954 dev->flags &= ~IFF_PROMISC;
955 if (dev->flags^old_flags) {
956 #ifdef CONFIG_NET_FASTROUTE
957 if (dev->flags&IFF_PROMISC) {
958 netdev_fastroute_obstacles++;
959 dev_clear_fastroute(dev);
960 } else
961 netdev_fastroute_obstacles--;
962 #endif
963 dev_mc_upload(dev);
964 printk(KERN_INFO "device %s %s promiscuous mode\n",
965 dev->name, (dev->flags&IFF_PROMISC) ? "entered" : "left");
966 }
967 }
969 /**
970 * dev_set_allmulti - update allmulti count on a device
971 * @dev: device
972 * @inc: modifier
973 *
974 * Add or remove reception of all multicast frames to a device. While the
975 * count in the device remains above zero the interface remains listening
976 * to all interfaces. Once it hits zero the device reverts back to normal
977 * filtering operation. A negative @inc value is used to drop the counter
978 * when releasing a resource needing all multicasts.
979 */
981 void dev_set_allmulti(struct net_device *dev, int inc)
982 {
983 unsigned short old_flags = dev->flags;
985 dev->flags |= IFF_ALLMULTI;
986 if ((dev->allmulti += inc) == 0)
987 dev->flags &= ~IFF_ALLMULTI;
988 if (dev->flags^old_flags)
989 dev_mc_upload(dev);
990 }
992 int dev_change_flags(struct net_device *dev, unsigned flags)
993 {
994 int ret;
995 int old_flags = dev->flags;
997 /*
998 * Set the flags on our device.
999 */
1001 dev->flags = (flags & (IFF_DEBUG|IFF_NOTRAILERS|IFF_NOARP|IFF_DYNAMIC|
1002 IFF_MULTICAST|IFF_PORTSEL|IFF_AUTOMEDIA)) |
1003 (dev->flags & (IFF_UP|IFF_VOLATILE|IFF_PROMISC|IFF_ALLMULTI));
1005 /*
1006 * Load in the correct multicast list now the flags have changed.
1007 */
1009 dev_mc_upload(dev);
1011 /*
1012 * Have we downed the interface. We handle IFF_UP ourselves
1013 * according to user attempts to set it, rather than blindly
1014 * setting it.
1015 */
1017 ret = 0;
1018 if ((old_flags^flags)&IFF_UP) /* Bit is different ? */
1020 ret = ((old_flags & IFF_UP) ? dev_close : dev_open)(dev);
1022 if (ret == 0)
1023 dev_mc_upload(dev);
1026 if (dev->flags&IFF_UP &&
1027 ((old_flags^dev->flags)&
1028 ~(IFF_UP|IFF_PROMISC|IFF_ALLMULTI|IFF_VOLATILE)))
1029 notifier_call_chain(&netdev_chain, NETDEV_CHANGE, dev);
1031 if ((flags^dev->gflags)&IFF_PROMISC) {
1032 int inc = (flags&IFF_PROMISC) ? +1 : -1;
1033 dev->gflags ^= IFF_PROMISC;
1034 dev_set_promiscuity(dev, inc);
1037 /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
1038 is important. Some (broken) drivers set IFF_PROMISC, when
1039 IFF_ALLMULTI is requested not asking us and not reporting.
1040 */
1041 if ((flags^dev->gflags)&IFF_ALLMULTI) {
1042 int inc = (flags&IFF_ALLMULTI) ? +1 : -1;
1043 dev->gflags ^= IFF_ALLMULTI;
1044 dev_set_allmulti(dev, inc);
1047 if (old_flags^dev->flags)
1048 rtmsg_ifinfo(RTM_NEWLINK, dev, old_flags^dev->flags);
1050 return ret;
1053 /*
1054 * Perform the SIOCxIFxxx calls.
1055 */
1057 static int dev_ifsioc(struct ifreq *ifr, unsigned int cmd)
1059 struct net_device *dev;
1060 int err;
1062 if ((dev = __dev_get_by_name(ifr->ifr_name)) == NULL)
1063 return -ENODEV;
1065 switch(cmd)
1067 case SIOCGIFFLAGS: /* Get interface flags */
1068 ifr->ifr_flags = (dev->flags&~(IFF_PROMISC|IFF_ALLMULTI|IFF_RUNNING))
1069 |(dev->gflags&(IFF_PROMISC|IFF_ALLMULTI));
1070 if (netif_running(dev) && netif_carrier_ok(dev))
1071 ifr->ifr_flags |= IFF_RUNNING;
1072 return 0;
1074 case SIOCSIFFLAGS: /* Set interface flags */
1075 return dev_change_flags(dev, ifr->ifr_flags);
1077 case SIOCGIFMETRIC: /* Get the metric on the interface */
1078 ifr->ifr_metric = 0;
1079 return 0;
1081 case SIOCSIFMETRIC: /* Set the metric on the interface */
1082 return -EOPNOTSUPP;
1084 case SIOCGIFMTU: /* Get the MTU of a device */
1085 ifr->ifr_mtu = dev->mtu;
1086 return 0;
1088 case SIOCSIFMTU: /* Set the MTU of a device */
1089 if (ifr->ifr_mtu == dev->mtu)
1090 return 0;
1092 /*
1093 * MTU must be positive.
1094 */
1096 if (ifr->ifr_mtu<0)
1097 return -EINVAL;
1099 if (!netif_device_present(dev))
1100 return -ENODEV;
1102 if (dev->change_mtu)
1103 err = dev->change_mtu(dev, ifr->ifr_mtu);
1104 else {
1105 dev->mtu = ifr->ifr_mtu;
1106 err = 0;
1108 if (!err && dev->flags&IFF_UP)
1109 notifier_call_chain(&netdev_chain, NETDEV_CHANGEMTU, dev);
1110 return err;
1112 case SIOCGIFHWADDR:
1113 memcpy(ifr->ifr_hwaddr.sa_data,dev->dev_addr, MAX_ADDR_LEN);
1114 ifr->ifr_hwaddr.sa_family=dev->type;
1115 return 0;
1117 case SIOCSIFHWADDR:
1118 if (dev->set_mac_address == NULL)
1119 return -EOPNOTSUPP;
1120 if (ifr->ifr_hwaddr.sa_family!=dev->type)
1121 return -EINVAL;
1122 if (!netif_device_present(dev))
1123 return -ENODEV;
1124 err = dev->set_mac_address(dev, &ifr->ifr_hwaddr);
1125 if (!err)
1126 notifier_call_chain(&netdev_chain, NETDEV_CHANGEADDR, dev);
1127 return err;
1129 case SIOCSIFHWBROADCAST:
1130 if (ifr->ifr_hwaddr.sa_family!=dev->type)
1131 return -EINVAL;
1132 memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data, MAX_ADDR_LEN);
1133 notifier_call_chain(&netdev_chain, NETDEV_CHANGEADDR, dev);
1134 return 0;
1136 case SIOCGIFMAP:
1137 ifr->ifr_map.mem_start=dev->mem_start;
1138 ifr->ifr_map.mem_end=dev->mem_end;
1139 ifr->ifr_map.base_addr=dev->base_addr;
1140 ifr->ifr_map.irq=dev->irq;
1141 ifr->ifr_map.dma=dev->dma;
1142 ifr->ifr_map.port=dev->if_port;
1143 return 0;
1145 case SIOCSIFMAP:
1146 if (dev->set_config) {
1147 if (!netif_device_present(dev))
1148 return -ENODEV;
1149 return dev->set_config(dev,&ifr->ifr_map);
1151 return -EOPNOTSUPP;
1153 case SIOCADDMULTI:
1154 if (dev->set_multicast_list == NULL ||
1155 ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
1156 return -EINVAL;
1157 if (!netif_device_present(dev))
1158 return -ENODEV;
1159 dev_mc_add(dev,ifr->ifr_hwaddr.sa_data, dev->addr_len, 1);
1160 return 0;
1162 case SIOCDELMULTI:
1163 if (dev->set_multicast_list == NULL ||
1164 ifr->ifr_hwaddr.sa_family!=AF_UNSPEC)
1165 return -EINVAL;
1166 if (!netif_device_present(dev))
1167 return -ENODEV;
1168 dev_mc_delete(dev,ifr->ifr_hwaddr.sa_data,dev->addr_len, 1);
1169 return 0;
1171 case SIOCGIFINDEX:
1172 ifr->ifr_ifindex = dev->ifindex;
1173 return 0;
1175 case SIOCSIFNAME:
1176 if (dev->flags&IFF_UP)
1177 return -EBUSY;
1178 if (__dev_get_by_name(ifr->ifr_newname))
1179 return -EEXIST;
1180 memcpy(dev->name, ifr->ifr_newname, IFNAMSIZ);
1181 dev->name[IFNAMSIZ-1] = 0;
1182 notifier_call_chain(&netdev_chain, NETDEV_CHANGENAME, dev);
1183 return 0;
1185 #ifdef WIRELESS_EXT
1186 case SIOCGIWSTATS:
1187 return dev_iwstats(dev, ifr);
1188 #endif /* WIRELESS_EXT */
1190 /*
1191 * Unknown or private ioctl
1192 */
1194 default:
1195 if ((cmd >= SIOCDEVPRIVATE &&
1196 cmd <= SIOCDEVPRIVATE + 15) ||
1197 cmd == SIOCBONDENSLAVE ||
1198 cmd == SIOCBONDRELEASE ||
1199 cmd == SIOCBONDSETHWADDR ||
1200 cmd == SIOCBONDSLAVEINFOQUERY ||
1201 cmd == SIOCBONDINFOQUERY ||
1202 cmd == SIOCBONDCHANGEACTIVE ||
1203 cmd == SIOCETHTOOL ||
1204 cmd == SIOCGMIIPHY ||
1205 cmd == SIOCGMIIREG ||
1206 cmd == SIOCSMIIREG) {
1207 if (dev->do_ioctl) {
1208 if (!netif_device_present(dev))
1209 return -ENODEV;
1210 return dev->do_ioctl(dev, ifr, cmd);
1212 return -EOPNOTSUPP;
1215 #ifdef WIRELESS_EXT
1216 if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST) {
1217 if (dev->do_ioctl) {
1218 if (!netif_device_present(dev))
1219 return -ENODEV;
1220 return dev->do_ioctl(dev, ifr, cmd);
1222 return -EOPNOTSUPP;
1224 #endif /* WIRELESS_EXT */
1227 return -EINVAL;
1230 /*
1231 * This function handles all "interface"-type I/O control requests. The actual
1232 * 'doing' part of this is dev_ifsioc above.
1233 */
1235 /**
1236 * dev_ioctl - network device ioctl
1237 * @cmd: command to issue
1238 * @arg: pointer to a struct ifreq in user space
1240 * Issue ioctl functions to devices. This is normally called by the
1241 * user space syscall interfaces but can sometimes be useful for
1242 * other purposes. The return value is the return from the syscall if
1243 * positive or a negative errno code on error.
1244 */
1246 int dev_ioctl(unsigned int cmd, void *arg)
1248 struct ifreq ifr;
1249 int ret;
1250 char *colon;
1252 /* One special case: SIOCGIFCONF takes ifconf argument
1253 and requires shared lock, because it sleeps writing
1254 to user space.
1255 */
1257 if (cmd == SIOCGIFCONF) {
1258 return -ENOSYS;
1260 if (cmd == SIOCGIFNAME) {
1261 return dev_ifname((struct ifreq *)arg);
1264 if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
1265 return -EFAULT;
1267 ifr.ifr_name[IFNAMSIZ-1] = 0;
1269 colon = strchr(ifr.ifr_name, ':');
1270 if (colon)
1271 *colon = 0;
1273 /*
1274 * See which interface the caller is talking about.
1275 */
1277 switch(cmd)
1279 /*
1280 * These ioctl calls:
1281 * - can be done by all.
1282 * - atomic and do not require locking.
1283 * - return a value
1284 */
1286 case SIOCGIFFLAGS:
1287 case SIOCGIFMETRIC:
1288 case SIOCGIFMTU:
1289 case SIOCGIFHWADDR:
1290 case SIOCGIFSLAVE:
1291 case SIOCGIFMAP:
1292 case SIOCGIFINDEX:
1293 dev_load(ifr.ifr_name);
1294 read_lock(&dev_base_lock);
1295 ret = dev_ifsioc(&ifr, cmd);
1296 read_unlock(&dev_base_lock);
1297 if (!ret) {
1298 if (colon)
1299 *colon = ':';
1300 if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
1301 return -EFAULT;
1303 return ret;
1305 /*
1306 * These ioctl calls:
1307 * - require superuser power.
1308 * - require strict serialization.
1309 * - return a value
1310 */
1312 case SIOCETHTOOL:
1313 case SIOCGMIIPHY:
1314 case SIOCGMIIREG:
1315 if (!capable(CAP_NET_ADMIN))
1316 return -EPERM;
1317 dev_load(ifr.ifr_name);
1318 dev_probe_lock();
1319 rtnl_lock();
1320 ret = dev_ifsioc(&ifr, cmd);
1321 rtnl_unlock();
1322 dev_probe_unlock();
1323 if (!ret) {
1324 if (colon)
1325 *colon = ':';
1326 if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
1327 return -EFAULT;
1329 return ret;
1331 /*
1332 * These ioctl calls:
1333 * - require superuser power.
1334 * - require strict serialization.
1335 * - do not return a value
1336 */
1338 case SIOCSIFFLAGS:
1339 case SIOCSIFMETRIC:
1340 case SIOCSIFMTU:
1341 case SIOCSIFMAP:
1342 case SIOCSIFHWADDR:
1343 case SIOCSIFSLAVE:
1344 case SIOCADDMULTI:
1345 case SIOCDELMULTI:
1346 case SIOCSIFHWBROADCAST:
1347 case SIOCSIFNAME:
1348 case SIOCSMIIREG:
1349 case SIOCBONDENSLAVE:
1350 case SIOCBONDRELEASE:
1351 case SIOCBONDSETHWADDR:
1352 case SIOCBONDSLAVEINFOQUERY:
1353 case SIOCBONDINFOQUERY:
1354 case SIOCBONDCHANGEACTIVE:
1355 if (!capable(CAP_NET_ADMIN))
1356 return -EPERM;
1357 dev_load(ifr.ifr_name);
1358 dev_probe_lock();
1359 rtnl_lock();
1360 ret = dev_ifsioc(&ifr, cmd);
1361 rtnl_unlock();
1362 dev_probe_unlock();
1363 return ret;
1365 case SIOCGIFMEM:
1366 /* Get the per device memory space. We can add this but currently
1367 do not support it */
1368 case SIOCSIFMEM:
1369 /* Set the per device memory buffer space. */
1370 case SIOCSIFLINK:
1371 return -EINVAL;
1373 /*
1374 * Unknown or private ioctl.
1375 */
1377 default:
1378 if (cmd >= SIOCDEVPRIVATE &&
1379 cmd <= SIOCDEVPRIVATE + 15) {
1380 dev_load(ifr.ifr_name);
1381 dev_probe_lock();
1382 rtnl_lock();
1383 ret = dev_ifsioc(&ifr, cmd);
1384 rtnl_unlock();
1385 dev_probe_unlock();
1386 if (!ret && copy_to_user(arg, &ifr, sizeof(struct ifreq)))
1387 return -EFAULT;
1388 return ret;
1390 #ifdef WIRELESS_EXT
1391 /* Take care of Wireless Extensions */
1392 if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST) {
1393 /* If command is `set a parameter', or
1394 * `get the encoding parameters', check if
1395 * the user has the right to do it */
1396 if (IW_IS_SET(cmd) || (cmd == SIOCGIWENCODE)) {
1397 if(!capable(CAP_NET_ADMIN))
1398 return -EPERM;
1400 dev_load(ifr.ifr_name);
1401 rtnl_lock();
1402 ret = dev_ifsioc(&ifr, cmd);
1403 rtnl_unlock();
1404 if (!ret && IW_IS_GET(cmd) &&
1405 copy_to_user(arg, &ifr,
1406 sizeof(struct ifreq)))
1407 return -EFAULT;
1408 return ret;
1410 #endif /* WIRELESS_EXT */
1411 return -EINVAL;
1416 /**
1417 * dev_new_index - allocate an ifindex
1419 * Returns a suitable unique value for a new device interface
1420 * number. The caller must hold the rtnl semaphore or the
1421 * dev_base_lock to be sure it remains unique.
1422 */
1424 int dev_new_index(void)
1426 static int ifindex;
1427 for (;;) {
1428 if (++ifindex <= 0)
1429 ifindex=1;
1430 if (__dev_get_by_index(ifindex) == NULL)
1431 return ifindex;
1435 static int dev_boot_phase = 1;
1437 /**
1438 * register_netdevice - register a network device
1439 * @dev: device to register
1441 * Take a completed network device structure and add it to the kernel
1442 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
1443 * chain. 0 is returned on success. A negative errno code is returned
1444 * on a failure to set up the device, or if the name is a duplicate.
1446 * Callers must hold the rtnl semaphore. See the comment at the
1447 * end of Space.c for details about the locking. You may want
1448 * register_netdev() instead of this.
1450 * BUGS:
1451 * The locking appears insufficient to guarantee two parallel registers
1452 * will not get the same name.
1453 */
1455 int net_dev_init(void);
1457 int register_netdevice(struct net_device *dev)
1459 struct net_device *d, **dp;
1460 #ifdef CONFIG_NET_DIVERT
1461 int ret;
1462 #endif
1464 spin_lock_init(&dev->queue_lock);
1465 spin_lock_init(&dev->xmit_lock);
1466 dev->xmit_lock_owner = -1;
1467 #ifdef CONFIG_NET_FASTROUTE
1468 dev->fastpath_lock=RW_LOCK_UNLOCKED;
1469 #endif
1471 if (dev_boot_phase)
1472 net_dev_init();
1474 #ifdef CONFIG_NET_DIVERT
1475 ret = alloc_divert_blk(dev);
1476 if (ret)
1477 return ret;
1478 #endif /* CONFIG_NET_DIVERT */
1480 dev->iflink = -1;
1482 /* Init, if this function is available */
1483 if (dev->init && dev->init(dev) != 0) {
1484 #ifdef CONFIG_NET_DIVERT
1485 free_divert_blk(dev);
1486 #endif
1487 return -EIO;
1490 dev->ifindex = dev_new_index();
1491 if (dev->iflink == -1)
1492 dev->iflink = dev->ifindex;
1494 /* Check for existence, and append to tail of chain */
1495 for (dp=&dev_base; (d=*dp) != NULL; dp=&d->next) {
1496 if (d == dev || strcmp(d->name, dev->name) == 0) {
1497 #ifdef CONFIG_NET_DIVERT
1498 free_divert_blk(dev);
1499 #endif
1500 return -EEXIST;
1503 /*
1504 * nil rebuild_header routine,
1505 * that should be never called and used as just bug trap.
1506 */
1508 if (dev->rebuild_header == NULL)
1509 dev->rebuild_header = default_rebuild_header;
1511 /*
1512 * Default initial state at registry is that the
1513 * device is present.
1514 */
1516 set_bit(__LINK_STATE_PRESENT, &dev->state);
1518 dev->next = NULL;
1519 dev_init_scheduler(dev);
1520 write_lock_bh(&dev_base_lock);
1521 *dp = dev;
1522 dev_hold(dev);
1523 dev->deadbeaf = 0;
1524 write_unlock_bh(&dev_base_lock);
1526 /* Notify protocols, that a new device appeared. */
1527 notifier_call_chain(&netdev_chain, NETDEV_REGISTER, dev);
1529 return 0;
1532 /**
1533 * netdev_finish_unregister - complete unregistration
1534 * @dev: device
1536 * Destroy and free a dead device. A value of zero is returned on
1537 * success.
1538 */
1540 int netdev_finish_unregister(struct net_device *dev)
1542 BUG_TRAP(dev->ip_ptr==NULL);
1543 BUG_TRAP(dev->ip6_ptr==NULL);
1544 BUG_TRAP(dev->dn_ptr==NULL);
1546 if (!dev->deadbeaf) {
1547 printk(KERN_ERR "Freeing alive device %p, %s\n",
1548 dev, dev->name);
1549 return 0;
1551 #ifdef NET_REFCNT_DEBUG
1552 printk(KERN_DEBUG "netdev_finish_unregister: %s%s.\n", dev->name,
1553 (dev->features & NETIF_F_DYNALLOC)?"":", old style");
1554 #endif
1555 if (dev->destructor)
1556 dev->destructor(dev);
1557 if (dev->features & NETIF_F_DYNALLOC)
1558 kfree(dev);
1559 return 0;
1562 /**
1563 * unregister_netdevice - remove device from the kernel
1564 * @dev: device
1566 * This function shuts down a device interface and removes it
1567 * from the kernel tables. On success 0 is returned, on a failure
1568 * a negative errno code is returned.
1570 * Callers must hold the rtnl semaphore. See the comment at the
1571 * end of Space.c for details about the locking. You may want
1572 * unregister_netdev() instead of this.
1573 */
1575 int unregister_netdevice(struct net_device *dev)
1577 unsigned long now, warning_time;
1578 struct net_device *d, **dp;
1580 /* If device is running, close it first. */
1581 if (dev->flags & IFF_UP)
1582 dev_close(dev);
1584 BUG_TRAP(dev->deadbeaf==0);
1585 dev->deadbeaf = 1;
1587 /* And unlink it from device chain. */
1588 for (dp = &dev_base; (d=*dp) != NULL; dp=&d->next) {
1589 if (d == dev) {
1590 write_lock_bh(&dev_base_lock);
1591 *dp = d->next;
1592 write_unlock_bh(&dev_base_lock);
1593 break;
1596 if (d == NULL) {
1597 printk(KERN_DEBUG "unregister_netdevice: device %s/%p"
1598 " not registered\n", dev->name, dev);
1599 return -ENODEV;
1602 /* Synchronize to net_rx_action. */
1603 br_write_lock_bh(BR_NETPROTO_LOCK);
1604 br_write_unlock_bh(BR_NETPROTO_LOCK);
1606 if (dev_boot_phase == 0) {
1608 /* Shutdown queueing discipline. */
1609 dev_shutdown(dev);
1611 /* Notify protocols, that we are about to destroy
1612 this device. They should clean all the things.
1613 */
1614 notifier_call_chain(&netdev_chain, NETDEV_UNREGISTER, dev);
1616 /*
1617 * Flush the multicast chain
1618 */
1619 dev_mc_discard(dev);
1622 if (dev->uninit)
1623 dev->uninit(dev);
1625 /* Notifier chain MUST detach us from master device. */
1626 BUG_TRAP(dev->master==NULL);
1628 #ifdef CONFIG_NET_DIVERT
1629 free_divert_blk(dev);
1630 #endif
1632 if (dev->features & NETIF_F_DYNALLOC) {
1633 #ifdef NET_REFCNT_DEBUG
1634 if (atomic_read(&dev->refcnt) != 1)
1635 printk(KERN_DEBUG "unregister_netdevice: holding %s refcnt=%d\n",
1636 dev->name, atomic_read(&dev->refcnt)-1);
1637 #endif
1638 dev_put(dev);
1639 return 0;
1642 /* Last reference is our one */
1643 if (atomic_read(&dev->refcnt) == 1) {
1644 dev_put(dev);
1645 return 0;
1648 #ifdef NET_REFCNT_DEBUG
1649 printk("unregister_netdevice: waiting %s refcnt=%d\n",
1650 dev->name, atomic_read(&dev->refcnt));
1651 #endif
1653 /* EXPLANATION. If dev->refcnt is not now 1 (our own reference)
1654 it means that someone in the kernel still has a reference
1655 to this device and we cannot release it.
1657 "New style" devices have destructors, hence we can return from this
1658 function and destructor will do all the work later. As of kernel 2.4.0
1659 there are very few "New Style" devices.
1661 "Old style" devices expect that the device is free of any references
1662 upon exit from this function.
1663 We cannot return from this function until all such references have
1664 fallen away. This is because the caller of this function will probably
1665 immediately kfree(*dev) and then be unloaded via sys_delete_module.
1667 So, we linger until all references fall away. The duration of the
1668 linger is basically unbounded! It is driven by, for example, the
1669 current setting of sysctl_ipfrag_time.
1671 After 1 second, we start to rebroadcast unregister notifications
1672 in hope that careless clients will release the device.
1674 */
1676 now = warning_time = jiffies;
1677 while (atomic_read(&dev->refcnt) != 1) {
1678 if ((jiffies - now) > 1*HZ) {
1679 /* Rebroadcast unregister notification */
1680 notifier_call_chain(&netdev_chain, NETDEV_UNREGISTER, dev);
1682 mdelay(250);
1683 if ((jiffies - warning_time) > 10*HZ) {
1684 printk(KERN_EMERG "unregister_netdevice: waiting for %s to "
1685 "become free. Usage count = %d\n",
1686 dev->name, atomic_read(&dev->refcnt));
1687 warning_time = jiffies;
1690 dev_put(dev);
1691 return 0;
1695 /*
1696 * Initialize the DEV module. At boot time this walks the device list and
1697 * unhooks any devices that fail to initialise (normally hardware not
1698 * present) and leaves us with a valid list of present and active devices.
1700 */
1702 extern void net_device_init(void);
1703 extern void ip_auto_config(void);
1704 #ifdef CONFIG_NET_DIVERT
1705 extern void dv_init(void);
1706 #endif /* CONFIG_NET_DIVERT */
1709 /*
1710 * Callers must hold the rtnl semaphore. See the comment at the
1711 * end of Space.c for details about the locking.
1712 */
1713 int __init net_dev_init(void)
1715 struct net_device *dev, **dp;
1717 if ( !dev_boot_phase )
1718 return 0;
1720 skb_init();
1722 net_header_cachep = kmem_cache_create(
1723 "net_header_cache",
1724 (PKT_PROT_LEN + sizeof(void *) - 1) & ~(sizeof(void *) - 1),
1725 0, SLAB_HWCACHE_ALIGN, NULL, NULL);
1727 spin_lock_init(&net_schedule_list_lock);
1728 INIT_LIST_HEAD(&net_schedule_list);
1730 /*
1731 * Add the devices.
1732 * If the call to dev->init fails, the dev is removed
1733 * from the chain disconnecting the device until the
1734 * next reboot.
1736 * NB At boot phase networking is dead. No locking is required.
1737 * But we still preserve dev_base_lock for sanity.
1738 */
1739 dp = &dev_base;
1740 while ((dev = *dp) != NULL) {
1741 spin_lock_init(&dev->queue_lock);
1742 spin_lock_init(&dev->xmit_lock);
1744 dev->xmit_lock_owner = -1;
1745 dev->iflink = -1;
1746 dev_hold(dev);
1748 /*
1749 * Allocate name. If the init() fails
1750 * the name will be reissued correctly.
1751 */
1752 if (strchr(dev->name, '%'))
1753 dev_alloc_name(dev, dev->name);
1755 if (dev->init && dev->init(dev)) {
1756 /*
1757 * It failed to come up. It will be unhooked later.
1758 * dev_alloc_name can now advance to next suitable
1759 * name that is checked next.
1760 */
1761 dev->deadbeaf = 1;
1762 dp = &dev->next;
1763 } else {
1764 dp = &dev->next;
1765 dev->ifindex = dev_new_index();
1766 if (dev->iflink == -1)
1767 dev->iflink = dev->ifindex;
1768 if (dev->rebuild_header == NULL)
1769 dev->rebuild_header = default_rebuild_header;
1770 dev_init_scheduler(dev);
1771 set_bit(__LINK_STATE_PRESENT, &dev->state);
1775 /*
1776 * Unhook devices that failed to come up
1777 */
1778 dp = &dev_base;
1779 while ((dev = *dp) != NULL) {
1780 if (dev->deadbeaf) {
1781 write_lock_bh(&dev_base_lock);
1782 *dp = dev->next;
1783 write_unlock_bh(&dev_base_lock);
1784 dev_put(dev);
1785 } else {
1786 dp = &dev->next;
1790 dev_boot_phase = 0;
1792 dev_mcast_init();
1794 /*
1795 * Initialise network devices
1796 */
1798 net_device_init();
1800 return 0;
1803 inline int init_tx_header(u8 *data, unsigned int len, struct net_device *dev)
1805 memcpy(data + ETH_ALEN, dev->dev_addr, ETH_ALEN);
1807 switch ( ntohs(*(unsigned short *)(data + 12)) )
1809 case ETH_P_ARP:
1810 if ( len < 42 ) break;
1811 memcpy(data + 22, dev->dev_addr, ETH_ALEN);
1812 return ETH_P_ARP;
1813 case ETH_P_IP:
1814 return ETH_P_IP;
1816 return 0;
1820 static int get_tx_bufs(net_vif_t *vif)
1822 struct task_struct *p = vif->domain;
1823 net_idx_t *shared_idxs = vif->shared_idxs;
1824 net_ring_t *shared_rings = vif->shared_rings;
1825 net_vif_t *target;
1826 unsigned long buf_pfn;
1827 struct pfn_info *buf_page;
1828 u8 *g_data;
1829 unsigned short protocol;
1830 struct sk_buff *skb;
1831 tx_req_entry_t tx;
1832 int i, j, ret;
1833 unsigned long flags;
1835 if ( vif->tx_req_cons == shared_idxs->tx_req_prod )
1836 return 0;
1838 spin_lock_irqsave(&vif->tx_lock, flags);
1840 j = vif->tx_prod;
1842 /*
1843 * Collect up new transmit buffers. We collect up to the guest OS's new
1844 * producer index, but take care not to catch up with our own consumer
1845 * index.
1846 */
1847 again:
1848 for ( i = vif->tx_req_cons;
1849 (i != shared_idxs->tx_req_prod) &&
1850 (((vif->tx_resp_prod-i) & (TX_RING_SIZE-1)) != 1);
1851 i = TX_RING_INC(i) )
1853 tx = shared_rings->tx_ring[i].req;
1854 target = VIF_DROP;
1856 if ( (tx.size < PKT_PROT_LEN) || (tx.size > ETH_FRAME_LEN) )
1858 DPRINTK("Bad packet size: %d\n", tx.size);
1859 __make_tx_response(vif, tx.id, RING_STATUS_BAD_PAGE);
1860 continue;
1863 /* No crossing a page boundary as the payload mustn't fragment. */
1864 if ( ((tx.addr & ~PAGE_MASK) + tx.size) >= PAGE_SIZE )
1866 DPRINTK("tx.addr: %lx, size: %u, end: %lu\n",
1867 tx.addr, tx.size, (tx.addr &~PAGE_MASK) + tx.size);
1868 __make_tx_response(vif, tx.id, RING_STATUS_BAD_PAGE);
1869 continue;
1872 buf_pfn = tx.addr >> PAGE_SHIFT;
1873 buf_page = frame_table + buf_pfn;
1874 spin_lock(&p->page_lock);
1875 if ( (buf_pfn >= max_page) ||
1876 ((buf_page->flags & PG_domain_mask) != p->domain) )
1878 DPRINTK("Bad page frame\n");
1879 spin_unlock(&p->page_lock);
1880 __make_tx_response(vif, tx.id, RING_STATUS_BAD_PAGE);
1881 continue;
1884 g_data = map_domain_mem(tx.addr);
1886 protocol = __constant_htons(
1887 init_tx_header(g_data, tx.size, the_dev));
1888 if ( protocol == 0 )
1890 __make_tx_response(vif, tx.id, RING_STATUS_BAD_PAGE);
1891 goto tx_unmap_and_continue;
1894 target = net_get_target_vif(g_data, tx.size, vif);
1896 if ( VIF_LOCAL(target) )
1898 /* Local delivery */
1899 if ( (skb = dev_alloc_skb(ETH_FRAME_LEN + 32)) == NULL )
1901 __make_tx_response(vif, tx.id, RING_STATUS_BAD_PAGE);
1902 put_vif(target);
1903 goto tx_unmap_and_continue;
1906 skb->src_vif = vif;
1907 skb->dst_vif = target;
1908 skb->protocol = protocol;
1910 /*
1911 * We don't need a well-formed skb as netif_rx will fill these
1912 * fields in as necessary. All we actually need is the right
1913 * page offset in skb->data, and the right length in skb->len.
1914 * Note that the correct address/length *excludes* link header.
1915 */
1916 skb->head = (u8 *)map_domain_mem(
1917 ((skb->pf - frame_table) << PAGE_SHIFT));
1918 skb->data = skb->head + 18;
1919 memcpy(skb->data, g_data, tx.size);
1920 skb->data += ETH_HLEN;
1921 skb->len = tx.size - ETH_HLEN;
1922 unmap_domain_mem(skb->head);
1924 netif_rx(skb);
1926 __make_tx_response(vif, tx.id, RING_STATUS_OK);
1928 else if ( (target == VIF_PHYS) || IS_PRIV(p) )
1930 vif->tx_shadow_ring[j].id = tx.id;
1931 vif->tx_shadow_ring[j].size = tx.size;
1932 vif->tx_shadow_ring[j].header =
1933 kmem_cache_alloc(net_header_cachep, GFP_KERNEL);
1934 if ( vif->tx_shadow_ring[j].header == NULL )
1936 __make_tx_response(vif, tx.id, RING_STATUS_OK);
1937 goto tx_unmap_and_continue;
1940 memcpy(vif->tx_shadow_ring[j].header, g_data, PKT_PROT_LEN);
1941 vif->tx_shadow_ring[j].payload = tx.addr + PKT_PROT_LEN;
1942 get_page_tot(buf_page);
1943 j = TX_RING_INC(j);
1945 else
1947 __make_tx_response(vif, tx.id, RING_STATUS_DROPPED);
1950 tx_unmap_and_continue:
1951 unmap_domain_mem(g_data);
1952 spin_unlock(&p->page_lock);
1955 /*
1956 * Needed as a final check for req_prod updates on another CPU.
1957 * Also ensures that other CPUs see shadow ring updates.
1958 */
1959 smp_mb();
1961 if ( (vif->tx_req_cons = i) != shared_idxs->tx_req_prod )
1962 goto again;
1964 if ( (ret = (vif->tx_prod != j)) )
1965 vif->tx_prod = j;
1967 spin_unlock_irqrestore(&vif->tx_lock, flags);
1969 return ret;
1973 /*
1974 * do_net_update:
1976 * Called from guest OS to notify updates to its transmit and/or receive
1977 * descriptor rings.
1978 */
1980 long do_net_update(void)
1982 net_ring_t *shared_rings;
1983 net_vif_t *vif;
1984 net_idx_t *shared_idxs;
1985 unsigned int i, j, idx;
1986 rx_req_entry_t rx;
1987 unsigned long pte_pfn, buf_pfn;
1988 struct pfn_info *pte_page, *buf_page;
1989 unsigned long *ptep;
1991 perfc_incr(net_hypercalls);
1993 for ( idx = 0; idx < MAX_DOMAIN_VIFS; idx++ )
1995 if ( (vif = current->net_vif_list[idx]) == NULL )
1996 break;
1998 shared_idxs = vif->shared_idxs;
1999 shared_rings = vif->shared_rings;
2001 /*
2002 * PHASE 1 -- TRANSMIT RING
2003 */
2005 if ( get_tx_bufs(vif) )
2007 add_to_net_schedule_list_tail(vif);
2008 maybe_schedule_tx_action();
2011 /*
2012 * PHASE 2 -- RECEIVE RING
2013 */
2015 /*
2016 * Collect up new receive buffers. We collect up to the guest OS's
2017 * new producer index, but take care not to catch up with our own
2018 * consumer index.
2019 */
2020 j = vif->rx_prod;
2021 for ( i = vif->rx_req_cons;
2022 (i != shared_idxs->rx_req_prod) &&
2023 (((vif->rx_resp_prod-i) & (RX_RING_SIZE-1)) != 1);
2024 i = RX_RING_INC(i) )
2026 rx = shared_rings->rx_ring[i].req;
2028 pte_pfn = rx.addr >> PAGE_SHIFT;
2029 pte_page = frame_table + pte_pfn;
2031 spin_lock_irq(&current->page_lock);
2032 if ( (pte_pfn >= max_page) ||
2033 ((pte_page->flags & (PG_type_mask | PG_domain_mask)) !=
2034 (PGT_l1_page_table | current->domain)) )
2036 DPRINTK("Bad page frame for ppte %d,%08lx,%08lx,%08lx\n",
2037 current->domain, pte_pfn, max_page, pte_page->flags);
2038 spin_unlock_irq(&current->page_lock);
2039 make_rx_response(vif, rx.id, 0, RING_STATUS_BAD_PAGE, 0);
2040 continue;
2043 ptep = map_domain_mem(rx.addr);
2045 if ( !(*ptep & _PAGE_PRESENT) )
2047 DPRINTK("Invalid PTE passed down (not present)\n");
2048 make_rx_response(vif, rx.id, 0, RING_STATUS_BAD_PAGE, 0);
2049 goto rx_unmap_and_continue;
2052 buf_pfn = *ptep >> PAGE_SHIFT;
2053 buf_page = frame_table + buf_pfn;
2055 if ( ((buf_page->flags & (PG_type_mask | PG_domain_mask)) !=
2056 (PGT_writeable_page | current->domain)) ||
2057 (buf_page->tot_count != 1) )
2059 DPRINTK("Need a mapped-once writeable page (%ld/%ld/%08lx)\n",
2060 buf_page->type_count, buf_page->tot_count, buf_page->flags);
2061 make_rx_response(vif, rx.id, 0, RING_STATUS_BAD_PAGE, 0);
2062 goto rx_unmap_and_continue;
2065 /*
2066 * The pte they passed was good, so take it away from them. We
2067 * also lock down the page-table page, so it doesn't go away.
2068 */
2069 get_page_type(pte_page);
2070 get_page_tot(pte_page);
2071 *ptep &= ~_PAGE_PRESENT;
2072 buf_page->flags = buf_page->type_count = buf_page->tot_count = 0;
2073 list_del(&buf_page->list);
2075 vif->rx_shadow_ring[j].id = rx.id;
2076 vif->rx_shadow_ring[j].pte_ptr = rx.addr;
2077 vif->rx_shadow_ring[j].buf_pfn = buf_pfn;
2078 vif->rx_shadow_ring[j].flush_count = (unsigned short)
2079 atomic_read(&tlb_flush_count[smp_processor_id()]);
2080 j = RX_RING_INC(j);
2082 rx_unmap_and_continue:
2083 unmap_domain_mem(ptep);
2084 spin_unlock_irq(&current->page_lock);
2087 vif->rx_req_cons = i;
2089 if ( vif->rx_prod != j )
2091 smp_mb(); /* Let other CPUs see new descriptors first. */
2092 vif->rx_prod = j;
2096 return 0;
2100 static void __make_tx_response(net_vif_t *vif,
2101 unsigned short id,
2102 unsigned char st)
2104 unsigned int pos;
2105 tx_resp_entry_t *resp;
2107 /* Place on the response ring for the relevant domain. */
2108 pos = vif->tx_resp_prod;
2109 resp = &vif->shared_rings->tx_ring[pos].resp;
2110 resp->id = id;
2111 resp->status = st;
2112 pos = TX_RING_INC(pos);
2113 vif->tx_resp_prod = vif->shared_idxs->tx_resp_prod = pos;
2114 smp_mb(); /* Update producer before checking event threshold. */
2115 if ( pos == vif->shared_idxs->tx_event )
2117 unsigned long cpu_mask = mark_guest_event(vif->domain, _EVENT_NET);
2118 guest_event_notify(cpu_mask);
2123 static void make_rx_response(net_vif_t *vif,
2124 unsigned short id,
2125 unsigned short size,
2126 unsigned char st,
2127 unsigned char off)
2129 unsigned long flags;
2130 unsigned int pos;
2131 rx_resp_entry_t *resp;
2133 /* Place on the response ring for the relevant domain. */
2134 spin_lock_irqsave(&vif->rx_lock, flags);
2135 pos = vif->rx_resp_prod;
2136 resp = &vif->shared_rings->rx_ring[pos].resp;
2137 resp->id = id;
2138 resp->size = size;
2139 resp->status = st;
2140 resp->offset = off;
2141 pos = RX_RING_INC(pos);
2142 vif->rx_resp_prod = vif->shared_idxs->rx_resp_prod = pos;
2143 smp_mb(); /* Update producer before checking event threshold. */
2144 if ( pos == vif->shared_idxs->rx_event )
2146 unsigned long cpu_mask = mark_guest_event(vif->domain, _EVENT_NET);
2147 guest_event_notify(cpu_mask);
2149 spin_unlock_irqrestore(&vif->rx_lock, flags);
2153 int setup_network_devices(void)
2155 int i, ret;
2156 extern char opt_ifname[];
2158 for ( i = 0; i < smp_num_cpus; i++ )
2159 skb_queue_head_init(&rx_skb_queue[i]);
2161 open_softirq(NET_RX_SOFTIRQ, net_rx_action, NULL);
2162 tasklet_enable(&net_tx_tasklet);
2164 if ( (the_dev = dev_get_by_name(opt_ifname)) == NULL )
2166 printk("Could not find device %s: using dummy device\n", opt_ifname);
2167 strcpy(opt_ifname, "dummy");
2168 if ( (the_dev = dev_get_by_name(opt_ifname)) == NULL )
2170 printk("Failed to find the dummy device!\n");
2171 return 0;
2175 if ( (ret = dev_open(the_dev)) != 0 )
2177 printk("Error opening device %s for use (%d)\n", opt_ifname, ret);
2178 the_dev = NULL;
2179 return 0;
2182 printk("Device %s opened and ready for use.\n", opt_ifname);
2184 return 1;