ia64/xen-unstable

view xen/net/dev.c @ 243:084eaa7c072e

bitkeeper revision 1.101 (3e5ca3886XTHk1fZRjPIuqdNdAtuZQ)

dev.c:
Fix race in net tx code.
author kaf24@labyrinth.cl.cam.ac.uk
date Wed Feb 26 11:22:48 2003 +0000 (2003-02-26)
parents 7b3edf42feae
children ce6eb0deb0c7
line source
1 /*
2 * NET3 Protocol independent device support routines.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 */
10 #include <asm/uaccess.h>
11 #include <asm/system.h>
12 #include <asm/bitops.h>
13 #include <linux/config.h>
14 #include <linux/delay.h>
15 #include <linux/lib.h>
16 #include <linux/types.h>
17 #include <linux/sched.h>
18 #include <linux/mm.h>
19 #include <linux/socket.h>
20 #include <linux/sockios.h>
21 #include <linux/errno.h>
22 #include <linux/interrupt.h>
23 #include <linux/if_ether.h>
24 #include <linux/netdevice.h>
25 #include <linux/etherdevice.h>
26 #include <linux/skbuff.h>
27 #include <linux/brlock.h>
28 #include <linux/init.h>
29 #include <linux/module.h>
31 #include <linux/event.h>
32 #include <asm/domain_page.h>
33 #include <asm/pgalloc.h>
35 #define BUG_TRAP ASSERT
36 #define notifier_call_chain(_a,_b,_c) ((void)0)
37 #define rtmsg_ifinfo(_a,_b,_c) ((void)0)
38 #define rtnl_lock() ((void)0)
39 #define rtnl_unlock() ((void)0)
41 #if 1
42 #define DPRINTK(_f, _a...) printk(_f , ## _a)
43 #else
44 #define DPRINTK(_f, _a...) ((void)0)
45 #endif
47 #define TX_RING_INC(_i) (((_i)+1) & (TX_RING_SIZE-1))
48 #define RX_RING_INC(_i) (((_i)+1) & (RX_RING_SIZE-1))
49 #define TX_RING_ADD(_i,_j) (((_i)+(_j)) & (TX_RING_SIZE-1))
50 #define RX_RING_ADD(_i,_j) (((_i)+(_j)) & (RX_RING_SIZE-1))
52 struct net_device *the_dev = NULL;
54 /*
55 * Transmitted packets are fragmented, so we can copy the important headesr
56 * before checking them for validity. Avoids need for page protection.
57 */
58 /* Ethernet + IP headers */
59 #define PKT_PROT_LEN (ETH_HLEN + 20)
60 static kmem_cache_t *net_header_cachep;
62 /**
63 * __dev_get_by_name - find a device by its name
64 * @name: name to find
65 *
66 * Find an interface by name. Must be called under RTNL semaphore
67 * or @dev_base_lock. If the name is found a pointer to the device
68 * is returned. If the name is not found then %NULL is returned. The
69 * reference counters are not incremented so the caller must be
70 * careful with locks.
71 */
74 struct net_device *__dev_get_by_name(const char *name)
75 {
76 struct net_device *dev;
78 for (dev = dev_base; dev != NULL; dev = dev->next) {
79 if (strncmp(dev->name, name, IFNAMSIZ) == 0)
80 return dev;
81 }
82 return NULL;
83 }
85 /**
86 * dev_get_by_name - find a device by its name
87 * @name: name to find
88 *
89 * Find an interface by name. This can be called from any
90 * context and does its own locking. The returned handle has
91 * the usage count incremented and the caller must use dev_put() to
92 * release it when it is no longer needed. %NULL is returned if no
93 * matching device is found.
94 */
96 struct net_device *dev_get_by_name(const char *name)
97 {
98 struct net_device *dev;
100 read_lock(&dev_base_lock);
101 dev = __dev_get_by_name(name);
102 if (dev)
103 dev_hold(dev);
104 read_unlock(&dev_base_lock);
105 return dev;
106 }
108 /**
109 * dev_get - test if a device exists
110 * @name: name to test for
111 *
112 * Test if a name exists. Returns true if the name is found. In order
113 * to be sure the name is not allocated or removed during the test the
114 * caller must hold the rtnl semaphore.
115 *
116 * This function primarily exists for back compatibility with older
117 * drivers.
118 */
120 int dev_get(const char *name)
121 {
122 struct net_device *dev;
124 read_lock(&dev_base_lock);
125 dev = __dev_get_by_name(name);
126 read_unlock(&dev_base_lock);
127 return dev != NULL;
128 }
130 /**
131 * __dev_get_by_index - find a device by its ifindex
132 * @ifindex: index of device
133 *
134 * Search for an interface by index. Returns %NULL if the device
135 * is not found or a pointer to the device. The device has not
136 * had its reference counter increased so the caller must be careful
137 * about locking. The caller must hold either the RTNL semaphore
138 * or @dev_base_lock.
139 */
141 struct net_device * __dev_get_by_index(int ifindex)
142 {
143 struct net_device *dev;
145 for (dev = dev_base; dev != NULL; dev = dev->next) {
146 if (dev->ifindex == ifindex)
147 return dev;
148 }
149 return NULL;
150 }
153 /**
154 * dev_get_by_index - find a device by its ifindex
155 * @ifindex: index of device
156 *
157 * Search for an interface by index. Returns NULL if the device
158 * is not found or a pointer to the device. The device returned has
159 * had a reference added and the pointer is safe until the user calls
160 * dev_put to indicate they have finished with it.
161 */
163 struct net_device * dev_get_by_index(int ifindex)
164 {
165 struct net_device *dev;
167 read_lock(&dev_base_lock);
168 dev = __dev_get_by_index(ifindex);
169 if (dev)
170 dev_hold(dev);
171 read_unlock(&dev_base_lock);
172 return dev;
173 }
175 /**
176 * dev_getbyhwaddr - find a device by its hardware address
177 * @type: media type of device
178 * @ha: hardware address
179 *
180 * Search for an interface by MAC address. Returns NULL if the device
181 * is not found or a pointer to the device. The caller must hold the
182 * rtnl semaphore. The returned device has not had its ref count increased
183 * and the caller must therefore be careful about locking
184 *
185 * BUGS:
186 * If the API was consistent this would be __dev_get_by_hwaddr
187 */
189 struct net_device *dev_getbyhwaddr(unsigned short type, char *ha)
190 {
191 struct net_device *dev;
193 for (dev = dev_base; dev != NULL; dev = dev->next) {
194 if (dev->type == type &&
195 memcmp(dev->dev_addr, ha, dev->addr_len) == 0)
196 return dev;
197 }
198 return NULL;
199 }
201 /**
202 * dev_alloc_name - allocate a name for a device
203 * @dev: device
204 * @name: name format string
205 *
206 * Passed a format string - eg "lt%d" it will try and find a suitable
207 * id. Not efficient for many devices, not called a lot. The caller
208 * must hold the dev_base or rtnl lock while allocating the name and
209 * adding the device in order to avoid duplicates. Returns the number
210 * of the unit assigned or a negative errno code.
211 */
213 int dev_alloc_name(struct net_device *dev, const char *name)
214 {
215 int i;
216 char buf[32];
217 char *p;
219 /*
220 * Verify the string as this thing may have come from
221 * the user. There must be either one "%d" and no other "%"
222 * characters, or no "%" characters at all.
223 */
224 p = strchr(name, '%');
225 if (p && (p[1] != 'd' || strchr(p+2, '%')))
226 return -EINVAL;
228 /*
229 * If you need over 100 please also fix the algorithm...
230 */
231 for (i = 0; i < 100; i++) {
232 snprintf(buf,sizeof(buf),name,i);
233 if (__dev_get_by_name(buf) == NULL) {
234 strcpy(dev->name, buf);
235 return i;
236 }
237 }
238 return -ENFILE; /* Over 100 of the things .. bail out! */
239 }
241 /**
242 * dev_alloc - allocate a network device and name
243 * @name: name format string
244 * @err: error return pointer
245 *
246 * Passed a format string, eg. "lt%d", it will allocate a network device
247 * and space for the name. %NULL is returned if no memory is available.
248 * If the allocation succeeds then the name is assigned and the
249 * device pointer returned. %NULL is returned if the name allocation
250 * failed. The cause of an error is returned as a negative errno code
251 * in the variable @err points to.
252 *
253 * The caller must hold the @dev_base or RTNL locks when doing this in
254 * order to avoid duplicate name allocations.
255 */
257 struct net_device *dev_alloc(const char *name, int *err)
258 {
259 struct net_device *dev=kmalloc(sizeof(struct net_device), GFP_KERNEL);
260 if (dev == NULL) {
261 *err = -ENOBUFS;
262 return NULL;
263 }
264 memset(dev, 0, sizeof(struct net_device));
265 *err = dev_alloc_name(dev, name);
266 if (*err < 0) {
267 kfree(dev);
268 return NULL;
269 }
270 return dev;
271 }
273 /**
274 * netdev_state_change - device changes state
275 * @dev: device to cause notification
276 *
277 * Called to indicate a device has changed state. This function calls
278 * the notifier chains for netdev_chain and sends a NEWLINK message
279 * to the routing socket.
280 */
282 void netdev_state_change(struct net_device *dev)
283 {
284 if (dev->flags&IFF_UP) {
285 notifier_call_chain(&netdev_chain, NETDEV_CHANGE, dev);
286 rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
287 }
288 }
291 #ifdef CONFIG_KMOD
293 /**
294 * dev_load - load a network module
295 * @name: name of interface
296 *
297 * If a network interface is not present and the process has suitable
298 * privileges this function loads the module. If module loading is not
299 * available in this kernel then it becomes a nop.
300 */
302 void dev_load(const char *name)
303 {
304 if (!dev_get(name) && capable(CAP_SYS_MODULE))
305 request_module(name);
306 }
308 #else
310 extern inline void dev_load(const char *unused){;}
312 #endif
314 static int default_rebuild_header(struct sk_buff *skb)
315 {
316 printk(KERN_DEBUG "%s: default_rebuild_header called -- BUG!\n",
317 skb->dev ? skb->dev->name : "NULL!!!");
318 kfree_skb(skb);
319 return 1;
320 }
322 /**
323 * dev_open - prepare an interface for use.
324 * @dev: device to open
325 *
326 * Takes a device from down to up state. The device's private open
327 * function is invoked and then the multicast lists are loaded. Finally
328 * the device is moved into the up state and a %NETDEV_UP message is
329 * sent to the netdev notifier chain.
330 *
331 * Calling this function on an active interface is a nop. On a failure
332 * a negative errno code is returned.
333 */
335 int dev_open(struct net_device *dev)
336 {
337 int ret = 0;
339 /*
340 * Is it already up?
341 */
343 if (dev->flags&IFF_UP)
344 return 0;
346 /*
347 * Is it even present?
348 */
349 if (!netif_device_present(dev))
350 return -ENODEV;
352 /*
353 * Call device private open method
354 */
355 if (try_inc_mod_count(dev->owner)) {
356 if (dev->open) {
357 ret = dev->open(dev);
358 if (ret != 0 && dev->owner)
359 __MOD_DEC_USE_COUNT(dev->owner);
360 }
361 } else {
362 ret = -ENODEV;
363 }
365 /*
366 * If it went open OK then:
367 */
369 if (ret == 0)
370 {
371 /*
372 * Set the flags.
373 */
374 dev->flags |= IFF_UP;
376 set_bit(__LINK_STATE_START, &dev->state);
378 /*
379 * Initialize multicasting status
380 */
381 dev_mc_upload(dev);
383 /*
384 * Wakeup transmit queue engine
385 */
386 dev_activate(dev);
388 /*
389 * ... and announce new interface.
390 */
391 notifier_call_chain(&netdev_chain, NETDEV_UP, dev);
392 }
393 return(ret);
394 }
397 /**
398 * dev_close - shutdown an interface.
399 * @dev: device to shutdown
400 *
401 * This function moves an active device into down state. A
402 * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
403 * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
404 * chain.
405 */
407 int dev_close(struct net_device *dev)
408 {
409 if (!(dev->flags&IFF_UP))
410 return 0;
412 /*
413 * Tell people we are going down, so that they can
414 * prepare to death, when device is still operating.
415 */
416 notifier_call_chain(&netdev_chain, NETDEV_GOING_DOWN, dev);
418 dev_deactivate(dev);
420 clear_bit(__LINK_STATE_START, &dev->state);
422 /*
423 * Call the device specific close. This cannot fail.
424 * Only if device is UP
425 *
426 * We allow it to be called even after a DETACH hot-plug
427 * event.
428 */
430 if (dev->stop)
431 dev->stop(dev);
433 /*
434 * Device is now down.
435 */
437 dev->flags &= ~IFF_UP;
439 /*
440 * Tell people we are down
441 */
442 notifier_call_chain(&netdev_chain, NETDEV_DOWN, dev);
444 /*
445 * Drop the module refcount
446 */
447 if (dev->owner)
448 __MOD_DEC_USE_COUNT(dev->owner);
450 return(0);
451 }
454 #ifdef CONFIG_HIGHMEM
455 /* Actually, we should eliminate this check as soon as we know, that:
456 * 1. IOMMU is present and allows to map all the memory.
457 * 2. No high memory really exists on this machine.
458 */
460 static inline int
461 illegal_highdma(struct net_device *dev, struct sk_buff *skb)
462 {
463 int i;
465 if (dev->features&NETIF_F_HIGHDMA)
466 return 0;
468 for (i=0; i<skb_shinfo(skb)->nr_frags; i++)
469 if (skb_shinfo(skb)->frags[i].page >= highmem_start_page)
470 return 1;
472 return 0;
473 }
474 #else
475 #define illegal_highdma(dev, skb) (0)
476 #endif
479 /*=======================================================================
480 Receiver routines
481 =======================================================================*/
483 struct netif_rx_stats netdev_rx_stat[NR_CPUS];
485 void deliver_packet(struct sk_buff *skb, net_vif_t *vif)
486 {
487 net_shadow_ring_t *shadow_ring;
488 rx_shadow_entry_t *rx;
489 unsigned long *g_pte;
490 struct pfn_info *g_pfn, *h_pfn;
491 unsigned int i;
493 memset(skb->mac.ethernet->h_dest, 0, ETH_ALEN);
494 if ( ntohs(skb->mac.ethernet->h_proto) == ETH_P_ARP )
495 memset(skb->nh.raw + 18, 0, ETH_ALEN);
496 shadow_ring = vif->shadow_ring;
498 if ( (i = shadow_ring->rx_idx) == shadow_ring->rx_prod )
499 return;
501 if ( shadow_ring->rx_ring[i].status != RING_STATUS_OK )
502 {
503 DPRINTK("Bad buffer in deliver_packet()\n");
504 goto inc_and_out;
505 }
507 rx = shadow_ring->rx_ring + i;
508 if ( (skb->len + ETH_HLEN) < rx->size )
509 rx->size = skb->len + ETH_HLEN;
511 g_pte = map_domain_mem(rx->addr);
513 g_pfn = frame_table + (*g_pte >> PAGE_SHIFT);
514 h_pfn = skb->pf;
516 h_pfn->tot_count = h_pfn->type_count = 1;
517 g_pfn->tot_count = g_pfn->type_count = 0;
518 h_pfn->flags = g_pfn->flags & (~PG_type_mask);
520 if (*g_pte & _PAGE_RW) h_pfn->flags |= PGT_writeable_page;
521 g_pfn->flags = 0;
523 /* Point the guest at the new machine frame. */
524 machine_to_phys_mapping[h_pfn - frame_table]
525 = machine_to_phys_mapping[g_pfn - frame_table];
526 *g_pte = (*g_pte & ~PAGE_MASK)
527 | (((h_pfn - frame_table) << PAGE_SHIFT) & PAGE_MASK);
528 *g_pte |= _PAGE_PRESENT;
530 unmap_domain_mem(g_pte);
532 /* Our skbuff now points at the guest's old frame. */
533 skb->pf = g_pfn;
535 inc_and_out:
536 shadow_ring->rx_idx = RX_RING_INC(i);
537 }
539 /**
540 * netif_rx - post buffer to the network code
541 * @skb: buffer to post
542 *
543 * This function receives a packet from a device driver and queues it for
544 * the upper (protocol) levels to process. It always succeeds. The buffer
545 * may be dropped during processing for congestion control or by the
546 * protocol layers.
547 *
548 * return values:
549 * NET_RX_SUCCESS (no congestion)
550 * NET_RX_DROP (packet was dropped)
551 */
553 int netif_rx(struct sk_buff *skb)
554 {
555 #ifdef CONFIG_SMP
556 unsigned long cpu_mask;
557 #endif
558 int this_cpu = smp_processor_id();
559 unsigned long flags;
560 net_vif_t *vif;
562 local_irq_save(flags);
564 ASSERT(skb->skb_type == SKB_ZERO_COPY);
565 ASSERT((skb->data - skb->head) == (18 + ETH_HLEN));
567 skb->head = (u8 *)map_domain_mem(((skb->pf - frame_table) << PAGE_SHIFT));
568 skb->data = skb->head;
569 skb_reserve(skb,18); /* 18 is the 16 from dev_alloc_skb plus 2 for
570 IP header alignment. */
571 skb->mac.raw = skb->data;
572 skb->data += ETH_HLEN;
573 skb->nh.raw = skb->data;
575 netdev_rx_stat[this_cpu].total++;
577 if ( skb->src_vif == VIF_UNKNOWN_INTERFACE )
578 skb->src_vif = VIF_PHYSICAL_INTERFACE;
580 if ( skb->dst_vif == VIF_UNKNOWN_INTERFACE )
581 skb->dst_vif = __net_get_target_vif(skb->mac.raw,
582 skb->len, skb->src_vif);
584 if ( ((vif = sys_vif_list[skb->dst_vif]) == NULL) ||
585 (skb->dst_vif <= VIF_PHYSICAL_INTERFACE) )
586 {
587 netdev_rx_stat[this_cpu].dropped++;
588 unmap_domain_mem(skb->head);
589 kfree_skb(skb);
590 local_irq_restore(flags);
591 return NET_RX_DROP;
592 }
594 deliver_packet(skb, vif);
595 cpu_mask = mark_hyp_event(vif->domain, _HYP_EVENT_NET_RX);
596 unmap_domain_mem(skb->head);
597 kfree_skb(skb);
598 hyp_event_notify(cpu_mask);
599 local_irq_restore(flags);
600 return NET_RX_SUCCESS;
601 }
604 /*************************************************************
605 * NEW TRANSMIT SCHEDULER
606 *
607 * NB. We ought also to only send a limited number of bytes to the NIC
608 * for transmission at any one time (to avoid head-of-line blocking).
609 * However, driver rings are small enough that they provide a reasonable
610 * limit.
611 *
612 * eg. 3c905 has 16 descriptors == 8 packets, at 100Mbps
613 * e1000 has 256 descriptors == 128 packets, at 1000Mbps
614 * tg3 has 512 descriptors == 256 packets, at 1000Mbps
615 *
616 * So, worst case is tg3 with 256 1500-bytes packets == 375kB.
617 * This would take 3ms, and represents our worst-case HoL blocking cost.
618 *
619 * We think this is reasonable.
620 */
622 struct list_head net_schedule_list;
623 spinlock_t net_schedule_list_lock;
625 static int __on_net_schedule_list(net_vif_t *vif)
626 {
627 return vif->list.next != NULL;
628 }
630 static void remove_from_net_schedule_list(net_vif_t *vif)
631 {
632 unsigned long flags;
633 if ( !__on_net_schedule_list(vif) ) return;
634 spin_lock_irqsave(&net_schedule_list_lock, flags);
635 if ( __on_net_schedule_list(vif) )
636 {
637 list_del(&vif->list);
638 vif->list.next = NULL;
639 }
640 spin_unlock_irqrestore(&net_schedule_list_lock, flags);
641 }
643 static void add_to_net_schedule_list_tail(net_vif_t *vif)
644 {
645 unsigned long flags;
646 if ( __on_net_schedule_list(vif) ) return;
647 spin_lock_irqsave(&net_schedule_list_lock, flags);
648 if ( !__on_net_schedule_list(vif) )
649 {
650 list_add_tail(&vif->list, &net_schedule_list);
651 }
652 spin_unlock_irqrestore(&net_schedule_list_lock, flags);
653 }
656 /* Destructor function for tx skbs. */
657 static void tx_skb_release(struct sk_buff *skb)
658 {
659 int i;
660 net_vif_t *vif = sys_vif_list[skb->src_vif];
661 unsigned int idx;
662 tx_shadow_entry_t *tx;
664 for ( i = 0; i < skb_shinfo(skb)->nr_frags; i++ )
665 put_page_tot(skb_shinfo(skb)->frags[i].page);
667 if ( skb->skb_type == SKB_NODATA )
668 kmem_cache_free(net_header_cachep, skb->head);
670 skb_shinfo(skb)->nr_frags = 0;
672 /* This would mean that the guest OS has fiddled with our index. */
673 if ( vif->shadow_ring->tx_cons != vif->net_ring->tx_cons )
674 DPRINTK("Shadow and shared rings out of sync (%d/%d)\n",
675 vif->shadow_ring->tx_cons, vif->net_ring->tx_cons);
677 /*
678 * XXX This assumes that, per vif, SKBs are processed in-order!
679 * Also assumes no concurrency. This is safe because each vif
680 * maps to one NIC. This is executed in NIC interrupt code, so we have
681 * mutual exclusion from do_IRQ().
682 */
684 /* Skip over a sequence of bad descriptors, plus the first good one. */
685 do {
686 idx = vif->shadow_ring->tx_cons;
687 /* There must be at least one good descriptor outstanding. */
688 if ( idx == vif->shadow_ring->tx_idx ) BUG();
689 tx = &vif->shadow_ring->tx_ring[idx];
690 vif->shadow_ring->tx_cons = TX_RING_INC(idx);
691 if ( vif->shadow_ring->tx_cons == vif->net_ring->tx_event )
692 set_bit(_EVENT_NET_TX,
693 &sys_vif_list[skb->src_vif]->domain->shared_info->events);
694 } while ( tx->status != RING_STATUS_OK );
696 /* Now skip over any more bad descriptors, up to the next good one. */
697 do {
698 idx = vif->shadow_ring->tx_cons;
699 tx = &vif->shadow_ring->tx_ring[idx];
700 /* Carry on until we find a good descriptor, or reach scheduler idx. */
701 if ( (idx == vif->shadow_ring->tx_idx) ||
702 (tx->status == RING_STATUS_OK) )
703 break;
704 vif->shadow_ring->tx_cons = TX_RING_INC(idx);
705 if ( vif->shadow_ring->tx_cons == vif->net_ring->tx_event )
706 set_bit(_EVENT_NET_TX,
707 &sys_vif_list[skb->src_vif]->domain->shared_info->events);
708 } while ( 1 );
710 /* Finally, update shared consumer index to the new private value. */
711 vif->net_ring->tx_cons = vif->shadow_ring->tx_cons;
712 }
715 static void net_tx_action(unsigned long unused)
716 {
717 struct net_device *dev = the_dev;
718 struct list_head *ent;
719 struct sk_buff *skb;
720 net_vif_t *vif;
721 tx_shadow_entry_t *tx;
723 spin_lock(&dev->xmit_lock);
724 while ( !netif_queue_stopped(dev) &&
725 !list_empty(&net_schedule_list) )
726 {
727 /* Get a vif from the list with work to do. */
728 ent = net_schedule_list.next;
729 vif = list_entry(ent, net_vif_t, list);
730 remove_from_net_schedule_list(vif);
731 if ( vif->shadow_ring->tx_idx == vif->shadow_ring->tx_prod )
732 continue;
734 /* Pick an entry from the transmit queue. */
735 tx = &vif->shadow_ring->tx_ring[vif->shadow_ring->tx_idx];
736 vif->shadow_ring->tx_idx = TX_RING_INC(vif->shadow_ring->tx_idx);
737 if ( vif->shadow_ring->tx_idx != vif->shadow_ring->tx_prod )
738 add_to_net_schedule_list_tail(vif);
740 /* Check the chosen entry is good. */
741 if ( tx->status != RING_STATUS_OK ) continue;
743 if ( (skb = alloc_skb_nodata(GFP_ATOMIC)) == NULL )
744 {
745 add_to_net_schedule_list_tail(vif);
746 printk("Out of memory in net_tx_action()!\n");
747 break;
748 }
750 skb->destructor = tx_skb_release;
752 skb->head = skb->data = tx->header;
753 skb->end = skb->tail = skb->head + PKT_PROT_LEN;
755 skb->dev = the_dev;
756 skb->src_vif = vif->id;
757 skb->dst_vif = VIF_PHYSICAL_INTERFACE;
758 skb->mac.raw = skb->data;
760 skb_shinfo(skb)->frags[0].page = frame_table +
761 (tx->payload >> PAGE_SHIFT);
762 skb_shinfo(skb)->frags[0].size = tx->size - PKT_PROT_LEN;
763 skb_shinfo(skb)->frags[0].page_offset = tx->payload & ~PAGE_MASK;
764 skb_shinfo(skb)->nr_frags = 1;
766 skb->data_len = tx->size - PKT_PROT_LEN;
767 skb->len = tx->size;
769 /* Transmit should always work, or the queue would be stopped. */
770 if ( dev->hard_start_xmit(skb, dev) != 0 )
771 {
772 add_to_net_schedule_list_tail(vif);
773 printk("Weird failure in hard_start_xmit!\n");
774 break;
775 }
776 }
777 spin_unlock(&dev->xmit_lock);
778 }
780 DECLARE_TASKLET_DISABLED(net_tx_tasklet, net_tx_action, 0);
782 static inline void maybe_schedule_tx_action(void)
783 {
784 smp_mb();
785 if ( !netif_queue_stopped(the_dev) &&
786 !list_empty(&net_schedule_list) )
787 tasklet_schedule(&net_tx_tasklet);
788 }
791 /*
792 * update_shared_ring(void)
793 *
794 * This replaces flush_rx_queue as the guest event handler to move packets
795 * queued in the guest ring up to the guest. Really, the packet is already
796 * there, it was page flipped in deliver_packet, but this moves the ring
797 * descriptor across from the shadow ring and increments the pointers.
798 */
800 void update_shared_ring(void)
801 {
802 rx_shadow_entry_t *rx;
803 shared_info_t *s = current->shared_info;
804 net_ring_t *net_ring;
805 net_shadow_ring_t *shadow_ring;
806 unsigned int nvif;
808 clear_bit(_HYP_EVENT_NET_RX, &current->hyp_events);
810 for ( nvif = 0; nvif < current->num_net_vifs; nvif++ )
811 {
812 net_ring = current->net_vif_list[nvif]->net_ring;
813 shadow_ring = current->net_vif_list[nvif]->shadow_ring;
815 /* This would mean that the guest OS has fiddled with our index. */
816 if ( shadow_ring->rx_cons != net_ring->rx_cons )
817 DPRINTK("Shadow and shared rings out of sync (%d/%d)\n",
818 shadow_ring->rx_cons, net_ring->rx_cons);
820 while ( shadow_ring->rx_cons != shadow_ring->rx_idx )
821 {
822 rx = shadow_ring->rx_ring + shadow_ring->rx_cons;
823 copy_to_user(net_ring->rx_ring + shadow_ring->rx_cons, rx,
824 sizeof(rx_entry_t));
826 if ( rx->flush_count == tlb_flush_count[smp_processor_id()] )
827 __flush_tlb();
829 shadow_ring->rx_cons = RX_RING_INC(shadow_ring->rx_cons);
831 if ( shadow_ring->rx_cons == net_ring->rx_event )
832 set_bit(_EVENT_NET_RX, &s->events);
833 }
834 net_ring->rx_cons = shadow_ring->rx_cons;
835 }
836 }
839 /*
840 * We need this ioctl for efficient implementation of the
841 * if_indextoname() function required by the IPv6 API. Without
842 * it, we would have to search all the interfaces to find a
843 * match. --pb
844 */
846 static int dev_ifname(struct ifreq *arg)
847 {
848 struct net_device *dev;
849 struct ifreq ifr;
851 /*
852 * Fetch the caller's info block.
853 */
855 if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
856 return -EFAULT;
858 read_lock(&dev_base_lock);
859 dev = __dev_get_by_index(ifr.ifr_ifindex);
860 if (!dev) {
861 read_unlock(&dev_base_lock);
862 return -ENODEV;
863 }
865 strcpy(ifr.ifr_name, dev->name);
866 read_unlock(&dev_base_lock);
868 if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
869 return -EFAULT;
870 return 0;
871 }
874 /**
875 * netdev_set_master - set up master/slave pair
876 * @slave: slave device
877 * @master: new master device
878 *
879 * Changes the master device of the slave. Pass %NULL to break the
880 * bonding. The caller must hold the RTNL semaphore. On a failure
881 * a negative errno code is returned. On success the reference counts
882 * are adjusted, %RTM_NEWLINK is sent to the routing socket and the
883 * function returns zero.
884 */
886 int netdev_set_master(struct net_device *slave, struct net_device *master)
887 {
888 struct net_device *old = slave->master;
890 if (master) {
891 if (old)
892 return -EBUSY;
893 dev_hold(master);
894 }
896 br_write_lock_bh(BR_NETPROTO_LOCK);
897 slave->master = master;
898 br_write_unlock_bh(BR_NETPROTO_LOCK);
900 if (old)
901 dev_put(old);
903 if (master)
904 slave->flags |= IFF_SLAVE;
905 else
906 slave->flags &= ~IFF_SLAVE;
908 rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
909 return 0;
910 }
912 /**
913 * dev_set_promiscuity - update promiscuity count on a device
914 * @dev: device
915 * @inc: modifier
916 *
917 * Add or remove promsicuity from a device. While the count in the device
918 * remains above zero the interface remains promiscuous. Once it hits zero
919 * the device reverts back to normal filtering operation. A negative inc
920 * value is used to drop promiscuity on the device.
921 */
923 void dev_set_promiscuity(struct net_device *dev, int inc)
924 {
925 unsigned short old_flags = dev->flags;
927 dev->flags |= IFF_PROMISC;
928 if ((dev->promiscuity += inc) == 0)
929 dev->flags &= ~IFF_PROMISC;
930 if (dev->flags^old_flags) {
931 #ifdef CONFIG_NET_FASTROUTE
932 if (dev->flags&IFF_PROMISC) {
933 netdev_fastroute_obstacles++;
934 dev_clear_fastroute(dev);
935 } else
936 netdev_fastroute_obstacles--;
937 #endif
938 dev_mc_upload(dev);
939 printk(KERN_INFO "device %s %s promiscuous mode\n",
940 dev->name, (dev->flags&IFF_PROMISC) ? "entered" : "left");
941 }
942 }
944 /**
945 * dev_set_allmulti - update allmulti count on a device
946 * @dev: device
947 * @inc: modifier
948 *
949 * Add or remove reception of all multicast frames to a device. While the
950 * count in the device remains above zero the interface remains listening
951 * to all interfaces. Once it hits zero the device reverts back to normal
952 * filtering operation. A negative @inc value is used to drop the counter
953 * when releasing a resource needing all multicasts.
954 */
956 void dev_set_allmulti(struct net_device *dev, int inc)
957 {
958 unsigned short old_flags = dev->flags;
960 dev->flags |= IFF_ALLMULTI;
961 if ((dev->allmulti += inc) == 0)
962 dev->flags &= ~IFF_ALLMULTI;
963 if (dev->flags^old_flags)
964 dev_mc_upload(dev);
965 }
967 int dev_change_flags(struct net_device *dev, unsigned flags)
968 {
969 int ret;
970 int old_flags = dev->flags;
972 /*
973 * Set the flags on our device.
974 */
976 dev->flags = (flags & (IFF_DEBUG|IFF_NOTRAILERS|IFF_NOARP|IFF_DYNAMIC|
977 IFF_MULTICAST|IFF_PORTSEL|IFF_AUTOMEDIA)) |
978 (dev->flags & (IFF_UP|IFF_VOLATILE|IFF_PROMISC|IFF_ALLMULTI));
980 /*
981 * Load in the correct multicast list now the flags have changed.
982 */
984 dev_mc_upload(dev);
986 /*
987 * Have we downed the interface. We handle IFF_UP ourselves
988 * according to user attempts to set it, rather than blindly
989 * setting it.
990 */
992 ret = 0;
993 if ((old_flags^flags)&IFF_UP) /* Bit is different ? */
994 {
995 ret = ((old_flags & IFF_UP) ? dev_close : dev_open)(dev);
997 if (ret == 0)
998 dev_mc_upload(dev);
999 }
1001 if (dev->flags&IFF_UP &&
1002 ((old_flags^dev->flags)&
1003 ~(IFF_UP|IFF_PROMISC|IFF_ALLMULTI|IFF_VOLATILE)))
1004 notifier_call_chain(&netdev_chain, NETDEV_CHANGE, dev);
1006 if ((flags^dev->gflags)&IFF_PROMISC) {
1007 int inc = (flags&IFF_PROMISC) ? +1 : -1;
1008 dev->gflags ^= IFF_PROMISC;
1009 dev_set_promiscuity(dev, inc);
1012 /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
1013 is important. Some (broken) drivers set IFF_PROMISC, when
1014 IFF_ALLMULTI is requested not asking us and not reporting.
1015 */
1016 if ((flags^dev->gflags)&IFF_ALLMULTI) {
1017 int inc = (flags&IFF_ALLMULTI) ? +1 : -1;
1018 dev->gflags ^= IFF_ALLMULTI;
1019 dev_set_allmulti(dev, inc);
1022 if (old_flags^dev->flags)
1023 rtmsg_ifinfo(RTM_NEWLINK, dev, old_flags^dev->flags);
1025 return ret;
1028 /*
1029 * Perform the SIOCxIFxxx calls.
1030 */
1032 static int dev_ifsioc(struct ifreq *ifr, unsigned int cmd)
1034 struct net_device *dev;
1035 int err;
1037 if ((dev = __dev_get_by_name(ifr->ifr_name)) == NULL)
1038 return -ENODEV;
1040 switch(cmd)
1042 case SIOCGIFFLAGS: /* Get interface flags */
1043 ifr->ifr_flags = (dev->flags&~(IFF_PROMISC|IFF_ALLMULTI|IFF_RUNNING))
1044 |(dev->gflags&(IFF_PROMISC|IFF_ALLMULTI));
1045 if (netif_running(dev) && netif_carrier_ok(dev))
1046 ifr->ifr_flags |= IFF_RUNNING;
1047 return 0;
1049 case SIOCSIFFLAGS: /* Set interface flags */
1050 return dev_change_flags(dev, ifr->ifr_flags);
1052 case SIOCGIFMETRIC: /* Get the metric on the interface */
1053 ifr->ifr_metric = 0;
1054 return 0;
1056 case SIOCSIFMETRIC: /* Set the metric on the interface */
1057 return -EOPNOTSUPP;
1059 case SIOCGIFMTU: /* Get the MTU of a device */
1060 ifr->ifr_mtu = dev->mtu;
1061 return 0;
1063 case SIOCSIFMTU: /* Set the MTU of a device */
1064 if (ifr->ifr_mtu == dev->mtu)
1065 return 0;
1067 /*
1068 * MTU must be positive.
1069 */
1071 if (ifr->ifr_mtu<0)
1072 return -EINVAL;
1074 if (!netif_device_present(dev))
1075 return -ENODEV;
1077 if (dev->change_mtu)
1078 err = dev->change_mtu(dev, ifr->ifr_mtu);
1079 else {
1080 dev->mtu = ifr->ifr_mtu;
1081 err = 0;
1083 if (!err && dev->flags&IFF_UP)
1084 notifier_call_chain(&netdev_chain, NETDEV_CHANGEMTU, dev);
1085 return err;
1087 case SIOCGIFHWADDR:
1088 memcpy(ifr->ifr_hwaddr.sa_data,dev->dev_addr, MAX_ADDR_LEN);
1089 ifr->ifr_hwaddr.sa_family=dev->type;
1090 return 0;
1092 case SIOCSIFHWADDR:
1093 if (dev->set_mac_address == NULL)
1094 return -EOPNOTSUPP;
1095 if (ifr->ifr_hwaddr.sa_family!=dev->type)
1096 return -EINVAL;
1097 if (!netif_device_present(dev))
1098 return -ENODEV;
1099 err = dev->set_mac_address(dev, &ifr->ifr_hwaddr);
1100 if (!err)
1101 notifier_call_chain(&netdev_chain, NETDEV_CHANGEADDR, dev);
1102 return err;
1104 case SIOCSIFHWBROADCAST:
1105 if (ifr->ifr_hwaddr.sa_family!=dev->type)
1106 return -EINVAL;
1107 memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data, MAX_ADDR_LEN);
1108 notifier_call_chain(&netdev_chain, NETDEV_CHANGEADDR, dev);
1109 return 0;
1111 case SIOCGIFMAP:
1112 ifr->ifr_map.mem_start=dev->mem_start;
1113 ifr->ifr_map.mem_end=dev->mem_end;
1114 ifr->ifr_map.base_addr=dev->base_addr;
1115 ifr->ifr_map.irq=dev->irq;
1116 ifr->ifr_map.dma=dev->dma;
1117 ifr->ifr_map.port=dev->if_port;
1118 return 0;
1120 case SIOCSIFMAP:
1121 if (dev->set_config) {
1122 if (!netif_device_present(dev))
1123 return -ENODEV;
1124 return dev->set_config(dev,&ifr->ifr_map);
1126 return -EOPNOTSUPP;
1128 case SIOCADDMULTI:
1129 if (dev->set_multicast_list == NULL ||
1130 ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
1131 return -EINVAL;
1132 if (!netif_device_present(dev))
1133 return -ENODEV;
1134 dev_mc_add(dev,ifr->ifr_hwaddr.sa_data, dev->addr_len, 1);
1135 return 0;
1137 case SIOCDELMULTI:
1138 if (dev->set_multicast_list == NULL ||
1139 ifr->ifr_hwaddr.sa_family!=AF_UNSPEC)
1140 return -EINVAL;
1141 if (!netif_device_present(dev))
1142 return -ENODEV;
1143 dev_mc_delete(dev,ifr->ifr_hwaddr.sa_data,dev->addr_len, 1);
1144 return 0;
1146 case SIOCGIFINDEX:
1147 ifr->ifr_ifindex = dev->ifindex;
1148 return 0;
1150 case SIOCSIFNAME:
1151 if (dev->flags&IFF_UP)
1152 return -EBUSY;
1153 if (__dev_get_by_name(ifr->ifr_newname))
1154 return -EEXIST;
1155 memcpy(dev->name, ifr->ifr_newname, IFNAMSIZ);
1156 dev->name[IFNAMSIZ-1] = 0;
1157 notifier_call_chain(&netdev_chain, NETDEV_CHANGENAME, dev);
1158 return 0;
1160 #ifdef WIRELESS_EXT
1161 case SIOCGIWSTATS:
1162 return dev_iwstats(dev, ifr);
1163 #endif /* WIRELESS_EXT */
1165 /*
1166 * Unknown or private ioctl
1167 */
1169 default:
1170 if ((cmd >= SIOCDEVPRIVATE &&
1171 cmd <= SIOCDEVPRIVATE + 15) ||
1172 cmd == SIOCBONDENSLAVE ||
1173 cmd == SIOCBONDRELEASE ||
1174 cmd == SIOCBONDSETHWADDR ||
1175 cmd == SIOCBONDSLAVEINFOQUERY ||
1176 cmd == SIOCBONDINFOQUERY ||
1177 cmd == SIOCBONDCHANGEACTIVE ||
1178 cmd == SIOCETHTOOL ||
1179 cmd == SIOCGMIIPHY ||
1180 cmd == SIOCGMIIREG ||
1181 cmd == SIOCSMIIREG) {
1182 if (dev->do_ioctl) {
1183 if (!netif_device_present(dev))
1184 return -ENODEV;
1185 return dev->do_ioctl(dev, ifr, cmd);
1187 return -EOPNOTSUPP;
1190 #ifdef WIRELESS_EXT
1191 if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST) {
1192 if (dev->do_ioctl) {
1193 if (!netif_device_present(dev))
1194 return -ENODEV;
1195 return dev->do_ioctl(dev, ifr, cmd);
1197 return -EOPNOTSUPP;
1199 #endif /* WIRELESS_EXT */
1202 return -EINVAL;
1205 /*
1206 * This function handles all "interface"-type I/O control requests. The actual
1207 * 'doing' part of this is dev_ifsioc above.
1208 */
1210 /**
1211 * dev_ioctl - network device ioctl
1212 * @cmd: command to issue
1213 * @arg: pointer to a struct ifreq in user space
1215 * Issue ioctl functions to devices. This is normally called by the
1216 * user space syscall interfaces but can sometimes be useful for
1217 * other purposes. The return value is the return from the syscall if
1218 * positive or a negative errno code on error.
1219 */
1221 int dev_ioctl(unsigned int cmd, void *arg)
1223 struct ifreq ifr;
1224 int ret;
1225 char *colon;
1227 /* One special case: SIOCGIFCONF takes ifconf argument
1228 and requires shared lock, because it sleeps writing
1229 to user space.
1230 */
1232 if (cmd == SIOCGIFCONF) {
1233 return -ENOSYS;
1235 if (cmd == SIOCGIFNAME) {
1236 return dev_ifname((struct ifreq *)arg);
1239 if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
1240 return -EFAULT;
1242 ifr.ifr_name[IFNAMSIZ-1] = 0;
1244 colon = strchr(ifr.ifr_name, ':');
1245 if (colon)
1246 *colon = 0;
1248 /*
1249 * See which interface the caller is talking about.
1250 */
1252 switch(cmd)
1254 /*
1255 * These ioctl calls:
1256 * - can be done by all.
1257 * - atomic and do not require locking.
1258 * - return a value
1259 */
1261 case SIOCGIFFLAGS:
1262 case SIOCGIFMETRIC:
1263 case SIOCGIFMTU:
1264 case SIOCGIFHWADDR:
1265 case SIOCGIFSLAVE:
1266 case SIOCGIFMAP:
1267 case SIOCGIFINDEX:
1268 dev_load(ifr.ifr_name);
1269 read_lock(&dev_base_lock);
1270 ret = dev_ifsioc(&ifr, cmd);
1271 read_unlock(&dev_base_lock);
1272 if (!ret) {
1273 if (colon)
1274 *colon = ':';
1275 if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
1276 return -EFAULT;
1278 return ret;
1280 /*
1281 * These ioctl calls:
1282 * - require superuser power.
1283 * - require strict serialization.
1284 * - return a value
1285 */
1287 case SIOCETHTOOL:
1288 case SIOCGMIIPHY:
1289 case SIOCGMIIREG:
1290 if (!capable(CAP_NET_ADMIN))
1291 return -EPERM;
1292 dev_load(ifr.ifr_name);
1293 dev_probe_lock();
1294 rtnl_lock();
1295 ret = dev_ifsioc(&ifr, cmd);
1296 rtnl_unlock();
1297 dev_probe_unlock();
1298 if (!ret) {
1299 if (colon)
1300 *colon = ':';
1301 if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
1302 return -EFAULT;
1304 return ret;
1306 /*
1307 * These ioctl calls:
1308 * - require superuser power.
1309 * - require strict serialization.
1310 * - do not return a value
1311 */
1313 case SIOCSIFFLAGS:
1314 case SIOCSIFMETRIC:
1315 case SIOCSIFMTU:
1316 case SIOCSIFMAP:
1317 case SIOCSIFHWADDR:
1318 case SIOCSIFSLAVE:
1319 case SIOCADDMULTI:
1320 case SIOCDELMULTI:
1321 case SIOCSIFHWBROADCAST:
1322 case SIOCSIFNAME:
1323 case SIOCSMIIREG:
1324 case SIOCBONDENSLAVE:
1325 case SIOCBONDRELEASE:
1326 case SIOCBONDSETHWADDR:
1327 case SIOCBONDSLAVEINFOQUERY:
1328 case SIOCBONDINFOQUERY:
1329 case SIOCBONDCHANGEACTIVE:
1330 if (!capable(CAP_NET_ADMIN))
1331 return -EPERM;
1332 dev_load(ifr.ifr_name);
1333 dev_probe_lock();
1334 rtnl_lock();
1335 ret = dev_ifsioc(&ifr, cmd);
1336 rtnl_unlock();
1337 dev_probe_unlock();
1338 return ret;
1340 case SIOCGIFMEM:
1341 /* Get the per device memory space. We can add this but currently
1342 do not support it */
1343 case SIOCSIFMEM:
1344 /* Set the per device memory buffer space. */
1345 case SIOCSIFLINK:
1346 return -EINVAL;
1348 /*
1349 * Unknown or private ioctl.
1350 */
1352 default:
1353 if (cmd >= SIOCDEVPRIVATE &&
1354 cmd <= SIOCDEVPRIVATE + 15) {
1355 dev_load(ifr.ifr_name);
1356 dev_probe_lock();
1357 rtnl_lock();
1358 ret = dev_ifsioc(&ifr, cmd);
1359 rtnl_unlock();
1360 dev_probe_unlock();
1361 if (!ret && copy_to_user(arg, &ifr, sizeof(struct ifreq)))
1362 return -EFAULT;
1363 return ret;
1365 #ifdef WIRELESS_EXT
1366 /* Take care of Wireless Extensions */
1367 if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST) {
1368 /* If command is `set a parameter', or
1369 * `get the encoding parameters', check if
1370 * the user has the right to do it */
1371 if (IW_IS_SET(cmd) || (cmd == SIOCGIWENCODE)) {
1372 if(!capable(CAP_NET_ADMIN))
1373 return -EPERM;
1375 dev_load(ifr.ifr_name);
1376 rtnl_lock();
1377 ret = dev_ifsioc(&ifr, cmd);
1378 rtnl_unlock();
1379 if (!ret && IW_IS_GET(cmd) &&
1380 copy_to_user(arg, &ifr,
1381 sizeof(struct ifreq)))
1382 return -EFAULT;
1383 return ret;
1385 #endif /* WIRELESS_EXT */
1386 return -EINVAL;
1391 /**
1392 * dev_new_index - allocate an ifindex
1394 * Returns a suitable unique value for a new device interface
1395 * number. The caller must hold the rtnl semaphore or the
1396 * dev_base_lock to be sure it remains unique.
1397 */
1399 int dev_new_index(void)
1401 static int ifindex;
1402 for (;;) {
1403 if (++ifindex <= 0)
1404 ifindex=1;
1405 if (__dev_get_by_index(ifindex) == NULL)
1406 return ifindex;
1410 static int dev_boot_phase = 1;
1412 /**
1413 * register_netdevice - register a network device
1414 * @dev: device to register
1416 * Take a completed network device structure and add it to the kernel
1417 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
1418 * chain. 0 is returned on success. A negative errno code is returned
1419 * on a failure to set up the device, or if the name is a duplicate.
1421 * Callers must hold the rtnl semaphore. See the comment at the
1422 * end of Space.c for details about the locking. You may want
1423 * register_netdev() instead of this.
1425 * BUGS:
1426 * The locking appears insufficient to guarantee two parallel registers
1427 * will not get the same name.
1428 */
1430 int net_dev_init(void);
1432 int register_netdevice(struct net_device *dev)
1434 struct net_device *d, **dp;
1435 #ifdef CONFIG_NET_DIVERT
1436 int ret;
1437 #endif
1439 spin_lock_init(&dev->queue_lock);
1440 spin_lock_init(&dev->xmit_lock);
1441 dev->xmit_lock_owner = -1;
1442 #ifdef CONFIG_NET_FASTROUTE
1443 dev->fastpath_lock=RW_LOCK_UNLOCKED;
1444 #endif
1446 if (dev_boot_phase)
1447 net_dev_init();
1449 #ifdef CONFIG_NET_DIVERT
1450 ret = alloc_divert_blk(dev);
1451 if (ret)
1452 return ret;
1453 #endif /* CONFIG_NET_DIVERT */
1455 dev->iflink = -1;
1457 /* Init, if this function is available */
1458 if (dev->init && dev->init(dev) != 0) {
1459 #ifdef CONFIG_NET_DIVERT
1460 free_divert_blk(dev);
1461 #endif
1462 return -EIO;
1465 dev->ifindex = dev_new_index();
1466 if (dev->iflink == -1)
1467 dev->iflink = dev->ifindex;
1469 /* Check for existence, and append to tail of chain */
1470 for (dp=&dev_base; (d=*dp) != NULL; dp=&d->next) {
1471 if (d == dev || strcmp(d->name, dev->name) == 0) {
1472 #ifdef CONFIG_NET_DIVERT
1473 free_divert_blk(dev);
1474 #endif
1475 return -EEXIST;
1478 /*
1479 * nil rebuild_header routine,
1480 * that should be never called and used as just bug trap.
1481 */
1483 if (dev->rebuild_header == NULL)
1484 dev->rebuild_header = default_rebuild_header;
1486 /*
1487 * Default initial state at registry is that the
1488 * device is present.
1489 */
1491 set_bit(__LINK_STATE_PRESENT, &dev->state);
1493 dev->next = NULL;
1494 dev_init_scheduler(dev);
1495 write_lock_bh(&dev_base_lock);
1496 *dp = dev;
1497 dev_hold(dev);
1498 dev->deadbeaf = 0;
1499 write_unlock_bh(&dev_base_lock);
1501 /* Notify protocols, that a new device appeared. */
1502 notifier_call_chain(&netdev_chain, NETDEV_REGISTER, dev);
1504 return 0;
1507 /**
1508 * netdev_finish_unregister - complete unregistration
1509 * @dev: device
1511 * Destroy and free a dead device. A value of zero is returned on
1512 * success.
1513 */
1515 int netdev_finish_unregister(struct net_device *dev)
1517 BUG_TRAP(dev->ip_ptr==NULL);
1518 BUG_TRAP(dev->ip6_ptr==NULL);
1519 BUG_TRAP(dev->dn_ptr==NULL);
1521 if (!dev->deadbeaf) {
1522 printk(KERN_ERR "Freeing alive device %p, %s\n",
1523 dev, dev->name);
1524 return 0;
1526 #ifdef NET_REFCNT_DEBUG
1527 printk(KERN_DEBUG "netdev_finish_unregister: %s%s.\n", dev->name,
1528 (dev->features & NETIF_F_DYNALLOC)?"":", old style");
1529 #endif
1530 if (dev->destructor)
1531 dev->destructor(dev);
1532 if (dev->features & NETIF_F_DYNALLOC)
1533 kfree(dev);
1534 return 0;
1537 /**
1538 * unregister_netdevice - remove device from the kernel
1539 * @dev: device
1541 * This function shuts down a device interface and removes it
1542 * from the kernel tables. On success 0 is returned, on a failure
1543 * a negative errno code is returned.
1545 * Callers must hold the rtnl semaphore. See the comment at the
1546 * end of Space.c for details about the locking. You may want
1547 * unregister_netdev() instead of this.
1548 */
1550 int unregister_netdevice(struct net_device *dev)
1552 unsigned long now, warning_time;
1553 struct net_device *d, **dp;
1555 /* If device is running, close it first. */
1556 if (dev->flags & IFF_UP)
1557 dev_close(dev);
1559 BUG_TRAP(dev->deadbeaf==0);
1560 dev->deadbeaf = 1;
1562 /* And unlink it from device chain. */
1563 for (dp = &dev_base; (d=*dp) != NULL; dp=&d->next) {
1564 if (d == dev) {
1565 write_lock_bh(&dev_base_lock);
1566 *dp = d->next;
1567 write_unlock_bh(&dev_base_lock);
1568 break;
1571 if (d == NULL) {
1572 printk(KERN_DEBUG "unregister_netdevice: device %s/%p"
1573 " not registered\n", dev->name, dev);
1574 return -ENODEV;
1577 /* Synchronize to net_rx_action. */
1578 br_write_lock_bh(BR_NETPROTO_LOCK);
1579 br_write_unlock_bh(BR_NETPROTO_LOCK);
1581 if (dev_boot_phase == 0) {
1583 /* Shutdown queueing discipline. */
1584 dev_shutdown(dev);
1586 /* Notify protocols, that we are about to destroy
1587 this device. They should clean all the things.
1588 */
1589 notifier_call_chain(&netdev_chain, NETDEV_UNREGISTER, dev);
1591 /*
1592 * Flush the multicast chain
1593 */
1594 dev_mc_discard(dev);
1597 if (dev->uninit)
1598 dev->uninit(dev);
1600 /* Notifier chain MUST detach us from master device. */
1601 BUG_TRAP(dev->master==NULL);
1603 #ifdef CONFIG_NET_DIVERT
1604 free_divert_blk(dev);
1605 #endif
1607 if (dev->features & NETIF_F_DYNALLOC) {
1608 #ifdef NET_REFCNT_DEBUG
1609 if (atomic_read(&dev->refcnt) != 1)
1610 printk(KERN_DEBUG "unregister_netdevice: holding %s refcnt=%d\n",
1611 dev->name, atomic_read(&dev->refcnt)-1);
1612 #endif
1613 dev_put(dev);
1614 return 0;
1617 /* Last reference is our one */
1618 if (atomic_read(&dev->refcnt) == 1) {
1619 dev_put(dev);
1620 return 0;
1623 #ifdef NET_REFCNT_DEBUG
1624 printk("unregister_netdevice: waiting %s refcnt=%d\n",
1625 dev->name, atomic_read(&dev->refcnt));
1626 #endif
1628 /* EXPLANATION. If dev->refcnt is not now 1 (our own reference)
1629 it means that someone in the kernel still has a reference
1630 to this device and we cannot release it.
1632 "New style" devices have destructors, hence we can return from this
1633 function and destructor will do all the work later. As of kernel 2.4.0
1634 there are very few "New Style" devices.
1636 "Old style" devices expect that the device is free of any references
1637 upon exit from this function.
1638 We cannot return from this function until all such references have
1639 fallen away. This is because the caller of this function will probably
1640 immediately kfree(*dev) and then be unloaded via sys_delete_module.
1642 So, we linger until all references fall away. The duration of the
1643 linger is basically unbounded! It is driven by, for example, the
1644 current setting of sysctl_ipfrag_time.
1646 After 1 second, we start to rebroadcast unregister notifications
1647 in hope that careless clients will release the device.
1649 */
1651 now = warning_time = jiffies;
1652 while (atomic_read(&dev->refcnt) != 1) {
1653 if ((jiffies - now) > 1*HZ) {
1654 /* Rebroadcast unregister notification */
1655 notifier_call_chain(&netdev_chain, NETDEV_UNREGISTER, dev);
1657 mdelay(250);
1658 if ((jiffies - warning_time) > 10*HZ) {
1659 printk(KERN_EMERG "unregister_netdevice: waiting for %s to "
1660 "become free. Usage count = %d\n",
1661 dev->name, atomic_read(&dev->refcnt));
1662 warning_time = jiffies;
1665 dev_put(dev);
1666 return 0;
1670 /*
1671 * Initialize the DEV module. At boot time this walks the device list and
1672 * unhooks any devices that fail to initialise (normally hardware not
1673 * present) and leaves us with a valid list of present and active devices.
1675 */
1677 extern void net_device_init(void);
1678 extern void ip_auto_config(void);
1679 #ifdef CONFIG_NET_DIVERT
1680 extern void dv_init(void);
1681 #endif /* CONFIG_NET_DIVERT */
1684 /*
1685 * Callers must hold the rtnl semaphore. See the comment at the
1686 * end of Space.c for details about the locking.
1687 */
1688 int __init net_dev_init(void)
1690 struct net_device *dev, **dp;
1692 if ( !dev_boot_phase )
1693 return 0;
1695 skb_init();
1697 net_header_cachep = kmem_cache_create(
1698 "net_header_cache",
1699 (PKT_PROT_LEN + sizeof(void *) - 1) & ~(sizeof(void *) - 1),
1700 0, SLAB_HWCACHE_ALIGN, NULL, NULL);
1702 spin_lock_init(&net_schedule_list_lock);
1703 INIT_LIST_HEAD(&net_schedule_list);
1705 /*
1706 * Add the devices.
1707 * If the call to dev->init fails, the dev is removed
1708 * from the chain disconnecting the device until the
1709 * next reboot.
1711 * NB At boot phase networking is dead. No locking is required.
1712 * But we still preserve dev_base_lock for sanity.
1713 */
1714 dp = &dev_base;
1715 while ((dev = *dp) != NULL) {
1716 spin_lock_init(&dev->queue_lock);
1717 spin_lock_init(&dev->xmit_lock);
1719 dev->xmit_lock_owner = -1;
1720 dev->iflink = -1;
1721 dev_hold(dev);
1723 /*
1724 * Allocate name. If the init() fails
1725 * the name will be reissued correctly.
1726 */
1727 if (strchr(dev->name, '%'))
1728 dev_alloc_name(dev, dev->name);
1730 if (dev->init && dev->init(dev)) {
1731 /*
1732 * It failed to come up. It will be unhooked later.
1733 * dev_alloc_name can now advance to next suitable
1734 * name that is checked next.
1735 */
1736 dev->deadbeaf = 1;
1737 dp = &dev->next;
1738 } else {
1739 dp = &dev->next;
1740 dev->ifindex = dev_new_index();
1741 if (dev->iflink == -1)
1742 dev->iflink = dev->ifindex;
1743 if (dev->rebuild_header == NULL)
1744 dev->rebuild_header = default_rebuild_header;
1745 dev_init_scheduler(dev);
1746 set_bit(__LINK_STATE_PRESENT, &dev->state);
1750 /*
1751 * Unhook devices that failed to come up
1752 */
1753 dp = &dev_base;
1754 while ((dev = *dp) != NULL) {
1755 if (dev->deadbeaf) {
1756 write_lock_bh(&dev_base_lock);
1757 *dp = dev->next;
1758 write_unlock_bh(&dev_base_lock);
1759 dev_put(dev);
1760 } else {
1761 dp = &dev->next;
1765 dev_boot_phase = 0;
1767 dev_mcast_init();
1769 /*
1770 * Initialise network devices
1771 */
1773 net_device_init();
1775 return 0;
1778 inline int init_tx_header(u8 *data, unsigned int len, struct net_device *dev)
1780 memcpy(data + ETH_ALEN, dev->dev_addr, ETH_ALEN);
1782 switch ( ntohs(*(unsigned short *)(data + 12)) )
1784 case ETH_P_ARP:
1785 if ( len < 42 ) break;
1786 memcpy(data + 22, dev->dev_addr, 6);
1787 return ETH_P_ARP;
1788 case ETH_P_IP:
1789 return ETH_P_IP;
1791 return 0;
1795 /*
1796 * do_net_update:
1798 * Called from guest OS to notify updates to its transmit and/or receive
1799 * descriptor rings.
1800 */
1802 long do_net_update(void)
1804 net_ring_t *net_ring;
1805 net_shadow_ring_t *shadow_ring;
1806 net_vif_t *current_vif;
1807 unsigned int i, j;
1808 struct sk_buff *skb;
1809 tx_entry_t tx;
1810 rx_shadow_entry_t *rx;
1811 unsigned long pfn;
1812 struct pfn_info *page;
1813 unsigned long *g_pte;
1815 for ( j = 0; j < current->num_net_vifs; j++)
1817 int target;
1818 u8 *g_data;
1819 unsigned short protocol;
1821 current_vif = current->net_vif_list[j];
1822 net_ring = current_vif->net_ring;
1823 shadow_ring = current_vif->shadow_ring;
1825 /*
1826 * PHASE 1 -- TRANSMIT RING
1827 */
1829 /*
1830 * Collect up new transmit buffers. We collect up to the guest OS's
1831 * new producer index, but take care not to catch up with our own
1832 * consumer index.
1833 */
1834 for ( i = shadow_ring->tx_prod;
1835 (i != net_ring->tx_prod) &&
1836 (((shadow_ring->tx_cons-i) & (TX_RING_SIZE-1)) != 1);
1837 i = TX_RING_INC(i) )
1839 if ( copy_from_user(&tx, net_ring->tx_ring+i, sizeof(tx)) )
1841 DPRINTK("Bad copy_from_user for tx net descriptor\n");
1842 shadow_ring->tx_ring[i].status = RING_STATUS_ERR_CFU;
1843 continue;
1846 shadow_ring->tx_ring[i].size = tx.size;
1847 shadow_ring->tx_ring[i].status = RING_STATUS_BAD_PAGE;
1849 if ( tx.size < PKT_PROT_LEN )
1851 DPRINTK("Runt packet %d\n", tx.size);
1852 continue;
1855 if ( ((tx.addr & ~PAGE_MASK) + tx.size) >= PAGE_SIZE )
1857 DPRINTK("tx.addr: %lx, size: %u, end: %lu\n",
1858 tx.addr, tx.size, (tx.addr &~PAGE_MASK) + tx.size);
1859 continue;
1862 pfn = tx.addr >> PAGE_SHIFT;
1863 page = frame_table + pfn;
1864 if ( (pfn >= max_page) ||
1865 ((page->flags & PG_domain_mask) != current->domain) )
1867 DPRINTK("Bad page frame\n");
1868 continue;
1871 g_data = map_domain_mem(tx.addr);
1873 protocol = __constant_htons(
1874 init_tx_header(g_data, tx.size, the_dev));
1875 if ( protocol == 0 )
1876 goto unmap_and_continue;
1878 target = __net_get_target_vif(g_data, tx.size, current_vif->id);
1880 if ( target > VIF_PHYSICAL_INTERFACE )
1882 /* Local delivery */
1883 if ( (skb = dev_alloc_skb(tx.size)) == NULL )
1884 goto unmap_and_continue;
1886 skb->destructor = tx_skb_release;
1888 shadow_ring->tx_ring[i].status = RING_STATUS_OK;
1890 skb->src_vif = current_vif->id;
1891 skb->dst_vif = target;
1892 skb->protocol = protocol;
1894 skb->head = (u8 *)map_domain_mem(
1895 ((skb->pf - frame_table) << PAGE_SHIFT));
1896 skb->data = skb->head + 16;
1897 skb_reserve(skb,2);
1898 memcpy(skb->data, g_data, tx.size);
1899 skb->len = tx.size;
1900 unmap_domain_mem(skb->head);
1901 skb->data += ETH_HLEN;
1902 (void)netif_rx(skb);
1904 else if ( target == VIF_PHYSICAL_INTERFACE )
1906 shadow_ring->tx_ring[i].header =
1907 kmem_cache_alloc(net_header_cachep, GFP_KERNEL);
1908 if ( shadow_ring->tx_ring[i].header == NULL )
1909 goto unmap_and_continue;
1910 memcpy(shadow_ring->tx_ring[i].header, g_data, PKT_PROT_LEN);
1911 shadow_ring->tx_ring[i].payload = tx.addr + PKT_PROT_LEN;
1912 shadow_ring->tx_ring[i].status = RING_STATUS_OK;
1913 get_page_tot(page);
1916 unmap_and_continue:
1917 unmap_domain_mem(g_data);
1920 if ( shadow_ring->tx_prod != i )
1922 smp_mb(); /* Let other CPUs see new descriptors first. */
1923 shadow_ring->tx_prod = i;
1924 add_to_net_schedule_list_tail(current_vif);
1925 maybe_schedule_tx_action();
1928 /*
1929 * PHASE 2 -- RECEIVE RING
1930 */
1932 /*
1933 * Collect up new receive buffers. We collect up to the guest OS's
1934 * new producer index, but take care not to catch up with our own
1935 * consumer index.
1936 */
1937 for ( i = shadow_ring->rx_prod;
1938 (i != net_ring->rx_prod) &&
1939 (((shadow_ring->rx_cons-i) & (RX_RING_SIZE-1)) != 1);
1940 i = RX_RING_INC(i) )
1942 /*
1943 * This copy assumes that rx_shadow_entry_t is an extension of
1944 * rx_net_entry_t extra fields must be tacked on to the end.
1945 */
1946 if ( copy_from_user( shadow_ring->rx_ring+i, net_ring->rx_ring+i,
1947 sizeof (rx_entry_t) ) )
1949 DPRINTK("Bad copy_from_user for rx ring\n");
1950 shadow_ring->rx_ring[i].status = RING_STATUS_ERR_CFU;
1951 continue;
1954 rx = shadow_ring->rx_ring + i;
1955 pfn = rx->addr >> PAGE_SHIFT;
1956 page = frame_table + pfn;
1958 shadow_ring->rx_ring[i].status = RING_STATUS_BAD_PAGE;
1960 if ( (pfn >= max_page) ||
1961 (page->flags != (PGT_l1_page_table | current->domain)) )
1963 DPRINTK("Bad page frame containing ppte\n");
1964 continue;
1967 g_pte = map_domain_mem(rx->addr);
1969 if (!(*g_pte & _PAGE_PRESENT))
1971 DPRINTK("Inavlid PTE passed down (not present)\n");
1972 unmap_domain_mem(g_pte);
1973 continue;
1976 page = (*g_pte >> PAGE_SHIFT) + frame_table;
1978 if (page->tot_count != 1)
1980 DPRINTK("An rx page must be mapped exactly once\n");
1981 unmap_domain_mem(g_pte);
1982 continue;
1985 /* The pte they passed was good, so take it away from them. */
1986 shadow_ring->rx_ring[i].status = RING_STATUS_OK;
1987 *g_pte &= ~_PAGE_PRESENT;
1988 page->flags = (page->flags & ~PG_type_mask) | PGT_net_rx_buf;
1989 rx->flush_count = tlb_flush_count[smp_processor_id()];
1991 unmap_domain_mem(g_pte);
1994 if ( shadow_ring->rx_prod != i )
1996 smp_mb(); /* Let other CPUs see new descriptors first. */
1997 shadow_ring->rx_prod = i;
2001 return 0;
2005 int setup_network_devices(void)
2007 int ret;
2008 extern char opt_ifname[];
2009 struct net_device *dev = dev_get_by_name(opt_ifname);
2011 if ( dev == NULL )
2013 printk("Could not find device %s\n", opt_ifname);
2014 return 0;
2017 ret = dev_open(dev);
2018 if ( ret != 0 )
2020 printk("Error opening device %s for use (%d)\n", opt_ifname, ret);
2021 return 0;
2023 printk("Device %s opened and ready for use.\n", opt_ifname);
2024 the_dev = dev;
2026 tasklet_enable(&net_tx_tasklet);
2028 return 1;