direct-io.hg

view xen/net/dev.c @ 364:942eb9bcae13

bitkeeper revision 1.170 (3e9c936fXyHEI0NKOWQkP9tHN4sbqw)

Many files:
Finished virtualisation of x86 LDT. Xenolinux now exports this to applications (eg. for use by linuxthreads).
author kaf24@scramble.cl.cam.ac.uk
date Tue Apr 15 23:19:11 2003 +0000 (2003-04-15)
parents 56fce5176629
children ff1bb22c81a1 95c43a4b6685
line source
1 /*
2 * NET3 Protocol independent device support routines.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 */
10 #include <asm/uaccess.h>
11 #include <asm/system.h>
12 #include <asm/bitops.h>
13 #include <linux/config.h>
14 #include <linux/delay.h>
15 #include <linux/lib.h>
16 #include <linux/types.h>
17 #include <linux/sched.h>
18 #include <linux/mm.h>
19 #include <linux/socket.h>
20 #include <linux/sockios.h>
21 #include <linux/errno.h>
22 #include <linux/interrupt.h>
23 #include <linux/if_ether.h>
24 #include <linux/netdevice.h>
25 #include <linux/etherdevice.h>
26 #include <linux/skbuff.h>
27 #include <linux/brlock.h>
28 #include <linux/init.h>
29 #include <linux/module.h>
31 #include <linux/event.h>
32 #include <asm/domain_page.h>
33 #include <asm/pgalloc.h>
35 #define BUG_TRAP ASSERT
36 #define notifier_call_chain(_a,_b,_c) ((void)0)
37 #define rtmsg_ifinfo(_a,_b,_c) ((void)0)
38 #define rtnl_lock() ((void)0)
39 #define rtnl_unlock() ((void)0)
41 #if 0
42 #define DPRINTK(_f, _a...) printk(_f , ## _a)
43 #else
44 #define DPRINTK(_f, _a...) ((void)0)
45 #endif
47 #define TX_RING_INC(_i) (((_i)+1) & (TX_RING_SIZE-1))
48 #define RX_RING_INC(_i) (((_i)+1) & (RX_RING_SIZE-1))
49 #define TX_RING_ADD(_i,_j) (((_i)+(_j)) & (TX_RING_SIZE-1))
50 #define RX_RING_ADD(_i,_j) (((_i)+(_j)) & (RX_RING_SIZE-1))
52 struct net_device *the_dev = NULL;
54 /*
55 * Transmitted packets are fragmented, so we can copy the important headesr
56 * before checking them for validity. Avoids need for page protection.
57 */
58 /* Ethernet + IP headers */
59 #define PKT_PROT_LEN (ETH_HLEN + 20)
60 static kmem_cache_t *net_header_cachep;
62 /**
63 * __dev_get_by_name - find a device by its name
64 * @name: name to find
65 *
66 * Find an interface by name. Must be called under RTNL semaphore
67 * or @dev_base_lock. If the name is found a pointer to the device
68 * is returned. If the name is not found then %NULL is returned. The
69 * reference counters are not incremented so the caller must be
70 * careful with locks.
71 */
74 struct net_device *__dev_get_by_name(const char *name)
75 {
76 struct net_device *dev;
78 for (dev = dev_base; dev != NULL; dev = dev->next) {
79 if (strncmp(dev->name, name, IFNAMSIZ) == 0)
80 return dev;
81 }
82 return NULL;
83 }
85 /**
86 * dev_get_by_name - find a device by its name
87 * @name: name to find
88 *
89 * Find an interface by name. This can be called from any
90 * context and does its own locking. The returned handle has
91 * the usage count incremented and the caller must use dev_put() to
92 * release it when it is no longer needed. %NULL is returned if no
93 * matching device is found.
94 */
96 struct net_device *dev_get_by_name(const char *name)
97 {
98 struct net_device *dev;
100 read_lock(&dev_base_lock);
101 dev = __dev_get_by_name(name);
102 if (dev)
103 dev_hold(dev);
104 read_unlock(&dev_base_lock);
105 return dev;
106 }
108 /**
109 * dev_get - test if a device exists
110 * @name: name to test for
111 *
112 * Test if a name exists. Returns true if the name is found. In order
113 * to be sure the name is not allocated or removed during the test the
114 * caller must hold the rtnl semaphore.
115 *
116 * This function primarily exists for back compatibility with older
117 * drivers.
118 */
120 int dev_get(const char *name)
121 {
122 struct net_device *dev;
124 read_lock(&dev_base_lock);
125 dev = __dev_get_by_name(name);
126 read_unlock(&dev_base_lock);
127 return dev != NULL;
128 }
130 /**
131 * __dev_get_by_index - find a device by its ifindex
132 * @ifindex: index of device
133 *
134 * Search for an interface by index. Returns %NULL if the device
135 * is not found or a pointer to the device. The device has not
136 * had its reference counter increased so the caller must be careful
137 * about locking. The caller must hold either the RTNL semaphore
138 * or @dev_base_lock.
139 */
141 struct net_device * __dev_get_by_index(int ifindex)
142 {
143 struct net_device *dev;
145 for (dev = dev_base; dev != NULL; dev = dev->next) {
146 if (dev->ifindex == ifindex)
147 return dev;
148 }
149 return NULL;
150 }
153 /**
154 * dev_get_by_index - find a device by its ifindex
155 * @ifindex: index of device
156 *
157 * Search for an interface by index. Returns NULL if the device
158 * is not found or a pointer to the device. The device returned has
159 * had a reference added and the pointer is safe until the user calls
160 * dev_put to indicate they have finished with it.
161 */
163 struct net_device * dev_get_by_index(int ifindex)
164 {
165 struct net_device *dev;
167 read_lock(&dev_base_lock);
168 dev = __dev_get_by_index(ifindex);
169 if (dev)
170 dev_hold(dev);
171 read_unlock(&dev_base_lock);
172 return dev;
173 }
175 /**
176 * dev_getbyhwaddr - find a device by its hardware address
177 * @type: media type of device
178 * @ha: hardware address
179 *
180 * Search for an interface by MAC address. Returns NULL if the device
181 * is not found or a pointer to the device. The caller must hold the
182 * rtnl semaphore. The returned device has not had its ref count increased
183 * and the caller must therefore be careful about locking
184 *
185 * BUGS:
186 * If the API was consistent this would be __dev_get_by_hwaddr
187 */
189 struct net_device *dev_getbyhwaddr(unsigned short type, char *ha)
190 {
191 struct net_device *dev;
193 for (dev = dev_base; dev != NULL; dev = dev->next) {
194 if (dev->type == type &&
195 memcmp(dev->dev_addr, ha, dev->addr_len) == 0)
196 return dev;
197 }
198 return NULL;
199 }
201 /**
202 * dev_alloc_name - allocate a name for a device
203 * @dev: device
204 * @name: name format string
205 *
206 * Passed a format string - eg "lt%d" it will try and find a suitable
207 * id. Not efficient for many devices, not called a lot. The caller
208 * must hold the dev_base or rtnl lock while allocating the name and
209 * adding the device in order to avoid duplicates. Returns the number
210 * of the unit assigned or a negative errno code.
211 */
213 int dev_alloc_name(struct net_device *dev, const char *name)
214 {
215 int i;
216 char buf[32];
217 char *p;
219 /*
220 * Verify the string as this thing may have come from
221 * the user. There must be either one "%d" and no other "%"
222 * characters, or no "%" characters at all.
223 */
224 p = strchr(name, '%');
225 if (p && (p[1] != 'd' || strchr(p+2, '%')))
226 return -EINVAL;
228 /*
229 * If you need over 100 please also fix the algorithm...
230 */
231 for (i = 0; i < 100; i++) {
232 snprintf(buf,sizeof(buf),name,i);
233 if (__dev_get_by_name(buf) == NULL) {
234 strcpy(dev->name, buf);
235 return i;
236 }
237 }
238 return -ENFILE; /* Over 100 of the things .. bail out! */
239 }
241 /**
242 * dev_alloc - allocate a network device and name
243 * @name: name format string
244 * @err: error return pointer
245 *
246 * Passed a format string, eg. "lt%d", it will allocate a network device
247 * and space for the name. %NULL is returned if no memory is available.
248 * If the allocation succeeds then the name is assigned and the
249 * device pointer returned. %NULL is returned if the name allocation
250 * failed. The cause of an error is returned as a negative errno code
251 * in the variable @err points to.
252 *
253 * The caller must hold the @dev_base or RTNL locks when doing this in
254 * order to avoid duplicate name allocations.
255 */
257 struct net_device *dev_alloc(const char *name, int *err)
258 {
259 struct net_device *dev=kmalloc(sizeof(struct net_device), GFP_KERNEL);
260 if (dev == NULL) {
261 *err = -ENOBUFS;
262 return NULL;
263 }
264 memset(dev, 0, sizeof(struct net_device));
265 *err = dev_alloc_name(dev, name);
266 if (*err < 0) {
267 kfree(dev);
268 return NULL;
269 }
270 return dev;
271 }
273 /**
274 * netdev_state_change - device changes state
275 * @dev: device to cause notification
276 *
277 * Called to indicate a device has changed state. This function calls
278 * the notifier chains for netdev_chain and sends a NEWLINK message
279 * to the routing socket.
280 */
282 void netdev_state_change(struct net_device *dev)
283 {
284 if (dev->flags&IFF_UP) {
285 notifier_call_chain(&netdev_chain, NETDEV_CHANGE, dev);
286 rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
287 }
288 }
291 #ifdef CONFIG_KMOD
293 /**
294 * dev_load - load a network module
295 * @name: name of interface
296 *
297 * If a network interface is not present and the process has suitable
298 * privileges this function loads the module. If module loading is not
299 * available in this kernel then it becomes a nop.
300 */
302 void dev_load(const char *name)
303 {
304 if (!dev_get(name) && capable(CAP_SYS_MODULE))
305 request_module(name);
306 }
308 #else
310 extern inline void dev_load(const char *unused){;}
312 #endif
314 static int default_rebuild_header(struct sk_buff *skb)
315 {
316 printk(KERN_DEBUG "%s: default_rebuild_header called -- BUG!\n",
317 skb->dev ? skb->dev->name : "NULL!!!");
318 kfree_skb(skb);
319 return 1;
320 }
322 /**
323 * dev_open - prepare an interface for use.
324 * @dev: device to open
325 *
326 * Takes a device from down to up state. The device's private open
327 * function is invoked and then the multicast lists are loaded. Finally
328 * the device is moved into the up state and a %NETDEV_UP message is
329 * sent to the netdev notifier chain.
330 *
331 * Calling this function on an active interface is a nop. On a failure
332 * a negative errno code is returned.
333 */
335 int dev_open(struct net_device *dev)
336 {
337 int ret = 0;
339 /*
340 * Is it already up?
341 */
343 if (dev->flags&IFF_UP)
344 return 0;
346 /*
347 * Is it even present?
348 */
349 if (!netif_device_present(dev))
350 return -ENODEV;
352 /*
353 * Call device private open method
354 */
355 if (try_inc_mod_count(dev->owner)) {
356 if (dev->open) {
357 ret = dev->open(dev);
358 if (ret != 0 && dev->owner)
359 __MOD_DEC_USE_COUNT(dev->owner);
360 }
361 } else {
362 ret = -ENODEV;
363 }
365 /*
366 * If it went open OK then:
367 */
369 if (ret == 0)
370 {
371 /*
372 * Set the flags.
373 */
374 dev->flags |= IFF_UP;
376 set_bit(__LINK_STATE_START, &dev->state);
378 /*
379 * Initialize multicasting status
380 */
381 dev_mc_upload(dev);
383 /*
384 * Wakeup transmit queue engine
385 */
386 dev_activate(dev);
388 /*
389 * ... and announce new interface.
390 */
391 notifier_call_chain(&netdev_chain, NETDEV_UP, dev);
392 }
393 return(ret);
394 }
397 /**
398 * dev_close - shutdown an interface.
399 * @dev: device to shutdown
400 *
401 * This function moves an active device into down state. A
402 * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
403 * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
404 * chain.
405 */
407 int dev_close(struct net_device *dev)
408 {
409 if (!(dev->flags&IFF_UP))
410 return 0;
412 /*
413 * Tell people we are going down, so that they can
414 * prepare to death, when device is still operating.
415 */
416 notifier_call_chain(&netdev_chain, NETDEV_GOING_DOWN, dev);
418 dev_deactivate(dev);
420 clear_bit(__LINK_STATE_START, &dev->state);
422 /*
423 * Call the device specific close. This cannot fail.
424 * Only if device is UP
425 *
426 * We allow it to be called even after a DETACH hot-plug
427 * event.
428 */
430 if (dev->stop)
431 dev->stop(dev);
433 /*
434 * Device is now down.
435 */
437 dev->flags &= ~IFF_UP;
439 /*
440 * Tell people we are down
441 */
442 notifier_call_chain(&netdev_chain, NETDEV_DOWN, dev);
444 /*
445 * Drop the module refcount
446 */
447 if (dev->owner)
448 __MOD_DEC_USE_COUNT(dev->owner);
450 return(0);
451 }
454 #ifdef CONFIG_HIGHMEM
455 /* Actually, we should eliminate this check as soon as we know, that:
456 * 1. IOMMU is present and allows to map all the memory.
457 * 2. No high memory really exists on this machine.
458 */
460 static inline int
461 illegal_highdma(struct net_device *dev, struct sk_buff *skb)
462 {
463 int i;
465 if (dev->features&NETIF_F_HIGHDMA)
466 return 0;
468 for (i=0; i<skb_shinfo(skb)->nr_frags; i++)
469 if (skb_shinfo(skb)->frags[i].page >= highmem_start_page)
470 return 1;
472 return 0;
473 }
474 #else
475 #define illegal_highdma(dev, skb) (0)
476 #endif
479 /*=======================================================================
480 Receiver routines
481 =======================================================================*/
483 struct netif_rx_stats netdev_rx_stat[NR_CPUS];
485 void deliver_packet(struct sk_buff *skb, net_vif_t *vif)
486 {
487 net_shadow_ring_t *shadow_ring;
488 rx_shadow_entry_t *rx;
489 unsigned long *g_pte;
490 struct pfn_info *g_pfn, *h_pfn;
491 unsigned int i;
492 unsigned long flags;
494 memset(skb->mac.ethernet->h_dest, 0, ETH_ALEN);
495 if ( ntohs(skb->mac.ethernet->h_proto) == ETH_P_ARP )
496 memset(skb->nh.raw + 18, 0, ETH_ALEN);
497 shadow_ring = vif->shadow_ring;
499 if ( (i = shadow_ring->rx_idx) == shadow_ring->rx_prod )
500 return;
502 if ( shadow_ring->rx_ring[i].status != RING_STATUS_OK )
503 {
504 DPRINTK("Bad buffer in deliver_packet()\n");
505 goto inc_and_out;
506 }
508 rx = shadow_ring->rx_ring + i;
509 ASSERT(skb->len <= PAGE_SIZE);
510 rx->size = skb->len;
511 rx->offset = (unsigned char)((unsigned long)skb->data & ~PAGE_MASK);
513 spin_lock_irqsave(&vif->domain->page_lock, flags);
515 g_pte = map_domain_mem(rx->addr);
517 g_pfn = frame_table + (*g_pte >> PAGE_SHIFT);
518 h_pfn = skb->pf;
520 h_pfn->tot_count = h_pfn->type_count = 1;
521 g_pfn->tot_count = g_pfn->type_count = 0;
522 h_pfn->flags = g_pfn->flags & ~PG_type_mask;
524 if (*g_pte & _PAGE_RW) h_pfn->flags |= PGT_writeable_page | PG_need_flush;
525 g_pfn->flags = 0;
527 /* Point the guest at the new machine frame. */
528 machine_to_phys_mapping[h_pfn - frame_table]
529 = machine_to_phys_mapping[g_pfn - frame_table];
530 *g_pte = (*g_pte & ~PAGE_MASK)
531 | (((h_pfn - frame_table) << PAGE_SHIFT) & PAGE_MASK);
532 *g_pte |= _PAGE_PRESENT;
534 unmap_domain_mem(g_pte);
536 spin_unlock_irqrestore(&vif->domain->page_lock, flags);
538 /* Our skbuff now points at the guest's old frame. */
539 skb->pf = g_pfn;
541 inc_and_out:
542 smp_wmb(); /* updates must happen before releasing the descriptor. */
543 shadow_ring->rx_idx = RX_RING_INC(i);
544 }
546 /**
547 * netif_rx - post buffer to the network code
548 * @skb: buffer to post
549 *
550 * This function receives a packet from a device driver and queues it for
551 * the upper (protocol) levels to process. It always succeeds. The buffer
552 * may be dropped during processing for congestion control or by the
553 * protocol layers.
554 *
555 * return values:
556 * NET_RX_SUCCESS (no congestion)
557 * NET_RX_DROP (packet was dropped)
558 */
560 int netif_rx(struct sk_buff *skb)
561 {
562 unsigned long cpu_mask;
563 int offset, this_cpu = smp_processor_id();
564 unsigned long flags;
565 net_vif_t *vif;
567 local_irq_save(flags);
569 ASSERT(skb->skb_type == SKB_ZERO_COPY);
571 /*
572 * Offset will include 16 bytes padding from dev_alloc_skb, 14 bytes for
573 * ethernet header, plus any other alignment padding added by the driver.
574 */
575 offset = (int)skb->data & ~PAGE_MASK;
576 skb->head = (u8 *)map_domain_mem(((skb->pf - frame_table) << PAGE_SHIFT));
577 skb->data = skb->nh.raw = skb->head + offset;
578 skb->tail = skb->data + skb->len;
579 skb_push(skb, ETH_HLEN);
580 skb->mac.raw = skb->data;
582 netdev_rx_stat[this_cpu].total++;
584 if ( skb->src_vif == VIF_UNKNOWN_INTERFACE )
585 skb->src_vif = VIF_PHYSICAL_INTERFACE;
587 if ( skb->dst_vif == VIF_UNKNOWN_INTERFACE )
588 skb->dst_vif = __net_get_target_vif(skb->data, skb->len, skb->src_vif);
590 if ( ((vif = sys_vif_list[skb->dst_vif]) == NULL) ||
591 (skb->dst_vif <= VIF_PHYSICAL_INTERFACE) )
592 {
593 netdev_rx_stat[this_cpu].dropped++;
594 unmap_domain_mem(skb->head);
595 kfree_skb(skb);
596 local_irq_restore(flags);
597 return NET_RX_DROP;
598 }
600 deliver_packet(skb, vif);
601 cpu_mask = mark_hyp_event(vif->domain, _HYP_EVENT_NET_RX);
602 unmap_domain_mem(skb->head);
603 kfree_skb(skb);
604 hyp_event_notify(cpu_mask);
605 local_irq_restore(flags);
606 return NET_RX_SUCCESS;
607 }
610 /*************************************************************
611 * NEW TRANSMIT SCHEDULER
612 *
613 * NB. We ought also to only send a limited number of bytes to the NIC
614 * for transmission at any one time (to avoid head-of-line blocking).
615 * However, driver rings are small enough that they provide a reasonable
616 * limit.
617 *
618 * eg. 3c905 has 16 descriptors == 8 packets, at 100Mbps
619 * e1000 has 256 descriptors == 128 packets, at 1000Mbps
620 * tg3 has 512 descriptors == 256 packets, at 1000Mbps
621 *
622 * So, worst case is tg3 with 256 1500-bytes packets == 375kB.
623 * This would take 3ms, and represents our worst-case HoL blocking cost.
624 *
625 * We think this is reasonable.
626 */
628 struct list_head net_schedule_list;
629 spinlock_t net_schedule_list_lock;
631 static int __on_net_schedule_list(net_vif_t *vif)
632 {
633 return vif->list.next != NULL;
634 }
636 static void remove_from_net_schedule_list(net_vif_t *vif)
637 {
638 unsigned long flags;
639 if ( !__on_net_schedule_list(vif) ) return;
640 spin_lock_irqsave(&net_schedule_list_lock, flags);
641 if ( __on_net_schedule_list(vif) )
642 {
643 list_del(&vif->list);
644 vif->list.next = NULL;
645 }
646 spin_unlock_irqrestore(&net_schedule_list_lock, flags);
647 }
649 static void add_to_net_schedule_list_tail(net_vif_t *vif)
650 {
651 unsigned long flags;
652 if ( __on_net_schedule_list(vif) ) return;
653 spin_lock_irqsave(&net_schedule_list_lock, flags);
654 if ( !__on_net_schedule_list(vif) )
655 {
656 list_add_tail(&vif->list, &net_schedule_list);
657 }
658 spin_unlock_irqrestore(&net_schedule_list_lock, flags);
659 }
662 /* Destructor function for tx skbs. */
663 static void tx_skb_release(struct sk_buff *skb)
664 {
665 int i, send = 0;
666 net_vif_t *vif = sys_vif_list[skb->src_vif];
667 unsigned int idx;
668 tx_shadow_entry_t *tx;
669 unsigned long cpu_mask, flags;
671 spin_lock_irqsave(&vif->domain->page_lock, flags);
672 for ( i = 0; i < skb_shinfo(skb)->nr_frags; i++ )
673 put_page_tot(skb_shinfo(skb)->frags[i].page);
674 spin_unlock_irqrestore(&vif->domain->page_lock, flags);
676 if ( skb->skb_type == SKB_NODATA )
677 kmem_cache_free(net_header_cachep, skb->head);
679 skb_shinfo(skb)->nr_frags = 0;
681 /* This would mean that the guest OS has fiddled with our index. */
682 if ( vif->shadow_ring->tx_cons != vif->net_ring->tx_cons )
683 DPRINTK("Shadow and shared rings out of sync (%d/%d)\n",
684 vif->shadow_ring->tx_cons, vif->net_ring->tx_cons);
686 /*
687 * XXX This assumes that, per vif, SKBs are processed in-order!
688 * Also assumes no concurrency. This is safe because each vif
689 * maps to one NIC. This is executed in NIC interrupt code, so we have
690 * mutual exclusion from do_IRQ().
691 */
693 smp_wmb(); /* make sure any status updates occur before inc'ing tx_cons. */
695 /* Skip over a sequence of bad descriptors, plus the first good one. */
696 do {
697 idx = vif->shadow_ring->tx_cons;
698 /* There must be at least one good descriptor outstanding. */
699 if ( idx == vif->shadow_ring->tx_idx ) BUG();
700 tx = &vif->shadow_ring->tx_ring[idx];
701 vif->shadow_ring->tx_cons = TX_RING_INC(idx);
702 if ( vif->shadow_ring->tx_cons == vif->net_ring->tx_event ) send = 1;
703 } while ( tx->status != RING_STATUS_OK );
705 /* Now skip over any more bad descriptors, up to the next good one. */
706 do {
707 idx = vif->shadow_ring->tx_cons;
708 tx = &vif->shadow_ring->tx_ring[idx];
709 /* Carry on until we find a good descriptor, or reach scheduler idx. */
710 if ( (idx == vif->shadow_ring->tx_idx) ||
711 (tx->status == RING_STATUS_OK) )
712 break;
713 vif->shadow_ring->tx_cons = TX_RING_INC(idx);
714 if ( vif->shadow_ring->tx_cons == vif->net_ring->tx_event ) send = 1;
715 } while ( 1 );
717 /* Update shared consumer index to the new private value. */
718 vif->net_ring->tx_cons = vif->shadow_ring->tx_cons;
720 /* Send a transmit event if requested. */
721 if ( send )
722 {
723 cpu_mask = mark_guest_event(vif->domain, _EVENT_NET_TX);
724 guest_event_notify(cpu_mask);
725 }
726 }
729 static void net_tx_action(unsigned long unused)
730 {
731 struct net_device *dev = the_dev;
732 struct list_head *ent;
733 struct sk_buff *skb;
734 net_vif_t *vif;
735 tx_shadow_entry_t *tx;
737 spin_lock(&dev->xmit_lock);
738 while ( !netif_queue_stopped(dev) &&
739 !list_empty(&net_schedule_list) )
740 {
741 /* Get a vif from the list with work to do. */
742 ent = net_schedule_list.next;
743 vif = list_entry(ent, net_vif_t, list);
744 remove_from_net_schedule_list(vif);
745 if ( vif->shadow_ring->tx_idx == vif->shadow_ring->tx_prod )
746 continue;
748 /* Pick an entry from the transmit queue. */
749 tx = &vif->shadow_ring->tx_ring[vif->shadow_ring->tx_idx];
750 vif->shadow_ring->tx_idx = TX_RING_INC(vif->shadow_ring->tx_idx);
751 if ( vif->shadow_ring->tx_idx != vif->shadow_ring->tx_prod )
752 add_to_net_schedule_list_tail(vif);
754 /* Check the chosen entry is good. */
755 if ( tx->status != RING_STATUS_OK ) continue;
757 if ( (skb = alloc_skb_nodata(GFP_ATOMIC)) == NULL )
758 {
759 printk("Out of memory in net_tx_action()!\n");
760 tx->status = RING_STATUS_BAD_PAGE;
761 break;
762 }
764 skb->destructor = tx_skb_release;
766 skb->head = skb->data = tx->header;
767 skb->end = skb->tail = skb->head + PKT_PROT_LEN;
769 skb->dev = the_dev;
770 skb->src_vif = vif->id;
771 skb->dst_vif = VIF_PHYSICAL_INTERFACE;
772 skb->mac.raw = skb->data;
774 skb_shinfo(skb)->frags[0].page = frame_table +
775 (tx->payload >> PAGE_SHIFT);
776 skb_shinfo(skb)->frags[0].size = tx->size - PKT_PROT_LEN;
777 skb_shinfo(skb)->frags[0].page_offset = tx->payload & ~PAGE_MASK;
778 skb_shinfo(skb)->nr_frags = 1;
780 skb->data_len = tx->size - PKT_PROT_LEN;
781 skb->len = tx->size;
783 /* Transmit should always work, or the queue would be stopped. */
784 if ( dev->hard_start_xmit(skb, dev) != 0 )
785 {
786 printk("Weird failure in hard_start_xmit!\n");
787 kfree_skb(skb);
788 break;
789 }
790 }
791 spin_unlock(&dev->xmit_lock);
792 }
794 DECLARE_TASKLET_DISABLED(net_tx_tasklet, net_tx_action, 0);
796 static inline void maybe_schedule_tx_action(void)
797 {
798 smp_mb();
799 if ( !netif_queue_stopped(the_dev) &&
800 !list_empty(&net_schedule_list) )
801 tasklet_schedule(&net_tx_tasklet);
802 }
805 /*
806 * update_shared_ring(void)
807 *
808 * This replaces flush_rx_queue as the guest event handler to move packets
809 * queued in the guest ring up to the guest. Really, the packet is already
810 * there, it was page flipped in deliver_packet, but this moves the ring
811 * descriptor across from the shadow ring and increments the pointers.
812 */
814 void update_shared_ring(void)
815 {
816 rx_shadow_entry_t *rx;
817 shared_info_t *s = current->shared_info;
818 net_ring_t *net_ring;
819 net_shadow_ring_t *shadow_ring;
820 unsigned int nvif;
822 clear_bit(_HYP_EVENT_NET_RX, &current->hyp_events);
824 for ( nvif = 0; nvif < current->num_net_vifs; nvif++ )
825 {
826 net_ring = current->net_vif_list[nvif]->net_ring;
827 shadow_ring = current->net_vif_list[nvif]->shadow_ring;
829 /* This would mean that the guest OS has fiddled with our index. */
830 if ( shadow_ring->rx_cons != net_ring->rx_cons )
831 DPRINTK("Shadow and shared rings out of sync (%d/%d)\n",
832 shadow_ring->rx_cons, net_ring->rx_cons);
834 while ( shadow_ring->rx_cons != shadow_ring->rx_idx )
835 {
836 rx = shadow_ring->rx_ring + shadow_ring->rx_cons;
837 copy_to_user(net_ring->rx_ring + shadow_ring->rx_cons, rx,
838 sizeof(rx_entry_t));
840 if ( rx->flush_count == tlb_flush_count[smp_processor_id()] )
841 __flush_tlb();
843 smp_wmb(); /* copy descriptor before inc'ing rx_cons */
844 shadow_ring->rx_cons = RX_RING_INC(shadow_ring->rx_cons);
846 if ( shadow_ring->rx_cons == net_ring->rx_event )
847 set_bit(_EVENT_NET_RX, &s->events);
848 }
849 net_ring->rx_cons = shadow_ring->rx_cons;
850 }
851 }
854 /*
855 * We need this ioctl for efficient implementation of the
856 * if_indextoname() function required by the IPv6 API. Without
857 * it, we would have to search all the interfaces to find a
858 * match. --pb
859 */
861 static int dev_ifname(struct ifreq *arg)
862 {
863 struct net_device *dev;
864 struct ifreq ifr;
866 /*
867 * Fetch the caller's info block.
868 */
870 if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
871 return -EFAULT;
873 read_lock(&dev_base_lock);
874 dev = __dev_get_by_index(ifr.ifr_ifindex);
875 if (!dev) {
876 read_unlock(&dev_base_lock);
877 return -ENODEV;
878 }
880 strcpy(ifr.ifr_name, dev->name);
881 read_unlock(&dev_base_lock);
883 if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
884 return -EFAULT;
885 return 0;
886 }
889 /**
890 * netdev_set_master - set up master/slave pair
891 * @slave: slave device
892 * @master: new master device
893 *
894 * Changes the master device of the slave. Pass %NULL to break the
895 * bonding. The caller must hold the RTNL semaphore. On a failure
896 * a negative errno code is returned. On success the reference counts
897 * are adjusted, %RTM_NEWLINK is sent to the routing socket and the
898 * function returns zero.
899 */
901 int netdev_set_master(struct net_device *slave, struct net_device *master)
902 {
903 struct net_device *old = slave->master;
905 if (master) {
906 if (old)
907 return -EBUSY;
908 dev_hold(master);
909 }
911 br_write_lock_bh(BR_NETPROTO_LOCK);
912 slave->master = master;
913 br_write_unlock_bh(BR_NETPROTO_LOCK);
915 if (old)
916 dev_put(old);
918 if (master)
919 slave->flags |= IFF_SLAVE;
920 else
921 slave->flags &= ~IFF_SLAVE;
923 rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
924 return 0;
925 }
927 /**
928 * dev_set_promiscuity - update promiscuity count on a device
929 * @dev: device
930 * @inc: modifier
931 *
932 * Add or remove promsicuity from a device. While the count in the device
933 * remains above zero the interface remains promiscuous. Once it hits zero
934 * the device reverts back to normal filtering operation. A negative inc
935 * value is used to drop promiscuity on the device.
936 */
938 void dev_set_promiscuity(struct net_device *dev, int inc)
939 {
940 unsigned short old_flags = dev->flags;
942 dev->flags |= IFF_PROMISC;
943 if ((dev->promiscuity += inc) == 0)
944 dev->flags &= ~IFF_PROMISC;
945 if (dev->flags^old_flags) {
946 #ifdef CONFIG_NET_FASTROUTE
947 if (dev->flags&IFF_PROMISC) {
948 netdev_fastroute_obstacles++;
949 dev_clear_fastroute(dev);
950 } else
951 netdev_fastroute_obstacles--;
952 #endif
953 dev_mc_upload(dev);
954 printk(KERN_INFO "device %s %s promiscuous mode\n",
955 dev->name, (dev->flags&IFF_PROMISC) ? "entered" : "left");
956 }
957 }
959 /**
960 * dev_set_allmulti - update allmulti count on a device
961 * @dev: device
962 * @inc: modifier
963 *
964 * Add or remove reception of all multicast frames to a device. While the
965 * count in the device remains above zero the interface remains listening
966 * to all interfaces. Once it hits zero the device reverts back to normal
967 * filtering operation. A negative @inc value is used to drop the counter
968 * when releasing a resource needing all multicasts.
969 */
971 void dev_set_allmulti(struct net_device *dev, int inc)
972 {
973 unsigned short old_flags = dev->flags;
975 dev->flags |= IFF_ALLMULTI;
976 if ((dev->allmulti += inc) == 0)
977 dev->flags &= ~IFF_ALLMULTI;
978 if (dev->flags^old_flags)
979 dev_mc_upload(dev);
980 }
982 int dev_change_flags(struct net_device *dev, unsigned flags)
983 {
984 int ret;
985 int old_flags = dev->flags;
987 /*
988 * Set the flags on our device.
989 */
991 dev->flags = (flags & (IFF_DEBUG|IFF_NOTRAILERS|IFF_NOARP|IFF_DYNAMIC|
992 IFF_MULTICAST|IFF_PORTSEL|IFF_AUTOMEDIA)) |
993 (dev->flags & (IFF_UP|IFF_VOLATILE|IFF_PROMISC|IFF_ALLMULTI));
995 /*
996 * Load in the correct multicast list now the flags have changed.
997 */
999 dev_mc_upload(dev);
1001 /*
1002 * Have we downed the interface. We handle IFF_UP ourselves
1003 * according to user attempts to set it, rather than blindly
1004 * setting it.
1005 */
1007 ret = 0;
1008 if ((old_flags^flags)&IFF_UP) /* Bit is different ? */
1010 ret = ((old_flags & IFF_UP) ? dev_close : dev_open)(dev);
1012 if (ret == 0)
1013 dev_mc_upload(dev);
1016 if (dev->flags&IFF_UP &&
1017 ((old_flags^dev->flags)&
1018 ~(IFF_UP|IFF_PROMISC|IFF_ALLMULTI|IFF_VOLATILE)))
1019 notifier_call_chain(&netdev_chain, NETDEV_CHANGE, dev);
1021 if ((flags^dev->gflags)&IFF_PROMISC) {
1022 int inc = (flags&IFF_PROMISC) ? +1 : -1;
1023 dev->gflags ^= IFF_PROMISC;
1024 dev_set_promiscuity(dev, inc);
1027 /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
1028 is important. Some (broken) drivers set IFF_PROMISC, when
1029 IFF_ALLMULTI is requested not asking us and not reporting.
1030 */
1031 if ((flags^dev->gflags)&IFF_ALLMULTI) {
1032 int inc = (flags&IFF_ALLMULTI) ? +1 : -1;
1033 dev->gflags ^= IFF_ALLMULTI;
1034 dev_set_allmulti(dev, inc);
1037 if (old_flags^dev->flags)
1038 rtmsg_ifinfo(RTM_NEWLINK, dev, old_flags^dev->flags);
1040 return ret;
1043 /*
1044 * Perform the SIOCxIFxxx calls.
1045 */
1047 static int dev_ifsioc(struct ifreq *ifr, unsigned int cmd)
1049 struct net_device *dev;
1050 int err;
1052 if ((dev = __dev_get_by_name(ifr->ifr_name)) == NULL)
1053 return -ENODEV;
1055 switch(cmd)
1057 case SIOCGIFFLAGS: /* Get interface flags */
1058 ifr->ifr_flags = (dev->flags&~(IFF_PROMISC|IFF_ALLMULTI|IFF_RUNNING))
1059 |(dev->gflags&(IFF_PROMISC|IFF_ALLMULTI));
1060 if (netif_running(dev) && netif_carrier_ok(dev))
1061 ifr->ifr_flags |= IFF_RUNNING;
1062 return 0;
1064 case SIOCSIFFLAGS: /* Set interface flags */
1065 return dev_change_flags(dev, ifr->ifr_flags);
1067 case SIOCGIFMETRIC: /* Get the metric on the interface */
1068 ifr->ifr_metric = 0;
1069 return 0;
1071 case SIOCSIFMETRIC: /* Set the metric on the interface */
1072 return -EOPNOTSUPP;
1074 case SIOCGIFMTU: /* Get the MTU of a device */
1075 ifr->ifr_mtu = dev->mtu;
1076 return 0;
1078 case SIOCSIFMTU: /* Set the MTU of a device */
1079 if (ifr->ifr_mtu == dev->mtu)
1080 return 0;
1082 /*
1083 * MTU must be positive.
1084 */
1086 if (ifr->ifr_mtu<0)
1087 return -EINVAL;
1089 if (!netif_device_present(dev))
1090 return -ENODEV;
1092 if (dev->change_mtu)
1093 err = dev->change_mtu(dev, ifr->ifr_mtu);
1094 else {
1095 dev->mtu = ifr->ifr_mtu;
1096 err = 0;
1098 if (!err && dev->flags&IFF_UP)
1099 notifier_call_chain(&netdev_chain, NETDEV_CHANGEMTU, dev);
1100 return err;
1102 case SIOCGIFHWADDR:
1103 memcpy(ifr->ifr_hwaddr.sa_data,dev->dev_addr, MAX_ADDR_LEN);
1104 ifr->ifr_hwaddr.sa_family=dev->type;
1105 return 0;
1107 case SIOCSIFHWADDR:
1108 if (dev->set_mac_address == NULL)
1109 return -EOPNOTSUPP;
1110 if (ifr->ifr_hwaddr.sa_family!=dev->type)
1111 return -EINVAL;
1112 if (!netif_device_present(dev))
1113 return -ENODEV;
1114 err = dev->set_mac_address(dev, &ifr->ifr_hwaddr);
1115 if (!err)
1116 notifier_call_chain(&netdev_chain, NETDEV_CHANGEADDR, dev);
1117 return err;
1119 case SIOCSIFHWBROADCAST:
1120 if (ifr->ifr_hwaddr.sa_family!=dev->type)
1121 return -EINVAL;
1122 memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data, MAX_ADDR_LEN);
1123 notifier_call_chain(&netdev_chain, NETDEV_CHANGEADDR, dev);
1124 return 0;
1126 case SIOCGIFMAP:
1127 ifr->ifr_map.mem_start=dev->mem_start;
1128 ifr->ifr_map.mem_end=dev->mem_end;
1129 ifr->ifr_map.base_addr=dev->base_addr;
1130 ifr->ifr_map.irq=dev->irq;
1131 ifr->ifr_map.dma=dev->dma;
1132 ifr->ifr_map.port=dev->if_port;
1133 return 0;
1135 case SIOCSIFMAP:
1136 if (dev->set_config) {
1137 if (!netif_device_present(dev))
1138 return -ENODEV;
1139 return dev->set_config(dev,&ifr->ifr_map);
1141 return -EOPNOTSUPP;
1143 case SIOCADDMULTI:
1144 if (dev->set_multicast_list == NULL ||
1145 ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
1146 return -EINVAL;
1147 if (!netif_device_present(dev))
1148 return -ENODEV;
1149 dev_mc_add(dev,ifr->ifr_hwaddr.sa_data, dev->addr_len, 1);
1150 return 0;
1152 case SIOCDELMULTI:
1153 if (dev->set_multicast_list == NULL ||
1154 ifr->ifr_hwaddr.sa_family!=AF_UNSPEC)
1155 return -EINVAL;
1156 if (!netif_device_present(dev))
1157 return -ENODEV;
1158 dev_mc_delete(dev,ifr->ifr_hwaddr.sa_data,dev->addr_len, 1);
1159 return 0;
1161 case SIOCGIFINDEX:
1162 ifr->ifr_ifindex = dev->ifindex;
1163 return 0;
1165 case SIOCSIFNAME:
1166 if (dev->flags&IFF_UP)
1167 return -EBUSY;
1168 if (__dev_get_by_name(ifr->ifr_newname))
1169 return -EEXIST;
1170 memcpy(dev->name, ifr->ifr_newname, IFNAMSIZ);
1171 dev->name[IFNAMSIZ-1] = 0;
1172 notifier_call_chain(&netdev_chain, NETDEV_CHANGENAME, dev);
1173 return 0;
1175 #ifdef WIRELESS_EXT
1176 case SIOCGIWSTATS:
1177 return dev_iwstats(dev, ifr);
1178 #endif /* WIRELESS_EXT */
1180 /*
1181 * Unknown or private ioctl
1182 */
1184 default:
1185 if ((cmd >= SIOCDEVPRIVATE &&
1186 cmd <= SIOCDEVPRIVATE + 15) ||
1187 cmd == SIOCBONDENSLAVE ||
1188 cmd == SIOCBONDRELEASE ||
1189 cmd == SIOCBONDSETHWADDR ||
1190 cmd == SIOCBONDSLAVEINFOQUERY ||
1191 cmd == SIOCBONDINFOQUERY ||
1192 cmd == SIOCBONDCHANGEACTIVE ||
1193 cmd == SIOCETHTOOL ||
1194 cmd == SIOCGMIIPHY ||
1195 cmd == SIOCGMIIREG ||
1196 cmd == SIOCSMIIREG) {
1197 if (dev->do_ioctl) {
1198 if (!netif_device_present(dev))
1199 return -ENODEV;
1200 return dev->do_ioctl(dev, ifr, cmd);
1202 return -EOPNOTSUPP;
1205 #ifdef WIRELESS_EXT
1206 if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST) {
1207 if (dev->do_ioctl) {
1208 if (!netif_device_present(dev))
1209 return -ENODEV;
1210 return dev->do_ioctl(dev, ifr, cmd);
1212 return -EOPNOTSUPP;
1214 #endif /* WIRELESS_EXT */
1217 return -EINVAL;
1220 /*
1221 * This function handles all "interface"-type I/O control requests. The actual
1222 * 'doing' part of this is dev_ifsioc above.
1223 */
1225 /**
1226 * dev_ioctl - network device ioctl
1227 * @cmd: command to issue
1228 * @arg: pointer to a struct ifreq in user space
1230 * Issue ioctl functions to devices. This is normally called by the
1231 * user space syscall interfaces but can sometimes be useful for
1232 * other purposes. The return value is the return from the syscall if
1233 * positive or a negative errno code on error.
1234 */
1236 int dev_ioctl(unsigned int cmd, void *arg)
1238 struct ifreq ifr;
1239 int ret;
1240 char *colon;
1242 /* One special case: SIOCGIFCONF takes ifconf argument
1243 and requires shared lock, because it sleeps writing
1244 to user space.
1245 */
1247 if (cmd == SIOCGIFCONF) {
1248 return -ENOSYS;
1250 if (cmd == SIOCGIFNAME) {
1251 return dev_ifname((struct ifreq *)arg);
1254 if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
1255 return -EFAULT;
1257 ifr.ifr_name[IFNAMSIZ-1] = 0;
1259 colon = strchr(ifr.ifr_name, ':');
1260 if (colon)
1261 *colon = 0;
1263 /*
1264 * See which interface the caller is talking about.
1265 */
1267 switch(cmd)
1269 /*
1270 * These ioctl calls:
1271 * - can be done by all.
1272 * - atomic and do not require locking.
1273 * - return a value
1274 */
1276 case SIOCGIFFLAGS:
1277 case SIOCGIFMETRIC:
1278 case SIOCGIFMTU:
1279 case SIOCGIFHWADDR:
1280 case SIOCGIFSLAVE:
1281 case SIOCGIFMAP:
1282 case SIOCGIFINDEX:
1283 dev_load(ifr.ifr_name);
1284 read_lock(&dev_base_lock);
1285 ret = dev_ifsioc(&ifr, cmd);
1286 read_unlock(&dev_base_lock);
1287 if (!ret) {
1288 if (colon)
1289 *colon = ':';
1290 if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
1291 return -EFAULT;
1293 return ret;
1295 /*
1296 * These ioctl calls:
1297 * - require superuser power.
1298 * - require strict serialization.
1299 * - return a value
1300 */
1302 case SIOCETHTOOL:
1303 case SIOCGMIIPHY:
1304 case SIOCGMIIREG:
1305 if (!capable(CAP_NET_ADMIN))
1306 return -EPERM;
1307 dev_load(ifr.ifr_name);
1308 dev_probe_lock();
1309 rtnl_lock();
1310 ret = dev_ifsioc(&ifr, cmd);
1311 rtnl_unlock();
1312 dev_probe_unlock();
1313 if (!ret) {
1314 if (colon)
1315 *colon = ':';
1316 if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
1317 return -EFAULT;
1319 return ret;
1321 /*
1322 * These ioctl calls:
1323 * - require superuser power.
1324 * - require strict serialization.
1325 * - do not return a value
1326 */
1328 case SIOCSIFFLAGS:
1329 case SIOCSIFMETRIC:
1330 case SIOCSIFMTU:
1331 case SIOCSIFMAP:
1332 case SIOCSIFHWADDR:
1333 case SIOCSIFSLAVE:
1334 case SIOCADDMULTI:
1335 case SIOCDELMULTI:
1336 case SIOCSIFHWBROADCAST:
1337 case SIOCSIFNAME:
1338 case SIOCSMIIREG:
1339 case SIOCBONDENSLAVE:
1340 case SIOCBONDRELEASE:
1341 case SIOCBONDSETHWADDR:
1342 case SIOCBONDSLAVEINFOQUERY:
1343 case SIOCBONDINFOQUERY:
1344 case SIOCBONDCHANGEACTIVE:
1345 if (!capable(CAP_NET_ADMIN))
1346 return -EPERM;
1347 dev_load(ifr.ifr_name);
1348 dev_probe_lock();
1349 rtnl_lock();
1350 ret = dev_ifsioc(&ifr, cmd);
1351 rtnl_unlock();
1352 dev_probe_unlock();
1353 return ret;
1355 case SIOCGIFMEM:
1356 /* Get the per device memory space. We can add this but currently
1357 do not support it */
1358 case SIOCSIFMEM:
1359 /* Set the per device memory buffer space. */
1360 case SIOCSIFLINK:
1361 return -EINVAL;
1363 /*
1364 * Unknown or private ioctl.
1365 */
1367 default:
1368 if (cmd >= SIOCDEVPRIVATE &&
1369 cmd <= SIOCDEVPRIVATE + 15) {
1370 dev_load(ifr.ifr_name);
1371 dev_probe_lock();
1372 rtnl_lock();
1373 ret = dev_ifsioc(&ifr, cmd);
1374 rtnl_unlock();
1375 dev_probe_unlock();
1376 if (!ret && copy_to_user(arg, &ifr, sizeof(struct ifreq)))
1377 return -EFAULT;
1378 return ret;
1380 #ifdef WIRELESS_EXT
1381 /* Take care of Wireless Extensions */
1382 if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST) {
1383 /* If command is `set a parameter', or
1384 * `get the encoding parameters', check if
1385 * the user has the right to do it */
1386 if (IW_IS_SET(cmd) || (cmd == SIOCGIWENCODE)) {
1387 if(!capable(CAP_NET_ADMIN))
1388 return -EPERM;
1390 dev_load(ifr.ifr_name);
1391 rtnl_lock();
1392 ret = dev_ifsioc(&ifr, cmd);
1393 rtnl_unlock();
1394 if (!ret && IW_IS_GET(cmd) &&
1395 copy_to_user(arg, &ifr,
1396 sizeof(struct ifreq)))
1397 return -EFAULT;
1398 return ret;
1400 #endif /* WIRELESS_EXT */
1401 return -EINVAL;
1406 /**
1407 * dev_new_index - allocate an ifindex
1409 * Returns a suitable unique value for a new device interface
1410 * number. The caller must hold the rtnl semaphore or the
1411 * dev_base_lock to be sure it remains unique.
1412 */
1414 int dev_new_index(void)
1416 static int ifindex;
1417 for (;;) {
1418 if (++ifindex <= 0)
1419 ifindex=1;
1420 if (__dev_get_by_index(ifindex) == NULL)
1421 return ifindex;
1425 static int dev_boot_phase = 1;
1427 /**
1428 * register_netdevice - register a network device
1429 * @dev: device to register
1431 * Take a completed network device structure and add it to the kernel
1432 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
1433 * chain. 0 is returned on success. A negative errno code is returned
1434 * on a failure to set up the device, or if the name is a duplicate.
1436 * Callers must hold the rtnl semaphore. See the comment at the
1437 * end of Space.c for details about the locking. You may want
1438 * register_netdev() instead of this.
1440 * BUGS:
1441 * The locking appears insufficient to guarantee two parallel registers
1442 * will not get the same name.
1443 */
1445 int net_dev_init(void);
1447 int register_netdevice(struct net_device *dev)
1449 struct net_device *d, **dp;
1450 #ifdef CONFIG_NET_DIVERT
1451 int ret;
1452 #endif
1454 spin_lock_init(&dev->queue_lock);
1455 spin_lock_init(&dev->xmit_lock);
1456 dev->xmit_lock_owner = -1;
1457 #ifdef CONFIG_NET_FASTROUTE
1458 dev->fastpath_lock=RW_LOCK_UNLOCKED;
1459 #endif
1461 if (dev_boot_phase)
1462 net_dev_init();
1464 #ifdef CONFIG_NET_DIVERT
1465 ret = alloc_divert_blk(dev);
1466 if (ret)
1467 return ret;
1468 #endif /* CONFIG_NET_DIVERT */
1470 dev->iflink = -1;
1472 /* Init, if this function is available */
1473 if (dev->init && dev->init(dev) != 0) {
1474 #ifdef CONFIG_NET_DIVERT
1475 free_divert_blk(dev);
1476 #endif
1477 return -EIO;
1480 dev->ifindex = dev_new_index();
1481 if (dev->iflink == -1)
1482 dev->iflink = dev->ifindex;
1484 /* Check for existence, and append to tail of chain */
1485 for (dp=&dev_base; (d=*dp) != NULL; dp=&d->next) {
1486 if (d == dev || strcmp(d->name, dev->name) == 0) {
1487 #ifdef CONFIG_NET_DIVERT
1488 free_divert_blk(dev);
1489 #endif
1490 return -EEXIST;
1493 /*
1494 * nil rebuild_header routine,
1495 * that should be never called and used as just bug trap.
1496 */
1498 if (dev->rebuild_header == NULL)
1499 dev->rebuild_header = default_rebuild_header;
1501 /*
1502 * Default initial state at registry is that the
1503 * device is present.
1504 */
1506 set_bit(__LINK_STATE_PRESENT, &dev->state);
1508 dev->next = NULL;
1509 dev_init_scheduler(dev);
1510 write_lock_bh(&dev_base_lock);
1511 *dp = dev;
1512 dev_hold(dev);
1513 dev->deadbeaf = 0;
1514 write_unlock_bh(&dev_base_lock);
1516 /* Notify protocols, that a new device appeared. */
1517 notifier_call_chain(&netdev_chain, NETDEV_REGISTER, dev);
1519 return 0;
1522 /**
1523 * netdev_finish_unregister - complete unregistration
1524 * @dev: device
1526 * Destroy and free a dead device. A value of zero is returned on
1527 * success.
1528 */
1530 int netdev_finish_unregister(struct net_device *dev)
1532 BUG_TRAP(dev->ip_ptr==NULL);
1533 BUG_TRAP(dev->ip6_ptr==NULL);
1534 BUG_TRAP(dev->dn_ptr==NULL);
1536 if (!dev->deadbeaf) {
1537 printk(KERN_ERR "Freeing alive device %p, %s\n",
1538 dev, dev->name);
1539 return 0;
1541 #ifdef NET_REFCNT_DEBUG
1542 printk(KERN_DEBUG "netdev_finish_unregister: %s%s.\n", dev->name,
1543 (dev->features & NETIF_F_DYNALLOC)?"":", old style");
1544 #endif
1545 if (dev->destructor)
1546 dev->destructor(dev);
1547 if (dev->features & NETIF_F_DYNALLOC)
1548 kfree(dev);
1549 return 0;
1552 /**
1553 * unregister_netdevice - remove device from the kernel
1554 * @dev: device
1556 * This function shuts down a device interface and removes it
1557 * from the kernel tables. On success 0 is returned, on a failure
1558 * a negative errno code is returned.
1560 * Callers must hold the rtnl semaphore. See the comment at the
1561 * end of Space.c for details about the locking. You may want
1562 * unregister_netdev() instead of this.
1563 */
1565 int unregister_netdevice(struct net_device *dev)
1567 unsigned long now, warning_time;
1568 struct net_device *d, **dp;
1570 /* If device is running, close it first. */
1571 if (dev->flags & IFF_UP)
1572 dev_close(dev);
1574 BUG_TRAP(dev->deadbeaf==0);
1575 dev->deadbeaf = 1;
1577 /* And unlink it from device chain. */
1578 for (dp = &dev_base; (d=*dp) != NULL; dp=&d->next) {
1579 if (d == dev) {
1580 write_lock_bh(&dev_base_lock);
1581 *dp = d->next;
1582 write_unlock_bh(&dev_base_lock);
1583 break;
1586 if (d == NULL) {
1587 printk(KERN_DEBUG "unregister_netdevice: device %s/%p"
1588 " not registered\n", dev->name, dev);
1589 return -ENODEV;
1592 /* Synchronize to net_rx_action. */
1593 br_write_lock_bh(BR_NETPROTO_LOCK);
1594 br_write_unlock_bh(BR_NETPROTO_LOCK);
1596 if (dev_boot_phase == 0) {
1598 /* Shutdown queueing discipline. */
1599 dev_shutdown(dev);
1601 /* Notify protocols, that we are about to destroy
1602 this device. They should clean all the things.
1603 */
1604 notifier_call_chain(&netdev_chain, NETDEV_UNREGISTER, dev);
1606 /*
1607 * Flush the multicast chain
1608 */
1609 dev_mc_discard(dev);
1612 if (dev->uninit)
1613 dev->uninit(dev);
1615 /* Notifier chain MUST detach us from master device. */
1616 BUG_TRAP(dev->master==NULL);
1618 #ifdef CONFIG_NET_DIVERT
1619 free_divert_blk(dev);
1620 #endif
1622 if (dev->features & NETIF_F_DYNALLOC) {
1623 #ifdef NET_REFCNT_DEBUG
1624 if (atomic_read(&dev->refcnt) != 1)
1625 printk(KERN_DEBUG "unregister_netdevice: holding %s refcnt=%d\n",
1626 dev->name, atomic_read(&dev->refcnt)-1);
1627 #endif
1628 dev_put(dev);
1629 return 0;
1632 /* Last reference is our one */
1633 if (atomic_read(&dev->refcnt) == 1) {
1634 dev_put(dev);
1635 return 0;
1638 #ifdef NET_REFCNT_DEBUG
1639 printk("unregister_netdevice: waiting %s refcnt=%d\n",
1640 dev->name, atomic_read(&dev->refcnt));
1641 #endif
1643 /* EXPLANATION. If dev->refcnt is not now 1 (our own reference)
1644 it means that someone in the kernel still has a reference
1645 to this device and we cannot release it.
1647 "New style" devices have destructors, hence we can return from this
1648 function and destructor will do all the work later. As of kernel 2.4.0
1649 there are very few "New Style" devices.
1651 "Old style" devices expect that the device is free of any references
1652 upon exit from this function.
1653 We cannot return from this function until all such references have
1654 fallen away. This is because the caller of this function will probably
1655 immediately kfree(*dev) and then be unloaded via sys_delete_module.
1657 So, we linger until all references fall away. The duration of the
1658 linger is basically unbounded! It is driven by, for example, the
1659 current setting of sysctl_ipfrag_time.
1661 After 1 second, we start to rebroadcast unregister notifications
1662 in hope that careless clients will release the device.
1664 */
1666 now = warning_time = jiffies;
1667 while (atomic_read(&dev->refcnt) != 1) {
1668 if ((jiffies - now) > 1*HZ) {
1669 /* Rebroadcast unregister notification */
1670 notifier_call_chain(&netdev_chain, NETDEV_UNREGISTER, dev);
1672 mdelay(250);
1673 if ((jiffies - warning_time) > 10*HZ) {
1674 printk(KERN_EMERG "unregister_netdevice: waiting for %s to "
1675 "become free. Usage count = %d\n",
1676 dev->name, atomic_read(&dev->refcnt));
1677 warning_time = jiffies;
1680 dev_put(dev);
1681 return 0;
1685 /*
1686 * Initialize the DEV module. At boot time this walks the device list and
1687 * unhooks any devices that fail to initialise (normally hardware not
1688 * present) and leaves us with a valid list of present and active devices.
1690 */
1692 extern void net_device_init(void);
1693 extern void ip_auto_config(void);
1694 #ifdef CONFIG_NET_DIVERT
1695 extern void dv_init(void);
1696 #endif /* CONFIG_NET_DIVERT */
1699 /*
1700 * Callers must hold the rtnl semaphore. See the comment at the
1701 * end of Space.c for details about the locking.
1702 */
1703 int __init net_dev_init(void)
1705 struct net_device *dev, **dp;
1707 if ( !dev_boot_phase )
1708 return 0;
1710 skb_init();
1712 net_header_cachep = kmem_cache_create(
1713 "net_header_cache",
1714 (PKT_PROT_LEN + sizeof(void *) - 1) & ~(sizeof(void *) - 1),
1715 0, SLAB_HWCACHE_ALIGN, NULL, NULL);
1717 spin_lock_init(&net_schedule_list_lock);
1718 INIT_LIST_HEAD(&net_schedule_list);
1720 /*
1721 * Add the devices.
1722 * If the call to dev->init fails, the dev is removed
1723 * from the chain disconnecting the device until the
1724 * next reboot.
1726 * NB At boot phase networking is dead. No locking is required.
1727 * But we still preserve dev_base_lock for sanity.
1728 */
1729 dp = &dev_base;
1730 while ((dev = *dp) != NULL) {
1731 spin_lock_init(&dev->queue_lock);
1732 spin_lock_init(&dev->xmit_lock);
1734 dev->xmit_lock_owner = -1;
1735 dev->iflink = -1;
1736 dev_hold(dev);
1738 /*
1739 * Allocate name. If the init() fails
1740 * the name will be reissued correctly.
1741 */
1742 if (strchr(dev->name, '%'))
1743 dev_alloc_name(dev, dev->name);
1745 if (dev->init && dev->init(dev)) {
1746 /*
1747 * It failed to come up. It will be unhooked later.
1748 * dev_alloc_name can now advance to next suitable
1749 * name that is checked next.
1750 */
1751 dev->deadbeaf = 1;
1752 dp = &dev->next;
1753 } else {
1754 dp = &dev->next;
1755 dev->ifindex = dev_new_index();
1756 if (dev->iflink == -1)
1757 dev->iflink = dev->ifindex;
1758 if (dev->rebuild_header == NULL)
1759 dev->rebuild_header = default_rebuild_header;
1760 dev_init_scheduler(dev);
1761 set_bit(__LINK_STATE_PRESENT, &dev->state);
1765 /*
1766 * Unhook devices that failed to come up
1767 */
1768 dp = &dev_base;
1769 while ((dev = *dp) != NULL) {
1770 if (dev->deadbeaf) {
1771 write_lock_bh(&dev_base_lock);
1772 *dp = dev->next;
1773 write_unlock_bh(&dev_base_lock);
1774 dev_put(dev);
1775 } else {
1776 dp = &dev->next;
1780 dev_boot_phase = 0;
1782 dev_mcast_init();
1784 /*
1785 * Initialise network devices
1786 */
1788 net_device_init();
1790 return 0;
1793 inline int init_tx_header(u8 *data, unsigned int len, struct net_device *dev)
1795 memcpy(data + ETH_ALEN, dev->dev_addr, ETH_ALEN);
1797 switch ( ntohs(*(unsigned short *)(data + 12)) )
1799 case ETH_P_ARP:
1800 if ( len < 42 ) break;
1801 memcpy(data + 22, dev->dev_addr, 6);
1802 return ETH_P_ARP;
1803 case ETH_P_IP:
1804 return ETH_P_IP;
1806 return 0;
1810 /*
1811 * do_net_update:
1813 * Called from guest OS to notify updates to its transmit and/or receive
1814 * descriptor rings.
1815 */
1817 long do_net_update(void)
1819 net_ring_t *net_ring;
1820 net_shadow_ring_t *shadow_ring;
1821 net_vif_t *current_vif;
1822 unsigned int i, j;
1823 struct sk_buff *skb;
1824 tx_entry_t tx;
1825 rx_shadow_entry_t *rx;
1826 unsigned long pfn;
1827 struct pfn_info *page;
1828 unsigned long *g_pte;
1830 for ( j = 0; j < current->num_net_vifs; j++)
1832 int target;
1833 u8 *g_data;
1834 unsigned short protocol;
1836 current_vif = current->net_vif_list[j];
1837 net_ring = current_vif->net_ring;
1838 shadow_ring = current_vif->shadow_ring;
1840 /*
1841 * PHASE 1 -- TRANSMIT RING
1842 */
1844 /*
1845 * Collect up new transmit buffers. We collect up to the guest OS's
1846 * new producer index, but take care not to catch up with our own
1847 * consumer index.
1848 */
1849 for ( i = shadow_ring->tx_prod;
1850 (i != net_ring->tx_prod) &&
1851 (((shadow_ring->tx_cons-i) & (TX_RING_SIZE-1)) != 1);
1852 i = TX_RING_INC(i) )
1854 if ( copy_from_user(&tx, net_ring->tx_ring+i, sizeof(tx)) )
1856 DPRINTK("Bad copy_from_user for tx net descriptor\n");
1857 shadow_ring->tx_ring[i].status = RING_STATUS_ERR_CFU;
1858 continue;
1861 shadow_ring->tx_ring[i].size = tx.size;
1862 shadow_ring->tx_ring[i].status = RING_STATUS_BAD_PAGE;
1864 if ( tx.size < PKT_PROT_LEN )
1866 DPRINTK("Runt packet %d\n", tx.size);
1867 continue;
1870 if ( ((tx.addr & ~PAGE_MASK) + tx.size) >= PAGE_SIZE )
1872 DPRINTK("tx.addr: %lx, size: %u, end: %lu\n",
1873 tx.addr, tx.size, (tx.addr &~PAGE_MASK) + tx.size);
1874 continue;
1877 pfn = tx.addr >> PAGE_SHIFT;
1878 page = frame_table + pfn;
1879 spin_lock_irq(&current->page_lock);
1880 if ( (pfn >= max_page) ||
1881 ((page->flags & PG_domain_mask) != current->domain) )
1883 DPRINTK("Bad page frame\n");
1884 spin_unlock_irq(&current->page_lock);
1885 continue;
1888 g_data = map_domain_mem(tx.addr);
1890 protocol = __constant_htons(
1891 init_tx_header(g_data, tx.size, the_dev));
1892 if ( protocol == 0 )
1893 goto tx_unmap_and_continue;
1895 target = __net_get_target_vif(g_data, tx.size, current_vif->id);
1897 if ( target > VIF_PHYSICAL_INTERFACE )
1899 /* Local delivery */
1900 if ( (skb = dev_alloc_skb(tx.size)) == NULL )
1901 goto tx_unmap_and_continue;
1903 skb->destructor = tx_skb_release;
1905 shadow_ring->tx_ring[i].status = RING_STATUS_OK;
1907 skb->src_vif = current_vif->id;
1908 skb->dst_vif = target;
1909 skb->protocol = protocol;
1911 skb->head = (u8 *)map_domain_mem(
1912 ((skb->pf - frame_table) << PAGE_SHIFT));
1913 skb->data = skb->head + 16;
1914 skb_reserve(skb,2);
1915 memcpy(skb->data, g_data, tx.size);
1916 skb->len = tx.size;
1917 unmap_domain_mem(skb->head);
1918 skb->data += ETH_HLEN;
1919 (void)netif_rx(skb);
1921 else if ( target == VIF_PHYSICAL_INTERFACE )
1923 shadow_ring->tx_ring[i].header =
1924 kmem_cache_alloc(net_header_cachep, GFP_KERNEL);
1925 if ( shadow_ring->tx_ring[i].header == NULL )
1926 goto tx_unmap_and_continue;
1927 memcpy(shadow_ring->tx_ring[i].header, g_data, PKT_PROT_LEN);
1928 shadow_ring->tx_ring[i].payload = tx.addr + PKT_PROT_LEN;
1929 shadow_ring->tx_ring[i].status = RING_STATUS_OK;
1930 get_page_tot(page);
1933 tx_unmap_and_continue:
1934 unmap_domain_mem(g_data);
1935 spin_unlock_irq(&current->page_lock);
1938 if ( shadow_ring->tx_prod != i )
1940 smp_mb(); /* Let other CPUs see new descriptors first. */
1941 shadow_ring->tx_prod = i;
1942 add_to_net_schedule_list_tail(current_vif);
1943 maybe_schedule_tx_action();
1946 /*
1947 * PHASE 2 -- RECEIVE RING
1948 */
1950 /*
1951 * Collect up new receive buffers. We collect up to the guest OS's
1952 * new producer index, but take care not to catch up with our own
1953 * consumer index.
1954 */
1955 for ( i = shadow_ring->rx_prod;
1956 (i != net_ring->rx_prod) &&
1957 (((shadow_ring->rx_cons-i) & (RX_RING_SIZE-1)) != 1);
1958 i = RX_RING_INC(i) )
1960 /*
1961 * This copy assumes that rx_shadow_entry_t is an extension of
1962 * rx_net_entry_t extra fields must be tacked on to the end.
1963 */
1964 if ( copy_from_user(shadow_ring->rx_ring+i, net_ring->rx_ring+i,
1965 sizeof (rx_entry_t) ) )
1967 DPRINTK("Bad copy_from_user for rx ring\n");
1968 shadow_ring->rx_ring[i].status = RING_STATUS_ERR_CFU;
1969 continue;
1972 rx = shadow_ring->rx_ring + i;
1973 pfn = rx->addr >> PAGE_SHIFT;
1974 page = frame_table + pfn;
1976 shadow_ring->rx_ring[i].status = RING_STATUS_BAD_PAGE;
1978 spin_lock_irq(&current->page_lock);
1979 if ( (pfn >= max_page) ||
1980 (page->flags != (PGT_l1_page_table | current->domain)) )
1982 DPRINTK("Bad page frame containing ppte\n");
1983 spin_unlock_irq(&current->page_lock);
1984 continue;
1987 g_pte = map_domain_mem(rx->addr);
1989 if (!(*g_pte & _PAGE_PRESENT))
1991 DPRINTK("Inavlid PTE passed down (not present)\n");
1992 goto rx_unmap_and_continue;
1995 page = (*g_pte >> PAGE_SHIFT) + frame_table;
1997 if (page->tot_count != 1)
1999 DPRINTK("RX page mapped multple times (%d/%d/%08x)\n",
2000 page->type_count, page->tot_count, page->flags);
2002 goto rx_unmap_and_continue;
2005 /* The pte they passed was good, so take it away from them. */
2006 shadow_ring->rx_ring[i].status = RING_STATUS_OK;
2007 *g_pte &= ~_PAGE_PRESENT;
2008 page->flags = (page->flags & ~PG_type_mask) | PGT_net_rx_buf;
2009 rx->flush_count = tlb_flush_count[smp_processor_id()];
2011 rx_unmap_and_continue:
2012 unmap_domain_mem(g_pte);
2013 spin_unlock_irq(&current->page_lock);
2016 if ( shadow_ring->rx_prod != i )
2018 smp_mb(); /* Let other CPUs see new descriptors first. */
2019 shadow_ring->rx_prod = i;
2023 return 0;
2027 int setup_network_devices(void)
2029 int ret;
2030 extern char opt_ifname[];
2031 struct net_device *dev = dev_get_by_name(opt_ifname);
2033 if ( dev == NULL )
2035 printk("Could not find device %s\n", opt_ifname);
2036 return 0;
2039 ret = dev_open(dev);
2040 if ( ret != 0 )
2042 printk("Error opening device %s for use (%d)\n", opt_ifname, ret);
2043 return 0;
2045 printk("Device %s opened and ready for use.\n", opt_ifname);
2046 the_dev = dev;
2048 tasklet_enable(&net_tx_tasklet);
2050 return 1;