direct-io.hg

view xen/net/dev.c @ 690:cf5bddb3603e

bitkeeper revision 1.404 (3f57607dkEm0-zZkL-qD7DTt8vKceQ)

apic.c:
Another bug fix for no-APIC systems.
author kaf24@scramble.cl.cam.ac.uk
date Thu Sep 04 15:55:41 2003 +0000 (2003-09-04)
parents 4862078df9ac
children 1f4e3a24e225
line source
1 /*
2 * NET3 Protocol independent device support routines.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 */
10 #include <asm/uaccess.h>
11 #include <asm/system.h>
12 #include <asm/bitops.h>
13 #include <linux/config.h>
14 #include <linux/delay.h>
15 #include <linux/lib.h>
16 #include <linux/types.h>
17 #include <linux/sched.h>
18 #include <linux/mm.h>
19 #include <linux/socket.h>
20 #include <linux/sockios.h>
21 #include <linux/errno.h>
22 #include <linux/interrupt.h>
23 #include <linux/if_ether.h>
24 #include <linux/netdevice.h>
25 #include <linux/etherdevice.h>
26 #include <linux/skbuff.h>
27 #include <linux/brlock.h>
28 #include <linux/init.h>
29 #include <linux/module.h>
31 #include <linux/event.h>
32 #include <asm/domain_page.h>
33 #include <asm/pgalloc.h>
35 #include <xeno/perfc.h>
37 #define BUG_TRAP ASSERT
38 #define notifier_call_chain(_a,_b,_c) ((void)0)
39 #define rtmsg_ifinfo(_a,_b,_c) ((void)0)
40 #define rtnl_lock() ((void)0)
41 #define rtnl_unlock() ((void)0)
43 #if 0
44 #define DPRINTK(_f, _a...) printk(_f , ## _a)
45 #else
46 #define DPRINTK(_f, _a...) ((void)0)
47 #endif
49 #define TX_RING_INC(_i) (((_i)+1) & (TX_RING_SIZE-1))
50 #define RX_RING_INC(_i) (((_i)+1) & (RX_RING_SIZE-1))
51 #define TX_RING_ADD(_i,_j) (((_i)+(_j)) & (TX_RING_SIZE-1))
52 #define RX_RING_ADD(_i,_j) (((_i)+(_j)) & (RX_RING_SIZE-1))
54 static struct sk_buff_head rx_skb_queue[NR_CPUS] __cacheline_aligned;
56 static int get_tx_bufs(net_vif_t *vif);
58 static void __make_tx_response(net_vif_t *vif,
59 unsigned short id,
60 unsigned char st);
61 static void make_rx_response(net_vif_t *vif,
62 unsigned short id,
63 unsigned short size,
64 unsigned char st,
65 unsigned char off);
67 struct net_device *the_dev = NULL;
69 /*
70 * Transmitted packets are fragmented, so we can copy the important headesr
71 * before checking them for validity. Avoids need for page protection.
72 */
73 /* Ethernet + IP headers */
74 #define PKT_PROT_LEN (ETH_HLEN + 20)
75 static kmem_cache_t *net_header_cachep;
77 /**
78 * __dev_get_by_name - find a device by its name
79 * @name: name to find
80 *
81 * Find an interface by name. Must be called under RTNL semaphore
82 * or @dev_base_lock. If the name is found a pointer to the device
83 * is returned. If the name is not found then %NULL is returned. The
84 * reference counters are not incremented so the caller must be
85 * careful with locks.
86 */
89 struct net_device *__dev_get_by_name(const char *name)
90 {
91 struct net_device *dev;
93 for (dev = dev_base; dev != NULL; dev = dev->next) {
94 if (strncmp(dev->name, name, IFNAMSIZ) == 0)
95 return dev;
96 }
97 return NULL;
98 }
100 /**
101 * dev_get_by_name - find a device by its name
102 * @name: name to find
103 *
104 * Find an interface by name. This can be called from any
105 * context and does its own locking. The returned handle has
106 * the usage count incremented and the caller must use dev_put() to
107 * release it when it is no longer needed. %NULL is returned if no
108 * matching device is found.
109 */
111 struct net_device *dev_get_by_name(const char *name)
112 {
113 struct net_device *dev;
115 read_lock(&dev_base_lock);
116 dev = __dev_get_by_name(name);
117 if (dev)
118 dev_hold(dev);
119 read_unlock(&dev_base_lock);
120 return dev;
121 }
123 /**
124 * dev_get - test if a device exists
125 * @name: name to test for
126 *
127 * Test if a name exists. Returns true if the name is found. In order
128 * to be sure the name is not allocated or removed during the test the
129 * caller must hold the rtnl semaphore.
130 *
131 * This function primarily exists for back compatibility with older
132 * drivers.
133 */
135 int dev_get(const char *name)
136 {
137 struct net_device *dev;
139 read_lock(&dev_base_lock);
140 dev = __dev_get_by_name(name);
141 read_unlock(&dev_base_lock);
142 return dev != NULL;
143 }
145 /**
146 * __dev_get_by_index - find a device by its ifindex
147 * @ifindex: index of device
148 *
149 * Search for an interface by index. Returns %NULL if the device
150 * is not found or a pointer to the device. The device has not
151 * had its reference counter increased so the caller must be careful
152 * about locking. The caller must hold either the RTNL semaphore
153 * or @dev_base_lock.
154 */
156 struct net_device * __dev_get_by_index(int ifindex)
157 {
158 struct net_device *dev;
160 for (dev = dev_base; dev != NULL; dev = dev->next) {
161 if (dev->ifindex == ifindex)
162 return dev;
163 }
164 return NULL;
165 }
168 /**
169 * dev_get_by_index - find a device by its ifindex
170 * @ifindex: index of device
171 *
172 * Search for an interface by index. Returns NULL if the device
173 * is not found or a pointer to the device. The device returned has
174 * had a reference added and the pointer is safe until the user calls
175 * dev_put to indicate they have finished with it.
176 */
178 struct net_device * dev_get_by_index(int ifindex)
179 {
180 struct net_device *dev;
182 read_lock(&dev_base_lock);
183 dev = __dev_get_by_index(ifindex);
184 if (dev)
185 dev_hold(dev);
186 read_unlock(&dev_base_lock);
187 return dev;
188 }
190 /**
191 * dev_getbyhwaddr - find a device by its hardware address
192 * @type: media type of device
193 * @ha: hardware address
194 *
195 * Search for an interface by MAC address. Returns NULL if the device
196 * is not found or a pointer to the device. The caller must hold the
197 * rtnl semaphore. The returned device has not had its ref count increased
198 * and the caller must therefore be careful about locking
199 *
200 * BUGS:
201 * If the API was consistent this would be __dev_get_by_hwaddr
202 */
204 struct net_device *dev_getbyhwaddr(unsigned short type, char *ha)
205 {
206 struct net_device *dev;
208 for (dev = dev_base; dev != NULL; dev = dev->next) {
209 if (dev->type == type &&
210 memcmp(dev->dev_addr, ha, dev->addr_len) == 0)
211 return dev;
212 }
213 return NULL;
214 }
216 /**
217 * dev_alloc_name - allocate a name for a device
218 * @dev: device
219 * @name: name format string
220 *
221 * Passed a format string - eg "lt%d" it will try and find a suitable
222 * id. Not efficient for many devices, not called a lot. The caller
223 * must hold the dev_base or rtnl lock while allocating the name and
224 * adding the device in order to avoid duplicates. Returns the number
225 * of the unit assigned or a negative errno code.
226 */
228 int dev_alloc_name(struct net_device *dev, const char *name)
229 {
230 int i;
231 char buf[32];
232 char *p;
234 /*
235 * Verify the string as this thing may have come from
236 * the user. There must be either one "%d" and no other "%"
237 * characters, or no "%" characters at all.
238 */
239 p = strchr(name, '%');
240 if (p && (p[1] != 'd' || strchr(p+2, '%')))
241 return -EINVAL;
243 /*
244 * If you need over 100 please also fix the algorithm...
245 */
246 for (i = 0; i < 100; i++) {
247 snprintf(buf,sizeof(buf),name,i);
248 if (__dev_get_by_name(buf) == NULL) {
249 strcpy(dev->name, buf);
250 return i;
251 }
252 }
253 return -ENFILE; /* Over 100 of the things .. bail out! */
254 }
256 /**
257 * dev_alloc - allocate a network device and name
258 * @name: name format string
259 * @err: error return pointer
260 *
261 * Passed a format string, eg. "lt%d", it will allocate a network device
262 * and space for the name. %NULL is returned if no memory is available.
263 * If the allocation succeeds then the name is assigned and the
264 * device pointer returned. %NULL is returned if the name allocation
265 * failed. The cause of an error is returned as a negative errno code
266 * in the variable @err points to.
267 *
268 * The caller must hold the @dev_base or RTNL locks when doing this in
269 * order to avoid duplicate name allocations.
270 */
272 struct net_device *dev_alloc(const char *name, int *err)
273 {
274 struct net_device *dev=kmalloc(sizeof(struct net_device), GFP_KERNEL);
275 if (dev == NULL) {
276 *err = -ENOBUFS;
277 return NULL;
278 }
279 memset(dev, 0, sizeof(struct net_device));
280 *err = dev_alloc_name(dev, name);
281 if (*err < 0) {
282 kfree(dev);
283 return NULL;
284 }
285 return dev;
286 }
288 /**
289 * netdev_state_change - device changes state
290 * @dev: device to cause notification
291 *
292 * Called to indicate a device has changed state. This function calls
293 * the notifier chains for netdev_chain and sends a NEWLINK message
294 * to the routing socket.
295 */
297 void netdev_state_change(struct net_device *dev)
298 {
299 if (dev->flags&IFF_UP) {
300 notifier_call_chain(&netdev_chain, NETDEV_CHANGE, dev);
301 rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
302 }
303 }
306 #ifdef CONFIG_KMOD
308 /**
309 * dev_load - load a network module
310 * @name: name of interface
311 *
312 * If a network interface is not present and the process has suitable
313 * privileges this function loads the module. If module loading is not
314 * available in this kernel then it becomes a nop.
315 */
317 void dev_load(const char *name)
318 {
319 if (!dev_get(name) && capable(CAP_SYS_MODULE))
320 request_module(name);
321 }
323 #else
325 extern inline void dev_load(const char *unused){;}
327 #endif
329 static int default_rebuild_header(struct sk_buff *skb)
330 {
331 printk(KERN_DEBUG "%s: default_rebuild_header called -- BUG!\n",
332 skb->dev ? skb->dev->name : "NULL!!!");
333 kfree_skb(skb);
334 return 1;
335 }
337 /**
338 * dev_open - prepare an interface for use.
339 * @dev: device to open
340 *
341 * Takes a device from down to up state. The device's private open
342 * function is invoked and then the multicast lists are loaded. Finally
343 * the device is moved into the up state and a %NETDEV_UP message is
344 * sent to the netdev notifier chain.
345 *
346 * Calling this function on an active interface is a nop. On a failure
347 * a negative errno code is returned.
348 */
350 int dev_open(struct net_device *dev)
351 {
352 int ret = 0;
354 /*
355 * Is it already up?
356 */
358 if (dev->flags&IFF_UP)
359 return 0;
361 /*
362 * Is it even present?
363 */
364 if (!netif_device_present(dev))
365 return -ENODEV;
367 /*
368 * Call device private open method
369 */
370 if (try_inc_mod_count(dev->owner)) {
371 if (dev->open) {
372 ret = dev->open(dev);
373 if (ret != 0 && dev->owner)
374 __MOD_DEC_USE_COUNT(dev->owner);
375 }
376 } else {
377 ret = -ENODEV;
378 }
380 /*
381 * If it went open OK then:
382 */
384 if (ret == 0)
385 {
386 /*
387 * Set the flags.
388 */
389 dev->flags |= IFF_UP;
391 set_bit(__LINK_STATE_START, &dev->state);
393 /*
394 * Initialize multicasting status
395 */
396 dev_mc_upload(dev);
398 /*
399 * Wakeup transmit queue engine
400 */
401 dev_activate(dev);
403 /*
404 * ... and announce new interface.
405 */
406 notifier_call_chain(&netdev_chain, NETDEV_UP, dev);
407 }
408 return(ret);
409 }
412 /**
413 * dev_close - shutdown an interface.
414 * @dev: device to shutdown
415 *
416 * This function moves an active device into down state. A
417 * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
418 * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
419 * chain.
420 */
422 int dev_close(struct net_device *dev)
423 {
424 if (!(dev->flags&IFF_UP))
425 return 0;
427 /*
428 * Tell people we are going down, so that they can
429 * prepare to death, when device is still operating.
430 */
431 notifier_call_chain(&netdev_chain, NETDEV_GOING_DOWN, dev);
433 dev_deactivate(dev);
435 clear_bit(__LINK_STATE_START, &dev->state);
437 /*
438 * Call the device specific close. This cannot fail.
439 * Only if device is UP
440 *
441 * We allow it to be called even after a DETACH hot-plug
442 * event.
443 */
445 if (dev->stop)
446 dev->stop(dev);
448 /*
449 * Device is now down.
450 */
452 dev->flags &= ~IFF_UP;
454 /*
455 * Tell people we are down
456 */
457 notifier_call_chain(&netdev_chain, NETDEV_DOWN, dev);
459 /*
460 * Drop the module refcount
461 */
462 if (dev->owner)
463 __MOD_DEC_USE_COUNT(dev->owner);
465 return(0);
466 }
469 #ifdef CONFIG_HIGHMEM
470 /* Actually, we should eliminate this check as soon as we know, that:
471 * 1. IOMMU is present and allows to map all the memory.
472 * 2. No high memory really exists on this machine.
473 */
475 static inline int
476 illegal_highdma(struct net_device *dev, struct sk_buff *skb)
477 {
478 int i;
480 if (dev->features&NETIF_F_HIGHDMA)
481 return 0;
483 for (i=0; i<skb_shinfo(skb)->nr_frags; i++)
484 if (skb_shinfo(skb)->frags[i].page >= highmem_start_page)
485 return 1;
487 return 0;
488 }
489 #else
490 #define illegal_highdma(dev, skb) (0)
491 #endif
494 /*=======================================================================
495 Receiver routines
496 =======================================================================*/
498 struct netif_rx_stats netdev_rx_stat[NR_CPUS];
500 void deliver_packet(struct sk_buff *skb, net_vif_t *vif)
501 {
502 rx_shadow_entry_t *rx;
503 unsigned long *ptep;
504 struct pfn_info *old_page, *new_page, *pte_page;
505 unsigned int i;
506 unsigned short size;
507 unsigned char offset, status = RING_STATUS_OK;
509 memcpy(skb->mac.ethernet->h_dest, vif->vmac, ETH_ALEN);
510 if ( ntohs(skb->mac.ethernet->h_proto) == ETH_P_ARP )
511 memcpy(skb->nh.raw + 18, vif->vmac, ETH_ALEN);
513 /*
514 * Slightly gross: we need the page_lock so that we can do PTE checking.
515 * However, we take it slightly early so that it can protect the update
516 * of rx_cons. This saves us from grabbing two locks.
517 */
518 spin_lock(&vif->domain->page_lock);
520 if ( (i = vif->rx_cons) == vif->rx_prod )
521 {
522 spin_unlock(&vif->domain->page_lock);
523 perfc_incr(net_rx_capacity_drop);
524 return;
525 }
526 rx = vif->rx_shadow_ring + i;
527 vif->rx_cons = RX_RING_INC(i);
529 size = (unsigned short)skb->len;
530 offset = (unsigned char)((unsigned long)skb->data & ~PAGE_MASK);
532 /* Release the page-table page. */
533 pte_page = frame_table + (rx->pte_ptr >> PAGE_SHIFT);
534 put_page_type(pte_page);
535 put_page_tot(pte_page);
537 old_page = frame_table + rx->buf_pfn;
538 new_page = skb->pf;
540 ptep = map_domain_mem(rx->pte_ptr);
542 if ( (*ptep & _PAGE_PRESENT) )
543 {
544 /* Bail out if the PTE has been reused under our feet. */
545 list_add(&old_page->list, &vif->domain->pg_head);
546 old_page->flags = vif->domain->domain;
547 unmap_domain_mem(ptep);
548 spin_unlock(&vif->domain->page_lock);
549 status = RING_STATUS_BAD_PAGE;
550 goto out;
551 }
553 /* Give the new page to the domain, marking it writeable. */
554 new_page->tot_count = new_page->type_count = 1;
555 new_page->flags = vif->domain->domain | PGT_writeable_page | PG_need_flush;
556 list_add(&new_page->list, &vif->domain->pg_head);
558 /* Patch the PTE to map the new page as writeable. */
559 machine_to_phys_mapping[new_page - frame_table]
560 = machine_to_phys_mapping[old_page - frame_table];
561 *ptep = (*ptep & ~PAGE_MASK) | _PAGE_RW | _PAGE_PRESENT |
562 (((new_page - frame_table) << PAGE_SHIFT) & PAGE_MASK);
564 unmap_domain_mem(ptep);
566 spin_unlock(&vif->domain->page_lock);
568 /* Our skbuff now points at the guest's old frame. */
569 skb->pf = old_page;
571 /* Updates must happen before releasing the descriptor. */
572 smp_wmb();
574 /*
575 * NB. The remote flush here should be safe, as we hold no locks. The
576 * network driver that called us should also have no nasty locks.
577 */
578 if ( rx->flush_count == (unsigned short)
579 atomic_read(&tlb_flush_count[vif->domain->processor]) )
580 {
581 perfc_incr(net_rx_tlbflush);
582 flush_tlb_cpu(vif->domain->processor);
583 }
585 perfc_incr(net_rx_delivered);
587 /* record this so they can be billed */
588 vif->total_packets_received++;
589 vif->total_bytes_received += size;
591 out:
592 make_rx_response(vif, rx->id, size, status, offset);
593 }
595 /**
596 * netif_rx - post buffer to the network code
597 * @skb: buffer to post
598 *
599 * This function receives a packet from a device driver and queues it for
600 * the upper (protocol) levels to process. It always succeeds. The buffer
601 * may be dropped during processing for congestion control or by the
602 * protocol layers.
603 *
604 * return values:
605 * NET_RX_SUCCESS (no congestion)
606 * NET_RX_DROP (packet was dropped)
607 */
609 int netif_rx(struct sk_buff *skb)
610 {
611 int this_cpu = smp_processor_id();
612 struct sk_buff_head *q = &rx_skb_queue[this_cpu];
613 unsigned long flags;
615 /* This oughtn't to happen, really! */
616 if ( unlikely(skb_queue_len(q) > 100) )
617 {
618 perfc_incr(net_rx_congestion_drop);
619 return NET_RX_DROP;
620 }
622 local_irq_save(flags);
623 __skb_queue_tail(q, skb);
624 local_irq_restore(flags);
626 __cpu_raise_softirq(this_cpu, NET_RX_SOFTIRQ);
628 return NET_RX_SUCCESS;
629 }
631 static void net_rx_action(struct softirq_action *h)
632 {
633 int offset, this_cpu = smp_processor_id();
634 struct sk_buff_head *q = &rx_skb_queue[this_cpu];
635 struct sk_buff *skb;
637 local_irq_disable();
639 while ( (skb = __skb_dequeue(q)) != NULL )
640 {
641 ASSERT(skb->skb_type == SKB_ZERO_COPY);
643 /*
644 * Offset will include 16 bytes padding from dev_alloc_skb, 14 bytes
645 * for ethernet header, plus any other alignment padding added by the
646 * driver.
647 */
648 offset = (int)skb->data & ~PAGE_MASK;
649 skb->head = (u8 *)map_domain_mem(((skb->pf - frame_table) <<
650 PAGE_SHIFT));
651 skb->data = skb->nh.raw = skb->head + offset;
652 skb->tail = skb->data + skb->len;
653 skb_push(skb, ETH_HLEN);
654 skb->mac.raw = skb->data;
656 netdev_rx_stat[this_cpu].total++;
658 if ( skb->dst_vif == NULL )
659 skb->dst_vif = net_get_target_vif(
660 skb->data, skb->len, skb->src_vif);
662 if ( !VIF_LOCAL(skb->dst_vif) )
663 skb->dst_vif = find_vif_by_id(0);
665 deliver_packet(skb, skb->dst_vif);
666 put_vif(skb->dst_vif);
668 unmap_domain_mem(skb->head);
669 kfree_skb(skb);
670 }
672 local_irq_enable();
673 }
676 /*************************************************************
677 * NEW TRANSMIT SCHEDULER
678 *
679 * NB. We ought also to only send a limited number of bytes to the NIC
680 * for transmission at any one time (to avoid head-of-line blocking).
681 * However, driver rings are small enough that they provide a reasonable
682 * limit.
683 *
684 * eg. 3c905 has 16 descriptors == 8 packets, at 100Mbps
685 * e1000 has 256 descriptors == 128 packets, at 1000Mbps
686 * tg3 has 512 descriptors == 256 packets, at 1000Mbps
687 *
688 * So, worst case is tg3 with 256 1500-bytes packets == 375kB.
689 * This would take 3ms, and represents our worst-case HoL blocking cost.
690 *
691 * We think this is reasonable.
692 */
694 struct list_head net_schedule_list;
695 spinlock_t net_schedule_list_lock;
697 static int __on_net_schedule_list(net_vif_t *vif)
698 {
699 return vif->list.next != NULL;
700 }
702 static void remove_from_net_schedule_list(net_vif_t *vif)
703 {
704 unsigned long flags;
705 spin_lock_irqsave(&net_schedule_list_lock, flags);
706 ASSERT(__on_net_schedule_list(vif));
707 list_del(&vif->list);
708 vif->list.next = NULL;
709 put_vif(vif);
710 spin_unlock_irqrestore(&net_schedule_list_lock, flags);
711 }
713 static void add_to_net_schedule_list_tail(net_vif_t *vif)
714 {
715 unsigned long flags;
716 if ( __on_net_schedule_list(vif) ) return;
717 spin_lock_irqsave(&net_schedule_list_lock, flags);
718 if ( !__on_net_schedule_list(vif) )
719 {
720 list_add_tail(&vif->list, &net_schedule_list);
721 get_vif(vif);
722 }
723 spin_unlock_irqrestore(&net_schedule_list_lock, flags);
724 }
727 static void tx_skb_release(struct sk_buff *skb);
729 static void net_tx_action(unsigned long unused)
730 {
731 struct net_device *dev = the_dev;
732 struct list_head *ent;
733 struct sk_buff *skb;
734 net_vif_t *vif;
735 tx_shadow_entry_t *tx;
737 spin_lock(&dev->xmit_lock);
738 while ( !netif_queue_stopped(dev) &&
739 !list_empty(&net_schedule_list) )
740 {
741 /* Get a vif from the list with work to do. */
742 ent = net_schedule_list.next;
743 vif = list_entry(ent, net_vif_t, list);
744 get_vif(vif);
745 remove_from_net_schedule_list(vif);
747 /* Check whether there are packets to be transmitted. */
748 if ( (vif->tx_cons == vif->tx_prod) && !get_tx_bufs(vif) )
749 {
750 put_vif(vif);
751 continue;
752 }
754 add_to_net_schedule_list_tail(vif);
756 if ( (skb = alloc_skb_nodata(GFP_ATOMIC)) == NULL )
757 {
758 printk("Out of memory in net_tx_action()!\n");
759 add_to_net_schedule_list_tail(vif);
760 put_vif(vif);
761 break;
762 }
764 /* Pick an entry from the transmit queue. */
765 tx = &vif->tx_shadow_ring[vif->tx_cons];
766 vif->tx_cons = TX_RING_INC(vif->tx_cons);
768 skb->destructor = tx_skb_release;
770 skb->head = skb->data = tx->header;
771 skb->end = skb->tail = skb->head + PKT_PROT_LEN;
773 skb->dev = the_dev;
774 skb->src_vif = vif;
775 skb->dst_vif = NULL;
776 skb->mac.raw = skb->data;
777 skb->guest_id = tx->id;
779 skb_shinfo(skb)->frags[0].page = frame_table +
780 (tx->payload >> PAGE_SHIFT);
781 skb_shinfo(skb)->frags[0].size = tx->size - PKT_PROT_LEN;
782 skb_shinfo(skb)->frags[0].page_offset = tx->payload & ~PAGE_MASK;
783 skb_shinfo(skb)->nr_frags = 1;
785 skb->data_len = tx->size - PKT_PROT_LEN;
786 skb->len = tx->size;
788 /* record the transmission so they can be billed */
789 vif->total_packets_sent++;
790 vif->total_bytes_sent += tx->size;
792 /* Is the NIC crap? */
793 if ( !(dev->features & NETIF_F_SG) )
794 skb_linearize(skb, GFP_KERNEL);
796 /* Transmit should always work, or the queue would be stopped. */
797 if ( dev->hard_start_xmit(skb, dev) != 0 )
798 {
799 printk("Weird failure in hard_start_xmit!\n");
800 kfree_skb(skb);
801 break;
802 }
804 perfc_incr(net_tx_transmitted);
805 }
806 spin_unlock(&dev->xmit_lock);
807 }
809 DECLARE_TASKLET_DISABLED(net_tx_tasklet, net_tx_action, 0);
811 static inline void maybe_schedule_tx_action(void)
812 {
813 smp_mb();
814 if ( !netif_queue_stopped(the_dev) &&
815 !list_empty(&net_schedule_list) )
816 tasklet_schedule(&net_tx_tasklet);
817 }
820 /* Destructor function for tx skbs. */
821 static void tx_skb_release(struct sk_buff *skb)
822 {
823 int i;
824 net_vif_t *vif = skb->src_vif;
825 unsigned long flags;
827 spin_lock_irqsave(&vif->domain->page_lock, flags);
828 for ( i = 0; i < skb_shinfo(skb)->nr_frags; i++ )
829 put_page_tot(skb_shinfo(skb)->frags[i].page);
830 spin_unlock_irqrestore(&vif->domain->page_lock, flags);
832 if ( skb->skb_type == SKB_NODATA )
833 kmem_cache_free(net_header_cachep, skb->head);
835 skb_shinfo(skb)->nr_frags = 0;
837 spin_lock_irqsave(&vif->tx_lock, flags);
838 __make_tx_response(vif, skb->guest_id, RING_STATUS_OK);
839 spin_unlock_irqrestore(&vif->tx_lock, flags);
841 /*
842 * Checks below must happen after the above response is posted.
843 * This avoids a possible race with a guest OS on another CPU.
844 */
845 smp_mb();
847 if ( (vif->tx_cons == vif->tx_prod) && get_tx_bufs(vif) )
848 {
849 add_to_net_schedule_list_tail(vif);
850 maybe_schedule_tx_action();
851 }
853 put_vif(vif);
854 }
857 /*
858 * We need this ioctl for efficient implementation of the
859 * if_indextoname() function required by the IPv6 API. Without
860 * it, we would have to search all the interfaces to find a
861 * match. --pb
862 */
864 static int dev_ifname(struct ifreq *arg)
865 {
866 struct net_device *dev;
867 struct ifreq ifr;
869 /*
870 * Fetch the caller's info block.
871 */
873 if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
874 return -EFAULT;
876 read_lock(&dev_base_lock);
877 dev = __dev_get_by_index(ifr.ifr_ifindex);
878 if (!dev) {
879 read_unlock(&dev_base_lock);
880 return -ENODEV;
881 }
883 strcpy(ifr.ifr_name, dev->name);
884 read_unlock(&dev_base_lock);
886 if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
887 return -EFAULT;
888 return 0;
889 }
892 /**
893 * netdev_set_master - set up master/slave pair
894 * @slave: slave device
895 * @master: new master device
896 *
897 * Changes the master device of the slave. Pass %NULL to break the
898 * bonding. The caller must hold the RTNL semaphore. On a failure
899 * a negative errno code is returned. On success the reference counts
900 * are adjusted, %RTM_NEWLINK is sent to the routing socket and the
901 * function returns zero.
902 */
904 int netdev_set_master(struct net_device *slave, struct net_device *master)
905 {
906 struct net_device *old = slave->master;
908 if (master) {
909 if (old)
910 return -EBUSY;
911 dev_hold(master);
912 }
914 br_write_lock_bh(BR_NETPROTO_LOCK);
915 slave->master = master;
916 br_write_unlock_bh(BR_NETPROTO_LOCK);
918 if (old)
919 dev_put(old);
921 if (master)
922 slave->flags |= IFF_SLAVE;
923 else
924 slave->flags &= ~IFF_SLAVE;
926 rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
927 return 0;
928 }
930 /**
931 * dev_set_promiscuity - update promiscuity count on a device
932 * @dev: device
933 * @inc: modifier
934 *
935 * Add or remove promsicuity from a device. While the count in the device
936 * remains above zero the interface remains promiscuous. Once it hits zero
937 * the device reverts back to normal filtering operation. A negative inc
938 * value is used to drop promiscuity on the device.
939 */
941 void dev_set_promiscuity(struct net_device *dev, int inc)
942 {
943 unsigned short old_flags = dev->flags;
945 dev->flags |= IFF_PROMISC;
946 if ((dev->promiscuity += inc) == 0)
947 dev->flags &= ~IFF_PROMISC;
948 if (dev->flags^old_flags) {
949 #ifdef CONFIG_NET_FASTROUTE
950 if (dev->flags&IFF_PROMISC) {
951 netdev_fastroute_obstacles++;
952 dev_clear_fastroute(dev);
953 } else
954 netdev_fastroute_obstacles--;
955 #endif
956 dev_mc_upload(dev);
957 printk(KERN_INFO "device %s %s promiscuous mode\n",
958 dev->name, (dev->flags&IFF_PROMISC) ? "entered" : "left");
959 }
960 }
962 /**
963 * dev_set_allmulti - update allmulti count on a device
964 * @dev: device
965 * @inc: modifier
966 *
967 * Add or remove reception of all multicast frames to a device. While the
968 * count in the device remains above zero the interface remains listening
969 * to all interfaces. Once it hits zero the device reverts back to normal
970 * filtering operation. A negative @inc value is used to drop the counter
971 * when releasing a resource needing all multicasts.
972 */
974 void dev_set_allmulti(struct net_device *dev, int inc)
975 {
976 unsigned short old_flags = dev->flags;
978 dev->flags |= IFF_ALLMULTI;
979 if ((dev->allmulti += inc) == 0)
980 dev->flags &= ~IFF_ALLMULTI;
981 if (dev->flags^old_flags)
982 dev_mc_upload(dev);
983 }
985 int dev_change_flags(struct net_device *dev, unsigned flags)
986 {
987 int ret;
988 int old_flags = dev->flags;
990 /*
991 * Set the flags on our device.
992 */
994 dev->flags = (flags & (IFF_DEBUG|IFF_NOTRAILERS|IFF_NOARP|IFF_DYNAMIC|
995 IFF_MULTICAST|IFF_PORTSEL|IFF_AUTOMEDIA)) |
996 (dev->flags & (IFF_UP|IFF_VOLATILE|IFF_PROMISC|IFF_ALLMULTI));
998 /*
999 * Load in the correct multicast list now the flags have changed.
1000 */
1002 dev_mc_upload(dev);
1004 /*
1005 * Have we downed the interface. We handle IFF_UP ourselves
1006 * according to user attempts to set it, rather than blindly
1007 * setting it.
1008 */
1010 ret = 0;
1011 if ((old_flags^flags)&IFF_UP) /* Bit is different ? */
1013 ret = ((old_flags & IFF_UP) ? dev_close : dev_open)(dev);
1015 if (ret == 0)
1016 dev_mc_upload(dev);
1019 if (dev->flags&IFF_UP &&
1020 ((old_flags^dev->flags)&
1021 ~(IFF_UP|IFF_PROMISC|IFF_ALLMULTI|IFF_VOLATILE)))
1022 notifier_call_chain(&netdev_chain, NETDEV_CHANGE, dev);
1024 if ((flags^dev->gflags)&IFF_PROMISC) {
1025 int inc = (flags&IFF_PROMISC) ? +1 : -1;
1026 dev->gflags ^= IFF_PROMISC;
1027 dev_set_promiscuity(dev, inc);
1030 /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
1031 is important. Some (broken) drivers set IFF_PROMISC, when
1032 IFF_ALLMULTI is requested not asking us and not reporting.
1033 */
1034 if ((flags^dev->gflags)&IFF_ALLMULTI) {
1035 int inc = (flags&IFF_ALLMULTI) ? +1 : -1;
1036 dev->gflags ^= IFF_ALLMULTI;
1037 dev_set_allmulti(dev, inc);
1040 if (old_flags^dev->flags)
1041 rtmsg_ifinfo(RTM_NEWLINK, dev, old_flags^dev->flags);
1043 return ret;
1046 /*
1047 * Perform the SIOCxIFxxx calls.
1048 */
1050 static int dev_ifsioc(struct ifreq *ifr, unsigned int cmd)
1052 struct net_device *dev;
1053 int err;
1055 if ((dev = __dev_get_by_name(ifr->ifr_name)) == NULL)
1056 return -ENODEV;
1058 switch(cmd)
1060 case SIOCGIFFLAGS: /* Get interface flags */
1061 ifr->ifr_flags = (dev->flags&~(IFF_PROMISC|IFF_ALLMULTI|IFF_RUNNING))
1062 |(dev->gflags&(IFF_PROMISC|IFF_ALLMULTI));
1063 if (netif_running(dev) && netif_carrier_ok(dev))
1064 ifr->ifr_flags |= IFF_RUNNING;
1065 return 0;
1067 case SIOCSIFFLAGS: /* Set interface flags */
1068 return dev_change_flags(dev, ifr->ifr_flags);
1070 case SIOCGIFMETRIC: /* Get the metric on the interface */
1071 ifr->ifr_metric = 0;
1072 return 0;
1074 case SIOCSIFMETRIC: /* Set the metric on the interface */
1075 return -EOPNOTSUPP;
1077 case SIOCGIFMTU: /* Get the MTU of a device */
1078 ifr->ifr_mtu = dev->mtu;
1079 return 0;
1081 case SIOCSIFMTU: /* Set the MTU of a device */
1082 if (ifr->ifr_mtu == dev->mtu)
1083 return 0;
1085 /*
1086 * MTU must be positive.
1087 */
1089 if (ifr->ifr_mtu<0)
1090 return -EINVAL;
1092 if (!netif_device_present(dev))
1093 return -ENODEV;
1095 if (dev->change_mtu)
1096 err = dev->change_mtu(dev, ifr->ifr_mtu);
1097 else {
1098 dev->mtu = ifr->ifr_mtu;
1099 err = 0;
1101 if (!err && dev->flags&IFF_UP)
1102 notifier_call_chain(&netdev_chain, NETDEV_CHANGEMTU, dev);
1103 return err;
1105 case SIOCGIFHWADDR:
1106 memcpy(ifr->ifr_hwaddr.sa_data,dev->dev_addr, MAX_ADDR_LEN);
1107 ifr->ifr_hwaddr.sa_family=dev->type;
1108 return 0;
1110 case SIOCSIFHWADDR:
1111 if (dev->set_mac_address == NULL)
1112 return -EOPNOTSUPP;
1113 if (ifr->ifr_hwaddr.sa_family!=dev->type)
1114 return -EINVAL;
1115 if (!netif_device_present(dev))
1116 return -ENODEV;
1117 err = dev->set_mac_address(dev, &ifr->ifr_hwaddr);
1118 if (!err)
1119 notifier_call_chain(&netdev_chain, NETDEV_CHANGEADDR, dev);
1120 return err;
1122 case SIOCSIFHWBROADCAST:
1123 if (ifr->ifr_hwaddr.sa_family!=dev->type)
1124 return -EINVAL;
1125 memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data, MAX_ADDR_LEN);
1126 notifier_call_chain(&netdev_chain, NETDEV_CHANGEADDR, dev);
1127 return 0;
1129 case SIOCGIFMAP:
1130 ifr->ifr_map.mem_start=dev->mem_start;
1131 ifr->ifr_map.mem_end=dev->mem_end;
1132 ifr->ifr_map.base_addr=dev->base_addr;
1133 ifr->ifr_map.irq=dev->irq;
1134 ifr->ifr_map.dma=dev->dma;
1135 ifr->ifr_map.port=dev->if_port;
1136 return 0;
1138 case SIOCSIFMAP:
1139 if (dev->set_config) {
1140 if (!netif_device_present(dev))
1141 return -ENODEV;
1142 return dev->set_config(dev,&ifr->ifr_map);
1144 return -EOPNOTSUPP;
1146 case SIOCADDMULTI:
1147 if (dev->set_multicast_list == NULL ||
1148 ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
1149 return -EINVAL;
1150 if (!netif_device_present(dev))
1151 return -ENODEV;
1152 dev_mc_add(dev,ifr->ifr_hwaddr.sa_data, dev->addr_len, 1);
1153 return 0;
1155 case SIOCDELMULTI:
1156 if (dev->set_multicast_list == NULL ||
1157 ifr->ifr_hwaddr.sa_family!=AF_UNSPEC)
1158 return -EINVAL;
1159 if (!netif_device_present(dev))
1160 return -ENODEV;
1161 dev_mc_delete(dev,ifr->ifr_hwaddr.sa_data,dev->addr_len, 1);
1162 return 0;
1164 case SIOCGIFINDEX:
1165 ifr->ifr_ifindex = dev->ifindex;
1166 return 0;
1168 case SIOCSIFNAME:
1169 if (dev->flags&IFF_UP)
1170 return -EBUSY;
1171 if (__dev_get_by_name(ifr->ifr_newname))
1172 return -EEXIST;
1173 memcpy(dev->name, ifr->ifr_newname, IFNAMSIZ);
1174 dev->name[IFNAMSIZ-1] = 0;
1175 notifier_call_chain(&netdev_chain, NETDEV_CHANGENAME, dev);
1176 return 0;
1178 #ifdef WIRELESS_EXT
1179 case SIOCGIWSTATS:
1180 return dev_iwstats(dev, ifr);
1181 #endif /* WIRELESS_EXT */
1183 /*
1184 * Unknown or private ioctl
1185 */
1187 default:
1188 if ((cmd >= SIOCDEVPRIVATE &&
1189 cmd <= SIOCDEVPRIVATE + 15) ||
1190 cmd == SIOCBONDENSLAVE ||
1191 cmd == SIOCBONDRELEASE ||
1192 cmd == SIOCBONDSETHWADDR ||
1193 cmd == SIOCBONDSLAVEINFOQUERY ||
1194 cmd == SIOCBONDINFOQUERY ||
1195 cmd == SIOCBONDCHANGEACTIVE ||
1196 cmd == SIOCETHTOOL ||
1197 cmd == SIOCGMIIPHY ||
1198 cmd == SIOCGMIIREG ||
1199 cmd == SIOCSMIIREG) {
1200 if (dev->do_ioctl) {
1201 if (!netif_device_present(dev))
1202 return -ENODEV;
1203 return dev->do_ioctl(dev, ifr, cmd);
1205 return -EOPNOTSUPP;
1208 #ifdef WIRELESS_EXT
1209 if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST) {
1210 if (dev->do_ioctl) {
1211 if (!netif_device_present(dev))
1212 return -ENODEV;
1213 return dev->do_ioctl(dev, ifr, cmd);
1215 return -EOPNOTSUPP;
1217 #endif /* WIRELESS_EXT */
1220 return -EINVAL;
1223 /*
1224 * This function handles all "interface"-type I/O control requests. The actual
1225 * 'doing' part of this is dev_ifsioc above.
1226 */
1228 /**
1229 * dev_ioctl - network device ioctl
1230 * @cmd: command to issue
1231 * @arg: pointer to a struct ifreq in user space
1233 * Issue ioctl functions to devices. This is normally called by the
1234 * user space syscall interfaces but can sometimes be useful for
1235 * other purposes. The return value is the return from the syscall if
1236 * positive or a negative errno code on error.
1237 */
1239 int dev_ioctl(unsigned int cmd, void *arg)
1241 struct ifreq ifr;
1242 int ret;
1243 char *colon;
1245 /* One special case: SIOCGIFCONF takes ifconf argument
1246 and requires shared lock, because it sleeps writing
1247 to user space.
1248 */
1250 if (cmd == SIOCGIFCONF) {
1251 return -ENOSYS;
1253 if (cmd == SIOCGIFNAME) {
1254 return dev_ifname((struct ifreq *)arg);
1257 if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
1258 return -EFAULT;
1260 ifr.ifr_name[IFNAMSIZ-1] = 0;
1262 colon = strchr(ifr.ifr_name, ':');
1263 if (colon)
1264 *colon = 0;
1266 /*
1267 * See which interface the caller is talking about.
1268 */
1270 switch(cmd)
1272 /*
1273 * These ioctl calls:
1274 * - can be done by all.
1275 * - atomic and do not require locking.
1276 * - return a value
1277 */
1279 case SIOCGIFFLAGS:
1280 case SIOCGIFMETRIC:
1281 case SIOCGIFMTU:
1282 case SIOCGIFHWADDR:
1283 case SIOCGIFSLAVE:
1284 case SIOCGIFMAP:
1285 case SIOCGIFINDEX:
1286 dev_load(ifr.ifr_name);
1287 read_lock(&dev_base_lock);
1288 ret = dev_ifsioc(&ifr, cmd);
1289 read_unlock(&dev_base_lock);
1290 if (!ret) {
1291 if (colon)
1292 *colon = ':';
1293 if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
1294 return -EFAULT;
1296 return ret;
1298 /*
1299 * These ioctl calls:
1300 * - require superuser power.
1301 * - require strict serialization.
1302 * - return a value
1303 */
1305 case SIOCETHTOOL:
1306 case SIOCGMIIPHY:
1307 case SIOCGMIIREG:
1308 if (!capable(CAP_NET_ADMIN))
1309 return -EPERM;
1310 dev_load(ifr.ifr_name);
1311 dev_probe_lock();
1312 rtnl_lock();
1313 ret = dev_ifsioc(&ifr, cmd);
1314 rtnl_unlock();
1315 dev_probe_unlock();
1316 if (!ret) {
1317 if (colon)
1318 *colon = ':';
1319 if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
1320 return -EFAULT;
1322 return ret;
1324 /*
1325 * These ioctl calls:
1326 * - require superuser power.
1327 * - require strict serialization.
1328 * - do not return a value
1329 */
1331 case SIOCSIFFLAGS:
1332 case SIOCSIFMETRIC:
1333 case SIOCSIFMTU:
1334 case SIOCSIFMAP:
1335 case SIOCSIFHWADDR:
1336 case SIOCSIFSLAVE:
1337 case SIOCADDMULTI:
1338 case SIOCDELMULTI:
1339 case SIOCSIFHWBROADCAST:
1340 case SIOCSIFNAME:
1341 case SIOCSMIIREG:
1342 case SIOCBONDENSLAVE:
1343 case SIOCBONDRELEASE:
1344 case SIOCBONDSETHWADDR:
1345 case SIOCBONDSLAVEINFOQUERY:
1346 case SIOCBONDINFOQUERY:
1347 case SIOCBONDCHANGEACTIVE:
1348 if (!capable(CAP_NET_ADMIN))
1349 return -EPERM;
1350 dev_load(ifr.ifr_name);
1351 dev_probe_lock();
1352 rtnl_lock();
1353 ret = dev_ifsioc(&ifr, cmd);
1354 rtnl_unlock();
1355 dev_probe_unlock();
1356 return ret;
1358 case SIOCGIFMEM:
1359 /* Get the per device memory space. We can add this but currently
1360 do not support it */
1361 case SIOCSIFMEM:
1362 /* Set the per device memory buffer space. */
1363 case SIOCSIFLINK:
1364 return -EINVAL;
1366 /*
1367 * Unknown or private ioctl.
1368 */
1370 default:
1371 if (cmd >= SIOCDEVPRIVATE &&
1372 cmd <= SIOCDEVPRIVATE + 15) {
1373 dev_load(ifr.ifr_name);
1374 dev_probe_lock();
1375 rtnl_lock();
1376 ret = dev_ifsioc(&ifr, cmd);
1377 rtnl_unlock();
1378 dev_probe_unlock();
1379 if (!ret && copy_to_user(arg, &ifr, sizeof(struct ifreq)))
1380 return -EFAULT;
1381 return ret;
1383 #ifdef WIRELESS_EXT
1384 /* Take care of Wireless Extensions */
1385 if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST) {
1386 /* If command is `set a parameter', or
1387 * `get the encoding parameters', check if
1388 * the user has the right to do it */
1389 if (IW_IS_SET(cmd) || (cmd == SIOCGIWENCODE)) {
1390 if(!capable(CAP_NET_ADMIN))
1391 return -EPERM;
1393 dev_load(ifr.ifr_name);
1394 rtnl_lock();
1395 ret = dev_ifsioc(&ifr, cmd);
1396 rtnl_unlock();
1397 if (!ret && IW_IS_GET(cmd) &&
1398 copy_to_user(arg, &ifr,
1399 sizeof(struct ifreq)))
1400 return -EFAULT;
1401 return ret;
1403 #endif /* WIRELESS_EXT */
1404 return -EINVAL;
1409 /**
1410 * dev_new_index - allocate an ifindex
1412 * Returns a suitable unique value for a new device interface
1413 * number. The caller must hold the rtnl semaphore or the
1414 * dev_base_lock to be sure it remains unique.
1415 */
1417 int dev_new_index(void)
1419 static int ifindex;
1420 for (;;) {
1421 if (++ifindex <= 0)
1422 ifindex=1;
1423 if (__dev_get_by_index(ifindex) == NULL)
1424 return ifindex;
1428 static int dev_boot_phase = 1;
1430 /**
1431 * register_netdevice - register a network device
1432 * @dev: device to register
1434 * Take a completed network device structure and add it to the kernel
1435 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
1436 * chain. 0 is returned on success. A negative errno code is returned
1437 * on a failure to set up the device, or if the name is a duplicate.
1439 * Callers must hold the rtnl semaphore. See the comment at the
1440 * end of Space.c for details about the locking. You may want
1441 * register_netdev() instead of this.
1443 * BUGS:
1444 * The locking appears insufficient to guarantee two parallel registers
1445 * will not get the same name.
1446 */
1448 int net_dev_init(void);
1450 int register_netdevice(struct net_device *dev)
1452 struct net_device *d, **dp;
1453 #ifdef CONFIG_NET_DIVERT
1454 int ret;
1455 #endif
1457 spin_lock_init(&dev->queue_lock);
1458 spin_lock_init(&dev->xmit_lock);
1459 dev->xmit_lock_owner = -1;
1460 #ifdef CONFIG_NET_FASTROUTE
1461 dev->fastpath_lock=RW_LOCK_UNLOCKED;
1462 #endif
1464 if (dev_boot_phase)
1465 net_dev_init();
1467 #ifdef CONFIG_NET_DIVERT
1468 ret = alloc_divert_blk(dev);
1469 if (ret)
1470 return ret;
1471 #endif /* CONFIG_NET_DIVERT */
1473 dev->iflink = -1;
1475 /* Init, if this function is available */
1476 if (dev->init && dev->init(dev) != 0) {
1477 #ifdef CONFIG_NET_DIVERT
1478 free_divert_blk(dev);
1479 #endif
1480 return -EIO;
1483 dev->ifindex = dev_new_index();
1484 if (dev->iflink == -1)
1485 dev->iflink = dev->ifindex;
1487 /* Check for existence, and append to tail of chain */
1488 for (dp=&dev_base; (d=*dp) != NULL; dp=&d->next) {
1489 if (d == dev || strcmp(d->name, dev->name) == 0) {
1490 #ifdef CONFIG_NET_DIVERT
1491 free_divert_blk(dev);
1492 #endif
1493 return -EEXIST;
1496 /*
1497 * nil rebuild_header routine,
1498 * that should be never called and used as just bug trap.
1499 */
1501 if (dev->rebuild_header == NULL)
1502 dev->rebuild_header = default_rebuild_header;
1504 /*
1505 * Default initial state at registry is that the
1506 * device is present.
1507 */
1509 set_bit(__LINK_STATE_PRESENT, &dev->state);
1511 dev->next = NULL;
1512 dev_init_scheduler(dev);
1513 write_lock_bh(&dev_base_lock);
1514 *dp = dev;
1515 dev_hold(dev);
1516 dev->deadbeaf = 0;
1517 write_unlock_bh(&dev_base_lock);
1519 /* Notify protocols, that a new device appeared. */
1520 notifier_call_chain(&netdev_chain, NETDEV_REGISTER, dev);
1522 return 0;
1525 /**
1526 * netdev_finish_unregister - complete unregistration
1527 * @dev: device
1529 * Destroy and free a dead device. A value of zero is returned on
1530 * success.
1531 */
1533 int netdev_finish_unregister(struct net_device *dev)
1535 BUG_TRAP(dev->ip_ptr==NULL);
1536 BUG_TRAP(dev->ip6_ptr==NULL);
1537 BUG_TRAP(dev->dn_ptr==NULL);
1539 if (!dev->deadbeaf) {
1540 printk(KERN_ERR "Freeing alive device %p, %s\n",
1541 dev, dev->name);
1542 return 0;
1544 #ifdef NET_REFCNT_DEBUG
1545 printk(KERN_DEBUG "netdev_finish_unregister: %s%s.\n", dev->name,
1546 (dev->features & NETIF_F_DYNALLOC)?"":", old style");
1547 #endif
1548 if (dev->destructor)
1549 dev->destructor(dev);
1550 if (dev->features & NETIF_F_DYNALLOC)
1551 kfree(dev);
1552 return 0;
1555 /**
1556 * unregister_netdevice - remove device from the kernel
1557 * @dev: device
1559 * This function shuts down a device interface and removes it
1560 * from the kernel tables. On success 0 is returned, on a failure
1561 * a negative errno code is returned.
1563 * Callers must hold the rtnl semaphore. See the comment at the
1564 * end of Space.c for details about the locking. You may want
1565 * unregister_netdev() instead of this.
1566 */
1568 int unregister_netdevice(struct net_device *dev)
1570 unsigned long now, warning_time;
1571 struct net_device *d, **dp;
1573 /* If device is running, close it first. */
1574 if (dev->flags & IFF_UP)
1575 dev_close(dev);
1577 BUG_TRAP(dev->deadbeaf==0);
1578 dev->deadbeaf = 1;
1580 /* And unlink it from device chain. */
1581 for (dp = &dev_base; (d=*dp) != NULL; dp=&d->next) {
1582 if (d == dev) {
1583 write_lock_bh(&dev_base_lock);
1584 *dp = d->next;
1585 write_unlock_bh(&dev_base_lock);
1586 break;
1589 if (d == NULL) {
1590 printk(KERN_DEBUG "unregister_netdevice: device %s/%p"
1591 " not registered\n", dev->name, dev);
1592 return -ENODEV;
1595 /* Synchronize to net_rx_action. */
1596 br_write_lock_bh(BR_NETPROTO_LOCK);
1597 br_write_unlock_bh(BR_NETPROTO_LOCK);
1599 if (dev_boot_phase == 0) {
1601 /* Shutdown queueing discipline. */
1602 dev_shutdown(dev);
1604 /* Notify protocols, that we are about to destroy
1605 this device. They should clean all the things.
1606 */
1607 notifier_call_chain(&netdev_chain, NETDEV_UNREGISTER, dev);
1609 /*
1610 * Flush the multicast chain
1611 */
1612 dev_mc_discard(dev);
1615 if (dev->uninit)
1616 dev->uninit(dev);
1618 /* Notifier chain MUST detach us from master device. */
1619 BUG_TRAP(dev->master==NULL);
1621 #ifdef CONFIG_NET_DIVERT
1622 free_divert_blk(dev);
1623 #endif
1625 if (dev->features & NETIF_F_DYNALLOC) {
1626 #ifdef NET_REFCNT_DEBUG
1627 if (atomic_read(&dev->refcnt) != 1)
1628 printk(KERN_DEBUG "unregister_netdevice: holding %s refcnt=%d\n",
1629 dev->name, atomic_read(&dev->refcnt)-1);
1630 #endif
1631 dev_put(dev);
1632 return 0;
1635 /* Last reference is our one */
1636 if (atomic_read(&dev->refcnt) == 1) {
1637 dev_put(dev);
1638 return 0;
1641 #ifdef NET_REFCNT_DEBUG
1642 printk("unregister_netdevice: waiting %s refcnt=%d\n",
1643 dev->name, atomic_read(&dev->refcnt));
1644 #endif
1646 /* EXPLANATION. If dev->refcnt is not now 1 (our own reference)
1647 it means that someone in the kernel still has a reference
1648 to this device and we cannot release it.
1650 "New style" devices have destructors, hence we can return from this
1651 function and destructor will do all the work later. As of kernel 2.4.0
1652 there are very few "New Style" devices.
1654 "Old style" devices expect that the device is free of any references
1655 upon exit from this function.
1656 We cannot return from this function until all such references have
1657 fallen away. This is because the caller of this function will probably
1658 immediately kfree(*dev) and then be unloaded via sys_delete_module.
1660 So, we linger until all references fall away. The duration of the
1661 linger is basically unbounded! It is driven by, for example, the
1662 current setting of sysctl_ipfrag_time.
1664 After 1 second, we start to rebroadcast unregister notifications
1665 in hope that careless clients will release the device.
1667 */
1669 now = warning_time = jiffies;
1670 while (atomic_read(&dev->refcnt) != 1) {
1671 if ((jiffies - now) > 1*HZ) {
1672 /* Rebroadcast unregister notification */
1673 notifier_call_chain(&netdev_chain, NETDEV_UNREGISTER, dev);
1675 mdelay(250);
1676 if ((jiffies - warning_time) > 10*HZ) {
1677 printk(KERN_EMERG "unregister_netdevice: waiting for %s to "
1678 "become free. Usage count = %d\n",
1679 dev->name, atomic_read(&dev->refcnt));
1680 warning_time = jiffies;
1683 dev_put(dev);
1684 return 0;
1688 /*
1689 * Initialize the DEV module. At boot time this walks the device list and
1690 * unhooks any devices that fail to initialise (normally hardware not
1691 * present) and leaves us with a valid list of present and active devices.
1693 */
1695 extern void net_device_init(void);
1696 extern void ip_auto_config(void);
1697 #ifdef CONFIG_NET_DIVERT
1698 extern void dv_init(void);
1699 #endif /* CONFIG_NET_DIVERT */
1702 /*
1703 * Callers must hold the rtnl semaphore. See the comment at the
1704 * end of Space.c for details about the locking.
1705 */
1706 int __init net_dev_init(void)
1708 struct net_device *dev, **dp;
1710 if ( !dev_boot_phase )
1711 return 0;
1713 skb_init();
1715 net_header_cachep = kmem_cache_create(
1716 "net_header_cache",
1717 (PKT_PROT_LEN + sizeof(void *) - 1) & ~(sizeof(void *) - 1),
1718 0, SLAB_HWCACHE_ALIGN, NULL, NULL);
1720 spin_lock_init(&net_schedule_list_lock);
1721 INIT_LIST_HEAD(&net_schedule_list);
1723 /*
1724 * Add the devices.
1725 * If the call to dev->init fails, the dev is removed
1726 * from the chain disconnecting the device until the
1727 * next reboot.
1729 * NB At boot phase networking is dead. No locking is required.
1730 * But we still preserve dev_base_lock for sanity.
1731 */
1732 dp = &dev_base;
1733 while ((dev = *dp) != NULL) {
1734 spin_lock_init(&dev->queue_lock);
1735 spin_lock_init(&dev->xmit_lock);
1737 dev->xmit_lock_owner = -1;
1738 dev->iflink = -1;
1739 dev_hold(dev);
1741 /*
1742 * Allocate name. If the init() fails
1743 * the name will be reissued correctly.
1744 */
1745 if (strchr(dev->name, '%'))
1746 dev_alloc_name(dev, dev->name);
1748 if (dev->init && dev->init(dev)) {
1749 /*
1750 * It failed to come up. It will be unhooked later.
1751 * dev_alloc_name can now advance to next suitable
1752 * name that is checked next.
1753 */
1754 dev->deadbeaf = 1;
1755 dp = &dev->next;
1756 } else {
1757 dp = &dev->next;
1758 dev->ifindex = dev_new_index();
1759 if (dev->iflink == -1)
1760 dev->iflink = dev->ifindex;
1761 if (dev->rebuild_header == NULL)
1762 dev->rebuild_header = default_rebuild_header;
1763 dev_init_scheduler(dev);
1764 set_bit(__LINK_STATE_PRESENT, &dev->state);
1768 /*
1769 * Unhook devices that failed to come up
1770 */
1771 dp = &dev_base;
1772 while ((dev = *dp) != NULL) {
1773 if (dev->deadbeaf) {
1774 write_lock_bh(&dev_base_lock);
1775 *dp = dev->next;
1776 write_unlock_bh(&dev_base_lock);
1777 dev_put(dev);
1778 } else {
1779 dp = &dev->next;
1783 dev_boot_phase = 0;
1785 dev_mcast_init();
1787 /*
1788 * Initialise network devices
1789 */
1791 net_device_init();
1793 return 0;
1796 inline int init_tx_header(u8 *data, unsigned int len, struct net_device *dev)
1798 memcpy(data + ETH_ALEN, dev->dev_addr, ETH_ALEN);
1800 switch ( ntohs(*(unsigned short *)(data + 12)) )
1802 case ETH_P_ARP:
1803 if ( len < 42 ) break;
1804 memcpy(data + 22, dev->dev_addr, ETH_ALEN);
1805 return ETH_P_ARP;
1806 case ETH_P_IP:
1807 return ETH_P_IP;
1809 return 0;
1813 static int get_tx_bufs(net_vif_t *vif)
1815 struct task_struct *p = vif->domain;
1816 net_idx_t *shared_idxs = vif->shared_idxs;
1817 net_ring_t *shared_rings = vif->shared_rings;
1818 net_vif_t *target;
1819 unsigned long buf_pfn;
1820 struct pfn_info *buf_page;
1821 u8 *g_data;
1822 unsigned short protocol;
1823 struct sk_buff *skb;
1824 tx_req_entry_t tx;
1825 int i, j, ret;
1826 unsigned long flags;
1828 if ( vif->tx_req_cons == shared_idxs->tx_req_prod )
1829 return 0;
1831 spin_lock_irqsave(&vif->tx_lock, flags);
1833 j = vif->tx_prod;
1835 /*
1836 * Collect up new transmit buffers. We collect up to the guest OS's new
1837 * producer index, but take care not to catch up with our own consumer
1838 * index.
1839 */
1840 again:
1841 for ( i = vif->tx_req_cons;
1842 (i != shared_idxs->tx_req_prod) &&
1843 (((vif->tx_resp_prod-i) & (TX_RING_SIZE-1)) != 1);
1844 i = TX_RING_INC(i) )
1846 tx = shared_rings->tx_ring[i].req;
1847 target = VIF_DROP;
1849 if ( (tx.size < PKT_PROT_LEN) || (tx.size > ETH_FRAME_LEN) )
1851 DPRINTK("Bad packet size: %d\n", tx.size);
1852 __make_tx_response(vif, tx.id, RING_STATUS_BAD_PAGE);
1853 continue;
1856 /* No crossing a page boundary as the payload mustn't fragment. */
1857 if ( ((tx.addr & ~PAGE_MASK) + tx.size) >= PAGE_SIZE )
1859 DPRINTK("tx.addr: %lx, size: %u, end: %lu\n",
1860 tx.addr, tx.size, (tx.addr &~PAGE_MASK) + tx.size);
1861 __make_tx_response(vif, tx.id, RING_STATUS_BAD_PAGE);
1862 continue;
1865 buf_pfn = tx.addr >> PAGE_SHIFT;
1866 buf_page = frame_table + buf_pfn;
1867 spin_lock(&p->page_lock);
1868 if ( (buf_pfn >= max_page) ||
1869 ((buf_page->flags & PG_domain_mask) != p->domain) )
1871 DPRINTK("Bad page frame\n");
1872 spin_unlock(&p->page_lock);
1873 __make_tx_response(vif, tx.id, RING_STATUS_BAD_PAGE);
1874 continue;
1877 g_data = map_domain_mem(tx.addr);
1879 protocol = __constant_htons(
1880 init_tx_header(g_data, tx.size, the_dev));
1881 if ( protocol == 0 )
1883 __make_tx_response(vif, tx.id, RING_STATUS_BAD_PAGE);
1884 goto tx_unmap_and_continue;
1887 target = net_get_target_vif(g_data, tx.size, vif);
1889 if ( VIF_LOCAL(target) )
1891 /* Local delivery */
1892 if ( (skb = dev_alloc_skb(ETH_FRAME_LEN + 32)) == NULL )
1894 __make_tx_response(vif, tx.id, RING_STATUS_BAD_PAGE);
1895 put_vif(target);
1896 goto tx_unmap_and_continue;
1899 skb->src_vif = vif;
1900 skb->dst_vif = target;
1901 skb->protocol = protocol;
1903 /*
1904 * We don't need a well-formed skb as netif_rx will fill these
1905 * fields in as necessary. All we actually need is the right
1906 * page offset in skb->data, and the right length in skb->len.
1907 * Note that the correct address/length *excludes* link header.
1908 */
1909 skb->head = (u8 *)map_domain_mem(
1910 ((skb->pf - frame_table) << PAGE_SHIFT));
1911 skb->data = skb->head + 18;
1912 memcpy(skb->data, g_data, tx.size);
1913 skb->data += ETH_HLEN;
1914 skb->len = tx.size - ETH_HLEN;
1915 unmap_domain_mem(skb->head);
1917 netif_rx(skb);
1919 __make_tx_response(vif, tx.id, RING_STATUS_OK);
1921 else if ( (target == VIF_PHYS) || IS_PRIV(p) )
1923 vif->tx_shadow_ring[j].id = tx.id;
1924 vif->tx_shadow_ring[j].size = tx.size;
1925 vif->tx_shadow_ring[j].header =
1926 kmem_cache_alloc(net_header_cachep, GFP_KERNEL);
1927 if ( vif->tx_shadow_ring[j].header == NULL )
1929 __make_tx_response(vif, tx.id, RING_STATUS_OK);
1930 goto tx_unmap_and_continue;
1933 memcpy(vif->tx_shadow_ring[j].header, g_data, PKT_PROT_LEN);
1934 vif->tx_shadow_ring[j].payload = tx.addr + PKT_PROT_LEN;
1935 get_page_tot(buf_page);
1936 j = TX_RING_INC(j);
1938 else
1940 __make_tx_response(vif, tx.id, RING_STATUS_DROPPED);
1943 tx_unmap_and_continue:
1944 unmap_domain_mem(g_data);
1945 spin_unlock(&p->page_lock);
1948 /*
1949 * Needed as a final check for req_prod updates on another CPU.
1950 * Also ensures that other CPUs see shadow ring updates.
1951 */
1952 smp_mb();
1954 if ( (vif->tx_req_cons = i) != shared_idxs->tx_req_prod )
1955 goto again;
1957 if ( (ret = (vif->tx_prod != j)) )
1958 vif->tx_prod = j;
1960 spin_unlock_irqrestore(&vif->tx_lock, flags);
1962 return ret;
1966 /*
1967 * do_net_update:
1969 * Called from guest OS to notify updates to its transmit and/or receive
1970 * descriptor rings.
1971 */
1973 long do_net_update(void)
1975 net_ring_t *shared_rings;
1976 net_vif_t *vif;
1977 net_idx_t *shared_idxs;
1978 unsigned int i, j, idx;
1979 rx_req_entry_t rx;
1980 unsigned long pte_pfn, buf_pfn;
1981 struct pfn_info *pte_page, *buf_page;
1982 unsigned long *ptep;
1984 perfc_incr(net_hypercalls);
1986 for ( idx = 0; idx < MAX_DOMAIN_VIFS; idx++ )
1988 if ( (vif = current->net_vif_list[idx]) == NULL )
1989 break;
1991 shared_idxs = vif->shared_idxs;
1992 shared_rings = vif->shared_rings;
1994 /*
1995 * PHASE 1 -- TRANSMIT RING
1996 */
1998 if ( get_tx_bufs(vif) )
2000 add_to_net_schedule_list_tail(vif);
2001 maybe_schedule_tx_action();
2004 /*
2005 * PHASE 2 -- RECEIVE RING
2006 */
2008 /*
2009 * Collect up new receive buffers. We collect up to the guest OS's
2010 * new producer index, but take care not to catch up with our own
2011 * consumer index.
2012 */
2013 j = vif->rx_prod;
2014 for ( i = vif->rx_req_cons;
2015 (i != shared_idxs->rx_req_prod) &&
2016 (((vif->rx_resp_prod-i) & (RX_RING_SIZE-1)) != 1);
2017 i = RX_RING_INC(i) )
2019 rx = shared_rings->rx_ring[i].req;
2021 pte_pfn = rx.addr >> PAGE_SHIFT;
2022 pte_page = frame_table + pte_pfn;
2024 spin_lock_irq(&current->page_lock);
2025 if ( (pte_pfn >= max_page) ||
2026 ((pte_page->flags & (PG_type_mask | PG_domain_mask)) !=
2027 (PGT_l1_page_table | current->domain)) )
2029 DPRINTK("Bad page frame for ppte %d,%08lx,%08lx,%08lx\n",
2030 current->domain, pte_pfn, max_page, pte_page->flags);
2031 spin_unlock_irq(&current->page_lock);
2032 make_rx_response(vif, rx.id, 0, RING_STATUS_BAD_PAGE, 0);
2033 continue;
2036 ptep = map_domain_mem(rx.addr);
2038 if ( !(*ptep & _PAGE_PRESENT) )
2040 DPRINTK("Invalid PTE passed down (not present)\n");
2041 make_rx_response(vif, rx.id, 0, RING_STATUS_BAD_PAGE, 0);
2042 goto rx_unmap_and_continue;
2045 buf_pfn = *ptep >> PAGE_SHIFT;
2046 buf_page = frame_table + buf_pfn;
2048 if ( ((buf_page->flags & (PG_type_mask | PG_domain_mask)) !=
2049 (PGT_writeable_page | current->domain)) ||
2050 (buf_page->tot_count != 1) )
2052 DPRINTK("Need a mapped-once writeable page (%ld/%ld/%08lx)\n",
2053 buf_page->type_count, buf_page->tot_count, buf_page->flags);
2054 make_rx_response(vif, rx.id, 0, RING_STATUS_BAD_PAGE, 0);
2055 goto rx_unmap_and_continue;
2058 /*
2059 * The pte they passed was good, so take it away from them. We
2060 * also lock down the page-table page, so it doesn't go away.
2061 */
2062 get_page_type(pte_page);
2063 get_page_tot(pte_page);
2064 *ptep &= ~_PAGE_PRESENT;
2065 buf_page->flags = buf_page->type_count = buf_page->tot_count = 0;
2066 list_del(&buf_page->list);
2068 vif->rx_shadow_ring[j].id = rx.id;
2069 vif->rx_shadow_ring[j].pte_ptr = rx.addr;
2070 vif->rx_shadow_ring[j].buf_pfn = buf_pfn;
2071 vif->rx_shadow_ring[j].flush_count = (unsigned short)
2072 atomic_read(&tlb_flush_count[smp_processor_id()]);
2073 j = RX_RING_INC(j);
2075 rx_unmap_and_continue:
2076 unmap_domain_mem(ptep);
2077 spin_unlock_irq(&current->page_lock);
2080 vif->rx_req_cons = i;
2082 if ( vif->rx_prod != j )
2084 smp_mb(); /* Let other CPUs see new descriptors first. */
2085 vif->rx_prod = j;
2089 return 0;
2093 static void __make_tx_response(net_vif_t *vif,
2094 unsigned short id,
2095 unsigned char st)
2097 unsigned int pos;
2098 tx_resp_entry_t *resp;
2100 /* Place on the response ring for the relevant domain. */
2101 pos = vif->tx_resp_prod;
2102 resp = &vif->shared_rings->tx_ring[pos].resp;
2103 resp->id = id;
2104 resp->status = st;
2105 pos = TX_RING_INC(pos);
2106 vif->tx_resp_prod = vif->shared_idxs->tx_resp_prod = pos;
2107 if ( pos == vif->shared_idxs->tx_event )
2109 unsigned long cpu_mask = mark_guest_event(vif->domain, _EVENT_NET);
2110 guest_event_notify(cpu_mask);
2115 static void make_rx_response(net_vif_t *vif,
2116 unsigned short id,
2117 unsigned short size,
2118 unsigned char st,
2119 unsigned char off)
2121 unsigned long flags;
2122 unsigned int pos;
2123 rx_resp_entry_t *resp;
2125 /* Place on the response ring for the relevant domain. */
2126 spin_lock_irqsave(&vif->rx_lock, flags);
2127 pos = vif->rx_resp_prod;
2128 resp = &vif->shared_rings->rx_ring[pos].resp;
2129 resp->id = id;
2130 resp->size = size;
2131 resp->status = st;
2132 resp->offset = off;
2133 pos = RX_RING_INC(pos);
2134 vif->rx_resp_prod = vif->shared_idxs->rx_resp_prod = pos;
2135 if ( pos == vif->shared_idxs->rx_event )
2137 unsigned long cpu_mask = mark_guest_event(vif->domain, _EVENT_NET);
2138 guest_event_notify(cpu_mask);
2140 spin_unlock_irqrestore(&vif->rx_lock, flags);
2144 int setup_network_devices(void)
2146 int i, ret;
2147 extern char opt_ifname[];
2148 struct net_device *dev;
2150 if ( (dev = dev_get_by_name(opt_ifname)) == NULL )
2152 printk("Could not find device %s: using dummy device\n", opt_ifname);
2153 strcpy(opt_ifname, "dummy");
2154 if ( (dev = dev_get_by_name(opt_ifname)) == NULL )
2156 printk("Failed to find the dummy device!\n");
2157 return 0;
2161 if ( (ret = dev_open(dev)) != 0 )
2163 printk("Error opening device %s for use (%d)\n", opt_ifname, ret);
2164 return 0;
2167 printk("Device %s opened and ready for use.\n", opt_ifname);
2168 the_dev = dev;
2170 for ( i = 0; i < smp_num_cpus; i++ )
2171 skb_queue_head_init(&rx_skb_queue[i]);
2173 open_softirq(NET_RX_SOFTIRQ, net_rx_action, NULL);
2174 tasklet_enable(&net_tx_tasklet);
2176 return 1;