ia64/xen-unstable

view linux-2.6.10-xen-sparse/drivers/xen/netfront/netfront.c @ 3559:80651f13b9c0

bitkeeper revision 1.1159.233.1 (41f910e9AmctVZRfu78RAYbPnw3W8g)

Merge scramble.cl.cam.ac.uk:/local/scratch/kaf24/xen-2.0-testing.bk
into scramble.cl.cam.ac.uk:/local/scratch/kaf24/xen-unstable.bk
author kaf24@scramble.cl.cam.ac.uk
date Thu Jan 27 16:03:53 2005 +0000 (2005-01-27)
parents 7413468a8d01 77732aef762e
children 30ee9c427a5b bc0fbb38cb25
line source
1 /******************************************************************************
2 * Virtual network driver for conversing with remote driver backends.
3 *
4 * Copyright (c) 2002-2004, K A Fraser
5 *
6 * This file may be distributed separately from the Linux kernel, or
7 * incorporated into other software packages, subject to the following license:
8 *
9 * Permission is hereby granted, free of charge, to any person obtaining a copy
10 * of this source file (the "Software"), to deal in the Software without
11 * restriction, including without limitation the rights to use, copy, modify,
12 * merge, publish, distribute, sublicense, and/or sell copies of the Software,
13 * and to permit persons to whom the Software is furnished to do so, subject to
14 * the following conditions:
15 *
16 * The above copyright notice and this permission notice shall be included in
17 * all copies or substantial portions of the Software.
18 *
19 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
22 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25 * IN THE SOFTWARE.
26 */
28 #include <linux/config.h>
29 #include <linux/module.h>
30 #include <linux/version.h>
31 #include <linux/kernel.h>
32 #include <linux/sched.h>
33 #include <linux/slab.h>
34 #include <linux/string.h>
35 #include <linux/errno.h>
36 #include <linux/netdevice.h>
37 #include <linux/inetdevice.h>
38 #include <linux/etherdevice.h>
39 #include <linux/skbuff.h>
40 #include <linux/init.h>
41 #include <linux/bitops.h>
42 #include <net/sock.h>
43 #include <net/pkt_sched.h>
44 #include <asm/io.h>
45 #include <asm-xen/evtchn.h>
46 #include <asm-xen/ctrl_if.h>
47 #include <asm-xen/xen-public/io/netif.h>
48 #include <asm-xen/balloon.h>
49 #include <asm/page.h>
51 #include <net/arp.h>
52 #include <net/route.h>
54 #define DEBUG 0
56 #ifndef __GFP_NOWARN
57 #define __GFP_NOWARN 0
58 #endif
59 #define alloc_xen_skb(_l) __dev_alloc_skb((_l), GFP_ATOMIC|__GFP_NOWARN)
61 #define init_skb_shinfo(_skb) \
62 do { \
63 atomic_set(&(skb_shinfo(_skb)->dataref), 1); \
64 skb_shinfo(_skb)->nr_frags = 0; \
65 skb_shinfo(_skb)->frag_list = NULL; \
66 } while ( 0 )
68 /* Allow headroom on each rx pkt for Ethernet header, alignment padding, ... */
69 #define RX_HEADROOM 200
71 /*
72 * If the backend driver is pipelining transmit requests then we can be very
73 * aggressive in avoiding new-packet notifications -- only need to send a
74 * notification if there are no outstanding unreceived responses.
75 * If the backend may be buffering our transmit buffers for any reason then we
76 * are rather more conservative.
77 */
78 #ifdef CONFIG_XEN_NETDEV_FRONTEND_PIPELINED_TRANSMITTER
79 #define TX_TEST_IDX resp_prod /* aggressive: any outstanding responses? */
80 #else
81 #define TX_TEST_IDX req_cons /* conservative: not seen all our requests? */
82 #endif
84 static void network_tx_buf_gc(struct net_device *dev);
85 static void network_alloc_rx_buffers(struct net_device *dev);
87 static unsigned long rx_pfn_array[NETIF_RX_RING_SIZE];
88 static multicall_entry_t rx_mcl[NETIF_RX_RING_SIZE+1];
89 static mmu_update_t rx_mmu[NETIF_RX_RING_SIZE];
91 static struct list_head dev_list;
93 struct net_private
94 {
95 struct list_head list;
96 struct net_device *dev;
98 struct net_device_stats stats;
99 NETIF_RING_IDX rx_resp_cons, tx_resp_cons;
100 unsigned int tx_full;
102 netif_tx_interface_t *tx;
103 netif_rx_interface_t *rx;
105 spinlock_t tx_lock;
106 spinlock_t rx_lock;
108 unsigned int handle;
109 unsigned int evtchn;
110 unsigned int irq;
112 /* What is the status of our connection to the remote backend? */
113 #define BEST_CLOSED 0
114 #define BEST_DISCONNECTED 1
115 #define BEST_CONNECTED 2
116 unsigned int backend_state;
118 /* Is this interface open or closed (down or up)? */
119 #define UST_CLOSED 0
120 #define UST_OPEN 1
121 unsigned int user_state;
123 /* Receive-ring batched refills. */
124 #define RX_MIN_TARGET 8
125 #define RX_MAX_TARGET NETIF_RX_RING_SIZE
126 int rx_target;
127 struct sk_buff_head rx_batch;
129 /*
130 * {tx,rx}_skbs store outstanding skbuffs. The first entry in each
131 * array is an index into a chain of free entries.
132 */
133 struct sk_buff *tx_skbs[NETIF_TX_RING_SIZE+1];
134 struct sk_buff *rx_skbs[NETIF_RX_RING_SIZE+1];
135 };
137 /* Access macros for acquiring freeing slots in {tx,rx}_skbs[]. */
138 #define ADD_ID_TO_FREELIST(_list, _id) \
139 (_list)[(_id)] = (_list)[0]; \
140 (_list)[0] = (void *)(unsigned long)(_id);
141 #define GET_ID_FROM_FREELIST(_list) \
142 ({ unsigned long _id = (unsigned long)(_list)[0]; \
143 (_list)[0] = (_list)[_id]; \
144 (unsigned short)_id; })
146 static char *status_name[] = {
147 [NETIF_INTERFACE_STATUS_CLOSED] = "closed",
148 [NETIF_INTERFACE_STATUS_DISCONNECTED] = "disconnected",
149 [NETIF_INTERFACE_STATUS_CONNECTED] = "connected",
150 [NETIF_INTERFACE_STATUS_CHANGED] = "changed",
151 };
153 static char *be_state_name[] = {
154 [BEST_CLOSED] = "closed",
155 [BEST_DISCONNECTED] = "disconnected",
156 [BEST_CONNECTED] = "connected",
157 };
159 #if DEBUG
160 #define DPRINTK(fmt, args...) \
161 printk(KERN_ALERT "xen_net (%s:%d) " fmt, __FUNCTION__, __LINE__, ##args)
162 #else
163 #define DPRINTK(fmt, args...) ((void)0)
164 #endif
165 #define IPRINTK(fmt, args...) \
166 printk(KERN_INFO "xen_net: " fmt, ##args)
167 #define WPRINTK(fmt, args...) \
168 printk(KERN_WARNING "xen_net: " fmt, ##args)
170 static struct net_device *find_dev_by_handle(unsigned int handle)
171 {
172 struct list_head *ent;
173 struct net_private *np;
174 list_for_each ( ent, &dev_list )
175 {
176 np = list_entry(ent, struct net_private, list);
177 if ( np->handle == handle )
178 return np->dev;
179 }
180 return NULL;
181 }
183 /** Network interface info. */
184 struct netif_ctrl {
185 /** Number of interfaces. */
186 int interface_n;
187 /** Number of connected interfaces. */
188 int connected_n;
189 /** Error code. */
190 int err;
191 int up;
192 };
194 static struct netif_ctrl netctrl;
196 static void netctrl_init(void)
197 {
198 memset(&netctrl, 0, sizeof(netctrl));
199 netctrl.up = NETIF_DRIVER_STATUS_DOWN;
200 }
202 /** Get or set a network interface error.
203 */
204 static int netctrl_err(int err)
205 {
206 if ( (err < 0) && !netctrl.err )
207 netctrl.err = err;
208 return netctrl.err;
209 }
211 /** Test if all network interfaces are connected.
212 *
213 * @return 1 if all connected, 0 if not, negative error code otherwise
214 */
215 static int netctrl_connected(void)
216 {
217 int ok;
219 if ( netctrl.err )
220 ok = netctrl.err;
221 else if ( netctrl.up == NETIF_DRIVER_STATUS_UP )
222 ok = (netctrl.connected_n == netctrl.interface_n);
223 else
224 ok = 0;
226 return ok;
227 }
229 /** Count the connected network interfaces.
230 *
231 * @return connected count
232 */
233 static int netctrl_connected_count(void)
234 {
236 struct list_head *ent;
237 struct net_private *np;
238 unsigned int connected;
240 connected = 0;
242 list_for_each(ent, &dev_list) {
243 np = list_entry(ent, struct net_private, list);
244 if (np->backend_state == BEST_CONNECTED)
245 connected++;
246 }
248 netctrl.connected_n = connected;
249 DPRINTK("> connected_n=%d interface_n=%d\n",
250 netctrl.connected_n, netctrl.interface_n);
251 return connected;
252 }
254 /** Send a packet on a net device to encourage switches to learn the
255 * MAC. We send a fake ARP request.
256 *
257 * @param dev device
258 * @return 0 on success, error code otherwise
259 */
260 static int send_fake_arp(struct net_device *dev)
261 {
262 struct sk_buff *skb;
263 u32 src_ip, dst_ip;
265 dst_ip = INADDR_BROADCAST;
266 src_ip = inet_select_addr(dev, dst_ip, RT_SCOPE_LINK);
268 /* No IP? Then nothing to do. */
269 if ( src_ip == 0 )
270 return 0;
272 skb = arp_create(ARPOP_REPLY, ETH_P_ARP,
273 dst_ip, dev, src_ip,
274 /*dst_hw*/ NULL, /*src_hw*/ NULL,
275 /*target_hw*/ dev->dev_addr);
276 if ( skb == NULL )
277 return -ENOMEM;
279 return dev_queue_xmit(skb);
280 }
282 static int network_open(struct net_device *dev)
283 {
284 struct net_private *np = netdev_priv(dev);
286 memset(&np->stats, 0, sizeof(np->stats));
288 np->user_state = UST_OPEN;
290 network_alloc_rx_buffers(dev);
291 np->rx->event = np->rx_resp_cons + 1;
293 netif_start_queue(dev);
295 return 0;
296 }
298 static void network_tx_buf_gc(struct net_device *dev)
299 {
300 NETIF_RING_IDX i, prod;
301 unsigned short id;
302 struct net_private *np = netdev_priv(dev);
303 struct sk_buff *skb;
305 if ( np->backend_state != BEST_CONNECTED )
306 return;
308 do {
309 prod = np->tx->resp_prod;
310 rmb(); /* Ensure we see responses up to 'rp'. */
312 for ( i = np->tx_resp_cons; i != prod; i++ )
313 {
314 id = np->tx->ring[MASK_NETIF_TX_IDX(i)].resp.id;
315 skb = np->tx_skbs[id];
316 ADD_ID_TO_FREELIST(np->tx_skbs, id);
317 dev_kfree_skb_irq(skb);
318 }
320 np->tx_resp_cons = prod;
322 /*
323 * Set a new event, then check for race with update of tx_cons. Note
324 * that it is essential to schedule a callback, no matter how few
325 * buffers are pending. Even if there is space in the transmit ring,
326 * higher layers may be blocked because too much data is outstanding:
327 * in such cases notification from Xen is likely to be the only kick
328 * that we'll get.
329 */
330 np->tx->event =
331 prod + ((np->tx->req_prod - prod) >> 1) + 1;
332 mb();
333 }
334 while ( prod != np->tx->resp_prod );
336 if ( np->tx_full &&
337 ((np->tx->req_prod - prod) < NETIF_TX_RING_SIZE) )
338 {
339 np->tx_full = 0;
340 if ( np->user_state == UST_OPEN )
341 netif_wake_queue(dev);
342 }
343 }
346 static void network_alloc_rx_buffers(struct net_device *dev)
347 {
348 unsigned short id;
349 struct net_private *np = netdev_priv(dev);
350 struct sk_buff *skb;
351 int i, batch_target;
352 NETIF_RING_IDX req_prod = np->rx->req_prod;
354 if ( unlikely(np->backend_state != BEST_CONNECTED) )
355 return;
357 /*
358 * Allocate skbuffs greedily, even though we batch updates to the
359 * receive ring. This creates a less bursty demand on the memory allocator,
360 * so should reduce the chance of failed allocation requests both for
361 * ourself and for other kernel subsystems.
362 */
363 batch_target = np->rx_target - (req_prod - np->rx_resp_cons);
364 for ( i = skb_queue_len(&np->rx_batch); i < batch_target; i++ )
365 {
366 if ( unlikely((skb = alloc_xen_skb(dev->mtu + RX_HEADROOM)) == NULL) )
367 break;
368 __skb_queue_tail(&np->rx_batch, skb);
369 }
371 /* Is the batch large enough to be worthwhile? */
372 if ( i < (np->rx_target/2) )
373 return;
375 for ( i = 0; ; i++ )
376 {
377 if ( (skb = __skb_dequeue(&np->rx_batch)) == NULL )
378 break;
380 skb->dev = dev;
382 id = GET_ID_FROM_FREELIST(np->rx_skbs);
384 np->rx_skbs[id] = skb;
386 np->rx->ring[MASK_NETIF_RX_IDX(req_prod + i)].req.id = id;
388 rx_pfn_array[i] = virt_to_machine(skb->head) >> PAGE_SHIFT;
390 /* Remove this page from pseudo phys map before passing back to Xen. */
391 phys_to_machine_mapping[__pa(skb->head) >> PAGE_SHIFT]
392 = INVALID_P2M_ENTRY;
394 rx_mcl[i].op = __HYPERVISOR_update_va_mapping;
395 rx_mcl[i].args[0] = (unsigned long)skb->head >> PAGE_SHIFT;
396 rx_mcl[i].args[1] = 0;
397 rx_mcl[i].args[2] = 0;
398 }
400 /*
401 * We may have allocated buffers which have entries outstanding in the page
402 * update queue -- make sure we flush those first!
403 */
404 flush_page_update_queue();
406 /* After all PTEs have been zapped we blow away stale TLB entries. */
407 rx_mcl[i-1].args[2] = UVMF_FLUSH_TLB;
409 /* Give away a batch of pages. */
410 rx_mcl[i].op = __HYPERVISOR_dom_mem_op;
411 rx_mcl[i].args[0] = MEMOP_decrease_reservation;
412 rx_mcl[i].args[1] = (unsigned long)rx_pfn_array;
413 rx_mcl[i].args[2] = (unsigned long)i;
414 rx_mcl[i].args[3] = 0;
415 rx_mcl[i].args[4] = DOMID_SELF;
417 /* Tell the ballon driver what is going on. */
418 balloon_update_driver_allowance(i);
420 /* Zap PTEs and give away pages in one big multicall. */
421 (void)HYPERVISOR_multicall(rx_mcl, i+1);
423 /* Check return status of HYPERVISOR_dom_mem_op(). */
424 if ( unlikely(rx_mcl[i].args[5] != i) )
425 panic("Unable to reduce memory reservation\n");
427 /* Above is a suitable barrier to ensure backend will see requests. */
428 np->rx->req_prod = req_prod + i;
430 /* Adjust our floating fill target if we risked running out of buffers. */
431 if ( ((req_prod - np->rx->resp_prod) < (np->rx_target / 4)) &&
432 ((np->rx_target *= 2) > RX_MAX_TARGET) )
433 np->rx_target = RX_MAX_TARGET;
434 }
437 static int network_start_xmit(struct sk_buff *skb, struct net_device *dev)
438 {
439 unsigned short id;
440 struct net_private *np = netdev_priv(dev);
441 netif_tx_request_t *tx;
442 NETIF_RING_IDX i;
444 if ( unlikely(np->tx_full) )
445 {
446 printk(KERN_ALERT "%s: full queue wasn't stopped!\n", dev->name);
447 netif_stop_queue(dev);
448 goto drop;
449 }
451 if ( unlikely((((unsigned long)skb->data & ~PAGE_MASK) + skb->len) >=
452 PAGE_SIZE) )
453 {
454 struct sk_buff *nskb;
455 if ( unlikely((nskb = alloc_xen_skb(skb->len)) == NULL) )
456 goto drop;
457 skb_put(nskb, skb->len);
458 memcpy(nskb->data, skb->data, skb->len);
459 nskb->dev = skb->dev;
460 dev_kfree_skb(skb);
461 skb = nskb;
462 }
464 spin_lock_irq(&np->tx_lock);
466 if ( np->backend_state != BEST_CONNECTED )
467 {
468 spin_unlock_irq(&np->tx_lock);
469 goto drop;
470 }
472 i = np->tx->req_prod;
474 id = GET_ID_FROM_FREELIST(np->tx_skbs);
475 np->tx_skbs[id] = skb;
477 tx = &np->tx->ring[MASK_NETIF_TX_IDX(i)].req;
479 tx->id = id;
480 tx->addr = virt_to_machine(skb->data);
481 tx->size = skb->len;
483 wmb(); /* Ensure that backend will see the request. */
484 np->tx->req_prod = i + 1;
486 network_tx_buf_gc(dev);
488 if ( (i - np->tx_resp_cons) == (NETIF_TX_RING_SIZE - 1) )
489 {
490 np->tx_full = 1;
491 netif_stop_queue(dev);
492 }
494 spin_unlock_irq(&np->tx_lock);
496 np->stats.tx_bytes += skb->len;
497 np->stats.tx_packets++;
499 /* Only notify Xen if we really have to. */
500 mb();
501 if ( np->tx->TX_TEST_IDX == i )
502 notify_via_evtchn(np->evtchn);
504 return 0;
506 drop:
507 np->stats.tx_dropped++;
508 dev_kfree_skb(skb);
509 return 0;
510 }
513 static irqreturn_t netif_int(int irq, void *dev_id, struct pt_regs *ptregs)
514 {
515 struct net_device *dev = dev_id;
516 struct net_private *np = netdev_priv(dev);
517 unsigned long flags;
519 spin_lock_irqsave(&np->tx_lock, flags);
520 network_tx_buf_gc(dev);
521 spin_unlock_irqrestore(&np->tx_lock, flags);
523 if ( (np->rx_resp_cons != np->rx->resp_prod) &&
524 (np->user_state == UST_OPEN) )
525 netif_rx_schedule(dev);
527 return IRQ_HANDLED;
528 }
531 static int netif_poll(struct net_device *dev, int *pbudget)
532 {
533 struct net_private *np = netdev_priv(dev);
534 struct sk_buff *skb, *nskb;
535 netif_rx_response_t *rx;
536 NETIF_RING_IDX i, rp;
537 mmu_update_t *mmu = rx_mmu;
538 multicall_entry_t *mcl = rx_mcl;
539 int work_done, budget, more_to_do = 1;
540 struct sk_buff_head rxq;
541 unsigned long flags;
543 spin_lock(&np->rx_lock);
545 if ( np->backend_state != BEST_CONNECTED )
546 {
547 spin_unlock(&np->rx_lock);
548 return 0;
549 }
551 skb_queue_head_init(&rxq);
553 if ( (budget = *pbudget) > dev->quota )
554 budget = dev->quota;
556 rp = np->rx->resp_prod;
557 rmb(); /* Ensure we see queued responses up to 'rp'. */
559 for ( i = np->rx_resp_cons, work_done = 0;
560 (i != rp) && (work_done < budget);
561 i++, work_done++ )
562 {
563 rx = &np->rx->ring[MASK_NETIF_RX_IDX(i)].resp;
565 /*
566 * An error here is very odd. Usually indicates a backend bug,
567 * low-memory condition, or that we didn't have reservation headroom.
568 */
569 if ( unlikely(rx->status <= 0) )
570 {
571 if ( net_ratelimit() )
572 printk(KERN_WARNING "Bad rx buffer (memory squeeze?).\n");
573 np->rx->ring[MASK_NETIF_RX_IDX(np->rx->req_prod)].req.id = rx->id;
574 wmb();
575 np->rx->req_prod++;
576 work_done--;
577 continue;
578 }
580 skb = np->rx_skbs[rx->id];
581 ADD_ID_TO_FREELIST(np->rx_skbs, rx->id);
583 /* NB. We handle skb overflow later. */
584 skb->data = skb->head + (rx->addr & ~PAGE_MASK);
585 skb->len = rx->status;
586 skb->tail = skb->data + skb->len;
588 np->stats.rx_packets++;
589 np->stats.rx_bytes += rx->status;
591 /* Remap the page. */
592 mmu->ptr = (rx->addr & PAGE_MASK) | MMU_MACHPHYS_UPDATE;
593 mmu->val = __pa(skb->head) >> PAGE_SHIFT;
594 mmu++;
595 mcl->op = __HYPERVISOR_update_va_mapping;
596 mcl->args[0] = (unsigned long)skb->head >> PAGE_SHIFT;
597 mcl->args[1] = (rx->addr & PAGE_MASK) | __PAGE_KERNEL;
598 mcl->args[2] = 0;
599 mcl++;
601 phys_to_machine_mapping[__pa(skb->head) >> PAGE_SHIFT] =
602 rx->addr >> PAGE_SHIFT;
604 __skb_queue_tail(&rxq, skb);
605 }
607 /* Some pages are no longer absent... */
608 balloon_update_driver_allowance(-work_done);
610 /* Do all the remapping work, and M->P updates, in one big hypercall. */
611 if ( likely((mcl - rx_mcl) != 0) )
612 {
613 mcl->op = __HYPERVISOR_mmu_update;
614 mcl->args[0] = (unsigned long)rx_mmu;
615 mcl->args[1] = mmu - rx_mmu;
616 mcl->args[2] = 0;
617 mcl++;
618 (void)HYPERVISOR_multicall(rx_mcl, mcl - rx_mcl);
619 }
621 while ( (skb = __skb_dequeue(&rxq)) != NULL )
622 {
623 /*
624 * Enough room in skbuff for the data we were passed? Also, Linux
625 * expects at least 16 bytes headroom in each receive buffer.
626 */
627 if ( unlikely(skb->tail > skb->end) ||
628 unlikely((skb->data - skb->head) < 16) )
629 {
630 nskb = NULL;
632 /* Only copy the packet if it fits in the current MTU. */
633 if ( skb->len <= (dev->mtu + ETH_HLEN) )
634 {
635 if ( (skb->tail > skb->end) && net_ratelimit() )
636 printk(KERN_INFO "Received packet needs %d bytes more "
637 "headroom.\n", skb->tail - skb->end);
639 if ( (nskb = alloc_xen_skb(skb->len + 2)) != NULL )
640 {
641 skb_reserve(nskb, 2);
642 skb_put(nskb, skb->len);
643 memcpy(nskb->data, skb->data, skb->len);
644 nskb->dev = skb->dev;
645 }
646 }
647 else if ( net_ratelimit() )
648 printk(KERN_INFO "Received packet too big for MTU "
649 "(%d > %d)\n", skb->len - ETH_HLEN, dev->mtu);
651 /* Reinitialise and then destroy the old skbuff. */
652 skb->len = 0;
653 skb->tail = skb->data;
654 init_skb_shinfo(skb);
655 dev_kfree_skb(skb);
657 /* Switch old for new, if we copied the buffer. */
658 if ( (skb = nskb) == NULL )
659 continue;
660 }
662 /* Set the shared-info area, which is hidden behind the real data. */
663 init_skb_shinfo(skb);
665 /* Ethernet-specific work. Delayed to here as it peeks the header. */
666 skb->protocol = eth_type_trans(skb, dev);
668 /* Pass it up. */
669 netif_receive_skb(skb);
670 dev->last_rx = jiffies;
671 }
673 np->rx_resp_cons = i;
675 /* If we get a callback with very few responses, reduce fill target. */
676 /* NB. Note exponential increase, linear decrease. */
677 if ( ((np->rx->req_prod - np->rx->resp_prod) > ((3*np->rx_target) / 4)) &&
678 (--np->rx_target < RX_MIN_TARGET) )
679 np->rx_target = RX_MIN_TARGET;
681 network_alloc_rx_buffers(dev);
683 *pbudget -= work_done;
684 dev->quota -= work_done;
686 if ( work_done < budget )
687 {
688 local_irq_save(flags);
690 np->rx->event = i + 1;
692 /* Deal with hypervisor racing our resetting of rx_event. */
693 mb();
694 if ( np->rx->resp_prod == i )
695 {
696 __netif_rx_complete(dev);
697 more_to_do = 0;
698 }
700 local_irq_restore(flags);
701 }
703 spin_unlock(&np->rx_lock);
705 return more_to_do;
706 }
709 static int network_close(struct net_device *dev)
710 {
711 struct net_private *np = netdev_priv(dev);
712 np->user_state = UST_CLOSED;
713 netif_stop_queue(np->dev);
714 return 0;
715 }
718 static struct net_device_stats *network_get_stats(struct net_device *dev)
719 {
720 struct net_private *np = netdev_priv(dev);
721 return &np->stats;
722 }
725 static void network_connect(struct net_device *dev,
726 netif_fe_interface_status_t *status)
727 {
728 struct net_private *np;
729 int i, requeue_idx;
730 netif_tx_request_t *tx;
732 np = netdev_priv(dev);
733 spin_lock_irq(&np->tx_lock);
734 spin_lock(&np->rx_lock);
736 /* Recovery procedure: */
738 /* Step 1: Reinitialise variables. */
739 np->rx_resp_cons = np->tx_resp_cons = np->tx_full = 0;
740 np->rx->event = np->tx->event = 1;
742 /* Step 2: Rebuild the RX and TX ring contents.
743 * NB. We could just free the queued TX packets now but we hope
744 * that sending them out might do some good. We have to rebuild
745 * the RX ring because some of our pages are currently flipped out
746 * so we can't just free the RX skbs.
747 * NB2. Freelist index entries are always going to be less than
748 * __PAGE_OFFSET, whereas pointers to skbs will always be equal or
749 * greater than __PAGE_OFFSET: we use this property to distinguish
750 * them.
751 */
753 /* Rebuild the TX buffer freelist and the TX ring itself.
754 * NB. This reorders packets. We could keep more private state
755 * to avoid this but maybe it doesn't matter so much given the
756 * interface has been down.
757 */
758 for ( requeue_idx = 0, i = 1; i <= NETIF_TX_RING_SIZE; i++ )
759 {
760 if ( (unsigned long)np->tx_skbs[i] >= __PAGE_OFFSET )
761 {
762 struct sk_buff *skb = np->tx_skbs[i];
764 tx = &np->tx->ring[requeue_idx++].req;
766 tx->id = i;
767 tx->addr = virt_to_machine(skb->data);
768 tx->size = skb->len;
770 np->stats.tx_bytes += skb->len;
771 np->stats.tx_packets++;
772 }
773 }
774 wmb();
775 np->tx->req_prod = requeue_idx;
777 /* Rebuild the RX buffer freelist and the RX ring itself. */
778 for ( requeue_idx = 0, i = 1; i <= NETIF_RX_RING_SIZE; i++ )
779 if ( (unsigned long)np->rx_skbs[i] >= __PAGE_OFFSET )
780 np->rx->ring[requeue_idx++].req.id = i;
781 wmb();
782 np->rx->req_prod = requeue_idx;
784 /* Step 3: All public and private state should now be sane. Get
785 * ready to start sending and receiving packets and give the driver
786 * domain a kick because we've probably just requeued some
787 * packets.
788 */
789 np->backend_state = BEST_CONNECTED;
790 wmb();
791 notify_via_evtchn(status->evtchn);
792 network_tx_buf_gc(dev);
794 if ( np->user_state == UST_OPEN )
795 netif_start_queue(dev);
797 spin_unlock(&np->rx_lock);
798 spin_unlock_irq(&np->tx_lock);
799 }
801 static void vif_show(struct net_private *np)
802 {
803 #if DEBUG
804 if (np) {
805 IPRINTK("<vif handle=%u %s(%s) evtchn=%u irq=%u tx=%p rx=%p>\n",
806 np->handle,
807 be_state_name[np->backend_state],
808 np->user_state ? "open" : "closed",
809 np->evtchn,
810 np->irq,
811 np->tx,
812 np->rx);
813 } else {
814 IPRINTK("<vif NULL>\n");
815 }
816 #endif
817 }
819 /* Send a connect message to xend to tell it to bring up the interface. */
820 static void send_interface_connect(struct net_private *np)
821 {
822 ctrl_msg_t cmsg = {
823 .type = CMSG_NETIF_FE,
824 .subtype = CMSG_NETIF_FE_INTERFACE_CONNECT,
825 .length = sizeof(netif_fe_interface_connect_t),
826 };
827 netif_fe_interface_connect_t *msg = (void*)cmsg.msg;
829 msg->handle = np->handle;
830 msg->tx_shmem_frame = (virt_to_machine(np->tx) >> PAGE_SHIFT);
831 msg->rx_shmem_frame = (virt_to_machine(np->rx) >> PAGE_SHIFT);
833 ctrl_if_send_message_block(&cmsg, NULL, 0, TASK_UNINTERRUPTIBLE);
834 }
836 /* Send a driver status notification to the domain controller. */
837 static int send_driver_status(int ok)
838 {
839 int err = 0;
840 ctrl_msg_t cmsg = {
841 .type = CMSG_NETIF_FE,
842 .subtype = CMSG_NETIF_FE_DRIVER_STATUS,
843 .length = sizeof(netif_fe_driver_status_t),
844 };
845 netif_fe_driver_status_t *msg = (void*)cmsg.msg;
847 msg->status = (ok ? NETIF_DRIVER_STATUS_UP : NETIF_DRIVER_STATUS_DOWN);
848 err = ctrl_if_send_message_block(&cmsg, NULL, 0, TASK_UNINTERRUPTIBLE);
849 return err;
850 }
852 /* Stop network device and free tx/rx queues and irq.
853 */
854 static void vif_release(struct net_private *np)
855 {
856 /* Stop old i/f to prevent errors whilst we rebuild the state. */
857 spin_lock_irq(&np->tx_lock);
858 spin_lock(&np->rx_lock);
859 netif_stop_queue(np->dev);
860 /* np->backend_state = BEST_DISCONNECTED; */
861 spin_unlock(&np->rx_lock);
862 spin_unlock_irq(&np->tx_lock);
864 /* Free resources. */
865 if(np->tx != NULL){
866 free_irq(np->irq, np->dev);
867 unbind_evtchn_from_irq(np->evtchn);
868 free_page((unsigned long)np->tx);
869 free_page((unsigned long)np->rx);
870 np->irq = 0;
871 np->evtchn = 0;
872 np->tx = NULL;
873 np->rx = NULL;
874 }
875 }
877 /* Release vif resources and close it down completely.
878 */
879 static void vif_close(struct net_private *np)
880 {
881 WPRINTK("Unexpected netif-CLOSED message in state %s\n",
882 be_state_name[np->backend_state]);
883 vif_release(np);
884 np->backend_state = BEST_CLOSED;
885 /* todo: take dev down and free. */
886 vif_show(np);
887 }
889 /* Move the vif into disconnected state.
890 * Allocates tx/rx pages.
891 * Sends connect message to xend.
892 */
893 static void vif_disconnect(struct net_private *np)
894 {
895 if(np->tx) free_page((unsigned long)np->tx);
896 if(np->rx) free_page((unsigned long)np->rx);
897 // Before this np->tx and np->rx had better be null.
898 np->tx = (netif_tx_interface_t *)__get_free_page(GFP_KERNEL);
899 np->rx = (netif_rx_interface_t *)__get_free_page(GFP_KERNEL);
900 memset(np->tx, 0, PAGE_SIZE);
901 memset(np->rx, 0, PAGE_SIZE);
902 np->backend_state = BEST_DISCONNECTED;
903 send_interface_connect(np);
904 vif_show(np);
905 }
907 /* Begin interface recovery.
908 *
909 * NB. Whilst we're recovering, we turn the carrier state off. We
910 * take measures to ensure that this device isn't used for
911 * anything. We also stop the queue for this device. Various
912 * different approaches (e.g. continuing to buffer packets) have
913 * been tested but don't appear to improve the overall impact on
914 * TCP connections.
915 *
916 * TODO: (MAW) Change the Xend<->Guest protocol so that a recovery
917 * is initiated by a special "RESET" message - disconnect could
918 * just mean we're not allowed to use this interface any more.
919 */
920 static void
921 vif_reset(
922 struct net_private *np)
923 {
924 IPRINTK("Attempting to reconnect network interface: handle=%u\n",
925 np->handle);
926 vif_release(np);
927 vif_disconnect(np);
928 vif_show(np);
929 }
931 /* Move the vif into connected state.
932 * Sets the mac and event channel from the message.
933 * Binds the irq to the event channel.
934 */
935 static void
936 vif_connect(
937 struct net_private *np, netif_fe_interface_status_t *status)
938 {
939 struct net_device *dev = np->dev;
940 memcpy(dev->dev_addr, status->mac, ETH_ALEN);
941 network_connect(dev, status);
942 np->evtchn = status->evtchn;
943 np->irq = bind_evtchn_to_irq(np->evtchn);
944 (void)request_irq(np->irq, netif_int, SA_SAMPLE_RANDOM, dev->name, dev);
945 netctrl_connected_count();
946 (void)send_fake_arp(dev);
947 vif_show(np);
948 }
951 /** Create a network device.
952 * @param handle device handle
953 * @param val return parameter for created device
954 * @return 0 on success, error code otherwise
955 */
956 static int create_netdev(int handle, struct net_device **val)
957 {
958 int i, err = 0;
959 struct net_device *dev = NULL;
960 struct net_private *np = NULL;
962 if ( (dev = alloc_etherdev(sizeof(struct net_private))) == NULL )
963 {
964 printk(KERN_WARNING "%s> alloc_etherdev failed.\n", __FUNCTION__);
965 err = -ENOMEM;
966 goto exit;
967 }
969 np = netdev_priv(dev);
970 np->backend_state = BEST_CLOSED;
971 np->user_state = UST_CLOSED;
972 np->handle = handle;
974 spin_lock_init(&np->tx_lock);
975 spin_lock_init(&np->rx_lock);
977 skb_queue_head_init(&np->rx_batch);
978 np->rx_target = RX_MIN_TARGET;
980 /* Initialise {tx,rx}_skbs to be a free chain containing every entry. */
981 for ( i = 0; i <= NETIF_TX_RING_SIZE; i++ )
982 np->tx_skbs[i] = (void *)(i+1);
983 for ( i = 0; i <= NETIF_RX_RING_SIZE; i++ )
984 np->rx_skbs[i] = (void *)(i+1);
986 dev->open = network_open;
987 dev->hard_start_xmit = network_start_xmit;
988 dev->stop = network_close;
989 dev->get_stats = network_get_stats;
990 dev->poll = netif_poll;
991 dev->weight = 64;
993 if ( (err = register_netdev(dev)) != 0 )
994 {
995 printk(KERN_WARNING "%s> register_netdev err=%d\n", __FUNCTION__, err);
996 goto exit;
997 }
998 np->dev = dev;
999 list_add(&np->list, &dev_list);
1001 exit:
1002 if ( (err != 0) && (dev != NULL ) )
1003 kfree(dev);
1004 else if ( val != NULL )
1005 *val = dev;
1006 return err;
1009 /* Get the target interface for a status message.
1010 * Creates the interface when it makes sense.
1011 * The returned interface may be null when there is no error.
1013 * @param status status message
1014 * @param np return parameter for interface state
1015 * @return 0 on success, error code otherwise
1016 */
1017 static int
1018 target_vif(
1019 netif_fe_interface_status_t *status, struct net_private **np)
1021 int err = 0;
1022 struct net_device *dev;
1024 DPRINTK("> handle=%d\n", status->handle);
1025 if ( status->handle < 0 )
1027 err = -EINVAL;
1028 goto exit;
1031 if ( (dev = find_dev_by_handle(status->handle)) != NULL )
1032 goto exit;
1034 if ( status->status == NETIF_INTERFACE_STATUS_CLOSED )
1035 goto exit;
1036 if ( status->status == NETIF_INTERFACE_STATUS_CHANGED )
1037 goto exit;
1039 /* It's a new interface in a good state - create it. */
1040 DPRINTK("> create device...\n");
1041 if ( (err = create_netdev(status->handle, &dev)) != 0 )
1042 goto exit;
1044 netctrl.interface_n++;
1046 exit:
1047 if ( np != NULL )
1048 *np = ((dev && !err) ? netdev_priv(dev) : NULL);
1049 DPRINTK("< err=%d\n", err);
1050 return err;
1053 /* Handle an interface status message. */
1054 static void netif_interface_status(netif_fe_interface_status_t *status)
1056 int err = 0;
1057 struct net_private *np = NULL;
1059 DPRINTK("> status=%s handle=%d\n",
1060 status_name[status->status], status->handle);
1062 if ( (err = target_vif(status, &np)) != 0 )
1064 WPRINTK("Invalid netif: handle=%u\n", status->handle);
1065 return;
1068 if ( np == NULL )
1070 DPRINTK("> no vif\n");
1071 return;
1074 switch ( status->status )
1076 case NETIF_INTERFACE_STATUS_CLOSED:
1077 switch ( np->backend_state )
1079 case BEST_CLOSED:
1080 case BEST_DISCONNECTED:
1081 case BEST_CONNECTED:
1082 vif_close(np);
1083 break;
1085 break;
1087 case NETIF_INTERFACE_STATUS_DISCONNECTED:
1088 switch ( np->backend_state )
1090 case BEST_CLOSED:
1091 vif_disconnect(np);
1092 break;
1093 case BEST_DISCONNECTED:
1094 case BEST_CONNECTED:
1095 vif_reset(np);
1096 break;
1098 break;
1100 case NETIF_INTERFACE_STATUS_CONNECTED:
1101 switch ( np->backend_state )
1103 case BEST_CLOSED:
1104 WPRINTK("Unexpected netif status %s in state %s\n",
1105 status_name[status->status],
1106 be_state_name[np->backend_state]);
1107 vif_disconnect(np);
1108 vif_connect(np, status);
1109 break;
1110 case BEST_DISCONNECTED:
1111 vif_connect(np, status);
1112 break;
1114 break;
1116 case NETIF_INTERFACE_STATUS_CHANGED:
1117 /*
1118 * The domain controller is notifying us that a device has been
1119 * added or removed.
1120 */
1121 break;
1123 default:
1124 WPRINTK("Invalid netif status code %d\n", status->status);
1125 break;
1128 vif_show(np);
1131 /*
1132 * Initialize the network control interface.
1133 */
1134 static void netif_driver_status(netif_fe_driver_status_t *status)
1136 netctrl.up = status->status;
1137 netctrl_connected_count();
1140 /* Receive handler for control messages. */
1141 static void netif_ctrlif_rx(ctrl_msg_t *msg, unsigned long id)
1144 switch ( msg->subtype )
1146 case CMSG_NETIF_FE_INTERFACE_STATUS:
1147 if ( msg->length != sizeof(netif_fe_interface_status_t) )
1148 goto error;
1149 netif_interface_status((netif_fe_interface_status_t *)
1150 &msg->msg[0]);
1151 break;
1153 case CMSG_NETIF_FE_DRIVER_STATUS:
1154 if ( msg->length != sizeof(netif_fe_driver_status_t) )
1155 goto error;
1156 netif_driver_status((netif_fe_driver_status_t *)
1157 &msg->msg[0]);
1158 break;
1160 error:
1161 default:
1162 msg->length = 0;
1163 break;
1166 ctrl_if_send_response(msg);
1170 #if 1
1171 /* Wait for all interfaces to be connected.
1173 * This works OK, but we'd like to use the probing mode (see below).
1174 */
1175 static int probe_interfaces(void)
1177 int err = 0, conn = 0;
1178 int wait_i, wait_n = 100;
1180 DPRINTK(">\n");
1182 for ( wait_i = 0; wait_i < wait_n; wait_i++)
1184 DPRINTK("> wait_i=%d\n", wait_i);
1185 conn = netctrl_connected();
1186 if(conn) break;
1187 DPRINTK("> schedule_timeout...\n");
1188 set_current_state(TASK_INTERRUPTIBLE);
1189 schedule_timeout(10);
1192 DPRINTK("> wait finished...\n");
1193 if ( conn <= 0 )
1195 err = netctrl_err(-ENETDOWN);
1196 WPRINTK("Failed to connect all virtual interfaces: err=%d\n", err);
1199 DPRINTK("< err=%d\n", err);
1201 return err;
1203 #else
1204 /* Probe for interfaces until no more are found.
1206 * This is the mode we'd like to use, but at the moment it panics the kernel.
1207 */
1208 static int probe_interfaces(void)
1210 int err = 0;
1211 int wait_i, wait_n = 100;
1212 ctrl_msg_t cmsg = {
1213 .type = CMSG_NETIF_FE,
1214 .subtype = CMSG_NETIF_FE_INTERFACE_STATUS,
1215 .length = sizeof(netif_fe_interface_status_t),
1216 };
1217 netif_fe_interface_status_t msg = {};
1218 ctrl_msg_t rmsg = {};
1219 netif_fe_interface_status_t *reply = (void*)rmsg.msg;
1220 int state = TASK_UNINTERRUPTIBLE;
1221 u32 query = -1;
1223 DPRINTK(">\n");
1225 netctrl.interface_n = 0;
1226 for ( wait_i = 0; wait_i < wait_n; wait_i++ )
1228 DPRINTK("> wait_i=%d query=%d\n", wait_i, query);
1229 msg.handle = query;
1230 memcpy(cmsg.msg, &msg, sizeof(msg));
1231 DPRINTK("> set_current_state...\n");
1232 set_current_state(state);
1233 DPRINTK("> rmsg=%p msg=%p, reply=%p\n", &rmsg, rmsg.msg, reply);
1234 DPRINTK("> sending...\n");
1235 err = ctrl_if_send_message_and_get_response(&cmsg, &rmsg, state);
1236 DPRINTK("> err=%d\n", err);
1237 if(err) goto exit;
1238 DPRINTK("> rmsg=%p msg=%p, reply=%p\n", &rmsg, rmsg.msg, reply);
1239 if((int)reply->handle < 0){
1240 // No more interfaces.
1241 break;
1243 query = -reply->handle - 2;
1244 DPRINTK(">netif_interface_status ...\n");
1245 netif_interface_status(reply);
1248 exit:
1249 if ( err )
1251 err = netctrl_err(-ENETDOWN);
1252 WPRINTK("Connecting virtual network interfaces failed: err=%d\n", err);
1255 DPRINTK("< err=%d\n", err);
1256 return err;
1259 #endif
1261 /*
1262 * We use this notifier to send out a fake ARP reply to reset switches and
1263 * router ARP caches when an IP interface is brought up on a VIF.
1264 */
1265 static int inetdev_notify(struct notifier_block *this,
1266 unsigned long event,
1267 void *ptr)
1269 struct in_ifaddr *ifa = (struct in_ifaddr *)ptr;
1270 struct net_device *dev = ifa->ifa_dev->dev;
1271 struct list_head *ent;
1272 struct net_private *np;
1274 if ( event != NETDEV_UP )
1275 goto out;
1277 list_for_each ( ent, &dev_list )
1279 np = list_entry(ent, struct net_private, list);
1280 if ( np->dev == dev )
1281 (void)send_fake_arp(dev);
1284 out:
1285 return NOTIFY_DONE;
1288 static struct notifier_block notifier_inetdev = {
1289 .notifier_call = inetdev_notify,
1290 .next = NULL,
1291 .priority = 0
1292 };
1294 static int __init netif_init(void)
1296 int err = 0;
1298 if ( xen_start_info.flags & SIF_INITDOMAIN )
1299 return 0;
1301 IPRINTK("Initialising virtual ethernet driver.\n");
1302 INIT_LIST_HEAD(&dev_list);
1303 (void)register_inetaddr_notifier(&notifier_inetdev);
1304 netctrl_init();
1305 (void)ctrl_if_register_receiver(CMSG_NETIF_FE, netif_ctrlif_rx,
1306 CALLBACK_IN_BLOCKING_CONTEXT);
1307 send_driver_status(1);
1308 err = probe_interfaces();
1309 if ( err )
1310 ctrl_if_unregister_receiver(CMSG_NETIF_FE, netif_ctrlif_rx);
1312 DPRINTK("< err=%d\n", err);
1313 return err;
1316 static void vif_suspend(struct net_private *np)
1318 /* Avoid having tx/rx stuff happen until we're ready. */
1319 free_irq(np->irq, np->dev);
1320 unbind_evtchn_from_irq(np->evtchn);
1323 static void vif_resume(struct net_private *np)
1325 /*
1326 * Connect regardless of whether IFF_UP flag set.
1327 * Stop bad things from happening until we're back up.
1328 */
1329 np->backend_state = BEST_DISCONNECTED;
1330 memset(np->tx, 0, PAGE_SIZE);
1331 memset(np->rx, 0, PAGE_SIZE);
1333 send_interface_connect(np);
1336 void netif_suspend(void)
1338 struct list_head *ent;
1339 struct net_private *np;
1341 list_for_each ( ent, &dev_list )
1343 np = list_entry(ent, struct net_private, list);
1344 vif_suspend(np);
1348 void netif_resume(void)
1350 struct list_head *ent;
1351 struct net_private *np;
1353 list_for_each ( ent, &dev_list )
1355 np = list_entry(ent, struct net_private, list);
1356 vif_resume(np);
1361 module_init(netif_init);