]> xenbits.xensource.com Git - people/ssmith/netchannel2-pvops.git/commitdiff
Forklift in the netchannel2 from the 2.6.18 tree.
authorSteven Smith <ssmith@weybridge.uk.xensource.com>
Wed, 15 Apr 2009 13:26:58 +0000 (14:26 +0100)
committerSteven Smith <ssmith@weybridge.uk.xensource.com>
Tue, 19 May 2009 14:02:39 +0000 (15:02 +0100)
I'll tidy it up shortly.

32 files changed:
drivers/net/Kconfig
drivers/net/Makefile
drivers/net/xen-netchannel2/Makefile [new file with mode: 0644]
drivers/net/xen-netchannel2/autobypass.c [new file with mode: 0644]
drivers/net/xen-netchannel2/bypass.c [new file with mode: 0644]
drivers/net/xen-netchannel2/bypassee.c [new file with mode: 0644]
drivers/net/xen-netchannel2/chan.c [new file with mode: 0644]
drivers/net/xen-netchannel2/hvm_guest_dummy.c [new file with mode: 0644]
drivers/net/xen-netchannel2/limiter.c [new file with mode: 0644]
drivers/net/xen-netchannel2/netback2.c [new file with mode: 0644]
drivers/net/xen-netchannel2/netchan2.c [new file with mode: 0644]
drivers/net/xen-netchannel2/netchannel2_core.h [new file with mode: 0644]
drivers/net/xen-netchannel2/netchannel2_endpoint.h [new file with mode: 0644]
drivers/net/xen-netchannel2/netchannel2_uspace.h [new file with mode: 0644]
drivers/net/xen-netchannel2/netfront2.c [new file with mode: 0644]
drivers/net/xen-netchannel2/offload.c [new file with mode: 0644]
drivers/net/xen-netchannel2/overrides.mk [new file with mode: 0644]
drivers/net/xen-netchannel2/poll.c [new file with mode: 0644]
drivers/net/xen-netchannel2/posted_buffers.c [new file with mode: 0644]
drivers/net/xen-netchannel2/receiver_map.c [new file with mode: 0644]
drivers/net/xen-netchannel2/recv_packet.c [new file with mode: 0644]
drivers/net/xen-netchannel2/rscb.c [new file with mode: 0644]
drivers/net/xen-netchannel2/sysfs.c [new file with mode: 0644]
drivers/net/xen-netchannel2/tools/destroy_bypass.c [new file with mode: 0644]
drivers/net/xen-netchannel2/tools/establish_bypass.c [new file with mode: 0644]
drivers/net/xen-netchannel2/util.c [new file with mode: 0644]
drivers/net/xen-netchannel2/vmq.c [new file with mode: 0644]
drivers/net/xen-netchannel2/vmq.h [new file with mode: 0644]
drivers/net/xen-netchannel2/vmq_def.h [new file with mode: 0644]
drivers/net/xen-netchannel2/xmit_packet.c [new file with mode: 0644]
include/xen/interface/io/netchannel2.h [new file with mode: 0644]
include/xen/interface/io/uring.h [new file with mode: 0644]

index e08fc224a2838b308650dfb9f73405b24440f03d..211c2fadc3fee98879e2295017969adac98a13e1 100644 (file)
@@ -2727,6 +2727,14 @@ config XEN_NETDEV_FRONTEND
          if you are compiling a kernel for a Xen guest, you almost
          certainly want to enable this.
 
+config XEN_NETCHANNEL2
+       tristate "Xen netchannel2 support"
+       depends on XEN
+       select XEN_XENBUS_FRONTEND
+       default y
+       help
+         Support for Xen version 2 network devices
+
 config ISERIES_VETH
        tristate "iSeries Virtual Ethernet driver support"
        depends on PPC_ISERIES
index 3665633ba184448a8cb2d931a1d84a4d53f935be..088ac0a1b6cddf4d85dd0ae946fcc6dafa585664 100644 (file)
@@ -150,6 +150,7 @@ obj-$(CONFIG_SLIP) += slip.o
 obj-$(CONFIG_SLHC) += slhc.o
 
 obj-$(CONFIG_XEN_NETDEV_FRONTEND) += xen-netfront.o
+obj-$(CONFIG_XEN_NETCHANNEL2) += xen-netchannel2/
 
 obj-$(CONFIG_DUMMY) += dummy.o
 obj-$(CONFIG_IFB) += ifb.o
diff --git a/drivers/net/xen-netchannel2/Makefile b/drivers/net/xen-netchannel2/Makefile
new file mode 100644 (file)
index 0000000..69d5102
--- /dev/null
@@ -0,0 +1,39 @@
+sinclude $(M)/overrides.mk
+
+obj-$(CONFIG_XEN_NETCHANNEL2) += netchannel2.o
+
+netchannel2-objs := chan.o netchan2.o rscb.o util.o \
+       posted_buffers.o limiter.o xmit_packet.o offload.o recv_packet.o \
+       poll.o
+
+ifeq ($(CONFIG_XEN_NETDEV2_BYPASSABLE),y)
+netchannel2-objs += bypassee.o
+endif
+
+ifeq ($(CONFIG_XEN_NETDEV2_BYPASS_ENDPOINT),y)
+netchannel2-objs += bypass.o
+endif
+
+ifeq ($(CONFIG_XEN_NETDEV2_AUTOMATIC_BYPASS),y)
+netchannel2-objs += autobypass.o
+endif
+
+ifeq ($(XEN_HVM_GUEST),y)
+netchannel2-objs += hvm_guest_dummy.o
+else ifeq ($(CONFIG_PARAVIRT),y)
+netchannel2-objs += hvm_guest_dummy.o
+else
+netchannel2-objs += receiver_map.o
+endif
+
+ifeq ($(CONFIG_XEN_NETDEV2_BACKEND),y)
+netchannel2-objs += netback2.o
+endif
+
+ifeq ($(CONFIG_XEN_NETDEV2_FRONTEND),y)
+netchannel2-objs += netfront2.o
+endif
+
+ifeq ($(CONFIG_XEN_NETDEV2_VMQ),y)
+netchannel2-objs += vmq.o
+endif
diff --git a/drivers/net/xen-netchannel2/autobypass.c b/drivers/net/xen-netchannel2/autobypass.c
new file mode 100644 (file)
index 0000000..f98d8bc
--- /dev/null
@@ -0,0 +1,316 @@
+#include <linux/kernel.h>
+#include <linux/jiffies.h>
+#include "netchannel2_core.h"
+
+/* The state machine works like this:
+
+   -- We start in state NORMAL.  In this state, we count how many
+      bypass and non-bypass packets we receive, and don't do anything
+      else.
+
+   -- After receiving AUTOBYPASS_PERIOD packets, we look at the
+      bypass-candidate to non-bypass-candidate ratio.  If the number
+      of non-bypass packets exceeds the number of bypass packets by
+      more than a factor of AUTOBYPASS_RATIO, reset the counters and
+      go back to state NORMAL.  Otherwise, go to state CONSIDERING.
+      We also reset and go back to normal if it took more than
+      AUTOBYPASS_MAX_PERIOD_JIFFIES jiffies to get here.
+
+   -- In state CONSIDERING, continue to count up the bypass and
+      non-bypass packets.  In addition, whenever we get a bypass
+      packet, pull the source MAC address out of the header and
+      compare it to the hot list.  If it's in the hot list, increment
+      that entry's count.
+
+   -- After another AUTOBYPASS_PERIOD, check the packet counts again.
+      Provided the total bypass ratio is good enough (see the NORMAL
+      exit criteria), walk the hot list, and if any entry accounts for
+      more than AUTOBYPASS_RATIO2 of the total traffic, suggest to
+      dom0 that it create a new bypass for us.  The go to DEBOUNCE.
+
+   -- In DEBOUNCE, wait until we've received at least
+      AUTOBYPASS_DEBOUNCE_PERIOD bypass packets, then go to NORMAL.
+
+   So, we establish a bypass if total traffic > PERIOD/MAX_PERIOD
+   packets per second, of which at least PERIOD/(MAX_PERIOD*(RATIO+1))
+   are bypass candidates and PERIOD/(MAX_PERIOD*(RATIO2+1)) are for
+   one specific bypass.  This needs to be sustained for at least
+   PERIOD*2 before we actually establish a bypass.
+*/
+
+/* If you increase this past 65536, consider changing the type of
+   auto_bypass.hot_macs[...].count, to avoid overflow. */
+#define AUTOBYPASS_PERIOD 1024
+#define AUTOBYPASS_RATIO 2
+#define AUTOBYPASS_RATIO2 4
+#define AUTOBYPASS_DEBOUNCE_PERIOD 1024
+#define AUTOBYPASS_MAX_PERIOD_JIFFIES (HZ/2)
+
+
+#define TEARDOWN_PERIOD_JIFFIES (HZ*5)
+#define TEARDOWN_MIN_PACKETS (256*TEARDOWN_PERIOD_JIFFIES)
+
+static void autoteardown_timer_fn(unsigned long ignore);
+
+static DEFINE_SPINLOCK(autoteardown_lock);
+static LIST_HEAD(autoteardown_list);
+static DEFINE_TIMER(autoteardown_timer, autoteardown_timer_fn, 0, 0);
+
+static void autoteardown_timer_fn(unsigned long ignore)
+{
+        struct nc2_alternate_ring *nar;
+
+        spin_lock(&autoteardown_lock);
+        list_for_each_entry(nar, &autoteardown_list,
+                            autoteardown.autoteardown_list) {
+                if (nar->autoteardown.seen_count < 2) {
+                        /* Give it at least two periods to get started,
+                           to avoid flapping. */
+                        /* One period isn't enough, because we reset
+                           the seen_count without holding the teardown
+                           lock from
+                           nc2_aux_ring_start_disable_sequence, and
+                           there's a risk that we'll see it non-zero
+                           when it should be zero.  However, the
+                           chances of that happening twice in a row
+                           are so small that we can ignore them.  Even
+                           if it does go wrong twice, the worst case
+                           is that we drop a few packets by forcing a
+                           detach when the remote is behaving
+                           correctly. */
+                        nar->autoteardown.seen_count++;
+                        continue;
+                }
+                switch (nar->state) {
+                case nc2_alt_ring_frontend_sent_ready:
+                        /* Interesting.  We're ready to go, but the
+                           backend isn't.  Furthermore, this isn't the
+                           first time we've seen this interface, so
+                           we've been trying to establish it for at
+                           least TEARDOWN_PERIOD_JIFFIES.  Conclude
+                           that the backend is misbehaving and start a
+                           disable sequence. */
+                        nc2_aux_ring_start_disable_sequence(nar);
+                        break;
+                case nc2_alt_ring_ready:
+                        if (nar->autoteardown.nr_packets <
+                            TEARDOWN_MIN_PACKETS) {
+                                /* This interface isn't busy enough ->
+                                   needs to be torn down. */
+                                nc2_aux_ring_start_disable_sequence(nar);
+                        }
+                        nar->autoteardown.nr_packets = 0;
+                        break;
+                case nc2_alt_ring_disabling:
+                        /* We seem to have gotten stuck trying to
+                           disable the ring, probably because the
+                           remote isn't sending FINISH messages fast
+                           enough.  Be a bit more aggressive. */
+                        nc2_aux_ring_start_detach_sequence(nar);
+                        break;
+                default:
+                        /* Other states are waiting either for the
+                           local operating system to complete work
+                           items, or for the upstream interface to
+                           process messages.  Upstream is always
+                           trusted, so just assume that this'll fix
+                           itself sooner or later. */
+                        break;
+                }
+        }
+        if (!list_empty(&autoteardown_list)) {
+                mod_timer(&autoteardown_timer,
+                          jiffies + TEARDOWN_PERIOD_JIFFIES);
+        }
+        spin_unlock(&autoteardown_lock);
+}
+
+void nc2_register_bypass_for_autoteardown(struct nc2_alternate_ring *nar)
+{
+        spin_lock_bh(&autoteardown_lock);
+        if (list_empty(&autoteardown_list))
+                mod_timer(&autoteardown_timer,
+                          jiffies + TEARDOWN_PERIOD_JIFFIES);
+        list_move(&nar->autoteardown.autoteardown_list, &autoteardown_list);
+        spin_unlock_bh(&autoteardown_lock);
+}
+
+void nc2_unregister_bypass_for_autoteardown(struct nc2_alternate_ring *nar)
+{
+        spin_lock_bh(&autoteardown_lock);
+        list_del_init(&nar->autoteardown.autoteardown_list);
+        if (list_empty(&autoteardown_list))
+                del_timer(&autoteardown_timer);
+        spin_unlock_bh(&autoteardown_lock);
+}
+
+static int busy_enough_for_bypass(struct netchannel2 *nc)
+{
+        uint64_t nr_non_bypass;
+        unsigned long start_jiffies;
+
+        nr_non_bypass = nc->auto_bypass.nr_non_bypass_packets;
+        start_jiffies = nc->auto_bypass.start_jiffies;
+        nc->auto_bypass.nr_non_bypass_packets = 0;
+        nc->auto_bypass.nr_bypass_packets = 0;
+        if (nr_non_bypass > AUTOBYPASS_PERIOD * AUTOBYPASS_RATIO ||
+            jiffies - start_jiffies > AUTOBYPASS_MAX_PERIOD_JIFFIES) {
+                /* Either took too long to collect the bypass
+                   packets, or too many non-bypass relative to
+                   number of bypasses.  Either way, not a good
+                   time to consider doing bypasses. */
+                nc->auto_bypass.start_jiffies = jiffies;
+                return 0;
+        } else {
+                return 1;
+        }
+}
+
+static void record_source_mac(struct netchannel2 *nc, struct sk_buff *skb)
+{
+        struct ethhdr *eh;
+        unsigned x;
+
+        if (skb_headlen(skb) < sizeof(struct ethhdr))
+                return;
+        eh = (struct ethhdr *)skb->data;
+        for (x = 0; x < nc->auto_bypass.nr_hot_macs; x++) {
+                if (!memcmp(eh->h_source, nc->auto_bypass.hot_macs[x].mac,
+                            sizeof(eh->h_source))) {
+                        nc->auto_bypass.hot_macs[x].count++;
+                        return;
+                }
+        }
+        if (x == AUTOBYPASS_MAX_HOT_MACS) {
+                /* Communicating with too many bypass candidates ->
+                   can't keep track of them all -> drop a couple at
+                   random. */
+                return;
+        }
+        nc->auto_bypass.hot_macs[x].count = 1;
+        memcpy(nc->auto_bypass.hot_macs[x].mac,
+               eh->h_source,
+               sizeof(eh->h_source));
+        nc->auto_bypass.nr_hot_macs++;
+}
+
+static void queue_suggested_bypass(struct netchannel2 *nc,
+                                   const char *mac)
+{
+        int ind;
+
+        ind = nc->auto_bypass.suggestion_head % AUTOBYPASS_SUGG_QUEUE_SIZE;
+        if (nc->auto_bypass.suggestion_head ==
+            nc->auto_bypass.suggestion_tail + AUTOBYPASS_SUGG_QUEUE_SIZE) {
+                /* We've overflowed the suggestion queue.  That means
+                   that, even though we're receiving a massive number
+                   of packets, we've never had enough free ring space
+                   to actually send a suggestion message.  I'm not
+                   convinced that's actually possible, but it's
+                   trivial to handle, so we might as well. */
+                /* Drop the oldest pending suggestion, since it's the
+                   most likely to be out of date and therefore
+                   useless. */
+                nc->auto_bypass.suggestion_tail++;
+        }
+        nc->auto_bypass.suggestion_head++;
+        memcpy(&nc->auto_bypass.suggestions[ind],
+               mac,
+               ETH_ALEN);
+}
+
+static void suggest_bypasses(struct netchannel2 *nc)
+{
+        unsigned x;
+        unsigned threshold;
+
+        BUG_ON(nc->auto_bypass.nr_hot_macs == 0);
+        threshold =
+                (nc->auto_bypass.nr_non_bypass_packets +
+                 nc->auto_bypass.nr_bypass_packets) / AUTOBYPASS_RATIO2;
+        for (x = 0; x < nc->auto_bypass.nr_hot_macs; x++) {
+                if (nc->auto_bypass.hot_macs[x].count > threshold) {
+                        queue_suggested_bypass(
+                                nc,
+                                nc->auto_bypass.hot_macs[x].mac);
+                }
+        }
+}
+
+/* Called under the master ring lock whenever we receive a packet with
+   NC2_PACKET_FLAG_bypass_candidate set. */
+void nc2_received_bypass_candidate_packet(struct netchannel2 *nc,
+                                          struct sk_buff *skb)
+{
+        nc->auto_bypass.nr_bypass_packets++;
+        switch (nc->auto_bypass.state) {
+        case autobypass_state_normal:
+                if (nc->auto_bypass.nr_bypass_packets != AUTOBYPASS_PERIOD)
+                        return;
+                if (!busy_enough_for_bypass(nc))
+                        return;
+                nc->auto_bypass.nr_hot_macs = 0;
+                nc->auto_bypass.state = autobypass_state_considering;
+                break;
+        case autobypass_state_considering:
+                record_source_mac(nc, skb);
+                if (nc->auto_bypass.nr_bypass_packets != AUTOBYPASS_PERIOD)
+                        return;
+                if (busy_enough_for_bypass(nc))
+                        suggest_bypasses(nc);
+                nc->auto_bypass.state = autobypass_state_debounce;
+                break;
+        case autobypass_state_debounce:
+                if (nc->auto_bypass.nr_bypass_packets == AUTOBYPASS_PERIOD) {
+                        nc->auto_bypass.state = autobypass_state_normal;
+                        nc->auto_bypass.nr_non_bypass_packets = 0;
+                        nc->auto_bypass.nr_bypass_packets = 0;
+                        nc->auto_bypass.start_jiffies = jiffies;
+                }
+                break;
+        }
+}
+
+static int send_suggestion(struct netchannel2_ring_pair *ncrp,
+                           const char *mac)
+{
+        struct netchannel2_msg_suggest_bypass msg;
+
+        if (!nc2_can_send_payload_bytes(&ncrp->prod_ring, sizeof(msg)))
+                return 0;
+
+        memset(&msg, 0, sizeof(msg));
+        memcpy(msg.mac, mac, ETH_ALEN);
+        nc2_send_message(&ncrp->prod_ring,
+                         NETCHANNEL2_MSG_SUGGEST_BYPASS,
+                         0,
+                         &msg,
+                         sizeof(msg));
+        ncrp->pending_time_sensitive_messages = 1;
+        return 1;
+}
+
+void _nc2_autobypass_make_suggestions(struct netchannel2 *nc)
+{
+        struct nc2_auto_bypass *nab = &nc->auto_bypass;
+        struct netchannel2_ring_pair *ncrp = &nc->rings;
+        unsigned ind;
+
+        while (nab->suggestion_tail != nab->suggestion_head) {
+                BUG_ON(nab->suggestion_head - nab->suggestion_tail >
+                       AUTOBYPASS_SUGG_QUEUE_SIZE);
+                ind = nab->suggestion_tail % AUTOBYPASS_SUGG_QUEUE_SIZE;
+                if (!send_suggestion(ncrp, nab->suggestions[ind].mac))
+                        break;
+                nab->suggestion_tail++;
+        }
+}
+
+void nc2_shutdown_autoteardown(void)
+{
+        /* There shouldn't be any interfaces at all, so there
+           certainly won't be any bypasses, and we don't have to worry
+           about the timer getting requeued.  Make sure it's finished
+           and then get out. */
+        del_timer_sync(&autoteardown_timer);
+}
diff --git a/drivers/net/xen-netchannel2/bypass.c b/drivers/net/xen-netchannel2/bypass.c
new file mode 100644 (file)
index 0000000..582c8f2
--- /dev/null
@@ -0,0 +1,830 @@
+#include <linux/kernel.h>
+#include <linux/kthread.h>
+#include <linux/interrupt.h>
+#include <linux/delay.h>
+#include <xen/evtchn.h>
+#include <xen/driver_util.h>
+#include "netchannel2_core.h"
+
+/* Can we send this packet on this bypass?  True if the destination
+   MAC address matches. */
+static int can_bypass_packet(struct nc2_alternate_ring *ncr,
+                             struct sk_buff *skb)
+{
+        struct ethhdr *eh;
+
+        if (skb_headlen(skb) < sizeof(*eh))
+                return 0;
+        eh = (struct ethhdr *)skb->data;
+        if (memcmp(eh->h_dest, ncr->rings.remote_mac, ETH_ALEN))
+                return 0;
+        else
+                return 1;
+}
+
+/* Called from the netdev start_xmit method.  We're holding the master
+   nc ring lock, but not the bypass ring lock. */
+int bypass_xmit_packet(struct netchannel2 *nc,
+                       struct nc2_alternate_ring *ncr,
+                       struct sk_buff *skb)
+{
+        struct netchannel2_ring_pair *rings = &ncr->rings;
+        struct skb_cb_overlay *skb_co = get_skb_overlay(skb);
+        size_t msg_size;
+        enum transmit_policy policy;
+        int r;
+
+        if (!can_bypass_packet(ncr, skb))
+                return 0;
+
+        spin_lock(&rings->lock);
+        if (ncr->state != nc2_alt_ring_ready) {
+                spin_unlock(&rings->lock);
+                return 0;
+        }
+        /* We're now committed to either transmitting this packet on
+           this ring or dropping it outright. */
+        if (skb->len <= PACKET_PREFIX_SIZE && !skb_is_nonlinear(skb)) {
+                r = prepare_xmit_allocate_small(rings, skb);
+                policy = transmit_policy_small;
+        } else {
+                r = prepare_xmit_allocate_grant(rings, skb, 1);
+                policy = transmit_policy_grant;
+        }
+        if (r < 0) {
+                printk("<0>Drop no prepare.\n");
+                spin_unlock(&rings->lock);
+                dev_kfree_skb(skb);
+                return 1;
+        }
+
+        skb_co->policy = policy;
+        msg_size = get_transmitted_packet_msg_size(skb);
+        if (!nc2_reserve_payload_bytes(&rings->prod_ring, msg_size)) {
+                /* Uh oh. */
+                printk("<0>Drop no reserve.\n");
+                release_tx_packet(rings, skb);
+                spin_unlock(&rings->lock);
+                return 1;
+        }
+
+#ifdef CONFIG_XEN_NETDEV2_AUTOMATIC_BYPASS
+        ncr->autoteardown.nr_packets++;
+#endif
+
+        queue_packet_to_interface(skb, rings);
+
+        spin_unlock(&rings->lock);
+
+        return 1;
+}
+
+void nc2_aux_ring_start_disable_sequence(struct nc2_alternate_ring *nar)
+{
+        spin_lock(&nar->rings.lock);
+        if (nar->state < nc2_alt_ring_disabling) {
+#ifdef CONFIG_XEN_NETDEV2_AUTOMATIC_BYPASS
+                /* We should really hold the autoteardown lock for
+                   this, but see the big comment in
+                   autoteardown_timer_fn() */
+                nar->autoteardown.seen_count = 0;
+#endif
+                nar->state = nc2_alt_ring_disabling;
+                nc2_kick(&nar->rings);
+        }
+        spin_unlock(&nar->rings.lock);
+}
+
+static void start_detach_worker(void *data)
+{
+        struct nc2_alternate_ring *ncr = data;
+
+        ENTER();
+
+        /* Detach from the ring.  Note that it may still be running at
+           this point.  In that case, we need to stop it and then go
+           and discard any outstanding messages on it. */
+
+        /* Stop the IRQ and change state.  This will prevent us from
+           being added to the schedule list again, but we may still be
+           on it for other reasons, so we need to get back into the
+           worker thread to finish up. */
+
+        /* We defer actually unmapping the rings to
+           nc2_advertise_rings(), since that's on the worker thread
+           and we therefore know we're not going to race anything
+           doing it there. */
+
+        if (ncr->rings.irq >= 0)
+                unbind_from_irqhandler(ncr->rings.irq, &ncr->rings);
+        ncr->rings.irq = -1;
+
+        nc2_unregister_bypass_for_autoteardown(ncr);
+
+        spin_lock_bh(&ncr->rings.lock);
+        ncr->state = nc2_alt_ring_detached_pending;
+        ncr->rings.interface->need_aux_ring_state_machine = 1;
+        nc2_kick(&ncr->rings.interface->rings);
+        spin_unlock_bh(&ncr->rings.lock);
+
+        EXIT();
+}
+
+void nc2_aux_ring_start_detach_sequence(struct nc2_alternate_ring *nar)
+{
+        spin_lock(&nar->rings.lock);
+        if (nar->state >= nc2_alt_ring_detaching) {
+                spin_unlock(&nar->rings.lock);
+                return;
+        }
+        nar->state = nc2_alt_ring_detaching;
+        spin_unlock(&nar->rings.lock);
+
+        /* We can't do unbind_from_irqhandler() from a tasklet, so
+           punt it to a workitem. */
+        INIT_WORK(&nar->detach_work_item,
+                  start_detach_worker,
+                  nar);
+        schedule_work(&nar->detach_work_item);
+}
+
+/* Crank through the auxiliary ring state machine.  Called holding the
+ * master ring lock. */
+void _nc2_crank_aux_ring_state_machine(struct netchannel2 *nc)
+{
+        struct nc2_alternate_ring *nar;
+        struct nc2_alternate_ring *next_nar;
+        struct netchannel2_msg_bypass_disabled disabled_msg;
+        struct netchannel2_msg_bypass_detached detached_msg;
+        struct netchannel2_msg_bypass_frontend_ready frontend_ready_msg;
+
+        memset(&disabled_msg, 0, sizeof(disabled_msg));
+        memset(&detached_msg, 0, sizeof(detached_msg));
+        memset(&frontend_ready_msg, 0, sizeof(frontend_ready_msg));
+
+        if (nc->pending_bypass_error) {
+                if (!nc2_can_send_payload_bytes(&nc->rings.prod_ring,
+                                                sizeof(frontend_ready_msg)))
+                        return;
+                frontend_ready_msg.port = -1;
+                nc2_send_message(&nc->rings.prod_ring,
+                                 NETCHANNEL2_MSG_BYPASS_FRONTEND_READY,
+                                 0,
+                                 &frontend_ready_msg,
+                                 sizeof(frontend_ready_msg));
+                nc->rings.pending_time_sensitive_messages = 1;
+                nc->pending_bypass_error = 0;
+        }
+
+        list_for_each_entry_safe(nar, next_nar, &nc->alternate_rings,
+                                 rings_by_interface) {
+
+                spin_lock(&nar->rings.lock);
+                if (nar->state == nc2_alt_ring_frontend_send_ready_pending) {
+                        if (!nc2_can_send_payload_bytes(&nc->rings.prod_ring,
+                                                        sizeof(frontend_ready_msg))) {
+                                spin_unlock(&nar->rings.lock);
+                                return;
+                        }
+                        frontend_ready_msg.port =
+                                irq_to_evtchn_port(nar->rings.irq);
+                        nc2_send_message(&nc->rings.prod_ring,
+                                         NETCHANNEL2_MSG_BYPASS_FRONTEND_READY,
+                                         0,
+                                         &frontend_ready_msg,
+                                         sizeof(frontend_ready_msg));
+                        nar->state = nc2_alt_ring_frontend_sent_ready;
+                        nc->rings.pending_time_sensitive_messages = 1;
+                }
+                if (nar->state == nc2_alt_ring_disabled_pending) {
+                        if (!nc2_can_send_payload_bytes(&nc->rings.prod_ring,
+                                                        sizeof(disabled_msg))){
+                                spin_unlock(&nar->rings.lock);
+                                return;
+                        }
+                        disabled_msg.handle = nar->handle;
+                        nc2_send_message(&nc->rings.prod_ring,
+                                         NETCHANNEL2_MSG_BYPASS_DISABLED,
+                                         0,
+                                         &disabled_msg,
+                                         sizeof(disabled_msg));
+                        nar->state = nc2_alt_ring_disabled;
+                        nc->rings.pending_time_sensitive_messages = 1;
+                }
+                if (nar->state == nc2_alt_ring_detached_pending) {
+                        if (!nc2_can_send_payload_bytes(&nc->rings.prod_ring,
+                                                        sizeof(detached_msg))){
+                                spin_unlock(&nar->rings.lock);
+                                return;
+                        }
+
+                        /* If we get here then we know that nobody
+                           else is going to touch the ring, because
+                           that's what detached_pending means. */
+                        /* Deferred from start_detach_worker() */
+                        nc2_unmap_grants(&nar->prod_mapper);
+                        nc2_unmap_grants(&nar->cons_mapper);
+                        nc2_unmap_grants(&nar->control_mapper);
+
+                        detached_msg.handle = nar->handle;
+                        nc2_send_message(&nc->rings.prod_ring,
+                                         NETCHANNEL2_MSG_BYPASS_DETACHED,
+                                         0,
+                                         &detached_msg,
+                                         sizeof(detached_msg));
+                        nc->rings.pending_time_sensitive_messages = 1;
+
+                        list_del(&nar->rings_by_interface);
+
+                        spin_unlock(&nar->rings.lock);
+
+                        kfree(nar);
+                } else {
+                        spin_unlock(&nar->rings.lock);
+                }
+        }
+        nc->need_aux_ring_state_machine = 0;
+}
+
+static int map_rings_common(struct nc2_alternate_ring *ncr,
+                            struct netchannel2_msg_bypass_common *msg)
+{
+        int err;
+
+        err = nc2_map_grants(&ncr->prod_mapper,
+                             ncr->prod_grefs,
+                             msg->ring_pages,
+                             msg->ring_domid);
+        if (err < 0) {
+                printk(KERN_ERR "%d mapping producer ring", err);
+                return err;
+        }
+
+        err = nc2_map_grants(&ncr->cons_mapper,
+                             ncr->cons_grefs,
+                             msg->ring_pages,
+                             msg->ring_domid);
+        if (err < 0) {
+                printk(KERN_ERR "%d mapping consumer ring", err);
+                return err;
+        }
+
+        err = nc2_map_grants(&ncr->control_mapper,
+                             &msg->control_gref,
+                             1,
+                             msg->ring_domid);
+        if (err < 0)
+                printk(KERN_ERR "%d mapping control ring", err);
+        return err;
+}
+
+static int map_rings_frontend(struct nc2_alternate_ring *ncr)
+{
+        struct netchannel2_frontend_shared *nfs;
+        struct netchannel2_sring_prod *prod_sring;
+        struct netchannel2_sring_cons *cons_sring;
+        int err;
+
+        err = map_rings_common(ncr, &ncr->frontend_setup_msg.common);
+        if (err < 0)
+                return err;
+
+        nfs = ncr->control_mapper.mapping->addr;
+        cons_sring = &nfs->cons;
+        prod_sring = &nfs->prod;
+        _nc2_attach_rings(&ncr->rings,
+                          cons_sring,
+                          ncr->cons_mapper.mapping->addr,
+                          ncr->frontend_setup_msg.common.ring_pages * PAGE_SIZE,
+                          prod_sring,
+                          ncr->prod_mapper.mapping->addr,
+                          ncr->frontend_setup_msg.common.ring_pages * PAGE_SIZE,
+                          ncr->frontend_setup_msg.common.peer_domid);
+
+        return 0;
+}
+
+static int map_rings_backend(struct nc2_alternate_ring *ncr)
+{
+        struct netchannel2_backend_shared *nbs;
+        struct netchannel2_sring_prod *prod_sring;
+        struct netchannel2_sring_cons *cons_sring;
+        int err;
+
+        err = map_rings_common(ncr, &ncr->backend_setup_msg.common);
+        if (err < 0)
+                return err;
+
+        nbs = ncr->control_mapper.mapping->addr;
+        cons_sring = &nbs->cons;
+        prod_sring = &nbs->prod;
+        _nc2_attach_rings(&ncr->rings,
+                          cons_sring,
+                          ncr->cons_mapper.mapping->addr,
+                          ncr->backend_setup_msg.common.ring_pages * PAGE_SIZE,
+                          prod_sring,
+                          ncr->prod_mapper.mapping->addr,
+                          ncr->backend_setup_msg.common.ring_pages * PAGE_SIZE,
+                          ncr->backend_setup_msg.common.peer_domid);
+
+        return 0;
+}
+
+static void send_ready_message(struct nc2_alternate_ring *ncr)
+{
+        struct netchannel2_msg_bypass_ready msg;
+
+        ENTER();
+
+        memset(&msg, 0, sizeof(msg));
+        if (nc2_can_send_payload_bytes(&ncr->rings.prod_ring, sizeof(msg))) {
+                nc2_send_message(&ncr->rings.prod_ring,
+                                 NETCHANNEL2_MSG_BYPASS_READY,
+                                 0, &msg, sizeof(msg));
+                if (nc2_flush_ring(&ncr->rings.prod_ring))
+                        notify_remote_via_irq(ncr->rings.irq);
+        } else {
+                /* This shouldn't happen, because the producer ring
+                   should be essentially empty at this stage.  If it
+                   does, it probably means the other end is playing
+                   silly buggers with the ring indexes.  Drop the
+                   message. */
+                printk(KERN_WARNING "Failed to send bypass ring ready message.\n");
+        }
+        EXIT();
+}
+
+void nc2_handle_bypass_ready(struct netchannel2 *nc,
+                             struct netchannel2_ring_pair *ncrp,
+                             struct netchannel2_msg_hdr *hdr)
+{
+        struct nc2_alternate_ring *ncr;
+
+        if (ncrp == &nc->rings) {
+                pr_debug("bypass ready on principal interface?\n");
+                return;
+        }
+        ncr = container_of(ncrp, struct nc2_alternate_ring, rings);
+        /* We're now allowed to start sending packets over this
+         * ring. */
+        if (ncr->state == nc2_alt_ring_frontend_sent_ready) {
+                ncr->state = nc2_alt_ring_ready;
+#ifdef CONFIG_XEN_NETDEV2_AUTOMATIC_BYPASS
+                ncr->autoteardown.seen_count = 0;
+#endif
+        }
+        DEBUGMSG("Bypass ready.");
+}
+
+/* Called holding the aux ring lock. */
+void _nc2_alternate_ring_disable_finish(struct nc2_alternate_ring *ncr)
+{
+        /* No more packets will ever come out of this ring -> it is
+           now disabled. */
+        ncr->state = nc2_alt_ring_disabled_pending;
+        ncr->rings.interface->need_aux_ring_state_machine = 1;
+        nc2_kick(&ncr->rings.interface->rings);
+}
+
+static void initialise_bypass_frontend_work_item(void *data)
+{
+        struct nc2_alternate_ring *ncr = data;
+        struct netchannel2 *interface = ncr->rings.interface;
+        int err;
+
+        memcpy(&ncr->rings.remote_mac,
+               ncr->frontend_setup_msg.common.remote_mac, 6);
+        err = map_rings_frontend(ncr);
+        if (err < 0)
+                goto err;
+
+        BUG_ON(ncr->rings.cons_ring.sring == NULL);
+
+        err = bind_listening_port_to_irqhandler(ncr->rings.otherend_id,
+                                                nc2_int,
+                                                0,
+                                                "netchannel2_bypass",
+                                                &ncr->rings);
+        if (err < 0)
+                goto err;
+        ncr->rings.irq = err;
+
+        /* Get it going. */
+        nc2_kick(&ncr->rings);
+
+        /* And get the master ring to send a FRONTEND_READY message */
+        ncr->state = nc2_alt_ring_frontend_send_ready_pending;
+        spin_lock_bh(&interface->rings.lock);
+        interface->need_aux_ring_state_machine = 1;
+        nc2_kick(&interface->rings);
+        spin_unlock_bh(&interface->rings.lock);
+
+        nc2_register_bypass_for_autoteardown(ncr);
+
+        return;
+
+err:
+        printk(KERN_ERR "Error %d setting up bypass ring!\n", err);
+
+        spin_lock_bh(&interface->rings.lock);
+        interface->pending_bypass_error = 1;
+        interface->need_aux_ring_state_machine = 1;
+        nc2_kick(&interface->rings);
+        list_del(&ncr->rings_by_interface);
+        spin_unlock_bh(&interface->rings.lock);
+
+        nc2_unmap_grants(&ncr->prod_mapper);
+        nc2_unmap_grants(&ncr->cons_mapper);
+        nc2_unmap_grants(&ncr->control_mapper);
+        kfree(ncr);
+        return;
+}
+
+static void initialise_bypass_backend_work_item(void *data)
+{
+        struct nc2_alternate_ring *ncr = data;
+        struct netchannel2 *interface = ncr->rings.interface;
+        int err;
+
+        memcpy(&ncr->rings.remote_mac,
+               ncr->backend_setup_msg.common.remote_mac, 6);
+        err = map_rings_backend(ncr);
+        if (err < 0)
+                goto err;
+
+        err = bind_interdomain_evtchn_to_irqhandler(ncr->rings.otherend_id,
+                                                    ncr->backend_setup_msg.port,
+                                                    nc2_int,
+                                                    0,
+                                                    "netchannel2_bypass",
+                                                    &ncr->rings);
+        if (err < 0)
+                goto err;
+        ncr->rings.irq = err;
+
+        send_ready_message(ncr);
+
+#ifdef CONFIG_XEN_NETDEV2_AUTOMATIC_BYPASS
+        ncr->autoteardown.seen_count = 0;
+#endif
+
+        spin_lock_bh(&ncr->rings.lock);
+        ncr->state = nc2_alt_ring_ready;
+        spin_unlock_bh(&ncr->rings.lock);
+
+        nc2_kick(&ncr->rings);
+
+        nc2_register_bypass_for_autoteardown(ncr);
+
+        return;
+
+err:
+        printk(KERN_ERR "Error %d setting up bypass ring!\n", err);
+
+        spin_lock_bh(&interface->rings.lock);
+        list_del(&ncr->rings_by_interface);
+        spin_unlock_bh(&interface->rings.lock);
+
+        nc2_unmap_grants(&ncr->prod_mapper);
+        nc2_unmap_grants(&ncr->cons_mapper);
+        nc2_unmap_grants(&ncr->control_mapper);
+        kfree(ncr);
+        return;
+}
+
+void nc2_handle_bypass_frontend(struct netchannel2 *nc,
+                                struct netchannel2_ring_pair *ncrp,
+                                struct netchannel2_msg_hdr *hdr)
+{
+        struct nc2_alternate_ring *work;
+
+        if (hdr->size < sizeof(work->frontend_setup_msg)) {
+                pr_debug("Bypass message had strange size %d\n", hdr->size);
+                return;
+        }
+        if (ncrp != &nc->rings) {
+                pr_debug("Bypass message on ancillary ring!\n");
+                return;
+        }
+        if (!nc->remote_trusted) {
+                pr_debug("Untrusted domain tried to set up a bypass.\n");
+                return;
+        }
+        if (nc->pending_bypass_error) {
+                pr_debug("Remote tried to establish a bypass when we already had a pending error\n");
+                return;
+        }
+        work = kzalloc(sizeof(*work), GFP_ATOMIC);
+        if (!work) {
+                printk(KERN_WARNING "no memory for alternative ring pair!\n");
+                nc->pending_bypass_error = 1;
+                nc->need_aux_ring_state_machine = 1;
+                return;
+        }
+        nc2_copy_from_ring(&nc->rings.cons_ring, &work->frontend_setup_msg,
+                           sizeof(work->frontend_setup_msg));
+        if (hdr->size != sizeof(work->frontend_setup_msg) +
+                                  sizeof(uint32_t) * 2 *
+                                  work->frontend_setup_msg.common.ring_pages) {
+                printk(KERN_WARNING "inconsistent bypass message size (%d for %d pages)\n",
+                       hdr->size, work->frontend_setup_msg.common.ring_pages);
+                goto err;
+        }
+        if (work->frontend_setup_msg.common.ring_pages >
+            MAX_BYPASS_RING_PAGES_MAPPABLE) {
+                printk(KERN_WARNING "too many ring pages: %d > %d\n",
+                       work->frontend_setup_msg.common.ring_pages,
+                       MAX_BYPASS_RING_PAGES_MAPPABLE);
+        err:
+                kfree(work);
+                nc->pending_bypass_error = 1;
+                nc->need_aux_ring_state_machine = 1;
+                return;
+        }
+        nc2_copy_from_ring_off(&ncrp->cons_ring,
+                               &work->prod_grefs,
+                               sizeof(uint32_t) *
+                                   work->frontend_setup_msg.common.ring_pages,
+                               sizeof(work->frontend_setup_msg));
+        nc2_copy_from_ring_off(&ncrp->cons_ring,
+                               &work->cons_grefs,
+                               sizeof(uint32_t) *
+                                   work->frontend_setup_msg.common.ring_pages,
+                               sizeof(work->frontend_setup_msg) +
+                                   sizeof(uint32_t) *
+                                   work->frontend_setup_msg.common.ring_pages);
+
+        work->state = nc2_alt_ring_frontend_preparing;
+#ifdef CONFIG_XEN_NETDEV2_AUTOMATIC_BYPASS
+        INIT_LIST_HEAD(&work->autoteardown.autoteardown_list);
+#endif
+        init_waitqueue_head(&work->eventq);
+        work->handle = work->frontend_setup_msg.common.handle;
+        INIT_WORK(&work->work_item, initialise_bypass_frontend_work_item,
+                  work);
+        init_ring_pair(&work->rings);
+        work->rings.filter_mac = 1;
+        work->rings.interface = nc;
+
+        list_add(&work->rings_by_interface, &nc->alternate_rings);
+        schedule_work(&work->work_item);
+}
+
+void nc2_handle_bypass_backend(struct netchannel2 *nc,
+                               struct netchannel2_ring_pair *ncrp,
+                               struct netchannel2_msg_hdr *hdr)
+{
+        struct nc2_alternate_ring *work;
+
+        if (hdr->size < sizeof(work->backend_setup_msg)) {
+                pr_debug("Bypass message had strange size %d\n", hdr->size);
+                return;
+        }
+        if (ncrp != &nc->rings) {
+                pr_debug("Bypass message on ancillary ring!\n");
+                return;
+        }
+        if (!nc->remote_trusted) {
+                pr_debug("Untrusted domain tried to set up a bypass.\n");
+                return;
+        }
+        work = kzalloc(sizeof(*work), GFP_ATOMIC);
+        if (!work) {
+                printk(KERN_WARNING "no memory for alternative ring pair!\n");
+                return;
+        }
+        nc2_copy_from_ring(&nc->rings.cons_ring, &work->backend_setup_msg,
+                           sizeof(work->backend_setup_msg));
+        if (hdr->size != sizeof(work->backend_setup_msg) +
+                                  sizeof(uint32_t) * 2 *
+                                  work->backend_setup_msg.common.ring_pages) {
+                printk(KERN_WARNING "inconsistent bypass message size (%d for %d pages)\n",
+                       hdr->size, work->backend_setup_msg.common.ring_pages);
+                goto err;
+        }
+        if (work->backend_setup_msg.common.ring_pages >
+            MAX_BYPASS_RING_PAGES_MAPPABLE) {
+                printk(KERN_WARNING "too many ring pages: %d > %d\n",
+                       work->backend_setup_msg.common.ring_pages,
+                       MAX_BYPASS_RING_PAGES_MAPPABLE);
+        err:
+                kfree(work);
+                return;
+        }
+        nc2_copy_from_ring_off(&ncrp->cons_ring,
+                               &work->prod_grefs,
+                               sizeof(uint32_t) *
+                                   work->backend_setup_msg.common.ring_pages,
+                               sizeof(work->backend_setup_msg));
+        nc2_copy_from_ring_off(&ncrp->cons_ring,
+                               &work->cons_grefs,
+                               sizeof(uint32_t) *
+                                   work->backend_setup_msg.common.ring_pages,
+                               sizeof(work->backend_setup_msg) +
+                                   sizeof(uint32_t) *
+                                   work->backend_setup_msg.common.ring_pages);
+
+#ifdef CONFIG_XEN_NETDEV2_AUTOMATIC_BYPASS
+        INIT_LIST_HEAD(&work->autoteardown.autoteardown_list);
+#endif
+        work->state = nc2_alt_ring_backend_preparing;
+        init_waitqueue_head(&work->eventq);
+        work->handle = work->backend_setup_msg.common.handle;
+        INIT_WORK(&work->work_item, initialise_bypass_backend_work_item,
+                  work);
+        init_ring_pair(&work->rings);
+        work->rings.filter_mac = 1;
+        work->rings.interface = nc;
+
+        list_add(&work->rings_by_interface, &nc->alternate_rings);
+        schedule_work(&work->work_item);
+}
+
+/* Called under the nc master ring. */
+static struct nc2_alternate_ring *find_ring_by_handle(struct netchannel2 *nc,
+                                                      uint32_t handle)
+{
+        struct nc2_alternate_ring *nar;
+        list_for_each_entry(nar, &nc->alternate_rings, rings_by_interface) {
+                if (nar->handle == handle)
+                        return nar;
+        }
+        return NULL;
+}
+
+void nc2_handle_bypass_disable(struct netchannel2 *nc,
+                               struct netchannel2_ring_pair *ncrp,
+                               struct netchannel2_msg_hdr *hdr)
+{
+        struct netchannel2_msg_bypass_disable msg;
+        struct nc2_alternate_ring *nar;
+
+        if (ncrp != &nc->rings) {
+                pr_debug("Bypass disable on ancillary ring!\n");
+                return;
+        }
+        if (!nc->remote_trusted) {
+                pr_debug("Untrusted remote requested bypass disable.\n");
+                return;
+        }
+        if (hdr->size != sizeof(msg)) {
+                printk(KERN_WARNING "Strange size bypass disable message; %d != %zd.\n",
+                       hdr->size, sizeof(msg));
+                return;
+        }
+        nc2_copy_from_ring(&nc->rings.cons_ring, &msg, sizeof(msg));
+        nar = find_ring_by_handle(nc, msg.handle);
+        if (nar == NULL) {
+                printk(KERN_WARNING "Request to disable unknown alternate ring %d.\n",
+                       msg.handle);
+                return;
+        }
+        nc2_aux_ring_start_disable_sequence(nar);
+}
+
+/* We've received a BYPASS_DETACH message on the master ring.  Do
+   what's needed to process it. */
+/* Called from the tasklet holding the master ring lock. */
+void nc2_handle_bypass_detach(struct netchannel2 *nc,
+                              struct netchannel2_ring_pair *ncrp,
+                              struct netchannel2_msg_hdr *hdr)
+{
+        struct netchannel2_msg_bypass_detach msg;
+        struct nc2_alternate_ring *nar;
+
+        if (ncrp != &nc->rings) {
+                pr_debug("Bypass detach on wrong ring.\n");
+                return;
+        }
+        if (!nc->remote_trusted) {
+                pr_debug("Detach request from untrusted peer.\n");
+                return;
+        }
+        if (hdr->size != sizeof(msg)) {
+                printk(KERN_WARNING "Strange size bypass detach message; %d != %zd.\n",
+                       hdr->size, sizeof(msg));
+                return;
+        }
+        nc2_copy_from_ring(&nc->rings.cons_ring, &msg, sizeof(msg));
+        nar = find_ring_by_handle(nc, msg.handle);
+        if (nar == NULL) {
+                printk(KERN_WARNING "Request to detach from unknown alternate ring %d.\n",
+                       msg.handle);
+                return;
+        }
+
+        nc2_aux_ring_start_detach_sequence(nar);
+}
+
+/* This is only called once the irqs have been stopped and the
+   interfaces have been de-pended, so it shouldn't have to worry about
+   any async activity. */
+static void release_alt_ring(struct nc2_alternate_ring *nar)
+{
+        flush_scheduled_work();
+
+        nc2_unmap_grants(&nar->prod_mapper);
+        nc2_unmap_grants(&nar->cons_mapper);
+        nc2_unmap_grants(&nar->control_mapper);
+
+        cleanup_ring_pair(&nar->rings);
+}
+
+void nc2_release_alt_rings(struct netchannel2 *nc)
+{
+        struct nc2_alternate_ring *nar, *next_nar;
+
+        list_for_each_entry_safe(nar, next_nar, &nc->alternate_rings,
+                                 rings_by_interface) {
+                release_alt_ring(nar);
+        }
+}
+
+/* This is called from a suspend callback just before the VM goes down
+   for suspend/resume.  When it returns, we must have unmapped all
+   bypass rings.  There is no possibility of failing. */
+void detach_all_bypasses(struct netchannel2 *nc)
+{
+        struct nc2_alternate_ring *nar;
+
+        int cntr;
+
+        spin_lock_bh(&nc->rings.lock);
+        cntr = 0;
+        while (!list_empty(&nc->alternate_rings) && cntr < 500) {
+                list_for_each_entry(nar, &nc->alternate_rings,
+                                    rings_by_interface) {
+                        spin_lock(&nar->rings.lock);
+                        /* If we're currently in an operating state,
+                           pretend we received a DISABLE message, so
+                           we eventually generate a DISABLED message.
+                           The peer will then start the detach state
+                           machine, which will eventually destroy the
+                           bypass. */
+                        /* nc2_alt_ring_frontend_sent_ready is a bit
+                           odd.  We are frontend-like, and we've told
+                           the backend who we are, but we haven't yet
+                           received a READY from the backend.  We
+                           don't necessarily trust the backend, so we
+                           can't wait for it.  The best we can do is
+                           to tell the peer that we've disabled, and
+                           let it drive the backend into shutdown. */
+                        if (nar->state == nc2_alt_ring_frontend_sent_ready ||
+                            nar->state == nc2_alt_ring_ready) {
+                                nar->state = nc2_alt_ring_disabling;
+                                nc2_kick(&nar->rings);
+                        }
+                        spin_unlock(&nar->rings.lock);
+                }
+                spin_unlock_bh(&nc->rings.lock);
+                /* Bit of a hack... */
+                msleep(10);
+                cntr++;
+                spin_lock_bh(&nc->rings.lock);
+        }
+        spin_unlock_bh(&nc->rings.lock);
+
+        if (cntr < 500)
+                return;
+
+        /* Okay, doing it the nice way didn't work.  This can happen
+           if the domain at the other end of the bypass isn't picking
+           up messages, so we can't flush through all of our pending
+           packets and disable ourselves cleanly.  Force it through
+           instead, by pretending that we've received a DETACH message
+           from the parent. */
+        printk(KERN_WARNING "timed out trying to disable a bypass nicely, being more forceful\n");
+        spin_lock_bh(&nc->rings.lock);
+        cntr = 0;
+        while (!list_empty(&nc->alternate_rings)) {
+                list_for_each_entry(nar, &nc->alternate_rings,
+                                    rings_by_interface) {
+                        spin_lock(&nar->rings.lock);
+                        if (nar->state >= nc2_alt_ring_detaching) {
+                                /* Okay, we're already detaching, and
+                                   we're waiting either for our work
+                                   item to run or for an opportunity
+                                   to tell the parent that we're
+                                   detached.  The parent is trusted,
+                                   so just wait for whatever it is
+                                   that we're waiting for to
+                                   happen. */
+                                spin_unlock(&nar->rings.lock);
+                                continue;
+                        }
+                        nar->state = nc2_alt_ring_detaching;
+                        spin_unlock(&nar->rings.lock);
+                        INIT_WORK(&nar->detach_work_item,
+                                  start_detach_worker,
+                                  nar);
+                        schedule_work(&nar->detach_work_item);
+                }
+                spin_unlock_bh(&nc->rings.lock);
+                msleep(10);
+                cntr++;
+                if (cntr % 100 == 0)
+                        printk(KERN_WARNING "taking a long time to detach from bypasses (%d)\n", cntr);
+                spin_lock_bh(&nc->rings.lock);
+        }
+        spin_unlock_bh(&nc->rings.lock);
+}
diff --git a/drivers/net/xen-netchannel2/bypassee.c b/drivers/net/xen-netchannel2/bypassee.c
new file mode 100644 (file)
index 0000000..a166e56
--- /dev/null
@@ -0,0 +1,807 @@
+/* All the bits which allow a domain to be bypassed. */
+#include <linux/kernel.h>
+#include <linux/delay.h>
+#include <linux/spinlock.h>
+#include "netchannel2_core.h"
+
+/* Bypass disable is a bit tricky.  Enable is relatively easy:
+
+   1) We decide to establish a bypass between two interfaces.
+   2) We allocate the pages for the rings and grant them to
+      the relevant domains.
+   3) We nominate one endpoint as the ``backend''.
+   4) We send both endpoints BYPASS messages.
+   5) As far as we're concerned, the bypass is now ready.  The
+      endpoints will do the rest of the negotiation without any help
+      from us.
+
+   Disable is harder.  Each bypass endpoint can be in one of three
+   states:
+
+   -- Running normally.
+   -- Disabled.
+   -- Detached.
+
+   A disabled endpoint won't generate any new operations (which means
+   that it can't send packets, but can send FINISHED_PACKET messages
+   and so forth).  A detached endpoint is one which has no longer
+   mapped the ring pages, so it can neither send nor receive.  There
+   is no provision for transitioning ``backwards'' i.e. from Disabled
+   to Running, Detached to Running, or Detached to Disabled.  There
+   are a couple of messages relevant to changing state:
+
+   -- DISABLE -- go to state Disabled if we're in Running.  Ignored in
+      other states (we won't even get an ACK).  We send this to the
+      endpoint.
+   -- DISABLED -- endpoint has transitioned to Disabled, whether of
+      its own accord or due to a DISABLE message.  We receive this
+      from the endpoint.
+   -- DETACH -- go to state Detached if we're in Running or Disabled.
+      Ignore in other states (without an ACK).  Sent to the endpoint.
+   -- DETACHED -- endpoint has transitioned to DETACHED.  Received
+      from the endpoint.
+
+   A bypass in which both endpoints are Detached can be safely
+   destroyed.
+
+   Once either endpoint has transitioned out of Running, the bypass is
+   pretty useless, so we try to push things so that we go to
+   Detached/Detached as quickly as possible.  In particular:
+
+   A state            B state              Action
+   Running            Disabled             Send A a DISABLE
+   Running            Detached             Send A a DETACH
+   Disabled           Disabled             Send both endpoints DETACH
+   Disabled           Detached             Send A a DETACH
+   Detached           Detached             Destroy the interface
+
+   (And the obvious mirror images)
+
+   There's some filtering so that we never send a given endpoint more
+   than one DISABLE message or more than one DETACH message.  If we
+   want to tear the bypass down from this end, we send both endpoints
+   DISABLE messages and let the state machine take things from
+   there.
+
+   The core state machine is implemented in
+   crank_bypass_state_machine().
+*/
+
+/* A list of all currently-live nc2_bypass interfaces.  Only touched
+   from the worker thread. */
+static LIST_HEAD(all_bypasses);
+
+/* Bottom-half safe lock protecting pretty much all of the bypass
+   state, across all interfaces.  The pending_list_lock is sometimes
+   acquired while this is held.  It is acquired while holding the ring
+   lock. */
+static DEFINE_SPINLOCK(bypasses_lock);
+
+/* Encourage the endpoint to detach as soon as possible. */
+/* Called under the bypass lock. */
+static void schedule_detach(struct nc2_bypass_endpoint *ep)
+{
+        if (!ep->detached && !ep->need_detach && !ep->detach_sent) {
+                BUG_ON(ep->nc2 == NULL);
+                ep->need_detach = 1;
+                ep->nc2->need_advertise_bypasses = 1;
+                nc2_kick(&ep->nc2->rings);
+        }
+}
+
+/* Encourage the endpoint to disable as soon as possible. */
+/* Called under the bypass lock. */
+static void schedule_disable(struct nc2_bypass_endpoint *ep)
+{
+        if (!ep->disabled && !ep->need_disable && !ep->disable_sent) {
+                BUG_ON(ep->detached);
+                BUG_ON(ep->nc2 == NULL);
+                ep->need_disable = 1;
+                ep->nc2->need_advertise_bypasses = 1;
+                nc2_kick(&ep->nc2->rings);
+        }
+}
+
+static void grant_end(grant_ref_t *gref)
+{
+        if (*gref && gnttab_end_foreign_access_ref(*gref)) {
+                gnttab_free_grant_reference(*gref);
+                *gref = 0;
+        }
+}
+
+/* Release all resources associated with the bypass.  It is assumed
+   that the caller has ensured that nobody else is going to access it
+   any more. */
+static void release_bypass(struct nc2_bypass *bypass)
+{
+        int i;
+
+        BUG_ON(atomic_read(&bypass->refcnt) != 0);
+
+        for (i = 0; i < bypass->nr_ring_pages; i++) {
+                grant_end(&bypass->ep_a.incoming_grefs[i]);
+                grant_end(&bypass->ep_b.incoming_grefs[i]);
+                grant_end(&bypass->ep_a.outgoing_grefs[i]);
+                grant_end(&bypass->ep_b.outgoing_grefs[i]);
+                if (bypass->ep_a.incoming_pages[i] &&
+                    !bypass->ep_a.incoming_grefs[i] &&
+                    !bypass->ep_b.outgoing_grefs[i])
+                        free_page(bypass->ep_a.incoming_pages[i]);
+                if (bypass->ep_b.incoming_pages[i] &&
+                    !bypass->ep_b.incoming_grefs[i] &&
+                    !bypass->ep_a.outgoing_grefs[i])
+                        free_page(bypass->ep_b.incoming_pages[i]);
+        }
+        grant_end(&bypass->ep_a.control_gref);
+        grant_end(&bypass->ep_b.control_gref);
+        if (bypass->control_page &&
+            !bypass->ep_a.control_gref &&
+            !bypass->ep_b.control_gref)
+                free_page(bypass->control_page);
+
+        kfree(bypass);
+}
+
+static void put_bypass(struct nc2_bypass *bypass)
+{
+        if (atomic_dec_and_test(&bypass->refcnt))
+                release_bypass(bypass);
+}
+
+/* The state of one of the bypass endpoints has changed.  Crank
+   through the state machine, scheduling any messages which are
+   needed.  Tear the bypass down if both ends have detached. */
+/* Called under the bypass lock. */
+static void crank_bypass_state_machine(struct nc2_bypass *bypass)
+{
+        if (bypass->ep_a.disabled != bypass->ep_b.disabled) {
+                schedule_disable(&bypass->ep_a);
+                schedule_disable(&bypass->ep_b);
+        }
+        if (bypass->ep_a.disabled && bypass->ep_b.disabled) {
+                schedule_detach(&bypass->ep_b);
+                schedule_detach(&bypass->ep_a);
+        }
+        if (bypass->ep_a.detached != bypass->ep_b.detached) {
+                schedule_detach(&bypass->ep_b);
+                schedule_detach(&bypass->ep_a);
+        }
+        if (bypass->ep_a.detached && bypass->ep_b.detached) {
+                /* Okay, neither endpoint knows about the bypass any
+                   more.  It is therefore dead. */
+                /* XXX: Should there be a concept of zombie bypasses?
+                 * i.e. keep the bypass around until userspace
+                 * explicitly reaps it, so as to avoid the usual ID
+                 * reuse races. */
+                list_del_init(&bypass->list);
+                wake_up_all(&bypass->detach_waitq);
+                put_bypass(bypass);
+        }
+}
+
+/* A bypass disabled message has been received on @ncrp (which should
+   be the main ring for @nc, or someone's misbehaving). */
+/* Called from the tasklet. */
+void nc2_handle_bypass_disabled(struct netchannel2 *nc,
+                                struct netchannel2_ring_pair *ncrp,
+                                struct netchannel2_msg_hdr *hdr)
+{
+        struct netchannel2_msg_bypass_disabled msg;
+        struct nc2_bypass *bypass;
+
+        if (hdr->size != sizeof(msg)) {
+                pr_debug("Strange size bypass disabled message; %d != %zd.\n",
+                         hdr->size, sizeof(msg));
+                return;
+        }
+        if (ncrp != &nc->rings) {
+                pr_debug("bypass_disabled on wrong ring.\n");
+                return;
+        }
+        nc2_copy_from_ring(&nc->rings.cons_ring, &msg, sizeof(msg));
+        spin_lock(&bypasses_lock);
+        list_for_each_entry(bypass, &nc->bypasses_a, ep_a.list) {
+                if (bypass->handle == msg.handle) {
+                        bypass->ep_a.disabled = 1;
+                        crank_bypass_state_machine(bypass);
+                        spin_unlock(&bypasses_lock);
+                        return;
+                }
+        }
+        list_for_each_entry(bypass, &nc->bypasses_b, ep_b.list) {
+                if (bypass->handle == msg.handle) {
+                        bypass->ep_b.disabled = 1;
+                        crank_bypass_state_machine(bypass);
+                        spin_unlock(&bypasses_lock);
+                        return;
+                }
+        }
+        spin_unlock(&bypasses_lock);
+
+        pr_debug("Disabled message was on the wrong ring (%d)?\n",
+                 msg.handle);
+        return;
+}
+
+static void detach(struct nc2_bypass_endpoint *ep)
+{
+        if (ep->detached)
+                return;
+        list_del_init(&ep->list);
+        ep->disabled = ep->detached = 1;
+        ep->nc2->extant_bypasses--;
+        ep->nc2 = NULL;
+}
+
+/* One of our peers has sent us a bypass detached message i.e. it was
+   previously bypassing us, and it's not any more.  Do the appropriate
+   thing. */
+void nc2_handle_bypass_detached(struct netchannel2 *nc,
+                                struct netchannel2_ring_pair *ncrp,
+                                struct netchannel2_msg_hdr *hdr)
+{
+        struct netchannel2_msg_bypass_detached msg;
+        struct nc2_bypass *bypass;
+
+        if (hdr->size != sizeof(msg)) {
+                pr_debug("Strange size bypass detached message; %d != %zd.\n",
+                         hdr->size, sizeof(msg));
+                return;
+        }
+        if (ncrp != &nc->rings) {
+                pr_debug("bypass_disabled on wrong ring.\n");
+                return;
+        }
+        nc2_copy_from_ring(&nc->rings.cons_ring, &msg, sizeof(msg));
+        spin_lock(&bypasses_lock);
+        list_for_each_entry(bypass, &nc->bypasses_a, ep_a.list) {
+                if (bypass->handle == msg.handle) {
+                        detach(&bypass->ep_a);
+                        crank_bypass_state_machine(bypass);
+                        spin_unlock(&bypasses_lock);
+                        return;
+                }
+        }
+        list_for_each_entry(bypass, &nc->bypasses_b, ep_b.list) {
+                if (bypass->handle == msg.handle) {
+                        detach(&bypass->ep_b);
+                        crank_bypass_state_machine(bypass);
+                        spin_unlock(&bypasses_lock);
+                        return;
+                }
+        }
+        spin_unlock(&bypasses_lock);
+        pr_debug("Detached message was on the wrong ring (%d)?\n",
+                 msg.handle);
+}
+
+static void process_suggestion_queue_workitem(void *ctxt)
+{
+        struct netchannel2 *nc = ctxt;
+        struct nc2_incoming_bypass_suggestions *sugg =
+                &nc->incoming_bypass_suggestions;
+        unsigned ind;
+        unsigned char mac[ETH_ALEN];
+
+        spin_lock_bh(&sugg->lock);
+        while (sugg->tail != sugg->head) {
+                ind = sugg->tail % NC2_BYPASS_SUGG_QUEUE_SIZE;
+                memcpy(mac, sugg->queue[ind].mac, ETH_ALEN);
+                sugg->tail++;
+                spin_unlock_bh(&sugg->lock);
+
+                nb2_handle_suggested_bypass(nc, mac);
+
+                spin_lock_bh(&sugg->lock);
+        }
+        spin_unlock_bh(&sugg->lock);
+}
+
+void nc2_handle_suggest_bypass(struct netchannel2 *nc,
+                               struct netchannel2_ring_pair *ncrp,
+                               struct netchannel2_msg_hdr *hdr)
+{
+        struct nc2_incoming_bypass_suggestions *sugg =
+                &nc->incoming_bypass_suggestions;
+        struct netchannel2_msg_suggest_bypass msg;
+        unsigned ind;
+
+        if (hdr->size != sizeof(msg)) {
+                pr_debug("strange size suggest bypass message; %d != %zd\n",
+                         hdr->size, sizeof(msg));
+                return;
+        }
+        if (ncrp != &nc->rings) {
+                pr_debug("suggest bypass on bypass ring?\n");
+                return;
+        }
+        nc2_copy_from_ring(&nc->rings.cons_ring, &msg, sizeof(msg));
+
+        spin_lock(&sugg->lock);
+        ind = sugg->head % NC2_BYPASS_SUGG_QUEUE_SIZE;
+        /* Drop if we've overflowed the queue */
+        if (sugg->head == sugg->tail + NC2_BYPASS_SUGG_QUEUE_SIZE)
+                sugg->tail++;
+        memcpy(&sugg->queue[ind].mac, msg.mac, ETH_ALEN);
+        if (sugg->head == sugg->tail)
+                schedule_work(&sugg->workitem);
+        sugg->head++;
+        spin_unlock(&sugg->lock);
+}
+
+
+static int send_disable_bypass_msg(struct netchannel2 *nc,
+                                   struct nc2_bypass *bypass)
+{
+        struct netchannel2_msg_bypass_disable msg = {
+                .handle = bypass->handle
+        };
+
+        if (!nc2_can_send_payload_bytes(&nc->rings.prod_ring, sizeof(msg)))
+                return 1;
+        nc2_send_message(&nc->rings.prod_ring, NETCHANNEL2_MSG_BYPASS_DISABLE,
+                         0, &msg, sizeof(msg));
+        nc->rings.pending_time_sensitive_messages = 1;
+        return 0;
+}
+
+static int send_detach_bypass_msg(struct netchannel2 *nc,
+                                  struct nc2_bypass *bypass)
+{
+        struct netchannel2_msg_bypass_detach msg = {
+                .handle = bypass->handle
+        };
+
+        if (!nc2_can_send_payload_bytes(&nc->rings.prod_ring, sizeof(msg)))
+                return 1;
+        nc2_send_message(&nc->rings.prod_ring, NETCHANNEL2_MSG_BYPASS_DETACH,
+                         0, &msg, sizeof(msg));
+        nc->rings.pending_time_sensitive_messages = 1;
+        return 0;
+}
+
+static void init_bypass_msg_common(struct netchannel2_msg_bypass_common *msg,
+                                   struct nc2_bypass_endpoint *this_ep,
+                                   struct netchannel2 *remote,
+                                   struct nc2_bypass *bypass)
+{
+        msg->control_gref = this_ep->control_gref;
+
+        /* XXX FIXME This may not always be domain 0 */
+        printk(KERN_WARNING "Assuming we're bypassing domain 0.\n");
+        msg->ring_domid = 0;
+        msg->ring_pages = bypass->nr_ring_pages;;
+        msg->peer_domid = remote->rings.otherend_id;
+        msg->peer_trusted = remote->remote_trusted;
+        msg->handle = bypass->handle;
+        memcpy(msg->remote_mac, remote->rings.remote_mac, ETH_ALEN);
+}
+
+static int advertise_bypass_frontend(struct netchannel2 *nc,
+                                     struct nc2_bypass *bypass)
+{
+        struct netchannel2_msg_bypass_frontend msg;
+        unsigned msg_size;
+
+        BUG_ON(nc != bypass->ep_a.nc2);
+
+        msg_size = sizeof(msg) + bypass->nr_ring_pages * 2 * sizeof(uint32_t);
+        if (!nc->current_bypass_frontend &&
+            !nc2_can_send_payload_bytes(&nc->rings.prod_ring, msg_size))
+                return 1;
+
+        memset(&msg, 0, sizeof(msg));
+
+        init_bypass_msg_common(&msg.common, &bypass->ep_a, bypass->ep_b.nc2,
+                               bypass);
+
+        nc->current_bypass_frontend = bypass;
+
+        /* Send the message.  nc2_send_message doesn't support the
+           right kind of scatter gather, so do it by hand. */
+        __nc2_avoid_ring_wrap(&nc->rings.prod_ring, msg_size);
+        msg.hdr.type = NETCHANNEL2_MSG_BYPASS_FRONTEND;
+        msg.hdr.size = msg_size;
+        nc2_copy_to_ring(&nc->rings.prod_ring, &msg, sizeof(msg));
+        nc2_copy_to_ring_off(&nc->rings.prod_ring,
+                             bypass->ep_a.outgoing_grefs,
+                             sizeof(uint32_t) * bypass->nr_ring_pages,
+                             sizeof(msg));
+        nc2_copy_to_ring_off(&nc->rings.prod_ring,
+                             bypass->ep_a.incoming_grefs,
+                             sizeof(uint32_t) * bypass->nr_ring_pages,
+                             sizeof(msg) + sizeof(uint32_t) * bypass->nr_ring_pages);
+        nc->rings.prod_ring.prod_pvt += msg_size;
+        nc->rings.prod_ring.bytes_available -= msg_size;
+        nc->rings.pending_time_sensitive_messages = 1;
+        return 0;
+}
+
+static int advertise_bypass_backend(struct netchannel2 *nc,
+                                     struct nc2_bypass *bypass)
+{
+        struct netchannel2_msg_bypass_backend msg;
+        unsigned msg_size;
+
+        BUG_ON(nc != bypass->ep_b.nc2);
+
+        msg_size = sizeof(msg) + bypass->nr_ring_pages * 2 * sizeof(uint32_t);
+        if (!nc2_can_send_payload_bytes(&nc->rings.prod_ring, msg_size))
+                return 1;
+
+        memset(&msg, 0, sizeof(msg));
+
+        init_bypass_msg_common(&msg.common, &bypass->ep_b, bypass->ep_a.nc2,
+                               bypass);
+
+        BUG_ON(bypass->evtchn_port == 0);
+        msg.port = bypass->evtchn_port;
+        msg.hdr.type = NETCHANNEL2_MSG_BYPASS_BACKEND;
+        msg.hdr.size = msg_size;
+        nc2_copy_to_ring(&nc->rings.prod_ring, &msg, sizeof(msg));
+        nc2_copy_to_ring_off(&nc->rings.prod_ring,
+                             bypass->ep_b.outgoing_grefs,
+                             sizeof(uint32_t) * bypass->nr_ring_pages,
+                             sizeof(msg));
+        nc2_copy_to_ring_off(&nc->rings.prod_ring,
+                             bypass->ep_b.incoming_grefs,
+                             sizeof(uint32_t) * bypass->nr_ring_pages,
+                             sizeof(msg) + sizeof(uint32_t) * bypass->nr_ring_pages);
+        nc->rings.prod_ring.prod_pvt += msg_size;
+        nc->rings.prod_ring.bytes_available -= msg_size;
+        nc->rings.pending_time_sensitive_messages = 1;
+        return 0;
+}
+
+/* Called from the tasklet, holding the ring lock for nc and the
+   bypass lock. */
+static int advertise_bypass(struct netchannel2 *nc, struct nc2_bypass *bypass)
+{
+        if (nc == bypass->ep_a.nc2)
+                return advertise_bypass_frontend(nc, bypass);
+        else
+                return advertise_bypass_backend(nc, bypass);
+}
+
+/* Called from the tasklet holding the ring and bypass locks. */
+static int nc2_do_bypass_advertise_work(struct nc2_bypass_endpoint *ep,
+                                        struct netchannel2 *nc,
+                                        struct nc2_bypass *bypass)
+{
+        if (ep->need_advertise) {
+                if (advertise_bypass(nc, bypass))
+                        return 0;
+                ep->need_advertise = 0;
+        }
+        if (ep->need_disable) {
+                if (send_disable_bypass_msg(nc, bypass))
+                        return 0;
+                ep->need_disable = 0;
+                ep->disable_sent = 1;
+        }
+        if (ep->need_detach) {
+                if (send_detach_bypass_msg(nc, bypass))
+                        return 0;
+                ep->need_detach = 0;
+                ep->detach_sent = 1;
+        }
+        return 1;
+}
+
+/* Called from the tasklet holding the ring lock. */
+void _nc2_advertise_bypasses(struct netchannel2 *nc)
+{
+        struct nc2_bypass *bypass;
+        int success;
+
+        spin_lock(&bypasses_lock);
+        success = 1;
+        list_for_each_entry(bypass, &nc->bypasses_a, ep_a.list) {
+                success &= nc2_do_bypass_advertise_work(&bypass->ep_a,
+                                                        nc,
+                                                        bypass);
+        }
+        list_for_each_entry(bypass, &nc->bypasses_b, ep_b.list) {
+                success &= nc2_do_bypass_advertise_work(&bypass->ep_b,
+                                                        nc,
+                                                        bypass);
+        }
+        if (success)
+                nc->need_advertise_bypasses = 0;
+        spin_unlock(&bypasses_lock);
+}
+
+void nc2_handle_bypass_frontend_ready(struct netchannel2 *nc,
+                                      struct netchannel2_ring_pair *ncrp,
+                                      struct netchannel2_msg_hdr *hdr)
+{
+        struct netchannel2_msg_bypass_frontend_ready msg;
+        struct nc2_bypass *bypass;
+
+        if (hdr->size != sizeof(msg) || ncrp != &nc->rings ||
+            !nc->current_bypass_frontend)
+                return;
+        bypass = nc->current_bypass_frontend;
+        nc->current_bypass_frontend = NULL;
+        nc2_copy_from_ring(&nc->rings.cons_ring, &msg, sizeof(msg));
+        spin_lock(&bypasses_lock);
+        if (msg.port <= 0) {
+                printk(KERN_WARNING "%d from frontend trying to establish bypass\n",
+                       msg.port);
+                detach(&bypass->ep_a);
+                detach(&bypass->ep_b);
+                crank_bypass_state_machine(bypass);
+                spin_unlock(&bypasses_lock);
+                return;
+        }
+
+        bypass->evtchn_port = msg.port;
+        bypass->ep_b.need_advertise = 1;
+        bypass->ep_b.nc2->need_advertise_bypasses = 1;
+        nc2_kick(&bypass->ep_b.nc2->rings);
+        spin_unlock(&bypasses_lock);
+}
+
+/* Called from an ioctl not holding any locks. */
+static int build_bypass_page(int *gref_pool,
+                             int *grefp_a,
+                             int *grefp_b,
+                             domid_t domid_a,
+                             domid_t domid_b,
+                             unsigned long *pagep)
+{
+        int gref_a, gref_b;
+        unsigned long page;
+
+        page = get_zeroed_page(GFP_ATOMIC);
+        if (page == 0)
+                return -ENOMEM;
+        gref_a = gnttab_claim_grant_reference(gref_pool);
+        gref_b = gnttab_claim_grant_reference(gref_pool);
+        BUG_ON(gref_a < 0);
+        BUG_ON(gref_b < 0);
+        gnttab_grant_foreign_access_ref(gref_a, domid_a, virt_to_mfn(page), 0);
+        gnttab_grant_foreign_access_ref(gref_b, domid_b, virt_to_mfn(page), 0);
+
+        *pagep = page;
+        *grefp_a = gref_a;
+        *grefp_b = gref_b;
+        return 0;
+}
+
+/* Called from an ioctl or work queue item not holding any locks. */
+int nc2_establish_bypass(struct netchannel2 *a, struct netchannel2 *b)
+{
+        struct nc2_bypass *work;
+        struct nc2_bypass *other_bypass;
+        int err;
+        grant_ref_t gref_pool;
+        int i;
+        static atomic_t next_handle;
+        int handle;
+        unsigned nr_pages;
+
+        /* Can't establish a bypass unless we're trusted by both of
+           the remote endpoints. */
+        if (!a->local_trusted || !b->local_trusted)
+                return -EPERM;
+
+        /* Can't establish a bypass unless it's allowed by both
+         * endpoints. */
+        if (!a->bypass_max_pages || !b->bypass_max_pages)
+                return -EOPNOTSUPP;
+
+        if (a->extant_bypasses >= a->max_bypasses ||
+            b->extant_bypasses >= b->max_bypasses)
+                return -EMFILE;
+
+        nr_pages = a->bypass_max_pages;
+        if (nr_pages > b->bypass_max_pages)
+                nr_pages = b->bypass_max_pages;
+        if (nr_pages > MAX_BYPASS_RING_PAGES_GRANTABLE)
+                nr_pages = MAX_BYPASS_RING_PAGES_GRANTABLE;
+        if (nr_pages == 0) {
+                printk(KERN_WARNING "tried to establish a null bypass ring?\n");
+                return -EINVAL;
+        }
+
+        work = kzalloc(sizeof(*work), GFP_ATOMIC);
+        if (!work)
+                return -ENOMEM;
+        atomic_set(&work->refcnt, 1);
+        init_waitqueue_head(&work->detach_waitq);
+
+        work->nr_ring_pages = nr_pages;
+
+        work->ep_a.nc2 = a;
+        work->ep_b.nc2 = b;
+
+        work->ep_a.need_advertise = 1;
+
+        handle = atomic_inc_return(&next_handle);
+        work->handle = handle;
+
+        /* XXX For now, the rings are allocated out of dom0 memory.  A
+           real implementation will need to get balancing transfers
+           from the other domains, because this approach leaks if you
+           have uncooperative domUs. */
+
+        err = gnttab_alloc_grant_references(work->nr_ring_pages * 4 + 2,
+                                            &gref_pool);
+        if (err < 0)
+                goto err;
+
+        err = -ENOMEM;
+        for (i = 0; i < work->nr_ring_pages; i++) {
+                err = build_bypass_page(&gref_pool,
+                                        &work->ep_a.incoming_grefs[i],
+                                        &work->ep_b.outgoing_grefs[i],
+                                        a->rings.otherend_id,
+                                        b->rings.otherend_id,
+                                        &work->ep_a.incoming_pages[i]);
+                if (err < 0)
+                        goto err;
+                err = build_bypass_page(&gref_pool,
+                                        &work->ep_b.incoming_grefs[i],
+                                        &work->ep_a.outgoing_grefs[i],
+                                        b->rings.otherend_id,
+                                        a->rings.otherend_id,
+                                        &work->ep_b.incoming_pages[i]);
+                if (err < 0)
+                        goto err;
+        }
+        err = build_bypass_page(&gref_pool,
+                                &work->ep_a.control_gref,
+                                &work->ep_b.control_gref,
+                                a->rings.otherend_id,
+                                b->rings.otherend_id,
+                                &work->control_page);
+        if (err < 0)
+                goto err;
+
+        spin_lock_bh(&bypasses_lock);
+
+        if (work->ep_a.nc2->current_bypass_frontend) {
+                /* We can't establish another bypass until this one
+                   has finished (which might be forever, if the remote
+                   domain is misbehaving, but that's not a
+                   problem). */
+                err = -EBUSY;
+                spin_unlock_bh(&bypasses_lock);
+                goto err;
+        }
+
+        /* Don't allow redundant bypasses, because they'll never be used.
+           This doesn't actually matter all that much, because in order
+           to establish a redundant bypass, either:
+
+           -- The user explicitly requested one, in which case they
+              get what they deserve, or
+           -- They're using the autobypasser, in which case it'll detect
+              that the bypass isn't being used within a few seconds
+              and tear it down.
+
+           Still, it's better to avoid it (if only so the user gets a
+           sensible error message), and so we do a quick check here.
+        */
+        list_for_each_entry(other_bypass, &a->bypasses_a, ep_a.list) {
+                BUG_ON(other_bypass->ep_a.nc2 != a);
+                if (other_bypass->ep_b.nc2 == b) {
+                        err = -EEXIST;
+                        spin_unlock_bh(&bypasses_lock);
+                        goto err;
+                }
+        }
+        list_for_each_entry(other_bypass, &a->bypasses_b, ep_b.list) {
+                BUG_ON(other_bypass->ep_b.nc2 != a);
+                if (other_bypass->ep_a.nc2 == b) {
+                        err = -EEXIST;
+                        spin_unlock_bh(&bypasses_lock);
+                        goto err;
+                }
+        }
+
+        list_add(&work->ep_a.list, &a->bypasses_a);
+        INIT_LIST_HEAD(&work->ep_b.list);
+        a->need_advertise_bypasses = 1;
+        list_add(&work->ep_b.list, &b->bypasses_b);
+        list_add_tail(&work->list, &all_bypasses);
+
+        a->extant_bypasses++;
+        b->extant_bypasses++;
+
+        spin_unlock_bh(&bypasses_lock);
+
+        nc2_kick(&a->rings);
+
+        return handle;
+
+err:
+        gnttab_free_grant_references(gref_pool);
+        put_bypass(work);
+        return err;
+}
+
+/* Called from an ioctl holding the bypass lock. */
+static struct nc2_bypass *get_bypass(uint32_t handle)
+{
+        struct nc2_bypass *bypass;
+
+        list_for_each_entry(bypass, &all_bypasses, list) {
+                if (bypass->handle == handle) {
+                        atomic_inc(&bypass->refcnt);
+                        return bypass;
+                }
+        }
+        return NULL;
+}
+
+static int bypass_fully_detached(struct nc2_bypass *bypass)
+{
+        int res;
+        spin_lock_bh(&bypasses_lock);
+        res = bypass->ep_a.detached && bypass->ep_b.detached;
+        spin_unlock_bh(&bypasses_lock);
+        return res;
+}
+
+int nc2_destroy_bypass(int handle)
+{
+        struct nc2_bypass *bypass;
+        int r;
+
+        spin_lock_bh(&bypasses_lock);
+        bypass = get_bypass(handle);
+        if (bypass == NULL) {
+                spin_unlock_bh(&bypasses_lock);
+                return -ESRCH;
+        }
+        schedule_disable(&bypass->ep_a);
+        schedule_disable(&bypass->ep_b);
+        spin_unlock_bh(&bypasses_lock);
+
+        r = wait_event_interruptible_timeout(bypass->detach_waitq,
+                                             bypass_fully_detached(bypass),
+                                             5 * HZ);
+        put_bypass(bypass);
+        if (r < 0) {
+                printk(KERN_WARNING "Failed to destroy a bypass (%d).\n",
+                       r);
+        }
+        return r;
+}
+
+/* We're guaranteed to be the only thing accessing @nc at this point,
+   but we don't know what's happening to the other endpoints of any
+   bypasses which it might have attached. */
+void release_bypasses(struct netchannel2 *nc)
+{
+        struct nc2_bypass *bypass, *next_bypass;
+
+        spin_lock(&bypasses_lock);
+        list_for_each_entry_safe(bypass, next_bypass, &nc->bypasses_a,
+                                 ep_a.list) {
+                detach(&bypass->ep_a);
+                crank_bypass_state_machine(bypass);
+        }
+        list_for_each_entry_safe(bypass, next_bypass, &nc->bypasses_b,
+                                 ep_b.list) {
+                detach(&bypass->ep_b);
+                crank_bypass_state_machine(bypass);
+        }
+        spin_unlock(&bypasses_lock);
+
+        BUG_ON(!list_empty(&nc->bypasses_a));
+        BUG_ON(!list_empty(&nc->bypasses_b));
+
+        flush_scheduled_work();
+}
+
+void nc2_init_incoming_bypass_suggestions(
+        struct netchannel2 *nc2,
+        struct nc2_incoming_bypass_suggestions *nibs)
+{
+        spin_lock_init(&nibs->lock);
+        INIT_WORK(&nibs->workitem, process_suggestion_queue_workitem, nc2);
+}
diff --git a/drivers/net/xen-netchannel2/chan.c b/drivers/net/xen-netchannel2/chan.c
new file mode 100644 (file)
index 0000000..90fca1e
--- /dev/null
@@ -0,0 +1,1117 @@
+#include <linux/kernel.h>
+#include <linux/kthread.h>
+#include <linux/gfp.h>
+#include <linux/etherdevice.h>
+#include <linux/interrupt.h>
+#include <linux/netdevice.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/delay.h>
+#include <linux/version.h>
+#ifdef CONFIG_PARAVIRT
+#include <xen/interface/xen.h>
+#include <xen/events.h>
+#else
+#include <xen/evtchn.h>
+#endif
+#include <xen/xenbus.h>
+
+#include "netchannel2_endpoint.h"
+#include "netchannel2_core.h"
+
+#ifdef CONFIG_XEN_NETDEV2_VMQ
+#include "vmq.h"
+#endif
+
+
+static void nc2_action(unsigned long ignore);
+static DECLARE_TASKLET(nc2_worker_tasklet, nc2_action, 0);
+/* A list of all ring_pairs which have pending incoming messages.  You
+   must disable_irq() when adding an interface to this list, and
+   enable_irq() when removing it. */
+static LIST_HEAD(pending_interfaces);
+/* Nests inside the per-ring locks. */
+static DEFINE_SPINLOCK(pending_interfaces_lock);
+
+/* Likewise */
+struct hypercall_batcher pending_rx_hypercalls;
+
+static void nc2_disable_irq_nosync(struct netchannel2_ring_pair *ncrp)
+{
+        disable_irq_nosync(ncrp->irq);
+#ifdef DEBUG
+        ncrp->irq_disable_count++;
+#endif
+}
+
+static void nc2_disable_irq(struct netchannel2_ring_pair *ncrp)
+{
+        disable_irq(ncrp->irq);
+#ifdef DEBUG
+        ncrp->irq_disable_count++;
+#endif
+}
+
+static void nc2_enable_irq(struct netchannel2_ring_pair *ncrp)
+{
+        enable_irq(ncrp->irq);
+#ifdef DEBUG
+        ncrp->irq_disable_count--;
+#endif
+}
+
+irqreturn_t nc2_int(int irq, void *dev_id, struct pt_regs *ptregs)
+{
+        struct netchannel2_ring_pair *ncr = dev_id;
+
+        if (ncr->irq == -1)
+                return IRQ_HANDLED;
+        ncr->last_event = jiffies;
+        if (ncr->cons_ring.sring->prod != ncr->cons_ring.cons_pvt ||
+           ncr->interface->is_stopped) {
+                spin_lock(&pending_interfaces_lock);
+                if (!ncr->is_pending) {
+                        list_add_tail(&ncr->pending_interfaces,
+                                      &pending_interfaces);
+                        ncr->is_pending = 1;
+                        nc2_disable_irq_nosync(ncr);
+                        tasklet_schedule(&nc2_worker_tasklet);
+                }
+                spin_unlock(&pending_interfaces_lock);
+        }
+        return IRQ_HANDLED;
+}
+
+/* Process all incoming messages.  The ring is not on the pending
+   list.  The function is given an IRQ-disabled reference for the
+   interface, and must dispose of it (either by enabling the IRQ or
+   re-introducing it to the pending list).  Alternatively, the
+   function can stop the ring being processed again by leaking the
+   reference (e.g. when the remote endpoint is misbehaving). */
+static void process_messages(struct netchannel2_ring_pair *ncrp)
+{
+        unsigned ring_bytes_consumed;
+        struct netchannel2_msg_hdr hdr;
+        RING_IDX prod;
+        unsigned long flags;
+        struct netchannel2 *nc = ncrp->interface;
+
+        ENTER();
+
+        ring_bytes_consumed = 0;
+
+retry:
+        prod = ncrp->cons_ring.sring->prod;
+        rmb();
+        while (prod != ncrp->cons_ring.cons_pvt) {
+                /* Ask the rate limiter if we're allowed to process
+                   this message. */
+                if (!nc2_rate_limiter_debit(&ncrp->limiter, 1)) {
+                        /* Rate limiter said no.  The limiter will
+                           automatically re-add the ring to the
+                           pending list when we're allowed to access
+                           it again.  For now, just drop it on the
+                           floor. */
+                        DEBUGMSG("Hit the rate limiter on ring %p!\n",
+                                 ncrp);
+                        /* We're no longer on the pending list, and so
+                           we need to re-enable the IRQ.  However, the
+                           limiter stop() method will have done an
+                           addition disable_irq(), and so the IRQ
+                           won't really get re-enabled. */
+                        nc2_enable_irq(ncrp);
+                        return;
+                }
+                nc2_copy_from_ring(&ncrp->cons_ring, &hdr, sizeof(hdr));
+                if (hdr.size < sizeof(hdr)) {
+                        printk(KERN_WARNING "Other end sent too-small message (%d)\n",
+                               hdr.size);
+                        EXIT();
+                        return;
+                }
+                if (hdr.size >
+                    ncrp->cons_ring.payload_bytes) {
+                        /* This one message is bigger than the whole
+                           ring -> other end is clearly misbehaving.
+                           We won't take any more messages from this
+                           ring. */
+                        printk(KERN_WARNING "Other end sent enormous message (%d > %zd)\n",
+                               hdr.size,
+                               ncrp->cons_ring.payload_bytes);
+                        EXIT();
+                        return;
+                }
+                if (ring_bytes_consumed + hdr.size >
+                    ncrp->cons_ring.payload_bytes) {
+                        /* If we consume this message, we'll
+                           have eaten more than a whole ring
+                           this time around.  That isn't
+                           always an error, but it's probably
+                           a good idea to get out and let some
+                           other interfaces do something. */
+                        DEBUGMSG("Ring %p overly busy", ncrp);
+                        spin_lock_irqsave(&pending_interfaces_lock, flags);
+                        if (!ncrp->is_pending) {
+                                list_add_tail(&ncrp->pending_interfaces,
+                                              &pending_interfaces);
+                                ncrp->is_pending = 1;
+                                /* We skip a disable_irq() here, and
+                                   effectively transfer the
+                                   IRQ-disabled reference which the
+                                   caller gave us back to the
+                                   pending_interfaces list. */
+                        }
+                        spin_unlock_irqrestore(&pending_interfaces_lock,
+                                               flags);
+                        EXIT();
+                        return;
+                }
+
+                DEBUGMSG("Message type %d.", hdr.type);
+                switch (hdr.type) {
+                case NETCHANNEL2_MSG_SET_MAX_PACKETS:
+                        nc2_handle_set_max_packets_msg(ncrp, &hdr);
+                        break;
+                case NETCHANNEL2_MSG_PACKET:
+                        nc2_handle_packet_msg(nc, ncrp, &hdr);
+                        break;
+                case NETCHANNEL2_MSG_FINISH_PACKET:
+                        nc2_handle_finish_packet_msg(nc, ncrp, &hdr);
+                        break;
+                case NETCHANNEL2_MSG_SET_OFFLOAD:
+                        nc2_handle_set_offload(nc, ncrp, &hdr);
+                        break;
+                case NETCHANNEL2_MSG_POST_BUFFER:
+                        nc2_handle_post_buffer(nc, ncrp, &hdr);
+                        break;
+                case NETCHANNEL2_MSG_RETURN_POSTED_BUFFER:
+                        nc2_handle_return_posted_buffer(nc, ncrp, &hdr);
+                        break;
+                case NETCHANNEL2_MSG_SET_NR_POSTED_BUFFERS:
+                        nc2_handle_set_nr_posted_buffers(nc, ncrp, &hdr);
+                        break;
+                case NETCHANNEL2_MSG_BYPASS_FRONTEND:
+                        nc2_handle_bypass_frontend(nc, ncrp, &hdr);
+                        break;
+                case NETCHANNEL2_MSG_BYPASS_BACKEND:
+                        nc2_handle_bypass_backend(nc, ncrp, &hdr);
+                        break;
+                case NETCHANNEL2_MSG_BYPASS_FRONTEND_READY:
+                        nc2_handle_bypass_frontend_ready(nc, ncrp, &hdr);
+                        break;
+                case NETCHANNEL2_MSG_BYPASS_DISABLE:
+                        nc2_handle_bypass_disable(nc, ncrp, &hdr);
+                        break;
+                case NETCHANNEL2_MSG_BYPASS_DISABLED:
+                        nc2_handle_bypass_disabled(nc, ncrp, &hdr);
+                        break;
+                case NETCHANNEL2_MSG_BYPASS_DETACH:
+                        nc2_handle_bypass_detach(nc, ncrp, &hdr);
+                        break;
+                case NETCHANNEL2_MSG_BYPASS_DETACHED:
+                        nc2_handle_bypass_detached(nc, ncrp, &hdr);
+                        break;
+                case NETCHANNEL2_MSG_BYPASS_READY:
+                        nc2_handle_bypass_ready(nc, ncrp, &hdr);
+                        break;
+                case NETCHANNEL2_MSG_SUGGEST_BYPASS:
+                        nc2_handle_suggest_bypass(nc, ncrp, &hdr);
+                        break;
+                case NETCHANNEL2_MSG_PAD:
+                        break;
+                default:
+                        /* Drop bad messages.  We should arguably stop
+                           processing the ring at this point, because
+                           the ring is probably corrupt.  However, if
+                           it is corrupt then one of the other checks
+                           will hit soon enough, and doing it this way
+                           should make it a bit easier to add new
+                           message types in future. */
+                        pr_debug("Bad message type %d from peer!\n",
+                                 hdr.type);
+                        break;
+                }
+                hdr.size = (hdr.size + 7) & ~7;
+                if (hdr.size == 0) {
+                        printk(KERN_WARNING "Other end sent empty message?\n");
+                        EXIT();
+                        /* Leak the IRQ-disable reference: the other
+                           end is misbehaving, so there's no point
+                           taking more interrupts from it. */
+                        return;
+                }
+                ncrp->cons_ring.cons_pvt += hdr.size;
+                ring_bytes_consumed += hdr.size;
+        }
+
+        if (unlikely(prod != ncrp->cons_ring.sring->prod))
+                goto retry;
+
+        /* Dispose of our IRQ-disable reference. */
+        nc2_enable_irq(ncrp);
+
+        if (nc2_final_check_for_messages(&ncrp->cons_ring,
+                                         prod)) {
+                DEBUGMSG("more stuff added to ring %p while poll method running",
+                         ncrp);
+                nc->rx.nr_ring_race++;
+                /* More work to do still.  Add ourselves back on the
+                   tail of the ring. */
+                nc2_kick(ncrp);
+        }
+
+        EXIT();
+}
+
+/* Flush out all pending metadata messages on ring @ncrp, and then
+   update the ring pointers to indicate that we've done so.  Fire the
+   event channel if necessary. */
+static void flush_rings(struct netchannel2_ring_pair *ncrp)
+{
+        struct netchannel2 *nc = ncrp->interface;
+        int need_kick;
+
+        ENTER();
+
+        flush_hypercall_batcher(&pending_rx_hypercalls,
+                                nc2_rscb_on_gntcopy_fail);
+        send_finish_packet_messages(ncrp);
+        if (ncrp->need_advertise_max_packets)
+                advertise_max_packets(ncrp);
+
+        if (ncrp == &nc->rings) {
+                nc2_replenish_rx_buffers(nc);
+                nc2_return_pending_posted_buffers(nc);
+                if (nc->need_advertise_offloads)
+                        advertise_offloads(nc);
+                if (nc->need_advertise_tx_buffers)
+                        nc2_advertise_tx_buffers(nc);
+                nc2_advertise_bypasses(nc);
+                nc2_crank_aux_ring_state_machine(nc);
+                nc2_autobypass_make_suggestions(nc);
+        } else {
+                nc2_alternate_ring_disable_finish(ncrp);
+        }
+
+        need_kick = 0;
+        if (nc2_finish_messages(&ncrp->cons_ring)) {
+                need_kick = 1;
+                /* If we need an event on the consumer ring, we always
+                   need to notify the other end, even if we don't have
+                   any messages which would normally be considered
+                   urgent. */
+                ncrp->pending_time_sensitive_messages = 1;
+        }
+        if (nc2_flush_ring(&ncrp->prod_ring))
+                need_kick = 1;
+        if (need_kick ||
+            (ncrp->delayed_kick && ncrp->pending_time_sensitive_messages)) {
+                if (ncrp->pending_time_sensitive_messages) {
+                        notify_remote_via_irq(ncrp->irq);
+                        ncrp->delayed_kick = 0;
+                } else {
+                        ncrp->delayed_kick = 1;
+                }
+                ncrp->pending_time_sensitive_messages = 0;
+        }
+        EXIT();
+}
+
+/* Process incoming messages, and then flush outgoing metadata
+ * messages.  We also try to unjam the xmit queue if any of the
+ * incoming messages would give us permission to send more stuff. */
+/* This is given an IRQ-disable reference, and must dispose of it. */
+static void nc2_poll(struct netchannel2_ring_pair *ncrp)
+{
+        ENTER();
+
+        if (!ncrp->is_attached) {
+                DEBUGMSG("Poll when not attached?");
+                nc2_enable_irq(ncrp);
+                EXIT();
+                return;
+        }
+
+        process_messages(ncrp);
+
+        flush_rings(ncrp);
+
+        EXIT();
+}
+
+/* Like skb_queue_purge(), but use release_tx_packet() rather than
+   kfree_skb() */
+static void nc2_queue_purge(struct netchannel2_ring_pair *ncrp,
+                            struct sk_buff_head *queue)
+{
+        struct sk_buff *skb;
+
+        while (!skb_queue_empty(queue)) {
+                skb = skb_dequeue(queue);
+                release_tx_packet(ncrp, skb);
+        }
+}
+
+/* struct net_device stop() method. */
+/* XXX this needs more attention. */
+static int nc2_stop(struct net_device *nd)
+{
+        struct netchannel2 *nc = netdev_priv(nd);
+
+        spin_lock_bh(&nc->rings.lock);
+        nc->stats.tx_dropped += skb_queue_len(&nc->pending_skbs);
+        nc2_queue_purge(&nc->rings, &nc->pending_skbs);
+        spin_unlock_bh(&nc->rings.lock);
+
+        return 0;
+}
+
+/* Kick a netchannel2 interface so that the poll() method runs
+ * soon. */
+/* This has semi release-like semantics, so you can set flags
+   lock-free and be guaranteed that the poll() method will eventually
+   run and see the flag set, without doing any explicit locking. */
+void nc2_kick(struct netchannel2_ring_pair *ncrp)
+{
+        unsigned long flags;
+        ENTER();
+        /* We're putting the interface on the pending list -> must
+           disable the irq first. */
+        /* XXX could this be _nosync? */
+        nc2_disable_irq(ncrp);
+        spin_lock_irqsave(&pending_interfaces_lock, flags);
+        if (!ncrp->is_pending) {
+                DEBUGMSG("Need to add %p to pending list.", ncrp);
+                list_add_tail(&ncrp->pending_interfaces,
+                              &pending_interfaces);
+                ncrp->is_pending = 1;
+                tasklet_schedule(&nc2_worker_tasklet);
+        } else {
+                DEBUGMSG("%p already pending.", ncrp);
+                nc2_enable_irq(ncrp);
+        }
+        spin_unlock_irqrestore(&pending_interfaces_lock, flags);
+        EXIT();
+}
+
+/* Lick nc2_kick(), but arrange that the ring is put on the front of
+   the pending list. */
+void nc2_kick_fast(struct netchannel2_ring_pair *ncrp)
+{
+        unsigned long flags;
+        ENTER();
+        nc2_disable_irq(ncrp);
+        spin_lock_irqsave(&pending_interfaces_lock, flags);
+        if (!ncrp->is_pending) {
+                DEBUGMSG("Need to add %p to pending list.", ncrp);
+                list_add(&ncrp->pending_interfaces,
+                         &pending_interfaces);
+                ncrp->is_pending = 1;
+                tasklet_schedule(&nc2_worker_tasklet);
+        } else {
+                DEBUGMSG("%p already pending.", ncrp);
+                /* Bump it to the head of the queue */
+                list_move(&ncrp->pending_interfaces,
+                          &pending_interfaces);
+                nc2_enable_irq(ncrp);
+        }
+        spin_unlock_irqrestore(&pending_interfaces_lock, flags);
+        EXIT();
+}
+
+/* struct net_device open method. */
+static int nc2_open(struct net_device *nd)
+{
+        struct netchannel2 *nc = netdev_priv(nd);
+
+        nc2_kick(&nc->rings);
+        return 0;
+}
+
+/* Rad a mac address from an address in xenstore at @prefix/@node.
+ * Call not holding locks.  Returns 0 on success or <0 on error. */
+static int read_mac_address(const char *prefix, const char *node,
+                            unsigned char *addr)
+{
+        int err;
+        unsigned mac[6];
+        int i;
+
+        err = xenbus_scanf(XBT_NIL, prefix, node,
+                           "%x:%x:%x:%x:%x:%x",
+                           &mac[0],
+                           &mac[1],
+                           &mac[2],
+                           &mac[3],
+                           &mac[4],
+                           &mac[5]);
+        if (err < 0)
+                return err;
+        if (err != 6)
+                return -EINVAL;
+        for (i = 0; i < 6; i++) {
+                if (mac[i] >= 0x100)
+                        return -EINVAL;
+                addr[i] = mac[i];
+        }
+       return 0;
+}
+
+/* Release resources associated with a ring pair.  It is assumed that
+   the ring pair has already been detached (which stops the IRQ and
+   rate limiter, and un-pends the ring). */
+void cleanup_ring_pair(struct netchannel2_ring_pair *ncrp)
+{
+        BUG_ON(ncrp->is_pending);
+        BUG_ON(ncrp->prod_ring.sring);
+        BUG_ON(ncrp->cons_ring.sring);
+        BUG_ON(!list_empty(&ncrp->waitq.task_list));
+
+        drop_pending_tx_packets(ncrp);
+        nc2_queue_purge(ncrp, &ncrp->release_on_flush_batcher);
+}
+
+/* Stop and start functions for the ring rate/asymmetry limiter.  This
+   is basically nc2_kick() split in half. */
+/* This takes an IRQ-disable reference which will be dropped by the
+   start() method later. */
+static void rate_limiter_stop_ring(void *ctxt)
+{
+        struct netchannel2_ring_pair *ncrp = ctxt;
+        unsigned long flags;
+
+        DEBUGMSG("rate limiter stopping ring");
+        nc2_disable_irq(ncrp);
+        spin_lock_irqsave(&pending_interfaces_lock, flags);
+        if (ncrp->is_pending) {
+                list_del(&ncrp->pending_interfaces);
+                ncrp->is_pending = 0;
+
+                /* If it's on the pending list, the interrupt will be
+                   disabled.  We need to turn the interrupt back on
+                   because we've removed the thing from the pending
+                   list, but it'll remain disabled because of the
+                   disable_irq() above. */
+#ifdef DEBUG
+                BUG_ON(ncrp->irq_disable_count <= 1);
+#endif
+                nc2_enable_irq(ncrp);
+        }
+        BUG_ON(ncrp->rlimit_disabled);
+        ncrp->rlimit_disabled = 1;
+        spin_unlock_irqrestore(&pending_interfaces_lock, flags);
+}
+static void rate_limiter_start_ring(void *ctxt)
+{
+        struct netchannel2_ring_pair *ncrp = ctxt;
+        unsigned long flags;
+
+        DEBUGMSG("rate limiter starting ring");
+        spin_lock_irqsave(&pending_interfaces_lock, flags);
+        if (ncrp->is_attached && !ncrp->is_pending) {
+                list_add_tail(&ncrp->pending_interfaces,
+                              &pending_interfaces);
+                ncrp->is_pending = 1;
+                tasklet_schedule(&nc2_worker_tasklet);
+
+                /* The IRQ disable reference is transferred to the
+                   pending list, so don't need to enable_irq()
+                   here. */
+        } else {
+                nc2_enable_irq(ncrp);
+        }
+        BUG_ON(!ncrp->rlimit_disabled);
+        ncrp->rlimit_disabled = 0;
+        spin_unlock_irqrestore(&pending_interfaces_lock, flags);
+}
+
+void init_ring_pair(struct netchannel2_ring_pair *ncrp)
+{
+        unsigned x;
+
+        spin_lock_init(&ncrp->lock);
+        ncrp->irq = -1;
+
+        for (x = 0; x < NR_TX_PACKETS - 1; x++)
+                txp_set_next_free(ncrp->tx_packets + x, x + 1);
+        txp_set_next_free(ncrp->tx_packets + x, INVALID_TXP_INDEX);
+        ncrp->head_free_tx_packet = 0;
+
+        skb_queue_head_init(&ncrp->pending_tx_queue);
+        skb_queue_head_init(&ncrp->release_on_flush_batcher);
+
+        init_waitqueue_head(&ncrp->waitq);
+        nc2_init_rate_limiter(&ncrp->limiter,
+                              fls(50000/HZ),
+                              20000,
+                              1000,
+                              rate_limiter_stop_ring,
+                              rate_limiter_start_ring,
+                              ncrp);
+        nc2_init_poller(ncrp);
+}
+
+/* Create a new netchannel2 structure.  Call with no locks held.
+   Returns NULL on error.  The xenbus device must remain valid for as
+   long as the netchannel2 structure does.  The core does not take out
+   any kind of reference count on it, but will refer to it throughout
+   the returned netchannel2's life. */
+struct netchannel2 *nc2_new(struct xenbus_device *xd)
+{
+        struct net_device *netdev;
+        struct netchannel2 *nc;
+        int err;
+        int local_trusted;
+        int remote_trusted;
+        int filter_mac;
+#ifdef CONFIG_XEN_NETDEV2_BYPASSABLE
+        int max_bypasses;
+#endif
+
+        if (!gnttab_subpage_grants_available()) {
+                printk(KERN_ERR "netchannel2 needs version 2 grant tables\n");
+                return NULL;
+        }
+
+        if (xenbus_scanf(XBT_NIL, xd->nodename, "local-trusted",
+                         "%d", &local_trusted) != 1) {
+                printk(KERN_WARNING "Can't tell whether local endpoint is trusted; assuming it is.\n");
+                local_trusted = 1;
+        }
+
+#ifdef CONFIG_XEN_NETDEV2_BYPASSABLE
+        max_bypasses = 0;
+        if (local_trusted) {
+                if (xenbus_scanf(XBT_NIL, xd->nodename, "max-bypasses",
+                                 "%d", &max_bypasses) != 1) {
+                        printk(KERN_WARNING "Can't get maximum bypass count; assuming 0.\n");
+                        max_bypasses = 0;
+                }
+        }
+#endif
+
+        if (xenbus_scanf(XBT_NIL, xd->nodename, "remote-trusted",
+                         "%d", &remote_trusted) != 1) {
+                printk(KERN_WARNING "Can't tell whether local endpoint is trusted; assuming it isn't.\n");
+                remote_trusted = 0;
+        }
+
+        if (xenbus_scanf(XBT_NIL, xd->nodename, "filter-mac",
+                         "%d", &filter_mac) != 1) {
+                if (remote_trusted) {
+                        printk(KERN_WARNING "Can't tell whether to filter MAC addresses from remote domain; filtering off.\n");
+                        filter_mac = 0;
+                } else {
+                        printk(KERN_WARNING "Can't tell whether to filter MAC addresses from remote domain; filtering on.\n");
+                        filter_mac = 1;
+                }
+        }
+
+        netdev = alloc_etherdev(sizeof(*nc));
+        if (netdev == NULL)
+                return NULL;
+
+        nc = netdev_priv(netdev);
+        memset(nc, 0, sizeof(*nc));
+        nc->magic = NETCHANNEL2_MAGIC;
+        nc->net_device = netdev;
+        nc->xenbus_device = xd;
+
+        nc->remote_trusted = remote_trusted;
+        nc->local_trusted = local_trusted;
+        nc->rings.filter_mac = filter_mac;
+
+        /* Default to RX csum and LRO on. */
+        nc->use_rx_csum = 1;
+        nc->use_lro = 1;
+
+#ifdef CONFIG_XEN_NETDEV2_BYPASSABLE
+        INIT_LIST_HEAD(&nc->bypasses_a);
+        INIT_LIST_HEAD(&nc->bypasses_b);
+        nc2_init_incoming_bypass_suggestions(nc,
+                                             &nc->incoming_bypass_suggestions);
+        nc->max_bypasses = max_bypasses;
+#endif
+#ifdef CONFIG_XEN_NETDEV2_BYPASS_ENDPOINT
+        INIT_LIST_HEAD(&nc->alternate_rings);
+#endif
+        skb_queue_head_init(&nc->pending_skbs);
+        init_ring_pair(&nc->rings);
+        nc->rings.interface = nc;
+        INIT_LIST_HEAD(&nc->rx_buffers);
+        INIT_LIST_HEAD(&nc->unused_rx_buffers);
+        INIT_LIST_HEAD(&nc->unposted_rx_buffers);
+        INIT_LIST_HEAD(&nc->avail_tx_buffers);
+        nc->nr_avail_tx_buffers = 0;
+        INIT_LIST_HEAD(&nc->unused_tx_buffer_slots);
+        INIT_LIST_HEAD(&nc->pending_tx_buffer_return);
+
+        if (local_trusted) {
+                if (init_receive_map_mode() < 0) {
+                        nc2_release(nc);
+                        return NULL;
+                }
+        }
+
+        netdev->open = nc2_open;
+        netdev->stop = nc2_stop;
+        netdev->hard_start_xmit = nc2_start_xmit;
+        netdev->get_stats = nc2_get_stats;
+        netdev->change_mtu = nc2_change_mtu;
+
+        /* We need to hold the ring lock in order to send messages
+           anyway, so there's no point in Linux doing additional
+           synchronisation. */
+        netdev->features = NETIF_F_LLTX;
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,20)
+        SET_MODULE_OWNER(netdev);
+#endif
+        SET_NETDEV_DEV(netdev, &xd->dev);
+        SET_ETHTOOL_OPS(netdev, &nc2_ethtool_ops);
+
+        err = read_mac_address(xd->nodename, "remote-mac",
+                               nc->rings.remote_mac);
+        if (err == 0)
+                err = read_mac_address(xd->nodename, "mac", netdev->dev_addr);
+        if (err == 0)
+                err = register_netdev(netdev);
+
+        if (err != 0) {
+                nc2_release(nc);
+                return NULL;
+        }
+
+        return nc;
+}
+
+/* Release a netchannel2 structure previously allocated with
+ * nc2_new().  Call with no locks held.  The rings will be
+ * automatically detach if necessary. */
+void nc2_release(struct netchannel2 *nc)
+{
+        netif_carrier_off(nc->net_device);
+
+        unregister_netdev(nc->net_device);
+
+        nc2_detach_rings(nc);
+
+        /* Unregistering the net device stops any netdev methods from
+           running, and detaching the rings unhooks us from the
+           pending list, so we're now the only thing accessing this
+           netchannel2 structure and we can tear it down with
+           impunity. */
+
+        nc2_release_alt_rings(nc);
+
+        cleanup_ring_pair(&nc->rings);
+
+        nc2_queue_purge(&nc->rings, &nc->pending_skbs);
+
+        /* Should have been released when we detached. */
+        BUG_ON(nc->rx_buffer_structs);
+
+        release_bypasses(nc);
+
+        unprepare_tx_buffers(nc);
+
+        free_netdev(nc->net_device);
+}
+
+void _nc2_attach_rings(struct netchannel2_ring_pair *ncrp,
+                       struct netchannel2_sring_cons *cons_sring,
+                       const volatile void *cons_payload,
+                       size_t cons_size,
+                       struct netchannel2_sring_prod *prod_sring,
+                       void *prod_payload,
+                       size_t prod_size,
+                       domid_t otherend_id)
+{
+        BUG_ON(prod_sring == NULL);
+        BUG_ON(cons_sring == NULL);
+
+        ncrp->prod_ring.sring = prod_sring;
+        ncrp->prod_ring.payload_bytes = prod_size;
+        ncrp->prod_ring.prod_pvt = 0;
+        ncrp->prod_ring.payload = prod_payload;
+
+        ncrp->cons_ring.sring = cons_sring;
+        ncrp->cons_ring.payload_bytes = cons_size;
+        ncrp->cons_ring.sring->prod_event = ncrp->cons_ring.sring->prod + 1;
+        ncrp->cons_ring.cons_pvt = 0;
+        ncrp->cons_ring.payload = cons_payload;
+
+        ncrp->otherend_id = otherend_id;
+
+        ncrp->is_attached = 1;
+
+        ncrp->need_advertise_max_packets = 1;
+}
+
+/* Attach a netchannel2 structure to a ring pair.  The endpoint is
+   also expected to set up an event channel after calling this before
+   using the interface.  Returns 0 on success or <0 on error. */
+int nc2_attach_rings(struct netchannel2 *nc,
+                     struct netchannel2_sring_cons *cons_sring,
+                     const volatile void *cons_payload,
+                     size_t cons_size,
+                     struct netchannel2_sring_prod *prod_sring,
+                     void *prod_payload,
+                     size_t prod_size,
+                     domid_t otherend_id)
+{
+#ifdef CONFIG_XEN_NETDEV2_BYPASSABLE
+        int feature_bypass;
+        int max_bypass_pages;
+
+        if (xenbus_scanf(XBT_NIL, nc->xenbus_device->otherend,
+                         "feature-bypass", "%d", &feature_bypass) < 0)
+                feature_bypass = 0;
+        if (feature_bypass) {
+                if (xenbus_scanf(XBT_NIL, nc->xenbus_device->otherend,
+                                 "feature-bypass-max-pages", "%d",
+                                 &max_bypass_pages) < 0) {
+                        printk(KERN_WARNING "other end claimed to support bypasses, but didn't expose max-pages?\n");
+                        /* Bypasses disabled for this ring. */
+                        nc->max_bypasses = 0;
+                } else {
+                        nc->bypass_max_pages = max_bypass_pages;
+                }
+        } else {
+                nc->max_bypasses = 0;
+        }
+#endif
+
+        spin_lock_bh(&nc->rings.lock);
+        _nc2_attach_rings(&nc->rings, cons_sring, cons_payload, cons_size,
+                          prod_sring, prod_payload, prod_size, otherend_id);
+
+        nc->need_advertise_offloads = 1;
+
+        spin_unlock_bh(&nc->rings.lock);
+
+        resume_receive_map_mode();
+
+        netif_carrier_on(nc->net_device);
+
+        /* Kick it to get it going. */
+        nc2_kick(&nc->rings);
+
+        return 0;
+}
+
+/* Detach from the rings.  This includes unmapping them, stopping the
+   interrupt, and disabling the rate limiter. */
+/* Careful: the netdev methods may still be running at this point. */
+/* This is not allowed to wait for the other end, because it might
+   have gone away (e.g. over suspend/resume). */
+static void nc2_detach_ring(struct netchannel2_ring_pair *ncrp)
+{
+        nc2_stop_polling(ncrp);
+
+        spin_lock(&ncrp->lock);
+        ncrp->detach_pending = 1;
+        spin_unlock(&ncrp->lock);
+        nc2_kick(ncrp);
+        wait_event(ncrp->waitq,
+                   ({
+                           int r;
+                           spin_lock_bh(&ncrp->lock);
+                           r = ncrp->detach_pending;
+                           spin_unlock_bh(&ncrp->lock);
+                           r;
+                   }) == 0);
+
+        /* _detach makes sure that the ring won't be touched again by
+           the tasklet, and in particular the limiter won't be used
+           again.  This is therefore a good time to clean up the
+           limiter. */
+        nc2_cleanup_rate_limiter(&ncrp->limiter);
+}
+
+/* Trivial wrapper around nc2_detach_ring().  Make the ring no longer
+   used. */
+/* Careful: the netdev can still be running. */
+void nc2_detach_rings(struct netchannel2 *nc)
+{
+        nc2_detach_ring(&nc->rings);
+
+        /* Okay, all async access to the ring is stopped.  Kill the
+           irqhandlers.  (It might be better to do this from the
+           _detach_ring() functions, but you're not allowed to
+           free_irq() from interrupt context, and tasklets are close
+           enough to cause problems). */
+
+        if (nc->rings.irq >= 0)
+                unbind_from_irqhandler(nc->rings.irq, &nc->rings);
+        nc->rings.irq = -1;
+
+#ifdef CONFIG_XEN_NETDEV2_BYPASS_ENDPOINT
+        {
+                struct nc2_alternate_ring *ncr;
+
+                list_for_each_entry(ncr, &nc->alternate_rings,
+                                    rings_by_interface) {
+                        if (ncr->rings.irq >= 0) {
+                                unbind_from_irqhandler(ncr->rings.irq,
+                                                       &ncr->rings);
+                                ncr->rings.irq = -1;
+                        }
+                }
+        }
+#endif
+
+        /* XXX De-pend the interfaces */
+
+        /* Disable all offloads */
+        nc->net_device->features &= ~NETIF_F_IP_CSUM;
+        nc->allow_tx_csum_offload = 0;
+}
+
+/* This is the worker thread bit of nc2_detach_rings. */
+static void _detach_rings(struct netchannel2_ring_pair *ncrp)
+{
+        unsigned long flags;
+
+        if (ncrp == &ncrp->interface->rings)
+                nc2_posted_buffer_rx_forget(ncrp->interface);
+
+        spin_lock_bh(&ncrp->lock);
+        /* We need to release all of the pending transmission packets,
+           because they're never going to complete now that we've lost
+           the ring. */
+        drop_pending_tx_packets(ncrp);
+
+        disable_irq(ncrp->irq);
+
+        BUG_ON(ncrp->nr_tx_packets_outstanding);
+        ncrp->max_tx_packets_outstanding = 0;
+
+        /* No way of sending pending finish messages now; drop
+         * them. */
+        ncrp->pending_finish.prod = 0;
+        ncrp->pending_finish.cons = 0;
+
+        ncrp->cons_ring.sring = NULL;
+        ncrp->prod_ring.sring = NULL;
+        ncrp->is_attached = 0;
+
+        spin_unlock_bh(&ncrp->lock);
+
+        /* Remove ourselves from the pending interfaces list.  If we
+           get re-attached, the reattacher will kick() and we'll be
+           fine.  If we don't then this avoids teardown races. */
+        spin_lock_irqsave(&pending_interfaces_lock, flags);
+        if (ncrp->is_pending)
+                list_del(&ncrp->pending_interfaces);
+        ncrp->is_pending = 0;
+        spin_unlock_irqrestore(&pending_interfaces_lock, flags);
+
+#ifdef CONFIG_XEN_NETDEV2_BYPASS_ENDPOINT
+        {
+                struct nc2_alternate_ring *nar;
+
+                /* Walk the alternate rings list and detach all of
+                   them as well.  This is recursive, but it's only
+                   ever going to recur one deep, so it's okay. */
+                /* Don't need to worry about synchronisation because
+                   the interface has been stopped. */
+                if (ncrp == &ncrp->interface->rings) {
+                        list_for_each_entry(nar,
+                                            &ncrp->interface->alternate_rings,
+                                            rings_by_interface)
+                                _detach_rings(&nar->rings);
+                }
+        }
+#endif
+
+        spin_lock_bh(&ncrp->lock);
+        ncrp->detach_pending = 0;
+        wake_up(&ncrp->waitq);
+        spin_unlock_bh(&ncrp->lock);
+}
+
+#if defined(CONFIG_XEN_NETDEV2_BACKEND)
+/* Connect to an event channel port in a remote domain.  Returns 0 on
+   success or <0 on error.  The port is automatically disconnected
+   when the channel is released or if the rings are detached.  This
+   should not be called if the port is already open. */
+int nc2_connect_evtchn(struct netchannel2 *nc, domid_t domid,
+                       int evtchn)
+{
+        int err;
+
+        BUG_ON(nc->rings.irq >= 0);
+
+        err = bind_interdomain_evtchn_to_irqhandler(domid,
+                                                    evtchn,
+                                                    nc2_int,
+                                                    IRQF_SAMPLE_RANDOM,
+                                                    "netchannel2",
+                                                    &nc->rings);
+        if (err >= 0) {
+                nc->rings.irq = err;
+                nc->rings.evtchn = irq_to_evtchn_port(err);
+                return 0;
+        } else {
+                return err;
+        }
+}
+#endif
+
+#if defined(CONFIG_XEN_NETDEV2_FRONTEND)
+/* Listen for incoming event channel connections from domain domid.
+   Similar semantics to nc2_connect_evtchn(). */
+#ifdef CONFIG_PARAVIRT
+int nc2_listen_evtchn(struct netchannel2 *nc, domid_t domid)
+{
+        int err;
+
+        BUG_ON(nc->rings.irq >= 0);
+        BUG_ON(nc->rings.evtchn > 0);
+
+        err = xen_alloc_evtchn(domid);
+        if (err < 0)
+                return err;
+        nc->rings.evtchn = err;
+        err = bind_evtchn_to_irqhandler(nc->rings.evtchn,
+                                        nc2_int, IRQF_SAMPLE_RANDOM,
+                                        "netchannel2", &nc->rings);
+        BUG_ON(err < 0);
+        nc->rings.irq = err;
+        return 0;
+}
+#else
+int nc2_listen_evtchn(struct netchannel2 *nc, domid_t domid)
+{
+        int err;
+
+        BUG_ON(nc->rings.irq >= 0);
+
+        err = bind_listening_port_to_irqhandler(domid,
+                                                nc2_int,
+                                                IRQF_SAMPLE_RANDOM,
+                                                "netchannel2",
+                                                &nc->rings);
+        if (err >= 0) {
+                nc->rings.irq = err;
+                nc->rings.evtchn = irq_to_evtchn_port(err);
+                return 0;
+        } else {
+                return err;
+        }
+}
+#endif
+#endif
+
+/* Find the local event channel port which was allocated by
+ * nc2_listen_evtchn() or nc2_connect_evtchn().  It is an error to
+ * call this when there is no event channel connected. */
+int nc2_get_evtchn_port(struct netchannel2 *nc)
+{
+        BUG_ON(nc->rings.irq < 0);
+        return nc->rings.evtchn;
+}
+
+/* XXX */
+void nc2_suspend(struct netchannel2 *nc)
+{
+        detach_all_bypasses(nc);
+        suspend_receive_map_mode();
+}
+
+/* @ncrp has been recently nc2_kick()ed.  Do all of the necessary
+   stuff. */
+static void process_ring(struct netchannel2_ring_pair *ncrp)
+{
+        struct netchannel2 *nc = ncrp->interface;
+        struct sk_buff *skb;
+
+        spin_lock(&ncrp->lock);
+
+        nc->tx.nr_tasklet_action++;
+
+        DEBUGMSG("Poll %p.", ncrp);
+        /* Pick up incoming messages. */
+        nc2_poll(ncrp);
+
+#ifdef CONFIG_XEN_NETDEV2_VMQ
+       do_vmq_work(nc);
+#endif
+        /* Transmit pending packets. */
+        if (!skb_queue_empty(&ncrp->pending_tx_queue)) {
+                skb = __skb_dequeue(&ncrp->pending_tx_queue);
+                do {
+                        nc2_really_start_xmit(ncrp, skb);
+                        skb = __skb_dequeue(&ncrp->pending_tx_queue);
+                } while (skb != NULL);
+
+                /* If we've transmitted on the main ring then we may
+                   have made use of the hypercall batcher.  Flush it.
+                   This must happen before we flush the rings, since
+                   that's when the PACKET messages will be made
+                   visible to the other end. */
+                if (ncrp == &nc->rings)
+                        flush_hypercall_batcher(&nc->batcher,
+                                                nc2_posted_on_gntcopy_fail);
+
+                flush_rings(ncrp);
+
+                while ((skb = __skb_dequeue(&ncrp->release_on_flush_batcher))){
+                        release_tx_packet(ncrp, skb);
+                }
+        }
+
+        if (ncrp == &nc->rings && nc->is_stopped) {
+                /* If the other end has processed some messages, there
+                   may be space on the ring for a delayed send from
+                   earlier.  Process it now. */
+                while (1) {
+                        skb = skb_peek_tail(&nc->pending_skbs);
+                        if (!skb)
+                                break;
+                        if (prepare_xmit_allocate_resources(nc, skb) < 0) {
+                                /* Still stuck */
+                                break;
+                        }
+                        __skb_unlink(skb, &nc->pending_skbs);
+                        queue_packet_to_interface(skb, ncrp);
+                }
+                if (skb_queue_empty(&nc->pending_skbs)) {
+                        nc->is_stopped = 0;
+                        netif_wake_queue(nc->net_device);
+                }
+        }
+
+        spin_unlock(&ncrp->lock);
+
+        if (ncrp->detach_pending)
+                _detach_rings(ncrp);
+}
+
+static void nc2_action(unsigned long ignore)
+{
+        struct netchannel2_ring_pair *ncrp;
+        unsigned long flags;
+
+        ENTER();
+
+        spin_lock_irqsave(&pending_interfaces_lock, flags);
+        while (!list_empty(&pending_interfaces)) {
+                ncrp = list_entry(pending_interfaces.next,
+                                  struct netchannel2_ring_pair,
+                                  pending_interfaces);
+                list_del(&ncrp->pending_interfaces);
+                ncrp->is_pending = 0;
+                spin_unlock_irqrestore(&pending_interfaces_lock,
+                                       flags);
+
+                process_ring(ncrp);
+
+                spin_lock_irqsave(&pending_interfaces_lock, flags);
+        }
+        spin_unlock_irqrestore(&pending_interfaces_lock,
+                               flags);
+       receive_pending_skbs();
+        EXIT();
+}
diff --git a/drivers/net/xen-netchannel2/hvm_guest_dummy.c b/drivers/net/xen-netchannel2/hvm_guest_dummy.c
new file mode 100644 (file)
index 0000000..431c315
--- /dev/null
@@ -0,0 +1,31 @@
+/* A simple stub implementation of some of the various functions which
+   don't exist in HVM mode. */
+#include <linux/kernel.h>
+#include "netchannel2_core.h"
+
+/* Receiver map mode. */
+struct sk_buff *handle_receiver_map_packet(struct netchannel2 *nc,
+                                           struct netchannel2_msg_packet *msg,
+                                           struct netchannel2_msg_hdr *hdr,
+                                           unsigned nr_frags,
+                                           unsigned frags_off)
+{
+        return NULL;
+}
+
+int init_receive_map_mode(void)
+{
+        return 0;
+}
+
+void resume_receive_map_mode(void)
+{
+}
+
+void deinit_receive_map_mode(void)
+{
+}
+
+void suspend_receive_map_mode(void)
+{
+}
diff --git a/drivers/net/xen-netchannel2/limiter.c b/drivers/net/xen-netchannel2/limiter.c
new file mode 100644 (file)
index 0000000..e948eb0
--- /dev/null
@@ -0,0 +1,173 @@
+/* Support for rate and asymmetry limiters intended to prevent
+ * denial-of-service conditions. */
+#include <linux/kernel.h>
+#include "netchannel2_core.h"
+
+/* Set the timer to fire once the number of tokens available exceeds
+   restart_thresh.  Called under the lock. */
+static void set_restart_timer(struct nc2_rate_limiter *nrl)
+{
+        unsigned needed_tokens;
+        u64 needed_ticks;
+        needed_tokens = nrl->restart_thresh - nrl->cur_tokens;
+        needed_ticks = needed_tokens >> nrl->tokens_per_tick_ord;
+        mod_timer(&nrl->timer, nrl->last_fill_time + needed_ticks);
+}
+
+/* Add nr_tokens to the pot.  Restart if that takes us above the
+   threshold.  If a threshold is set and we didn't reach it, tweak the
+   timer. */
+static void _rate_limiter_credit(struct nc2_rate_limiter *nrl,
+                                 u64 nr_tokens)
+{
+        if (nrl->cur_tokens + nr_tokens > nrl->max_tokens)
+                nrl->cur_tokens = nrl->max_tokens;
+        else
+                nrl->cur_tokens += nr_tokens;
+        if (nrl->restart_thresh <= nrl->max_tokens) {
+                if (nrl->cur_tokens >= nrl->restart_thresh) {
+                        del_timer(&nrl->timer);
+                        nrl->restart_thresh = nrl->max_tokens + 1;
+                        nrl->start(nrl->ctxt);
+                } else {
+                        set_restart_timer(nrl);
+                }
+        }
+}
+
+/* Sample jiffies, and refill the bukcet as appropriate. */
+static void _refill_nrl(struct nc2_rate_limiter *nrl)
+{
+        u64 now;
+        u64 elapsed;
+
+        now = get_jiffies_64();
+        elapsed = now - nrl->last_fill_time;
+        nrl->last_fill_time = now;
+        _rate_limiter_credit(nrl, elapsed << nrl->tokens_per_tick_ord);
+}
+
+/* The timer function.  Refill the bucket according to how much time
+   has passed, and restart if necessary. */
+static void restart_timer(unsigned long data)
+{
+        struct nc2_rate_limiter *nrl = (struct nc2_rate_limiter *)data;
+        spin_lock_bh(&nrl->lock);
+        _refill_nrl(nrl);
+        spin_unlock_bh(&nrl->lock);
+}
+
+/* Initialise a rate limiter.  (1 << tokens_per_tick_ord) tokens will
+   be added to the bucket every jiffy, up to a limit of max_tokens.
+   When we run out of tokens, stop() is called, and start() will be
+   called if there are subsequently sufficient tokens to satisfy some
+   request.  We try to arrange that start() won't be called until at
+   least fill_granularity_tokens tokens are available.
+
+   Call nc2_cleanup_rate_limiter() once you're finished with the
+   limiter.
+
+   It is guaranteed that any call to stop() will be followed by a call
+   to start(), unless nc2_cleanup_rate_limiter() is called before we
+   get around to it.  stop() and start() are both called under the
+   limiter lock, so must not call back into nc2_rate_limiter_credit()
+   or nc2_rate_limiter_debit().  stop() is only called from in
+   _debit().  start() can be called from either _credit(), _debit(),
+   or from a timer tasklet.  The two methods are always called with
+   bottom halves disabled.
+*/
+void nc2_init_rate_limiter(struct nc2_rate_limiter *nrl,
+                           unsigned tokens_per_tick_ord,
+                           unsigned max_tokens,
+                           unsigned fill_granularity_tokens,
+                           void (*stop)(void *ctxt),
+                           void (*start)(void *ctxt),
+                           void *ctxt)
+{
+        BUG_ON(tokens_per_tick_ord > 31);
+        BUG_ON(max_tokens >= (unsigned)-1);
+        BUG_ON(fill_granularity_tokens > max_tokens);
+
+        memset(nrl, 0, sizeof(*nrl));
+        nrl->tokens_per_tick_ord = tokens_per_tick_ord;
+        nrl->max_tokens = max_tokens;
+        nrl->fill_granularity_tokens = fill_granularity_tokens;
+        nrl->stop = stop;
+        nrl->start = start;
+        nrl->ctxt = ctxt;
+
+        spin_lock_init(&nrl->lock);
+        nrl->cur_tokens = max_tokens;
+        nrl->restart_thresh = nrl->max_tokens + 1;
+        nrl->last_fill_time = get_jiffies_64();
+
+        setup_timer(&nrl->timer, restart_timer, (unsigned long)nrl);
+}
+
+/* Clean up a rate limiter.  It is guaranteed that neither start() nor
+   stop() will be called after this returns.  The caller must ensure
+   that neither _credit() nor _debit() are called on the rate limiter
+   after this starts. */
+void nc2_cleanup_rate_limiter(struct nc2_rate_limiter *nrl)
+{
+        del_timer_sync(&nrl->timer);
+
+        /* There shouldn't be anyone using it now. */
+        BUG_ON(spin_is_locked(&nrl->lock));
+}
+
+/* Put some tokens in the bucket.  This is mostly used when you're
+   doing asymmetry limiting rather than rate limiting. */
+void nc2_rate_limiter_credit(struct nc2_rate_limiter *nrl,
+                             unsigned nr_tokens)
+{
+        spin_lock_bh(&nrl->lock);
+        _rate_limiter_credit(nrl, nr_tokens);
+        spin_unlock_bh(&nrl->lock);
+}
+
+/* Take some tokens out of the bucket.  If tokens are available, take
+   them and return 1.  Otherwise, call the stop() method and return
+   0. */
+int nc2_rate_limiter_debit(struct nc2_rate_limiter *nrl,
+                           unsigned nr_tokens)
+{
+        unsigned new_restart_thresh;
+
+        BUG_ON(nr_tokens > nrl->max_tokens);
+
+        spin_lock_bh(&nrl->lock);
+        if (nrl->cur_tokens >= nr_tokens) {
+                nrl->cur_tokens -= nr_tokens;
+                spin_unlock_bh(&nrl->lock);
+                return 1;
+        }
+
+        _refill_nrl(nrl);
+        if (nrl->cur_tokens > nr_tokens) {
+                nrl->cur_tokens -= nr_tokens;
+                spin_unlock_bh(&nrl->lock);
+                return 1;
+        }
+
+        /* Okay, we really have hit the limiter. */
+        if (nrl->restart_thresh == nrl->max_tokens + 1)
+                nrl->stop(nrl->ctxt);
+
+        if (nr_tokens < nrl->fill_granularity_tokens)
+                new_restart_thresh = nrl->fill_granularity_tokens;
+        else
+                new_restart_thresh = nr_tokens;
+        if (nrl->restart_thresh > new_restart_thresh) {
+                nrl->restart_thresh = new_restart_thresh;
+                set_restart_timer(nrl);
+        } else {
+                /* If the restart thresh is already less than the new
+                   restart thresh, then (a) the timer's already
+                   running, and (b) someone can already make use of
+                   the lower token pool, so it wouldn't be a good idea
+                   to move it backwards.  Therefore, do nothing. */
+        }
+        spin_unlock_bh(&nrl->lock);
+        return 0;
+}
diff --git a/drivers/net/xen-netchannel2/netback2.c b/drivers/net/xen-netchannel2/netback2.c
new file mode 100644 (file)
index 0000000..5325400
--- /dev/null
@@ -0,0 +1,538 @@
+#include <linux/kernel.h>
+#include <linux/gfp.h>
+#include <linux/vmalloc.h>
+#include <linux/miscdevice.h>
+#include <xen/gnttab.h>
+#include <xen/xenbus.h>
+#include <xen/interface/io/netchannel2.h>
+
+#include "netchannel2_core.h"
+#include "netchannel2_endpoint.h"
+#include "netchannel2_uspace.h"
+
+static struct netchannel2 *device_to_nc2(struct device *dev);
+
+#include "sysfs.c"
+
+#ifdef CONFIG_XEN_NETDEV2_VMQ
+#include "vmq.h"
+#define NR_TX_BUFS (VMQ_MAX_BUFFERS+256)
+#else
+#define NR_TX_BUFS 256
+#endif
+
+static atomic_t next_handle;
+/* A list of all currently-live netback2 interfaces. */
+static LIST_HEAD(all_netbacks);
+/* A lock to protect the above list. */
+static DEFINE_MUTEX(all_netbacks_lock);
+
+#define NETBACK2_MAGIC 0xb5e99485
+struct netback2 {
+        unsigned magic;
+        struct xenbus_device *xenbus_device;
+
+        int handle;
+        struct list_head list;
+
+        struct netchannel2 *chan;
+
+        struct grant_mapping b2f_mapping;
+        struct grant_mapping f2b_mapping;
+        struct grant_mapping control_mapping;
+
+        int attached;
+
+        struct xenbus_watch shutdown_watch;
+        int have_shutdown_watch;
+};
+
+static struct netback2 *xenbus_device_to_nb2(struct xenbus_device *xd)
+{
+        struct netback2 *nb = xd->dev.driver_data;
+        BUG_ON(nb->magic != NETBACK2_MAGIC);
+        return nb;
+}
+
+static struct netchannel2 *device_to_nc2(struct device *dev)
+{
+        return xenbus_device_to_nb2(to_xenbus_device(dev))->chan;
+}
+
+/* Read a range of grants out of xenstore and map them in gm.  Any
+   existing mapping in gm is released.  Returns 0 on success or <0 on
+   error.  On error, gm is preserved, and xenbus_dev_fatal() is
+   called. */
+static int map_grants(struct netback2 *nd, const char *prefix,
+                      struct grant_mapping *gm)
+{
+        struct xenbus_device *xd = nd->xenbus_device;
+        int err;
+        char buf[32];
+        int i;
+        unsigned nr_pages;
+        grant_ref_t grefs[MAX_GRANT_MAP_PAGES];
+
+        sprintf(buf, "%s-nr-pages", prefix);
+        err = xenbus_scanf(XBT_NIL, xd->otherend, buf, "%u", &nr_pages);
+        if (err == -ENOENT) {
+                nr_pages = 1;
+        } else if (err != 1) {
+                if (err < 0) {
+                        xenbus_dev_fatal(xd, err, "reading %s", buf);
+                        return err;
+                } else {
+                        xenbus_dev_fatal(xd, err, "reading %s as integer",
+                                         buf);
+                        return -EINVAL;
+                }
+        }
+
+        for (i = 0; i < nr_pages; i++) {
+                sprintf(buf, "%s-ref-%d", prefix, i);
+                err = xenbus_scanf(XBT_NIL, xd->otherend, buf, "%u",
+                                   &grefs[i]);
+                if (err != 1) {
+                        if (err < 0) {
+                                xenbus_dev_fatal(xd,
+                                                 err,
+                                                 "reading gref %d from %s/%s",
+                                                 i,
+                                                 xd->otherend,
+                                                 buf);
+                        } else {
+                                xenbus_dev_fatal(xd,
+                                                 -EINVAL,
+                                                 "expected an integer at %s/%s",
+                                                 xd->otherend,
+                                                 buf);
+                                err = -EINVAL;
+                        }
+                        return err;
+                }
+        }
+
+        err = nc2_map_grants(gm, grefs, nr_pages, xd->otherend_id);
+        if (err < 0)
+                xenbus_dev_fatal(xd, err, "mapping ring %s from %s",
+                                 prefix, xd->otherend);
+        return 0;
+}
+
+/* Undo the effects of attach_to_frontend */
+static void detach_from_frontend(struct netback2 *nb)
+{
+        if (!nb->attached)
+                return;
+        nc2_detach_rings(nb->chan);
+        nc2_unmap_grants(&nb->b2f_mapping);
+        nc2_unmap_grants(&nb->f2b_mapping);
+        nc2_unmap_grants(&nb->control_mapping);
+        nb->attached = 0;
+}
+
+static int attach_to_frontend(struct netback2 *nd)
+{
+        int err;
+        int evtchn;
+        struct xenbus_device *xd = nd->xenbus_device;
+        struct netchannel2 *nc = nd->chan;
+        struct netchannel2_backend_shared *nbs;
+
+        if (nd->attached)
+                return 0;
+
+        /* Attach the shared memory bits */
+        err = map_grants(nd, "b2f-ring", &nd->b2f_mapping);
+        if (err)
+                return err;
+        err = map_grants(nd, "f2b-ring", &nd->f2b_mapping);
+        if (err)
+                return err;
+        err = map_grants(nd, "control", &nd->control_mapping);
+        if (err)
+                return err;
+        nbs = nd->control_mapping.mapping->addr;
+        err = nc2_attach_rings(nc,
+                               &nbs->cons,
+                               nd->f2b_mapping.mapping->addr,
+                               nd->f2b_mapping.nr_pages * PAGE_SIZE,
+                               &nbs->prod,
+                               nd->b2f_mapping.mapping->addr,
+                               nd->b2f_mapping.nr_pages * PAGE_SIZE,
+                               xd->otherend_id);
+        if (err < 0) {
+                xenbus_dev_fatal(xd, err, "attaching to rings");
+                return err;
+        }
+
+        /* Connect the event channel. */
+        err = xenbus_scanf(XBT_NIL, xd->otherend, "event-channel", "%u",
+                           &evtchn);
+        if (err < 0) {
+                xenbus_dev_fatal(xd, err,
+                                 "reading %s/event-channel or {t,r}x-sring-pages",
+                                 xd->otherend);
+                return err;
+        }
+        err = nc2_connect_evtchn(nd->chan, xd->otherend_id, evtchn);
+        if (err < 0) {
+                xenbus_dev_fatal(xd, err, "binding to event channel");
+                return err;
+        }
+
+#ifdef CONFIG_XEN_NETDEV2_VMQ
+       nc2_vmq_connect(nc);
+#endif
+
+        /* All done */
+        nd->attached = 1;
+
+        return 0;
+}
+
+static void nb2_shutdown(struct netchannel2 *nc)
+{
+#ifdef CONFIG_XEN_NETDEV2_VMQ
+       nc2_vmq_disconnect(nc);
+#endif
+        nc2_set_nr_tx_buffers(nc, 0);
+}
+
+static void frontend_changed(struct xenbus_device *xd,
+                            enum xenbus_state frontend_state)
+{
+        struct netback2 *nb = xenbus_device_to_nb2(xd);
+        int err;
+
+        switch (frontend_state) {
+        case XenbusStateInitialising:
+                /* If the frontend does a kexec following a crash, we
+                   can end up bounced back here even though we're
+                   attached.  Try to recover by detaching from the old
+                   rings. */
+                /* (A normal shutdown, and even a normal kexec, would
+                 * have gone through Closed first, so we'll already be
+                 * detached, and this is pointless but harmless.) */
+                detach_from_frontend(nb);
+
+                nc2_set_nr_tx_buffers(nb->chan, NR_TX_BUFS);
+
+                /* Tell the frontend what sort of rings we're willing
+                   to accept. */
+                xenbus_printf(XBT_NIL, nb->xenbus_device->nodename,
+                              "max-sring-pages", "%d", MAX_GRANT_MAP_PAGES);
+
+#ifdef CONFIG_XEN_NETDEV2_BYPASS_ENDPOINT
+                xenbus_printf(XBT_NIL, nb->xenbus_device->nodename,
+                              "feature-bypass", "1");
+                xenbus_printf(XBT_NIL, nb->xenbus_device->nodename,
+                              "feature-bypass-max-pages", "%d",
+                              MAX_BYPASS_RING_PAGES_GRANTABLE);
+#endif
+
+                /* Start the device bring-up bit of the state
+                 * machine. */
+                xenbus_switch_state(nb->xenbus_device, XenbusStateInitWait);
+                break;
+
+        case XenbusStateInitWait:
+                /* Frontend doesn't use this state */
+                xenbus_dev_fatal(xd, EINVAL,
+                                 "unexpected frontend state InitWait");
+                break;
+
+        case XenbusStateInitialised:
+        case XenbusStateConnected:
+                /* Frontend has advertised its rings to us */
+                err = attach_to_frontend(nb);
+                if (err >= 0)
+                        xenbus_switch_state(xd, XenbusStateConnected);
+                break;
+
+        case XenbusStateClosing:
+               nb2_shutdown(nb->chan);
+                detach_from_frontend(nb);
+                xenbus_switch_state(xd, XenbusStateClosed);
+                break;
+
+        case XenbusStateClosed:
+                detach_from_frontend(nb);
+                xenbus_switch_state(xd, XenbusStateClosed);
+                if (!xenbus_dev_is_online(xd))
+                        device_unregister(&xd->dev);
+                break;
+
+        case XenbusStateUnknown:
+                detach_from_frontend(nb);
+                xenbus_switch_state(xd, XenbusStateClosed);
+                device_unregister(&xd->dev);
+                break;
+
+        default:
+                /* Ignore transitions to unknown states */
+                break;
+        }
+}
+
+static int netback2_uevent(struct xenbus_device *xd, char **envp,
+                           int num_envp, char *buffer, int buffer_size)
+{
+        struct netback2 *nb = xenbus_device_to_nb2(xd);
+        int i;
+        int length;
+
+        i = 0;
+        length = 0;
+        add_uevent_var(envp, num_envp, &i, buffer, buffer_size, &length,
+                      "vif=%s", nb->chan->net_device->name);
+
+       envp[i] = NULL;
+
+       return 0;
+}
+
+static void netback2_shutdown(struct xenbus_device *xd)
+{
+        struct netback2 *nb = xenbus_device_to_nb2(xd);
+       nb2_shutdown(nb->chan);
+        xenbus_switch_state(xd, XenbusStateClosing);
+}
+
+static void shutdown_watch_callback(struct xenbus_watch *watch,
+                                    const char **vec,
+                                    unsigned int len)
+{
+        struct netback2 *nb =
+                container_of(watch, struct netback2, shutdown_watch);
+        char *type;
+
+        type = xenbus_read(XBT_NIL, nb->xenbus_device->nodename,
+                           "shutdown-request", NULL);
+        if (IS_ERR(type)) {
+                if (PTR_ERR(type) != -ENOENT)
+                        printk(KERN_WARNING "Cannot read %s/%s: %ld\n",
+                               nb->xenbus_device->nodename, "shutdown-request",
+                               PTR_ERR(type));
+                return;
+        }
+        if (strcmp(type, "force") == 0) {
+                detach_from_frontend(nb);
+                xenbus_switch_state(nb->xenbus_device, XenbusStateClosed);
+        } else if (strcmp(type, "normal") == 0) {
+                netback2_shutdown(nb->xenbus_device);
+        } else {
+                printk(KERN_WARNING "Unrecognised shutdown request %s from tools\n",
+                       type);
+        }
+        xenbus_rm(XBT_NIL, nb->xenbus_device->nodename, "shutdown-request");
+        kfree(type);
+}
+
+static ssize_t show_handle(struct device *dev, struct device_attribute *attr,
+                           char *buf)
+{
+        struct xenbus_device *xd = to_xenbus_device(dev);
+        struct netback2 *nb = xenbus_device_to_nb2(xd);
+        return sprintf(buf, "%u", nb->handle);
+}
+static DEVICE_ATTR(handle, S_IRUGO, show_handle, NULL);
+
+static int netback2_probe(struct xenbus_device *xd,
+                          const struct xenbus_device_id *id)
+{
+        struct netback2 *nb;
+
+        nb = kzalloc(sizeof(*nb), GFP_KERNEL);
+        if (nb == NULL)
+                goto err;
+        nb->magic = NETBACK2_MAGIC;
+        nb->xenbus_device = xd;
+
+        nb->shutdown_watch.node = kasprintf(GFP_KERNEL, "%s/shutdown-request",
+                                            xd->nodename);
+        if (nb->shutdown_watch.node == NULL)
+                goto err;
+        nb->shutdown_watch.callback = shutdown_watch_callback;
+        if (register_xenbus_watch(&nb->shutdown_watch))
+                goto err;
+        nb->have_shutdown_watch = 1;
+
+        nb->chan = nc2_new(xd);
+        if (!nb->chan)
+                goto err;
+
+        xd->dev.driver_data = nb;
+
+        nc2_sysfs_addif(xd);
+
+        nb->handle = atomic_inc_return(&next_handle);
+        mutex_lock(&all_netbacks_lock);
+        list_add(&nb->list, &all_netbacks);
+        mutex_unlock(&all_netbacks_lock);
+
+        device_create_file(&xd->dev, &dev_attr_handle);
+
+        kobject_uevent(&xd->dev.kobj, KOBJ_ONLINE);
+
+        return 0;
+
+err:
+        if (nb != NULL) {
+                if (nb->have_shutdown_watch)
+                        unregister_xenbus_watch(&nb->shutdown_watch);
+                kfree(nb->shutdown_watch.node);
+                kfree(nb);
+        }
+        xenbus_dev_fatal(xd, ENOMEM, "probing netdev");
+        return -ENOMEM;
+}
+
+static int netback2_remove(struct xenbus_device *xd)
+{
+        struct netback2 *nb = xenbus_device_to_nb2(xd);
+        kobject_uevent(&xd->dev.kobj, KOBJ_OFFLINE);
+        nc2_sysfs_delif(xd);
+        mutex_lock(&all_netbacks_lock);
+        list_del(&nb->list);
+        mutex_unlock(&all_netbacks_lock);
+        if (nb->chan != NULL)
+                nc2_release(nb->chan);
+        if (nb->have_shutdown_watch)
+                unregister_xenbus_watch(&nb->shutdown_watch);
+        kfree(nb->shutdown_watch.node);
+        nc2_unmap_grants(&nb->b2f_mapping);
+        nc2_unmap_grants(&nb->f2b_mapping);
+        nc2_unmap_grants(&nb->control_mapping);
+        kfree(nb);
+        return 0;
+}
+
+static const struct xenbus_device_id netback2_ids[] = {
+       { "vif2" },
+       { "" }
+};
+
+static struct xenbus_driver netback2 = {
+       .name = "vif2",
+       .owner = THIS_MODULE,
+       .ids = netback2_ids,
+       .probe = netback2_probe,
+        .remove = netback2_remove,
+       .otherend_changed = frontend_changed,
+        .run_otherend_changed_during_shutdown = 1,
+        .shutdown = netback2_shutdown,
+        .uevent = netback2_uevent,
+};
+
+#ifdef CONFIG_XEN_NETDEV2_BYPASSABLE
+static struct netback2 *find_netback_by_handle_locked(unsigned handle)
+{
+        struct netback2 *nb;
+
+        list_for_each_entry(nb, &all_netbacks, list) {
+                if (nb->handle == handle)
+                        return nb;
+        }
+        return NULL;
+}
+
+static struct netback2 *find_netback_by_remote_mac_locked(const char *mac)
+{
+        struct netback2 *nb;
+
+        list_for_each_entry(nb, &all_netbacks, list) {
+                if (!memcmp(nb->chan->rings.remote_mac, mac, ETH_ALEN))
+                        return nb;
+        }
+        return NULL;
+}
+
+static long netchannel2_ioctl_establish_bypass(struct netchannel2_ioctl_establish_bypass __user *argsp)
+{
+        struct netchannel2_ioctl_establish_bypass args;
+        struct netback2 *a, *b;
+        int res;
+
+        if (copy_from_user(&args, argsp, sizeof(args)))
+                return -EFAULT;
+
+        mutex_lock(&all_netbacks_lock);
+        a = find_netback_by_handle_locked(args.handle_a);
+        b = find_netback_by_handle_locked(args.handle_b);
+        if (a && b)
+                res = nc2_establish_bypass(a->chan, b->chan);
+        else
+                res = -EINVAL;
+        mutex_unlock(&all_netbacks_lock);
+
+        return res;
+}
+
+void nb2_handle_suggested_bypass(struct netchannel2 *a_chan, const char *mac_b)
+{
+        struct netback2 *b;
+        mutex_lock(&all_netbacks_lock);
+        b = find_netback_by_remote_mac_locked(mac_b);
+        if (b != NULL)
+                nc2_establish_bypass(a_chan, b->chan);
+        mutex_unlock(&all_netbacks_lock);
+}
+
+static long netchannel2_ioctl_destroy_bypass(struct netchannel2_ioctl_destroy_bypass __user *argsp)
+{
+        struct netchannel2_ioctl_destroy_bypass args;
+
+        if (copy_from_user(&args, argsp, sizeof(args)))
+                return -EFAULT;
+
+        return nc2_destroy_bypass(args.handle);
+}
+#endif
+
+static long misc_dev_unlocked_ioctl(struct file *filp, unsigned cmd,
+                                    unsigned long data)
+{
+        switch (cmd) {
+#ifdef CONFIG_XEN_NETDEV2_BYPASSABLE
+        case NETCHANNEL2_IOCTL_ESTABLISH_BYPASS:
+                return netchannel2_ioctl_establish_bypass(
+                        (struct netchannel2_ioctl_establish_bypass __user *)data);
+        case NETCHANNEL2_IOCTL_DESTROY_BYPASS:
+                return netchannel2_ioctl_destroy_bypass(
+                        (struct netchannel2_ioctl_destroy_bypass __user *)data);
+#endif
+        default:
+                return -EINVAL;
+        }
+}
+
+static struct file_operations misc_dev_fops = {
+        .owner = THIS_MODULE,
+        .unlocked_ioctl = misc_dev_unlocked_ioctl
+};
+
+static struct miscdevice netback2_misc_dev = {
+        .minor = MISC_DYNAMIC_MINOR,
+        .name = "netback2",
+        .fops = &misc_dev_fops
+};
+
+int __init netback2_init(void)
+{
+        int r;
+
+        r = misc_register(&netback2_misc_dev);
+        if (r < 0) {
+                printk(KERN_ERR "Error %d registering control device.\n",
+                       r);
+                return r;
+        }
+        r = xenbus_register_backend(&netback2);
+        if (r < 0) {
+                printk(KERN_ERR "error %d registering backend driver.\n",
+                       r);
+                misc_deregister(&netback2_misc_dev);
+        }
+        return r;
+}
diff --git a/drivers/net/xen-netchannel2/netchan2.c b/drivers/net/xen-netchannel2/netchan2.c
new file mode 100644 (file)
index 0000000..b4bdeb3
--- /dev/null
@@ -0,0 +1,32 @@
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include "netchannel2_endpoint.h"
+
+static int __init netchan2_init(void)
+{
+        int r;
+
+       r = nc2_init();
+        if (r < 0)
+                return r;
+        r = netfront2_init();
+        if (r < 0)
+                return r;
+        r = netback2_init();
+        if (r < 0)
+                netfront2_exit();
+        return r;
+}
+module_init(netchan2_init);
+
+/* We can't unload if we're acting as a backend. */
+#ifndef CONFIG_XEN_NETDEV2_BACKEND
+static void __exit netchan2_exit(void)
+{
+        netfront2_exit();
+        nc2_exit();
+}
+module_exit(netchan2_exit);
+#endif
+
+MODULE_LICENSE("GPL");
diff --git a/drivers/net/xen-netchannel2/netchannel2_core.h b/drivers/net/xen-netchannel2/netchannel2_core.h
new file mode 100644 (file)
index 0000000..7c5f46a
--- /dev/null
@@ -0,0 +1,1131 @@
+#ifndef NETCHANNEL2_CORE_H__
+#define NETCHANNEL2_CORE_H__
+
+#include <xen/interface/xen.h>
+#ifdef CONFIG_PARAVIRT
+#include <xen/grant_table.h>
+#else
+#include <xen/gnttab.h>
+#endif
+#include <xen/interface/io/netchannel2.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <asm/xen/hypercall.h>
+
+#include "vmq_def.h"
+
+#ifdef CONFIG_PARAVIRT
+typedef struct gnttab_copy gnttab_copy_t;
+#endif
+
+#if 0
+#define DEBUGMSG(x, ...) do { printk(KERN_NOTICE "%s:%s:%d " x "\n", __FILE__, __func__, __LINE__ , ## __VA_ARGS__ ); } while (0)
+#else
+static inline void DEBUGMSG(const char *fmt, ...)
+{
+}
+#endif
+#define ENTER() DEBUGMSG("===>")
+#define EXIT() DEBUGMSG("<===")
+#define RETURN(x) do { EXIT(); return (x); } while (0)
+
+/* After we send this number of frags, we request the other end to
+ * notify us when sending the corresponding finish packet message */
+#define MAX_MAX_COUNT_FRAGS_NO_EVENT 192
+
+/* Very small packets (e.g. TCP pure acks) are sent inline in the
+ * ring, to avoid the hypercall overhead.  This is the largest packet
+ * which will be sent small, in bytes.  It should be big enough to
+ * cover the normal headers (i.e. ethernet + IP + TCP = 66 bytes) plus
+ * a little bit of slop for options etc. */
+#define PACKET_PREFIX_SIZE 96
+
+/* How many packets can we have outstanding at any one time?  This
+ * must be small enough that it won't be confused with an sk_buff
+ * pointer; see the txp_slot stuff later. */
+#define NR_TX_PACKETS 256
+
+/* A way of keeping track of a mapping of a bunch of grant references
+   into a contigous chunk of virtual address space.  This is used for
+   things like multi-page rings. */
+#define MAX_GRANT_MAP_PAGES 4
+struct grant_mapping {
+        unsigned nr_pages;
+        grant_handle_t handles[MAX_GRANT_MAP_PAGES];
+        struct vm_struct *mapping;
+};
+
+enum transmit_policy {
+        transmit_policy_unknown = 0,
+        transmit_policy_first = 0xf001,
+        transmit_policy_grant = transmit_policy_first,
+        transmit_policy_post,
+        transmit_policy_map,
+        transmit_policy_small,
+#ifdef CONFIG_XEN_NETDEV2_VMQ
+        transmit_policy_vmq,
+        transmit_policy_last = transmit_policy_vmq
+#else
+        transmit_policy_last = transmit_policy_small
+#endif
+};
+
+/* When we send a packet message, we need to tag it with an ID.  That
+   ID is an index into the TXP slot array.  Each slot contains either
+   a pointer to an sk_buff (if it's in use), or the index of the next
+   free slot (if it isn't).  A slot is in use if the contents is >
+   NR_TX_PACKETS, and free otherwise. */
+struct txp_slot {
+        unsigned long __contents;
+};
+
+typedef uint32_t nc2_txp_index_t;
+
+#define INVALID_TXP_INDEX ((nc2_txp_index_t)NR_TX_PACKETS)
+
+static inline int txp_slot_in_use(struct txp_slot *slot)
+{
+        if (slot->__contents <= NR_TX_PACKETS)
+                return 0;
+        else
+                return 1;
+}
+
+static inline void txp_set_skb(struct txp_slot *slot, struct sk_buff *skb)
+{
+        slot->__contents = (unsigned long)skb;
+}
+
+static inline struct sk_buff *txp_get_skb(struct txp_slot *slot)
+{
+        if (txp_slot_in_use(slot))
+                return (struct sk_buff *)slot->__contents;
+        else
+                return NULL;
+}
+
+static inline void txp_set_next_free(struct txp_slot *slot,
+                                     nc2_txp_index_t idx)
+{
+        slot->__contents = idx;
+}
+
+static inline nc2_txp_index_t txp_get_next_free(struct txp_slot *slot)
+{
+        return (nc2_txp_index_t)slot->__contents;
+}
+
+/* This goes in struct sk_buff::cb */
+struct skb_cb_overlay {
+        struct list_head buffers; /* Only if we're using the posted
+                                     buffer strategy. */
+        struct txp_slot *tp;
+        unsigned nr_fragments;
+        grant_ref_t gref_pool;
+        enum transmit_policy policy;
+        uint8_t failed;
+        uint8_t expecting_finish;
+        uint8_t type;
+        uint16_t inline_prefix_size;
+};
+
+#define CASSERT(x) typedef unsigned __cassert_ ## __LINE__ [(x)-1]
+CASSERT(sizeof(struct skb_cb_overlay) <= sizeof( ((struct sk_buff *)0)->cb));
+
+static inline struct skb_cb_overlay *get_skb_overlay(struct sk_buff *skb)
+{
+        return (struct skb_cb_overlay *)skb->cb;
+}
+
+
+struct nc2_rate_limiter {
+        unsigned max_tokens;
+        unsigned tokens_per_tick_ord;
+        unsigned fill_granularity_tokens;
+        void (*stop)(void *ctxt);
+        void (*start)(void *ctxt);
+        void *ctxt;
+
+        unsigned cur_tokens;
+        unsigned restart_thresh;
+        struct timer_list timer;
+        u64 last_fill_time;
+        spinlock_t lock;
+};
+
+void nc2_init_rate_limiter(struct nc2_rate_limiter *nrl,
+                           unsigned tokens_per_tick_ord,
+                           unsigned max_tokens,
+                           unsigned fill_granularity_tokens,
+                           void (*stop)(void *),
+                           void (*start)(void *),
+                           void *ctxt);
+void nc2_cleanup_rate_limiter(struct nc2_rate_limiter *nrl);
+void nc2_rate_limiter_credit(struct nc2_rate_limiter *nrl,
+                             unsigned nr_tokens);
+int nc2_rate_limiter_debit(struct nc2_rate_limiter *nrl,
+                           unsigned nr_tokens);
+
+struct nc2_alternate_ring;
+
+#ifdef CONFIG_XEN_NETDEV2_AUTOMATIC_BYPASS
+#define AUTOBYPASS_MAX_HOT_MACS 8
+#define AUTOBYPASS_SUGG_QUEUE_SIZE 8
+struct nc2_auto_bypass {
+        enum {
+                autobypass_state_normal,
+                autobypass_state_considering,
+                autobypass_state_debounce
+        } state;
+        uint32_t nr_bypass_packets;
+        uint64_t nr_non_bypass_packets;
+        unsigned long start_jiffies;
+        unsigned nr_hot_macs;
+        struct {
+                unsigned char mac[ETH_ALEN];
+                /* This won't overflow because the autobypass period
+                   is less than 65536. */
+                uint16_t count;
+        } hot_macs[AUTOBYPASS_MAX_HOT_MACS];
+        unsigned suggestion_head;
+        unsigned suggestion_tail;
+        struct {
+                unsigned char mac[ETH_ALEN];
+        } suggestions[AUTOBYPASS_SUGG_QUEUE_SIZE];
+};
+void nc2_received_bypass_candidate_packet(struct netchannel2 *nc,
+                                          struct sk_buff *skb);
+
+struct nc2_bypass_autoteardown {
+        struct list_head autoteardown_list;
+        uint64_t nr_packets;
+        unsigned seen_count;
+};
+
+void nc2_register_bypass_for_autoteardown(struct nc2_alternate_ring *nar);
+void nc2_unregister_bypass_for_autoteardown(struct nc2_alternate_ring *nar);
+void nc2_shutdown_autoteardown(void);
+#else
+static inline void nc2_shutdown_autoteardown(void)
+{
+}
+static inline void nc2_register_bypass_for_autoteardown(struct nc2_alternate_ring *nar)
+{
+}
+static inline void nc2_unregister_bypass_for_autoteardown(struct nc2_alternate_ring *nar)
+{
+}
+#endif
+
+#ifdef CONFIG_XEN_NETDEV2_BYPASSABLE
+#define NC2_BYPASS_SUGG_QUEUE_SIZE 8
+struct nc2_incoming_bypass_suggestions {
+        spinlock_t lock;
+
+        unsigned head;
+        unsigned tail;
+
+        struct work_struct workitem;
+
+        struct {
+                unsigned char mac[ETH_ALEN];
+        } queue[NC2_BYPASS_SUGG_QUEUE_SIZE];
+};
+
+void nc2_init_incoming_bypass_suggestions(
+        struct netchannel2 *nc,
+        struct nc2_incoming_bypass_suggestions *nibs);
+#endif
+
+/* A buffer which we have allocated for the other end to send us
+   packets in. */
+struct nc2_rx_buffer {
+        struct list_head list;
+        void *buffer;
+        grant_ref_t gref;
+        uint8_t is_posted; /* Set if this buffer is available to the
+                              other end. */
+};
+
+/* A buffer which the other end has provided us which we can use to
+   transmit packets to it. */
+struct nc2_tx_buffer {
+        struct list_head list;
+        uint32_t id; /* ID assigned by the remote endpoint. */
+        grant_ref_t gref;
+        uint16_t off_in_page;
+        uint16_t size;
+        grant_handle_t grant_handle; 
+};
+
+/* Packets for which we need to send FINISH_PACKET messages for as
+   soon as possible. */
+struct pending_finish_packets {
+#define MAX_PENDING_FINISH_PACKETS 256
+        uint32_t ids[MAX_PENDING_FINISH_PACKETS];
+        RING_IDX prod;
+        RING_IDX cons;
+};
+
+struct hypercall_batcher {
+        unsigned nr_pending_gops;
+        gnttab_copy_t gops[16];
+        void *ctxt[16];
+};
+
+struct netchannel2_ring_pair {
+        struct netchannel2 *interface;
+        /* Main ring lock.  Acquired from bottom halves.  The
+           pending_interfaces_lock nests inside this one.  If we have
+           auxiliary rings, the aux ring locks nest inside the master
+           ring lock. */
+        spinlock_t lock;
+
+        /* jiffies the last time the interrupt fired.  Not
+           synchronised at all, because it doesn't usually matter if
+           it's a bit off. */
+        unsigned last_event;
+
+        /* Protected by the lock.  Initialised at attach_ring() time
+           and de-initialised at detach_ring() time. */
+        struct netchannel2_prod_ring prod_ring;
+        struct netchannel2_cons_ring cons_ring;
+        uint8_t is_attached; /* True if the rings are currently safe to
+                                access. */
+        uint8_t detach_pending;
+
+        unsigned max_count_frags_no_event;
+        unsigned expected_finish_messages;
+        struct timer_list polling_timer;
+
+        int need_flush;
+        domid_t otherend_id;
+
+        struct nc2_rate_limiter limiter;
+        uint8_t rlimit_disabled;
+
+        /* The IRQ corresponding to the event channel which is
+           connected to the other end.  This only changes from the
+           xenbus state change handler.  It is notified from lots of
+           other places.  Fortunately, it's safe to notify on an irq
+           after it's been released, so the lack of synchronisation
+           doesn't matter. */
+        /* The irq is disabled whenever either the interface is on the
+           pending list or it's been stopped by the rate limiter.
+           (i.e. irq_disable_count == is_pending +
+           rlimit_disabled). */
+        int irq;
+        int evtchn;
+
+        /* A count of the number of times that the interrupt has been
+           disabled.  This is purely a debugging aid, so we're a bit
+           lax about synchronisation. */
+#ifdef DEBUG
+        unsigned irq_disable_count;
+#endif
+
+        /* The MAC address of our peer. */
+        unsigned char remote_mac[ETH_ALEN];
+
+        /* Set if we need to check the source MAC address on incoming
+           packets. */
+        int filter_mac;
+
+        /* A pool of free transmitted_packet structures, threaded on
+           the list member.  Protected by the lock. */
+        nc2_txp_index_t head_free_tx_packet;
+
+        /* Total number of packets on the allocated list.  Protected
+           by the lock. */
+        unsigned nr_tx_packets_outstanding;
+        /* Maximum number of packets which the other end will allow us
+           to keep outstanding at one time.  Valid whenever
+           is_attached is set. */
+        unsigned max_tx_packets_outstanding;
+
+       /* Count number of frags that we have sent to the other side
+          When we reach a max value we request that the other end
+          send an event when sending the corresponding finish message */
+       unsigned count_frags_no_event;
+
+        /* Set if we need to send a SET_MAX_PACKETS message.
+           Protected by the lock. */
+        uint8_t need_advertise_max_packets;
+
+        /* Set if there are messages on the ring which are considered
+           time-sensitive, so that it's necessary to notify the remote
+           endpoint as soon as possible. */
+        uint8_t pending_time_sensitive_messages;
+
+        /* Set if we've previously suppressed a remote notification
+           because none of the messages pending at the time of the
+           flush were time-sensitive.  The remote should be notified
+           as soon as the ring is flushed, even if the normal
+           filtering rules would suppress the event. */
+        uint8_t delayed_kick;
+
+        /* A list of packet IDs which we need to return to the other
+           end as soon as there is space on the ring.  Protected by
+           the lock. */
+        struct pending_finish_packets pending_finish;
+
+        /* transmitted_packet structures which are to be transmitted
+           next time the TX tasklet looks at this interface.
+           Protected by the lock. */
+        struct sk_buff_head pending_tx_queue;
+
+        /* Packets which we'll have finished transmitting as soon as
+           we flush the hypercall batcher.  Protected by the lock. */
+        struct sk_buff_head release_on_flush_batcher;
+
+        /* The pending interface list is thread through here.  We are
+           on the list iff is_pending is set. Protected by the pending
+           interface lock. */
+        struct list_head pending_interfaces;
+        uint8_t is_pending;
+
+        wait_queue_head_t waitq;
+
+        /* A pre-allocated pool of TX packets.  The
+           allocated_tx_packets and free_tx_packets linked lists
+           contain elements of this array, and it can also be directly
+           indexed by packet ID.  Protected by the lock. */
+        struct txp_slot tx_packets[NR_TX_PACKETS];
+};
+
+struct netchannel2 {
+#define NETCHANNEL2_MAGIC 0x57c68c1d
+        unsigned magic;
+
+        /* Set when the structure is created and never changed */
+        struct net_device *net_device;
+        struct xenbus_device *xenbus_device;
+
+        /* Set if we trust the remote endpoint. */
+        int remote_trusted;
+        /* Set if the remote endpoint is expected to trust us.
+           There's no guarantee that this is actually correct, but
+           it's useful for optimisation. */
+        int local_trusted;
+
+#ifdef CONFIG_XEN_NETDEV2_BYPASS_ENDPOINT
+        /* Alternate rings for this interface.  Protected by the
+           master rings lock. */
+        struct list_head alternate_rings;
+        uint8_t need_aux_ring_state_machine;
+
+        uint8_t pending_bypass_error;
+#endif
+
+        /* Various statistics.  These don't need to be perfectly
+           accurate, so they're not protected by anything. */
+        /* TX stats */
+        struct {
+                /* prepare_xmit_allocate_resources() failed because
+                   we were too busy. */
+                unsigned nr_too_busy;
+                /* We had to queue the tasklet for this interface. */
+                unsigned nr_queue_tasklet;
+                /* The tasklet looked at this interface */
+                unsigned nr_tasklet_action;
+                /* We notified the remote endpoint */
+                unsigned nr_notifies;
+                /* Transmit without checksum */
+                unsigned nr_csum_blank;
+                /* Transmit with validated checksum */
+                unsigned nr_csum_validated;
+                /* Transmit GSO */
+                unsigned nr_gso;
+                /* Too busy: out of packets */
+                unsigned nr_failed_alloc_packet;
+                /* Too busy: out of fragments */
+                unsigned nr_failed_alloc_fragment;
+                /* Too busy: no space on ring */
+                unsigned nr_failed_reserve_ring;
+                /* Too busy: not enough posted buffers */
+                unsigned nr_failed_no_buffers;
+        } tx;
+        /* RX stats */
+        struct {
+                /* IRQ fired */
+                unsigned nr_irqs;
+                /* poll() method called */
+                unsigned nr_polls;
+                /* We consumed more than a whole ring in one poll()
+                 * call */
+                unsigned nr_ring_overflow;
+                /* Total number of messages received. */
+                unsigned nr_messages;
+                /* We raced with something checking for ring
+                 * completion. */
+                unsigned nr_ring_race;
+                /* We notified the remote due to message RX work. */
+                unsigned nr_notify;
+                /* We became unstuck due to RX work. */
+                unsigned nr_unstick;
+                /* The poll() method left work pending. */
+                unsigned nr_incomplete_poll;
+                /* handle_receiver_map_packet() failed because
+                   alloc_rx_packet() did. */
+                unsigned nr_failed_no_packet;
+                /* Failed because dev_alloc_skb() did. */
+                unsigned nr_failed_no_skb;
+                /* We modified the receiver map timer. */
+                unsigned nr_mod_timer;
+                /* The RX timer expired. */
+                unsigned nr_timer_expire;
+                /* We had to copy a mapped packet. */
+                unsigned nr_unmap;
+                /* Calls to put_rx_map_packet. */
+                unsigned nr_put_packet;
+                /* The GC tasklet ran */
+                unsigned nr_gc_tasklet;
+                /* Receive with validated checksum */
+                unsigned nr_csum_validated;
+                /* Receive with blank checksum */
+                unsigned nr_csum_blank;
+                /* Receive GSO */
+                unsigned nr_gso;
+                /* Dropped because the source MAC address was
+                 * wrong. */
+                unsigned dropped_bad_mac;
+        } rx;
+
+        struct netchannel2_ring_pair rings;
+
+        /* A single-entry transmit queue, used because we can't
+         * reliably tell whether we're about to run out of tx slots
+         * and stop the main queue.  Protected by the lock. */
+        /* This should arguably be per-ring, but that's quite hard to
+         * arrange because we only have one queue to stop at the Linux
+         * level.  We could manage our own pending queue, but we're
+         * still going to need to limit it somehow and we've not
+         * really solved the problem.  Alternatively, we could make
+         * the alternate rings fall back to the main ring when they
+         * get full, but that'll cause packet reordering.  The fix
+         * used is just to drop packets when the alt rigns get full,
+         * in the hope that that'll cause the sender to back off and
+         * we'll avoid hitting the problem again, but it's not really
+         * very satisfactory.
+         *
+         * (Stopping the main ring just because an ancillary ring is
+         * full isn't good enough, because the peer on an ancillary
+         * ring will often be untrusted, and we can't let them kill
+         * the connection to the principal ring's peer.)
+         */
+        struct sk_buff_head pending_skbs;
+
+        /* Task offload control.  These are all protected by the
+         * lock. */
+        /* Ethtool allows us to use RX checksumming */
+        uint8_t use_rx_csum;
+        /* The remote endpoint allows us to use TX checksumming.
+           Whether we actually use TX checksumming is controlled by
+           the net device feature bits. */
+        uint8_t allow_tx_csum_offload;
+        /* Ethtool allows us to use LRO for TCPv4 */
+        uint8_t use_lro;
+        /* The remote endpoint allows us to use TSO for TCPv4.  As for
+           checksumming, we only actually use the feature if the net
+           device says to. */
+        uint8_t allow_tso;
+        /* At some point in the past, we tried to tell the other end
+           what our current offload policy is and failed.  Try again
+           as soon as possible. */
+        uint8_t need_advertise_offloads;
+
+       /* Flag to indicate that the interface is stopped
+           When the interface is stopped we need to run the tasklet
+           after we receive an interrupt so that we can wake it up */
+       uint8_t is_stopped;
+
+#ifdef CONFIG_XEN_NETDEV2_BYPASSABLE
+        /* Bypass support.  */
+        /* There's some unadvertised bypass in one of the lists. */
+        uint8_t need_advertise_bypasses;
+        uint8_t bypass_max_pages;
+        uint16_t max_bypasses;
+        uint16_t extant_bypasses;
+        struct list_head bypasses_a;
+        struct list_head bypasses_b;
+
+        struct nc2_bypass *current_bypass_frontend;
+        struct nc2_incoming_bypass_suggestions incoming_bypass_suggestions;
+#endif
+
+        /* Infrastructure for managing buffers which we've posted to
+           the other end.  These are all protected by the lock. */
+        /* A list of nx2_rx_buffer structures, threaded on list, which
+           we've posted to the other end. */
+        struct list_head rx_buffers;
+        /* Buffers which we've allocated but not yet sent to the other
+           end. */
+        struct list_head unposted_rx_buffers;
+        /* Buffers which are available but not yet allocated. */
+        struct list_head unused_rx_buffers;
+        /* The number of buffers in the rx_buffers list. */
+        unsigned nr_rx_buffers;
+        /* The maximum number of buffers which we can ever have
+           outstanding, and the size of the rx_buffer_structs
+           array. */
+        unsigned max_nr_rx_buffers;
+        /* A bunch of nc2_rx_buffer structures which can be used for
+           RX buffers. */
+        struct nc2_rx_buffer *rx_buffer_structs;
+        /* Set if we're sufficiently far through device shutdown that
+           posting more RX buffers would be a bad idea. */
+        uint8_t dont_post_buffers;
+
+        /* Infrastructure for managing buffers which the other end has
+           posted to us.  Protected by the lock. */
+        /* A list of nc2_tx_buffer structures, threaded on list, which
+           contains all tx buffers which have been posted by the
+           remote. */
+        struct list_head avail_tx_buffers;
+        /* A list of nc2_tx_buffer structures which the other end
+           hasn't populated yet. */
+        struct list_head unused_tx_buffer_slots;
+        /* A list of nc2_tx_buffer structures which we need to return
+           to the other end. */
+        struct list_head pending_tx_buffer_return;
+        /* Some pre-allocated nc2_tx_buffer structures.  We have to
+           pre-allocate, because we always need to be able to respond
+           to a POST_BUFFER message (up to some limit). */
+        struct nc2_tx_buffer *tx_buffers;
+        /* Non-zero if we need to send the other end a
+           SET_NR_POSTED_BUFFERS message. */
+        uint8_t need_advertise_tx_buffers;
+        /* Number of tx buffers.  This is the actual number of slots
+           in the @tx_buffers array. */
+        uint32_t nr_tx_buffers;
+        /* Number of available tx buffers.  The length of the
+         * avail_tx_buffers list. */
+        uint32_t nr_avail_tx_buffers;
+        /* ``Configured'' number of tx buffers.  We only actually
+           allocate any TX buffers when the local interface is up, but
+           this is set to the desired number of buffers all the
+           time. */
+        uint32_t configured_nr_tx_buffers;
+
+        /* Updates are protected by the lock.  This can be read at any
+         * time without holding any locks, and the rest of Linux is
+         * expected to cope. */
+        struct net_device_stats stats;
+
+        struct hypercall_batcher batcher;
+
+#ifdef CONFIG_XEN_NETDEV2_VMQ
+       /* vmq data for supporting multi-queue devices */
+       nc2_vmq_t vmq;
+#endif
+
+#ifdef CONFIG_XEN_NETDEV2_AUTOMATIC_BYPASS
+        struct nc2_auto_bypass auto_bypass;
+#endif
+};
+
+#ifdef CONFIG_XEN_NETDEV2_BYPASSABLE
+#define MAX_BYPASS_RING_PAGES_GRANTABLE 4
+struct nc2_bypass_endpoint {
+        struct list_head list; /* Always ``valid'', but won't actually
+                                  be in any list if we're detached (it
+                                  gets set to the empty list). */
+        struct netchannel2 *nc2; /* Valid provided detached isn't
+                                  * set */
+        grant_ref_t incoming_grefs[MAX_BYPASS_RING_PAGES_GRANTABLE];
+        grant_ref_t outgoing_grefs[MAX_BYPASS_RING_PAGES_GRANTABLE];
+        grant_ref_t control_gref;
+        unsigned long incoming_pages[MAX_BYPASS_RING_PAGES_GRANTABLE];
+
+        uint8_t need_advertise;
+        uint8_t need_disable;
+        uint8_t disable_sent;
+        uint8_t disabled;
+        uint8_t need_detach;
+        uint8_t detach_sent;
+        uint8_t detached;
+};
+
+/* This is the representation of a bypass in the bypassed domain. */
+struct nc2_bypass {
+        /* Cleared to an empty list if both endpoints are detached. */
+        struct list_head list;
+
+        /* Reference count.  Being on the big list, threaded through
+           @list, counts as a single reference. */
+        atomic_t refcnt;
+
+        struct nc2_bypass_endpoint ep_a;
+        struct nc2_bypass_endpoint ep_b;
+        unsigned long control_page;
+        unsigned nr_ring_pages;
+
+        unsigned handle;
+        int evtchn_port;
+
+        wait_queue_head_t detach_waitq;
+};
+
+int nc2_establish_bypass(struct netchannel2 *a, struct netchannel2 *b);
+int nc2_destroy_bypass(int handle);
+void _nc2_advertise_bypasses(struct netchannel2 *nc);
+static inline void nc2_advertise_bypasses(struct netchannel2 *nc)
+{
+        if (nc->need_advertise_bypasses)
+                _nc2_advertise_bypasses(nc);
+}
+void nc2_handle_bypass_disabled(struct netchannel2 *nc,
+                                struct netchannel2_ring_pair *ncrp,
+                                struct netchannel2_msg_hdr *hdr);
+void nc2_handle_bypass_detached(struct netchannel2 *nc,
+                                struct netchannel2_ring_pair *ncrp,
+                                struct netchannel2_msg_hdr *hdr);
+void nc2_handle_bypass_frontend_ready(struct netchannel2 *nc,
+                                      struct netchannel2_ring_pair *ncrp,
+                                      struct netchannel2_msg_hdr *hdr);
+void nc2_handle_bypass_disabled(struct netchannel2 *nc,
+                                struct netchannel2_ring_pair *ncrp,
+                                struct netchannel2_msg_hdr *hdr);
+void nc2_handle_bypass_detached(struct netchannel2 *nc,
+                                struct netchannel2_ring_pair *ncrp,
+                                struct netchannel2_msg_hdr *hdr);
+void nc2_handle_suggest_bypass(struct netchannel2 *nc,
+                               struct netchannel2_ring_pair *ncrp,
+                               struct netchannel2_msg_hdr *hdr);
+void release_bypasses(struct netchannel2 *nc);
+void nb2_handle_suggested_bypass(struct netchannel2 *a_chan,
+                                 const char *mac_b);
+void nc2_aux_ring_start_disable_sequence(struct nc2_alternate_ring *nar);
+void nc2_aux_ring_start_detach_sequence(struct nc2_alternate_ring *nar);
+#else
+static inline void release_bypasses(struct netchannel2 *nc)
+{
+}
+static inline void nc2_advertise_bypasses(struct netchannel2 *nc)
+{
+}
+static inline void nc2_handle_bypass_frontend_ready(struct netchannel2 *nc,
+                                                    struct netchannel2_ring_pair *ncrp,
+                                                    struct netchannel2_msg_hdr *hdr)
+{
+}
+static inline void nc2_handle_bypass_disabled(struct netchannel2 *nc,
+                                              struct netchannel2_ring_pair *ncrp,
+                                              struct netchannel2_msg_hdr *hdr)
+{
+}
+static inline void nc2_handle_bypass_detached(struct netchannel2 *nc,
+                                              struct netchannel2_ring_pair *ncrp,
+                                              struct netchannel2_msg_hdr *hdr)
+{
+}
+static inline void nc2_handle_suggest_bypass(struct netchannel2 *nc,
+                                             struct netchannel2_ring_pair *ncrp,
+                                             struct netchannel2_msg_hdr *hdr)
+{
+}
+#endif
+
+#ifdef CONFIG_XEN_NETDEV2_BYPASS_ENDPOINT
+#define MAX_BYPASS_RING_PAGES_MAPPABLE 4
+/* This is the representation of a bypass from the point of view of
+   one of the endpoint domains. */
+struct nc2_alternate_ring {
+        /* List of all alternate rings on a given interface.  Dangles
+         * off of alternate_rings in struct netchannel2.  Protected by
+         * the netchannel2 master ring lock. */
+        struct list_head rings_by_interface;
+        /* The state of the alternate ring.  This only ever goes
+         * forwards.  It is protected by the auxiliary ring lock. */
+        enum {
+                /* This is a frontend, it's just been allocated and
+                   doesn't yet have a port. */
+                nc2_alt_ring_frontend_preparing = 0xf001,
+                /* This is a frontend, it has a port but hasn't told
+                   the parent yet. */
+                nc2_alt_ring_frontend_send_ready_pending,
+                /* We've sent the FRONTEND_READY message and are
+                   waiting for the backend to say it's ready. */
+                nc2_alt_ring_frontend_sent_ready,
+                /* This is a backend.  In theory, we know what port to
+                   use, but we haven't tried to bind to it yet. */
+                nc2_alt_ring_backend_preparing,
+                /* Running normally */
+                nc2_alt_ring_ready,
+                /* Can't be used for more PACKETs, will disable as
+                   soon as all FINISHes arrive. */
+                nc2_alt_ring_disabling,
+                /* All FINISHes arrived, waiting to send DISABLED */
+                nc2_alt_ring_disabled_pending,
+                /* DISABLED sent. */
+                nc2_alt_ring_disabled,
+                /* DETACH received */
+                nc2_alt_ring_detaching,
+                /* Ring has been detached, waiting to send the
+                   DETACHED message. */
+                nc2_alt_ring_detached_pending
+        } state;
+        struct work_struct work_item;
+        struct work_struct detach_work_item;
+
+        struct grant_mapping prod_mapper;
+        struct grant_mapping cons_mapper;
+        struct grant_mapping control_mapper;
+
+        struct netchannel2_ring_pair rings;
+
+        /* A lower bound on the number of times we've called
+           disable_irq() on the irq.  The interrupt handler guarantees
+           to notify the eventq quickly if this increases.  It
+           increases whenever there is work for the worker thread to
+           do. */
+        atomic_t irq_disable_count;
+        wait_queue_head_t eventq;
+        uint32_t handle;
+
+        struct netchannel2_msg_bypass_frontend frontend_setup_msg;
+        struct netchannel2_msg_bypass_backend backend_setup_msg;
+        uint32_t cons_grefs[MAX_BYPASS_RING_PAGES_MAPPABLE];
+        uint32_t prod_grefs[MAX_BYPASS_RING_PAGES_MAPPABLE];
+
+#ifdef CONFIG_XEN_NETDEV2_AUTOMATIC_BYPASS
+        struct nc2_bypass_autoteardown autoteardown;
+#endif
+};
+
+void nc2_handle_bypass_ready(struct netchannel2 *nc,
+                             struct netchannel2_ring_pair *ncrp,
+                             struct netchannel2_msg_hdr *hdr);
+int bypass_xmit_packet(struct netchannel2 *nc,
+                       struct nc2_alternate_ring *ncr,
+                       struct sk_buff *skb);
+void _nc2_alternate_ring_disable_finish(struct nc2_alternate_ring *ncr);
+static inline void nc2_alternate_ring_disable_finish(struct netchannel2_ring_pair *ncrp)
+{
+        struct nc2_alternate_ring *nar;
+        nar = container_of(ncrp, struct nc2_alternate_ring, rings);
+        if (nar->state == nc2_alt_ring_disabling &&
+            ncrp->nr_tx_packets_outstanding == 0)
+                _nc2_alternate_ring_disable_finish(nar);
+}
+void _nc2_crank_aux_ring_state_machine(struct netchannel2 *nc);
+static inline void nc2_crank_aux_ring_state_machine(struct netchannel2 *nc)
+{
+        if (nc->need_aux_ring_state_machine)
+                _nc2_crank_aux_ring_state_machine(nc);
+}
+void nc2_release_alt_rings(struct netchannel2 *nc);
+void detach_all_bypasses(struct netchannel2 *nc);
+void nc2_handle_bypass_frontend(struct netchannel2 *nc,
+                                struct netchannel2_ring_pair *ncrp,
+                                struct netchannel2_msg_hdr *hdr);
+void nc2_handle_bypass_backend(struct netchannel2 *nc,
+                               struct netchannel2_ring_pair *ncrp,
+                               struct netchannel2_msg_hdr *hdr);
+void nc2_handle_bypass_disable(struct netchannel2 *nc,
+                               struct netchannel2_ring_pair *ncrp,
+                               struct netchannel2_msg_hdr *hdr);
+void nc2_handle_bypass_detach(struct netchannel2 *nc,
+                              struct netchannel2_ring_pair *ncrp,
+                              struct netchannel2_msg_hdr *hdr);
+void nc2_handle_bypass_ready(struct netchannel2 *nc,
+                             struct netchannel2_ring_pair *ncrp,
+                             struct netchannel2_msg_hdr *hdr);
+#else
+static inline void detach_all_bypasses(struct netchannel2 *nc)
+{
+}
+static inline void nc2_crank_aux_ring_state_machine(struct netchannel2 *nc)
+{
+}
+static inline void nc2_alternate_ring_disable_finish(struct netchannel2_ring_pair *ncrp)
+{
+}
+static inline void nc2_release_alt_rings(struct netchannel2 *nc)
+{
+}
+static inline void nc2_handle_bypass_frontend(struct netchannel2 *nc,
+                                              struct netchannel2_ring_pair *ncrp,
+                                              struct netchannel2_msg_hdr *hdr)
+{
+}
+static inline void nc2_handle_bypass_backend(struct netchannel2 *nc,
+                                             struct netchannel2_ring_pair *ncrp,
+                                             struct netchannel2_msg_hdr *hdr)
+{
+}
+static inline void nc2_handle_bypass_disable(struct netchannel2 *nc,
+                                             struct netchannel2_ring_pair *ncrp,
+                                             struct netchannel2_msg_hdr *hdr)
+{
+}
+static inline void nc2_handle_bypass_detach(struct netchannel2 *nc,
+                                            struct netchannel2_ring_pair *ncrp,
+                                            struct netchannel2_msg_hdr *hdr)
+{
+}
+static inline void nc2_handle_bypass_ready(struct netchannel2 *nc,
+                                           struct netchannel2_ring_pair *ncrp,
+                                           struct netchannel2_msg_hdr *hdr)
+{
+}
+#endif
+
+#ifdef CONFIG_XEN_NETDEV2_AUTOMATIC_BYPASS
+void _nc2_autobypass_make_suggestions(struct netchannel2 *nc);
+static inline void nc2_autobypass_make_suggestions(struct netchannel2 *nc)
+{
+        if (nc->auto_bypass.suggestion_tail != nc->auto_bypass.suggestion_head)
+                _nc2_autobypass_make_suggestions(nc);
+}
+#else
+static inline void nc2_autobypass_make_suggestions(struct netchannel2 *nc)
+{
+}
+#endif
+
+static inline void flush_prepared_grant_copies(struct hypercall_batcher *hb,
+                                               void (*on_fail)(void *ctxt,
+                                                               gnttab_copy_t *gop))
+{
+        unsigned x;
+
+        if (hb->nr_pending_gops == 0)
+                return;
+        if (HYPERVISOR_grant_table_op(GNTTABOP_copy, hb->gops,
+                                      hb->nr_pending_gops))
+            BUG();
+        for (x = 0; x < hb->nr_pending_gops; x++)
+                if (hb->gops[x].status != GNTST_okay)
+                        on_fail(hb->ctxt[x], &hb->gops[x]);
+        hb->nr_pending_gops = 0;
+}
+
+static inline gnttab_copy_t *hypercall_batcher_grant_copy(struct hypercall_batcher *hb,
+                                                          void *ctxt,
+                                                          void (*on_fail)(void *,
+                                                                          gnttab_copy_t *gop))
+{
+        if (hb->nr_pending_gops == ARRAY_SIZE(hb->gops))
+                flush_prepared_grant_copies(hb, on_fail);
+        hb->ctxt[hb->nr_pending_gops] = ctxt;
+        return &hb->gops[hb->nr_pending_gops++];
+}
+
+static inline void flush_hypercall_batcher(struct hypercall_batcher *hb,
+                                           void (*on_fail)(void *,
+                                                           gnttab_copy_t *gop))
+{
+        flush_prepared_grant_copies(hb, on_fail);
+}
+
+static inline struct nc2_tx_buffer *_get_tx_buffer(struct netchannel2 *nc)
+{
+       struct nc2_tx_buffer *buffer;
+       struct list_head *entry = nc->avail_tx_buffers.next;
+       list_del(entry);
+       buffer = list_entry(entry, struct nc2_tx_buffer, list);
+       nc->nr_avail_tx_buffers--;
+       return buffer;
+}
+
+/* recycle a posted buffer: return it to the list of available buffers */
+static inline void recycle_tx_buffer(struct netchannel2 *nc, 
+                                   struct nc2_tx_buffer *buffer)
+{
+       list_add(&buffer->list, &nc->avail_tx_buffers);
+       nc->nr_avail_tx_buffers++;
+}
+
+/* add a buffer to the pending list to be returned to the other end buffer */
+static inline void return_tx_buffer(struct netchannel2 *nc, 
+                                   struct nc2_tx_buffer *buffer)
+{
+       list_add(&buffer->list, &nc->pending_tx_buffer_return);
+}
+
+/* add a buffer slot to list of unused buffer slots after it has been
+ * returned to other end */
+static inline void free_tx_buffer(struct netchannel2 *nc, 
+                                 struct nc2_tx_buffer *buffer)
+{
+       list_add(&buffer->list, &nc->unused_tx_buffer_slots);
+}
+struct sk_buff *handle_receiver_copy_packet(struct netchannel2 *nc,
+                                            struct netchannel2_ring_pair *ncrp,
+                                            struct netchannel2_msg_packet *msg,
+                                            struct netchannel2_msg_hdr *hdr,
+                                            unsigned nr_frags,
+                                            unsigned frags_off);
+struct sk_buff *handle_pre_posted_packet(struct netchannel2 *nc,
+                                         struct netchannel2_msg_packet *msg,
+                                         struct netchannel2_msg_hdr *hdr,
+                                         unsigned nr_frags,
+                                         unsigned frags_off);
+struct sk_buff *handle_receiver_map_packet(struct netchannel2 *nc,
+                                           struct netchannel2_msg_packet *msg,
+                                           struct netchannel2_msg_hdr *hdr,
+                                           unsigned nr_frags,
+                                           unsigned frags_off);
+void nc2_handle_return_posted_buffer(struct netchannel2 *nc,
+                                     struct netchannel2_ring_pair *ncrp,
+                                     struct netchannel2_msg_hdr *hdr);
+void nc2_handle_post_buffer(struct netchannel2 *nc,
+                            struct netchannel2_ring_pair *ncrp,
+                            struct netchannel2_msg_hdr *hdr);
+void nc2_handle_set_nr_posted_buffers(struct netchannel2 *nc,
+                                      struct netchannel2_ring_pair *ncrp,
+                                      struct netchannel2_msg_hdr *hdr);
+void nc2_advertise_tx_buffers(struct netchannel2 *nc);
+
+int prepare_xmit_allocate_small(struct netchannel2_ring_pair *ncrp,
+                                                       struct sk_buff *skb);
+int prepare_xmit_allocate_grant(struct netchannel2_ring_pair *ncrp,
+                                struct sk_buff *skb,
+                                int use_subpage_grants);
+void xmit_grant(struct netchannel2_ring_pair *ncrp,
+                struct sk_buff *skb,
+                int use_subpage_grants,
+                volatile void *msg);
+int prepare_xmit_allocate_post(struct netchannel2 *nc,
+                               struct sk_buff *skb);
+void xmit_post(struct netchannel2 *nc,
+               struct sk_buff *skb,
+               volatile void *msg);
+
+void nc2_replenish_rx_buffers(struct netchannel2 *nc);
+
+void queue_finish_packet_message(struct netchannel2_ring_pair *ncrp,
+                                 uint32_t id, uint8_t flags);
+
+void nc2_return_pending_posted_buffers(struct netchannel2 *nc);
+void nc2_posted_buffer_rx_forget(struct netchannel2 *nc);
+
+int allocate_txp_slot(struct netchannel2_ring_pair *ncrp,
+                      struct sk_buff *skb);
+void release_txp_slot(struct netchannel2_ring_pair *ncrp,
+                      struct sk_buff *skb);
+/* Releases the txp slot, the grant pool, and the skb */
+void release_tx_packet(struct netchannel2_ring_pair *ncrp,
+                       struct sk_buff *skb);
+
+void unprepare_tx_buffers(struct netchannel2 *nc);
+
+void fetch_fragment(struct netchannel2_ring_pair *ncrp,
+                    unsigned idx,
+                    struct netchannel2_fragment *frag,
+                    unsigned off);
+
+void pull_through(struct sk_buff *skb, unsigned count);
+
+void nc2_kick(struct netchannel2_ring_pair *ncrp);
+void nc2_kick_fast(struct netchannel2_ring_pair *ncrp);
+
+int nc2_map_grants(struct grant_mapping *gm,
+                   const grant_ref_t *grefs,
+                   unsigned nr_grefs,
+                   domid_t remote_domain);
+void nc2_unmap_grants(struct grant_mapping *gm);
+
+void _nc2_attach_rings(struct netchannel2_ring_pair *ncrp,
+                       struct netchannel2_sring_cons *cons_sring,
+                       const volatile void *cons_payload,
+                       size_t cons_size,
+                       struct netchannel2_sring_prod *prod_sring,
+                       void *prod_payload,
+                       size_t prod_size,
+                       domid_t otherend_id);
+void queue_packet_to_interface(struct sk_buff *skb,
+                               struct netchannel2_ring_pair *ncrp);
+unsigned get_transmitted_packet_msg_size(struct sk_buff *skb);
+void init_ring_pair(struct netchannel2_ring_pair *ncrp);
+
+irqreturn_t nc2_int(int irq, void *dev_id, struct pt_regs *ptregs);
+
+void cleanup_ring_pair(struct netchannel2_ring_pair *ncrp);
+void nc2_rscb_on_gntcopy_fail(void *ctxt, gnttab_copy_t *gop);
+void nc2_posted_on_gntcopy_fail(void *ctxt, gnttab_copy_t *gop);
+
+int init_receive_map_mode(void);
+void deinit_receive_map_mode(void);
+void suspend_receive_map_mode(void);
+void resume_receive_map_mode(void);
+
+struct netchannel2 *nc2_get_interface_for_page(struct page *p);
+
+int nc2_start_xmit(struct sk_buff *skb, struct net_device *dev);
+void nc2_really_start_xmit(struct netchannel2_ring_pair *ncrp,
+                           struct sk_buff *skb);
+int prepare_xmit_allocate_resources(struct netchannel2 *nc,
+                                    struct sk_buff *skb);
+void nc2_handle_finish_packet_msg(struct netchannel2 *nc,
+                                  struct netchannel2_ring_pair *ncrp,
+                                  struct netchannel2_msg_hdr *hdr);
+void nc2_handle_set_max_packets_msg(struct netchannel2_ring_pair *ncrp,
+                                    struct netchannel2_msg_hdr *hdr);
+void drop_pending_tx_packets(struct netchannel2_ring_pair *ncrp);
+
+void send_finish_packet_messages(struct netchannel2_ring_pair *ncrp);
+void nc2_handle_packet_msg(struct netchannel2 *nc,
+                           struct netchannel2_ring_pair *ncrp,
+                           struct netchannel2_msg_hdr *hdr);
+void advertise_max_packets(struct netchannel2_ring_pair *ncrp);
+void receive_pending_skbs(void);
+
+void advertise_offloads(struct netchannel2 *nc);
+void nc2_handle_set_offload(struct netchannel2 *nc,
+                            struct netchannel2_ring_pair *ncrp,
+                            struct netchannel2_msg_hdr *hdr);
+struct net_device_stats *nc2_get_stats(struct net_device *nd);
+int nc2_change_mtu(struct net_device *nd, int mtu);
+
+#ifdef DEBUG
+void _sanity_check_list(struct list_head *root, const char *file,
+                        int line);
+#define sanity_check_list(x) _sanity_check_list(x, __FILE__, __LINE__)
+void debug_dump_nc2_struct(struct netchannel2 *nc);
+#else
+static inline void sanity_check_list(struct list_head *lh)
+{
+}
+#endif
+
+extern struct hypercall_batcher pending_rx_hypercalls;
+extern struct ethtool_ops nc2_ethtool_ops;
+
+void nc2_init_poller(struct netchannel2_ring_pair *ncrp);
+void nc2_start_polling(struct netchannel2_ring_pair *ncrp);
+void nc2_stop_polling(struct netchannel2_ring_pair *ncrp);
+
+/* Compatibility with PV-ops kernels. */
+#ifdef CONFIG_PARAVIRT
+#define nc2_end_foreign_access_ref gnttab_end_foreign_access_ref
+#define nc2_end_foreign_access gnttab_end_foreign_access
+#else
+static inline int nc2_end_foreign_access_ref(grant_ref_t gref, int readonly)
+{
+        return gnttab_end_foreign_access_ref(gref);
+}
+static inline void nc2_end_foreign_access(grant_ref_t gref, int readonly,
+                                         unsigned long page)
+{
+        gnttab_end_foreign_access(gref, page);
+}
+#endif
+
+#ifndef CONFIG_PARAVIRT
+#include <xen/live_maps.h>
+#else
+static inline int page_is_tracked(struct page *p)
+{
+        return 0;
+}
+static inline void lookup_tracker_page(struct page *p, domid_t *domid,
+                                       grant_ref_t *gref)
+{
+        BUG();
+}
+#endif
+
+#endif /* !NETCHANNEL2_CORE_H__ */
diff --git a/drivers/net/xen-netchannel2/netchannel2_endpoint.h b/drivers/net/xen-netchannel2/netchannel2_endpoint.h
new file mode 100644 (file)
index 0000000..ca3a707
--- /dev/null
@@ -0,0 +1,63 @@
+/* Interface between the endpoint implementations (netfront2.c,
+   netback2.c) and the netchannel2 core (chan.c and the various
+   transmission modes).  */
+#ifndef NETCHANNEL2_ENDPOINT_H__
+#define NETCHANNEL2_ENDPOINT_H__
+
+#include <linux/init.h>
+#include <xen/interface/xen.h>
+
+struct netchannel2_sring_prod;
+struct netchannel2_sring_cons;
+struct netchannel2;
+struct xenbus_device;
+
+struct netchannel2 *nc2_new(struct xenbus_device *xd);
+void nc2_release(struct netchannel2 *nc);
+
+int nc2_attach_rings(struct netchannel2 *nc,
+                     struct netchannel2_sring_cons *cons_sring,
+                     const volatile void *cons_payload,
+                     size_t cons_size,
+                     struct netchannel2_sring_prod *prod_sring,
+                     void *prod_payload,
+                     size_t prod_size,
+                     domid_t otherend_id);
+void nc2_detach_rings(struct netchannel2 *nc);
+#if defined(CONFIG_XEN_NETDEV2_FRONTEND)
+int nc2_listen_evtchn(struct netchannel2 *nc, domid_t dom);
+#endif
+#if defined(CONFIG_XEN_NETDEV2_BACKEND)
+int nc2_connect_evtchn(struct netchannel2 *nc, domid_t domid,
+                       int evtchn);
+#endif
+int nc2_get_evtchn_port(struct netchannel2 *nc);
+void nc2_suspend(struct netchannel2 *nc);
+
+void nc2_set_nr_tx_buffers(struct netchannel2 *nc, unsigned nr_buffers);
+
+/* Interface which the endpoints provide to the core. */
+#ifdef CONFIG_XEN_NETDEV2_FRONTEND
+int __init netfront2_init(void);
+void __exit netfront2_exit(void);
+#else
+static inline int netfront2_init(void)
+{
+    return 0;
+}
+static inline void netfront2_exit(void)
+{
+}
+#endif
+#ifdef CONFIG_XEN_NETDEV2_BACKEND
+int __init netback2_init(void);
+#else
+static inline int netback2_init(void)
+{
+    return 0;
+}
+#endif
+int __init nc2_init(void);
+void __exit nc2_exit(void);
+
+#endif /* NETCHANNEL2_ENDPOINT_H__ */
diff --git a/drivers/net/xen-netchannel2/netchannel2_uspace.h b/drivers/net/xen-netchannel2/netchannel2_uspace.h
new file mode 100644 (file)
index 0000000..5310201
--- /dev/null
@@ -0,0 +1,17 @@
+#ifndef NETCHANNEL2_USPACE_H__
+#define NETCHANNEL2_USPACE_H__
+
+#include <linux/ioctl.h>
+
+struct netchannel2_ioctl_establish_bypass {
+        unsigned handle_a;
+        unsigned handle_b;
+};
+#define NETCHANNEL2_IOCTL_ESTABLISH_BYPASS _IOW('N', 0, struct netchannel2_ioctl_establish_bypass)
+
+struct netchannel2_ioctl_destroy_bypass {
+        unsigned handle;
+};
+#define NETCHANNEL2_IOCTL_DESTROY_BYPASS _IOW('N', 1, struct netchannel2_ioctl_destroy_bypass)
+
+#endif /* !NETCHANNEL2_USPACE_H__ */
diff --git a/drivers/net/xen-netchannel2/netfront2.c b/drivers/net/xen-netchannel2/netfront2.c
new file mode 100644 (file)
index 0000000..0894740
--- /dev/null
@@ -0,0 +1,542 @@
+#include <linux/kernel.h>
+#include <linux/gfp.h>
+#include <linux/version.h>
+#ifdef CONFIG_PARAVIRT
+#include <xen/grant_table.h>
+#include <xen/page.h>
+#else
+#include <xen/gnttab.h>
+#endif
+#include <xen/xenbus.h>
+
+#include "netchannel2_core.h"
+#include "netchannel2_endpoint.h"
+
+static struct netchannel2 *device_to_nc2(struct device *dev);
+
+#include "sysfs.c"
+
+#define MAX_SRING_PAGES 4
+
+struct netfront2 {
+#define NETFRONT2_MAGIC 0x9268e704
+        unsigned magic;
+        struct xenbus_device *xenbus_device;
+
+        void *f2b_sring;
+        grant_ref_t f2b_grefs[MAX_SRING_PAGES];
+        void *b2f_sring;
+        grant_ref_t b2f_grefs[MAX_SRING_PAGES];
+
+        struct netchannel2_frontend_shared *control_shared;
+        grant_ref_t control_shared_gref;
+
+        int nr_sring_pages;
+        int sring_order;
+
+        grant_ref_t rings_gref_pool; /* Some pre-allocated grant
+                                        references to cover the shared
+                                        rings. */
+
+        struct netchannel2 *chan;
+
+        int attached; /* True if the shared rings are ready to go. */
+};
+
+static struct netfront2 *xenbus_device_to_nf2(struct xenbus_device *xd)
+{
+        struct netfront2 *work = xd->dev.driver_data;
+        BUG_ON(work->magic != NETFRONT2_MAGIC);
+        return work;
+}
+
+static struct netchannel2 *device_to_nc2(struct device *dev)
+{
+        return xenbus_device_to_nf2(to_xenbus_device(dev))->chan;
+}
+
+/* Try to revoke a bunch of grant references and return the grefs to
+   the rings grefs pool.  Any cleared grefs are set to 0.  Returns 0
+   on success or <0 on error.  Ignores zero entries in the @grefs
+   list, and zeroes any entries which are successfully ended. */
+static int ungrant_access_to_ring(struct netfront2 *nf,
+                                  grant_ref_t *grefs,
+                                  int nr_pages)
+{
+        int i;
+        int succ;
+        int failed;
+
+        failed = 0;
+
+        for (i = 0; i < nr_pages; i++) {
+                if (grefs[i]) {
+                        succ = nc2_end_foreign_access_ref(grefs[i], 0);
+                        if (!succ) {
+                                /* XXX we can't recover when this
+                                 * happens.  Try to do something
+                                 * vaguely plausible, but the device
+                                 * is pretty much doomed. */
+                                printk(KERN_WARNING "Failed to end access to gref %d\n",
+                                       i);
+                                failed = 1;
+                                continue;
+                        }
+                        gnttab_release_grant_reference(&nf->rings_gref_pool,
+                                                       grefs[i]);
+                        grefs[i] = 0;
+                }
+        }
+
+        if (failed)
+                return -EBUSY;
+        else
+                return 0;
+}
+
+/* Allocate and initialise grant references to cover a bunch of pages.
+   @ring should be in the direct-mapped region.  The rings_gref_pool
+   on nf should contain at least @nr_pages references.
+   Already-populated slots in the @grefs list are left unchanged. */
+static void grant_access_to_ring(struct netfront2 *nf,
+                                 domid_t otherend,
+                                 void *ring,
+                                 int *grefs,
+                                 int nr_pages)
+{
+        void *p;
+        int i;
+        grant_ref_t ref;
+
+        for (i = 0; i < nr_pages; i++) {
+
+                if (grefs[i] != 0)
+                        continue;
+
+                p = (void *)((unsigned long)ring + PAGE_SIZE * i);
+
+                ref = gnttab_claim_grant_reference(&nf->rings_gref_pool);
+                /* There should be enough grefs in the pool to handle
+                   the rings. */
+                BUG_ON(ref < 0);
+                gnttab_grant_foreign_access_ref(ref,
+                                                otherend,
+                                                virt_to_mfn(p),
+                                                0);
+                grefs[i] = ref;
+        }
+}
+
+/* Push an already-granted ring into xenstore. */
+static int publish_ring(struct xenbus_transaction xbt,
+                        struct netfront2 *nf,
+                        const char *prefix,
+                        const int *grefs,
+                        int nr_grefs)
+{
+        int i;
+        char buf[32];
+        int err;
+
+        sprintf(buf, "%s-nr-pages", prefix);
+        err = xenbus_printf(xbt, nf->xenbus_device->nodename, buf,
+                            "%u", nr_grefs);
+        if (err)
+                return err;
+
+        for (i = 0; i < nr_grefs; i++) {
+                BUG_ON(grefs[i] == 0);
+                sprintf(buf, "%s-ref-%u", prefix, i);
+                err = xenbus_printf(xbt, nf->xenbus_device->nodename,
+                                    buf, "%u", grefs[i]);
+                if (err)
+                        return err;
+        }
+        return 0;
+}
+
+static int publish_rings(struct netfront2 *nf)
+{
+        int err;
+        struct xenbus_transaction xbt;
+        const char *msg;
+
+again:
+        err = xenbus_transaction_start(&xbt);
+        if (err) {
+                xenbus_dev_fatal(nf->xenbus_device, err,
+                                 "starting transaction");
+                return err;
+        }
+
+        err = publish_ring(xbt, nf, "f2b-ring", nf->f2b_grefs,
+                           nf->nr_sring_pages);
+        if (err) {
+                msg = "publishing f2b-ring";
+                goto abort;
+        }
+        err = publish_ring(xbt, nf, "b2f-ring", nf->b2f_grefs,
+                           nf->nr_sring_pages);
+        if (err) {
+                msg = "publishing b2f-ring";
+                goto abort;
+        }
+        err = publish_ring(xbt, nf, "control", &nf->control_shared_gref, 1);
+        if (err) {
+                msg = "publishing control";
+                goto abort;
+        }
+        err = xenbus_printf(xbt, nf->xenbus_device->nodename,
+                            "event-channel", "%u",
+                            nc2_get_evtchn_port(nf->chan));
+        if (err) {
+                msg = "publishing event channel";
+                goto abort;
+        }
+
+#ifdef CONFIG_XEN_NETDEV2_BYPASS_ENDPOINT
+        err = xenbus_printf(xbt, nf->xenbus_device->nodename,
+                            "feature-bypass", "1");
+        if (!err)
+                err = xenbus_printf(xbt, nf->xenbus_device->nodename,
+                                    "feature-bypass-max-pages", "%d",
+                                    MAX_BYPASS_RING_PAGES_MAPPABLE);
+        if (err) {
+                msg = "publishing bypass info";
+                goto abort;
+        }
+#endif
+
+        err = xenbus_transaction_end(xbt, 0);
+        if (err) {
+                if (err == -EAGAIN)
+                        goto again;
+                xenbus_dev_fatal(nf->xenbus_device, err,
+                                 "completing transaction");
+        }
+
+        return err;
+
+abort:
+        xenbus_transaction_end(xbt, 1);
+        xenbus_dev_fatal(nf->xenbus_device, err, msg);
+        return err;
+}
+
+/* Release the rings.  WARNING: This will leak memory if the other end
+   still has the rings mapped.  There isn't really anything we can do
+   about that; the alternative (giving the other end access to
+   whatever Linux puts in the memory after we released it) is probably
+   worse. */
+static void release_rings(struct netfront2 *nf)
+{
+        int have_outstanding_grants;
+
+        have_outstanding_grants = 0;
+
+        if (nf->f2b_sring) {
+                if (ungrant_access_to_ring(nf, nf->f2b_grefs,
+                                           nf->nr_sring_pages) >= 0) {
+                        free_pages((unsigned long)nf->f2b_sring,
+                                   nf->sring_order);
+                } else {
+                        have_outstanding_grants = 1;
+                }
+                nf->f2b_sring = NULL;
+        }
+
+        if (nf->b2f_sring) {
+                if (ungrant_access_to_ring(nf, nf->b2f_grefs,
+                                           nf->nr_sring_pages) >= 0) {
+                        free_pages((unsigned long)nf->b2f_sring,
+                                   nf->sring_order);
+                } else {
+                        have_outstanding_grants = 1;
+                }
+                nf->b2f_sring = NULL;
+        }
+
+        if (nf->control_shared) {
+                if (ungrant_access_to_ring(nf, &nf->control_shared_gref,
+                                           1) >= 0) {
+                        free_page((unsigned long)nf->control_shared);
+                } else {
+                        have_outstanding_grants = 1;
+                }
+                nf->control_shared = NULL;
+        }
+
+        if (have_outstanding_grants != 0) {
+                printk(KERN_WARNING
+                       "Released shared rings while the backend still had them mapped; leaking memory\n");
+        }
+
+        /* We can't release the gref pool if there are still
+           references outstanding against it. */
+        if (!have_outstanding_grants) {
+                if (nf->rings_gref_pool)
+                        gnttab_free_grant_references(nf->rings_gref_pool);
+                nf->rings_gref_pool = 0;
+        }
+
+        nf->attached = 0;
+}
+
+static int allocate_rings(struct netfront2 *nf, domid_t otherend)
+{
+        int err;
+        int max_sring_pages;
+        int sring_order;
+        int nr_sring_pages;
+        size_t sring_size;
+
+        /* Figure out how big our shared rings are going to be. */
+        err = xenbus_scanf(XBT_NIL, nf->xenbus_device->otherend,
+                           "max-sring-pages", "%d", &max_sring_pages);
+        if (err < 0) {
+                xenbus_dev_fatal(nf->xenbus_device, err,
+                                 "reading %s/max-sring-pages",
+                                 nf->xenbus_device->otherend);
+                return err;
+        }
+        if (max_sring_pages > MAX_SRING_PAGES)
+                max_sring_pages = MAX_SRING_PAGES;
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,20)
+        sring_order = long_log2(max_sring_pages);
+#else
+        sring_order = order_base_2(max_sring_pages);
+#endif
+        nr_sring_pages = 1 << sring_order;
+        sring_size = nr_sring_pages * PAGE_SIZE;
+
+        release_rings(nf);
+
+        nf->nr_sring_pages = nr_sring_pages;
+        nf->sring_order = sring_order;
+
+        nf->f2b_sring = (void *)__get_free_pages(GFP_KERNEL, sring_order);
+        if (!nf->f2b_sring)
+                return -ENOMEM;
+        memset(nf->f2b_sring, 0, sring_size);
+
+        nf->b2f_sring = (void *)__get_free_pages(GFP_KERNEL, sring_order);
+        if (!nf->b2f_sring)
+                return -ENOMEM;
+        memset(nf->b2f_sring, 0, sring_size);
+
+        nf->control_shared = (void *)get_zeroed_page(GFP_KERNEL);
+        if (!nf->control_shared)
+                return -ENOMEM;
+
+        /* Pre-allocate enough grant references to be sure that we can
+           grant access to both rings without an error. */
+        err = gnttab_alloc_grant_references(nr_sring_pages * 2 + 1,
+                                            &nf->rings_gref_pool);
+        if (err < 0)
+                return err;
+
+        grant_access_to_ring(nf,
+                             otherend,
+                             nf->b2f_sring,
+                             nf->b2f_grefs,
+                             nr_sring_pages);
+        grant_access_to_ring(nf,
+                             otherend,
+                             nf->f2b_sring,
+                             nf->f2b_grefs,
+                             nr_sring_pages);
+        grant_access_to_ring(nf,
+                             otherend,
+                             nf->control_shared,
+                             &nf->control_shared_gref,
+                             1);
+        err = nc2_listen_evtchn(nf->chan, otherend);
+        if (err < 0)
+                return err;
+
+        nf->attached = 1;
+
+        return 0;
+}
+
+static void backend_changed(struct xenbus_device *xd,
+                           enum xenbus_state backend_state)
+{
+        struct netfront2 *nf = xenbus_device_to_nf2(xd);
+        int err;
+
+        switch (backend_state) {
+        case XenbusStateInitialising:
+                /* Backend isn't ready yet, don't do anything. */
+                break;
+
+        case XenbusStateInitWait:
+                /* Backend has advertised the ring protocol.  Allocate
+                   the rings, and tell the backend about them. */
+
+                /* XXX it would make more sense to set this to 0. */
+                nc2_set_nr_tx_buffers(nf->chan, 256);
+
+                err = 0;
+                if (!nf->attached)
+                        err = allocate_rings(nf, xd->otherend_id);
+                if (err < 0) {
+                        xenbus_dev_fatal(xd, err, "allocating shared rings");
+                        break;
+                }
+                err = publish_rings(nf);
+                if (err >= 0)
+                        xenbus_switch_state(xd, XenbusStateInitialised);
+                break;
+
+        case XenbusStateInitialised:
+                /* Backend isn't supposed to use this state. */
+                xenbus_dev_fatal(xd, EINVAL,
+                                 "unexpected backend state Initialised");
+                break;
+
+        case XenbusStateConnected:
+                /* All ready */
+                err = nc2_attach_rings(nf->chan,
+                                       &nf->control_shared->cons,
+                                       nf->b2f_sring,
+                                       nf->nr_sring_pages * PAGE_SIZE,
+                                       &nf->control_shared->prod,
+                                       nf->f2b_sring,
+                                       nf->nr_sring_pages * PAGE_SIZE,
+                                       nf->xenbus_device->otherend_id);
+                if (err < 0) {
+                        xenbus_dev_fatal(xd, err,
+                                         "failed to attach to rings");
+                } else {
+                        xenbus_switch_state(xd, XenbusStateConnected);
+                }
+                break;
+
+        case XenbusStateClosing:
+                xenbus_switch_state(xd, XenbusStateClosing);
+                break;
+
+        case XenbusStateClosed:
+                /* Tell the tools that it's safe to remove the device
+                   from the bus. */
+                xenbus_frontend_closed(xd);
+                /* Note that we don't release the rings here.  This
+                   means that if the backend moves to a different
+                   domain, we won't be able to reconnect, but it also
+                   limits the amount of memory which can be wasted in
+                   the release_rings() leak if the backend is faulty
+                   or malicious.  It's not obvious which is more
+                   useful, and so I choose the safer but less
+                   featureful approach. */
+                /* This is only a problem if you're using driver
+                   domains and trying to recover from a driver error
+                   by rebooting the backend domain.  The rest of the
+                   tools don't support that, so it's a bit
+                   theoretical.  The memory leaks aren't, though. */
+                break;
+
+        case XenbusStateUnknown:
+                /* The tools have removed the device area from the
+                   store.  Do nothing and rely on xenbus core to call
+                   our remove method. */
+                break;
+
+        default:
+                /* Ignore transitions to unknown states */
+                break;
+        }
+}
+
+static int __devinit netfront_probe(struct xenbus_device *xd,
+                                   const struct xenbus_device_id *id)
+{
+        struct netfront2 *nf;
+
+        nf = kzalloc(sizeof(*nf), GFP_KERNEL);
+        if (nf == NULL)
+                goto err;
+        nf->magic = NETFRONT2_MAGIC;
+        nf->xenbus_device = xd;
+        nf->chan = nc2_new(xd);
+        if (nf->chan == NULL)
+                goto err;
+
+        xd->dev.driver_data = nf;
+
+        nc2_sysfs_addif(xd);
+
+        return 0;
+
+err:
+        kfree(nf);
+        xenbus_dev_fatal(xd, ENOMEM, "probing netdev");
+        return -ENOMEM;
+}
+
+static int netfront_suspend(struct xenbus_device *xd)
+{
+        /* We're about to suspend.  Do the minimum amount of work to
+           make that safe. */
+        struct netfront2 *nf = xenbus_device_to_nf2(xd);
+
+        nc2_suspend(nf->chan);
+
+        return 0;
+}
+
+static int netfront_resume(struct xenbus_device *xd)
+{
+        /* We've been suspended and come back.  The rings are
+           therefore dead.  Tear them down. */
+        /* We rely on the normal xenbus state machine to bring them
+           back to life. */
+        struct netfront2 *nf = xenbus_device_to_nf2(xd);
+
+        nc2_detach_rings(nf->chan);
+        release_rings(nf);
+
+        return 0;
+}
+
+static int __devexit netfront_remove(struct xenbus_device *xd)
+{
+        struct netfront2 *nf = xenbus_device_to_nf2(xd);
+        nc2_sysfs_delif(xd);
+        if (nf->chan != NULL)
+                nc2_release(nf->chan);
+        release_rings(nf);
+        kfree(nf);
+        return 0;
+}
+
+static const struct xenbus_device_id netfront_ids[] = {
+       { "vif2" },
+       { "" }
+};
+MODULE_ALIAS("xen:vif2");
+
+static struct xenbus_driver netfront2 = {
+       .name = "vif2",
+       .owner = THIS_MODULE,
+       .ids = netfront_ids,
+       .probe = netfront_probe,
+       .remove = __devexit_p(netfront_remove),
+       .otherend_changed = backend_changed,
+        .resume = netfront_resume,
+        .suspend = netfront_suspend,
+};
+
+int __init netfront2_init(void)
+{
+#ifndef CONFIG_PARAVIRT
+        if (!is_running_on_xen())
+                return -ENODEV;
+#endif
+        return xenbus_register_frontend(&netfront2);
+}
+
+void __exit netfront2_exit(void)
+{
+        xenbus_unregister_driver(&netfront2);
+}
diff --git a/drivers/net/xen-netchannel2/offload.c b/drivers/net/xen-netchannel2/offload.c
new file mode 100644 (file)
index 0000000..6c86fa4
--- /dev/null
@@ -0,0 +1,156 @@
+/* All the bits used to handle enabling and disabling the various
+ * offloads. */
+#include <linux/kernel.h>
+#include <linux/ethtool.h>
+#include "netchannel2_core.h"
+
+static int nc2_set_tx_csum(struct net_device *nd, u32 val);
+static int nc2_set_sg(struct net_device *nd, u32 val);
+static int nc2_set_tso(struct net_device *nd, u32 val);
+
+/* ---------------- Interface to the other domain ----------------------- */
+void nc2_handle_set_offload(struct netchannel2 *nc,
+                            struct netchannel2_ring_pair *ncrp,
+                            struct netchannel2_msg_hdr *hdr)
+{
+        struct netchannel2_msg_set_offload msg;
+        if (hdr->size != sizeof(msg)) {
+                pr_debug("Strange sized offload message: %d\n",
+                         hdr->size);
+                return;
+        }
+        if (ncrp != &nc->rings) {
+                pr_debug("Setting offloads on an ancillary ring!\n");
+                return;
+        }
+        nc2_copy_from_ring(&nc->rings.cons_ring, &msg, hdr->size);
+        if (msg.csum != nc->allow_tx_csum_offload) {
+                nc->allow_tx_csum_offload = msg.csum;
+                nc2_set_tx_csum(nc->net_device, msg.csum);
+                /* Linux doesn't support scatter-gather mode without
+                   TX csum offload.  We therefore need to disable SG
+                   support whenever the remote turns off csum support.
+                   We also elect to enable SG support whenever the
+                   remote turns on csum support, since that's more
+                   likely to be useful than requiring the user to
+                   manually enable it every time. */
+                nc2_set_sg(nc->net_device, msg.csum);
+        }
+
+        if (msg.tcpv4_segmentation_offload != nc->allow_tso) {
+                nc->allow_tso = msg.tcpv4_segmentation_offload;
+                nc2_set_tso(nc->net_device, msg.tcpv4_segmentation_offload);
+        }
+}
+
+/* Tell the other end what sort of offloads it's allowed to use. */
+void advertise_offloads(struct netchannel2 *nc)
+{
+        struct netchannel2_msg_set_offload msg;
+
+        memset(&msg, 0, sizeof(msg));
+
+        if (nc2_can_send_payload_bytes(&nc->rings.prod_ring, sizeof(msg))) {
+                msg.csum = nc->use_rx_csum;
+                msg.tcpv4_segmentation_offload = nc->use_lro;
+                nc2_send_message(&nc->rings.prod_ring, NETCHANNEL2_MSG_SET_OFFLOAD,
+                                 0, &msg, sizeof(msg));
+                nc->need_advertise_offloads = 0;
+                nc->rings.pending_time_sensitive_messages = 1;
+        } else {
+                nc->need_advertise_offloads = 1;
+        }
+}
+
+
+
+/* ---------------------- Ethtool interface ---------------------------- */
+
+static int nc2_set_rx_csum(struct net_device *nd, u32 val)
+{
+        struct netchannel2 *nc = netdev_priv(nd);
+
+        spin_lock_bh(&nc->rings.lock);
+        if (nc->use_rx_csum != val) {
+                nc->use_rx_csum = val;
+                nc->need_advertise_offloads = 1;
+                spin_unlock_bh(&nc->rings.lock);
+                nc2_kick(&nc->rings);
+        } else {
+                spin_unlock_bh(&nc->rings.lock);
+        }
+
+        return 0;
+}
+
+static u32 nc2_get_rx_csum(struct net_device *nd)
+{
+        struct netchannel2 *nc = netdev_priv(nd);
+        return nc->use_rx_csum;
+}
+
+static int nc2_set_tx_csum(struct net_device *nd, u32 val)
+{
+        struct netchannel2 *nc = netdev_priv(nd);
+
+        /* Can't turn on TX csum offload if the other end can't do RX
+           csum offload. */
+        if (val != 0 && !nc->allow_tx_csum_offload)
+                return -EOPNOTSUPP;
+        return ethtool_op_set_tx_csum(nd, val);
+}
+
+/* ethtool set_sg() handler.  Linux makes sure that TX csum offload is
+   only enabled when scatter-gather mode is, so we don't have to worry
+   about that here. */
+static int nc2_set_sg(struct net_device *nd, u32 val)
+{
+        /* We support both the SG and FRAGLIST variants of
+         * scatter-gather.  FRAGLIST support is the only reason we
+         * can't just use ethtool_op_set_sg.
+         */
+        if (val)
+                nd->features |= NETIF_F_SG|NETIF_F_FRAGLIST;
+        else
+                nd->features &= ~(NETIF_F_SG|NETIF_F_FRAGLIST);
+        return 0;
+}
+
+static int nc2_set_tso(struct net_device *nd, u32 val)
+{
+        struct netchannel2 *nc = netdev_priv(nd);
+        if (val != 0 && !nc->allow_tso)
+                return -EOPNOTSUPP;
+        return ethtool_op_set_tso(nd, val);
+}
+
+struct ethtool_ops nc2_ethtool_ops = {
+        .get_tx_csum = ethtool_op_get_tx_csum,
+        .set_tx_csum = nc2_set_tx_csum,
+        .get_rx_csum = nc2_get_rx_csum,
+        .set_rx_csum = nc2_set_rx_csum,
+        .get_sg      = ethtool_op_get_sg,
+        .set_sg      = nc2_set_sg,
+        .get_tso     = ethtool_op_get_tso,
+        .set_tso     = nc2_set_tso
+};
+
+
+/* ------------------ Other netdevice operations ----------------------- */
+/* These aren't really offloads, per-se, but they really belong with
+   the ethtool operations, so they go here. */
+
+struct net_device_stats *nc2_get_stats(struct net_device *nd)
+{
+        struct netchannel2 *nc = netdev_priv(nd);
+
+        return &nc->stats;
+}
+
+int nc2_change_mtu(struct net_device *nd, int mtu)
+{
+        if (mtu > NETCHANNEL2_MAX_PACKET_BYTES)
+                return -EINVAL;
+        nd->mtu = mtu;
+        return 0;
+}
diff --git a/drivers/net/xen-netchannel2/overrides.mk b/drivers/net/xen-netchannel2/overrides.mk
new file mode 100644 (file)
index 0000000..0ffd242
--- /dev/null
@@ -0,0 +1,6 @@
+ifeq ($(OUT_OF_TREE_BUILD),y)
+EXTRA_CFLAGS += -DCONFIG_XEN_NETCHANNEL2 -DCONFIG_XEN_NETDEV2_FRONTEND -I$(M)/../../../include/xen/interface/io
+
+CONFIG_XEN_NETCHANNEL2 = m
+CONFIG_XEN_NETDEV2_FRONTEND = y
+endif
diff --git a/drivers/net/xen-netchannel2/poll.c b/drivers/net/xen-netchannel2/poll.c
new file mode 100644 (file)
index 0000000..3e9801e
--- /dev/null
@@ -0,0 +1,59 @@
+/* There are a couple of places where we try to minimise wakeups in
+   ways which work in the vast majority of cases, but occasionally
+   cause a needed event to be lost.  Compensate for those with a 1Hz
+   ticker.  The ticker runs whenever we have outstanding TX packets.
+   Once it's running, we never try to modify it, and instead just let
+   it run out. */
+/* If we're relying on this timer for correctness then performance is
+   going to be absolutely dire, but it should be sufficient to avoid
+   outright deadlocks. */
+#include <linux/kernel.h>
+#include <linux/timer.h>
+#include "netchannel2_core.h"
+
+#define TICKER_INTERVAL (HZ)
+
+static void poll_timer(unsigned long arg)
+{
+        struct netchannel2_ring_pair *ncrp =
+                (struct netchannel2_ring_pair *)arg;
+
+        /* If the ring appears to be behaving ``normally'', increase
+           the number of messages which we're allowed to have
+           outstanding by some small amount.  If it looks like we've
+           deadlocked, halve it. */
+        /* Arbitrarily define ``normal'' to be at least one interrupt
+           every 100ms, and a small amount to be 10. */
+        /* We don't synchronise against concurrent readers of
+           max_count_frags_no_event, because it doesn't matter too
+           much if it's slightly wrong.  We don't need to worry about
+           concurrent writers, because this timer is the only thing
+           which can change it, and it's only ever run on one cpu at a
+           time. */
+        if (jiffies - ncrp->last_event > HZ/10)
+                ncrp->max_count_frags_no_event /= 2;
+        else if (ncrp->max_count_frags_no_event + 10 <=
+                 MAX_MAX_COUNT_FRAGS_NO_EVENT)
+                ncrp->max_count_frags_no_event += 10;
+
+        if (ncrp->expected_finish_messages == 0)
+                return;
+        if (ncrp->cons_ring.sring->prod != ncrp->cons_ring.cons_pvt)
+                nc2_kick(ncrp);
+        nc2_start_polling(ncrp);
+}
+
+void nc2_init_poller(struct netchannel2_ring_pair *ncrp)
+{
+        setup_timer(&ncrp->polling_timer, poll_timer, (unsigned long)ncrp);
+}
+
+void nc2_start_polling(struct netchannel2_ring_pair *ncrp)
+{
+        mod_timer(&ncrp->polling_timer, jiffies + HZ);
+}
+
+void nc2_stop_polling(struct netchannel2_ring_pair *ncrp)
+{
+        del_timer_sync(&ncrp->polling_timer);
+}
diff --git a/drivers/net/xen-netchannel2/posted_buffers.c b/drivers/net/xen-netchannel2/posted_buffers.c
new file mode 100644 (file)
index 0000000..97eba89
--- /dev/null
@@ -0,0 +1,800 @@
+/* Support for receiver-posted buffers */
+#include <linux/kernel.h>
+#include <linux/delay.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#ifdef CONFIG_PARAVIRT
+#include <xen/interface/xen.h>
+#include <xen/grant_table.h>
+#include <xen/events.h>
+#include <xen/page.h>
+#else
+#include <xen/evtchn.h>
+#include <xen/gnttab.h>
+#endif
+#include <xen/xenbus.h>
+#include "netchannel2_endpoint.h"
+#include "netchannel2_core.h"
+
+#ifdef CONFIG_XEN_NETDEV2_VMQ
+#include "vmq.h"
+#endif
+
+#define POSTED_BUFFER_SIZE PAGE_SIZE
+
+/* A poison value to make certain buffer management errors more
+ * obvious. */
+#define RX_BUFFER_BIAS 0xbeef0000
+
+static void prepare_tx_buffers(struct netchannel2 *nc);
+
+/* --------------------------- Receive -------------------------------- */
+
+/* The other end just sent us a buffer id.  Convert it back to an
+   nc2_rx_buffer structure.  Returns NULL if the id is invalid, or if
+   it isn't currently owned by the other end. */
+static struct nc2_rx_buffer *find_rx_buffer(struct netchannel2 *nc,
+                                            uint32_t id)
+{
+        struct nc2_rx_buffer *rxb;
+        id -= RX_BUFFER_BIAS;
+        if (id >= nc->max_nr_rx_buffers)
+                return NULL;
+        rxb = &nc->rx_buffer_structs[id];
+        if (rxb->is_posted)
+                return rxb;
+        else
+                return NULL;
+}
+
+/* Post a buffer to the other endpoint immediately.  Assumes that the
+   caller has already checked that there is enough space available on
+   the ring. */
+static void _nc2_post_buffer(struct netchannel2 *nc,
+                             struct nc2_rx_buffer *rxb)
+{
+        struct netchannel2_msg_post_buffer msg;
+
+        BUG_ON(!nc->remote_trusted);
+
+        msg.id = rxb - nc->rx_buffer_structs + RX_BUFFER_BIAS;
+        msg.gref = rxb->gref;
+        msg.off_in_page = offset_in_page(rxb->buffer);
+        msg.size = POSTED_BUFFER_SIZE;
+
+        nc2_send_message(&nc->rings.prod_ring, NETCHANNEL2_MSG_POST_BUFFER,
+                         0, &msg, sizeof(msg));
+}
+
+/* Push out all pending buffer posts, until the ring becomes full or
+   we run out of buffers to post.  Called under the lock. */
+static void push_rx_buffer_posts(struct netchannel2 *nc)
+{
+        struct nc2_rx_buffer *buf;
+
+        while (!list_empty(&nc->unposted_rx_buffers) &&
+               nc2_can_send_payload_bytes(&nc->rings.prod_ring,
+                                          sizeof(struct netchannel2_msg_post_buffer))) {
+                buf = list_entry(nc->unposted_rx_buffers.next,
+                                 struct nc2_rx_buffer,
+                                 list);
+                _nc2_post_buffer(nc, buf);
+                buf->is_posted = 1;
+                list_move(&buf->list, &nc->rx_buffers);
+                nc->nr_rx_buffers++;
+
+                nc->rings.pending_time_sensitive_messages = 1;
+        }
+}
+
+/* Allocate more RX buffers until we reach our target number of RX
+   buffers and post them to the other endpoint.  Call under the
+   lock. */
+void nc2_replenish_rx_buffers(struct netchannel2 *nc)
+{
+        struct nc2_rx_buffer *rb;
+
+        if (nc->dont_post_buffers || !nc->remote_trusted)
+                return;
+
+        while (!list_empty(&nc->unused_rx_buffers)) {
+                rb = list_entry(nc->unused_rx_buffers.next,
+                                struct nc2_rx_buffer,
+                                list);
+                rb->buffer = (void *)__get_free_pages(GFP_ATOMIC|__GFP_NOWARN,
+                                                      0);
+                if (!rb->buffer)
+                        break;
+                rb->gref =
+                        gnttab_grant_foreign_access(nc->rings.otherend_id,
+                                                    virt_to_mfn(rb->buffer),
+                                                    0);
+                if ((int)rb->gref < 0) {
+                        free_page((unsigned long)rb->buffer);
+                        break;
+                }
+
+                list_move(&rb->list, &nc->unposted_rx_buffers);
+        }
+
+        push_rx_buffer_posts(nc);
+}
+
+/* The other endpoint has used @rxb to transmit part of the packet
+   which we're goign to represent by @skb.  Attach it to the packet's
+   fragment list.  The caller should make sure that @skb currently has
+   less than MAX_SKB_FRAGS in its shinfo area, and that @size and
+   @offset are appropriate for the buffer.  @size gives the size of
+   the fragment, and @offset gives its offset relative to the start of
+   the receive buffer. */
+/* This effectively transfers ownership of the buffer's page from @rxb
+   to @skb. */
+static void attach_buffer_to_skb(struct sk_buff *skb,
+                                 struct nc2_rx_buffer *rxb,
+                                 unsigned size,
+                                 unsigned offset)
+{
+        struct skb_shared_info *shinfo = skb_shinfo(skb);
+        skb_frag_t *frag = &shinfo->frags[shinfo->nr_frags];
+
+        BUG_ON(shinfo->nr_frags >= MAX_SKB_FRAGS);
+
+        frag->page = virt_to_page(rxb->buffer);
+        frag->page_offset = offset_in_page(rxb->buffer) + offset;
+        frag->size = size;
+        skb->truesize += size;
+        skb->data_len += size;
+        skb->len += size;
+
+        shinfo->nr_frags++;
+}
+
+/* The other end has sent us a packet using pre-posted buffers.  Parse
+   it up and return an skb representing the packet, or NULL on
+   error. */
+struct sk_buff *handle_pre_posted_packet(struct netchannel2 *nc,
+                                         struct netchannel2_msg_packet *msg,
+                                         struct netchannel2_msg_hdr *hdr,
+                                         unsigned nr_frags,
+                                         unsigned frags_off)
+{
+        struct netchannel2_fragment frag;
+        struct sk_buff *head_skb;
+        struct sk_buff *cur_skb;
+        struct sk_buff *new_skb;
+        unsigned x;
+        struct nc2_rx_buffer *rxb;
+        int is_bad;
+        int dropped;
+        unsigned acc_len;
+        unsigned prefix_len;
+
+#define SKB_MIN_PAYLOAD_SIZE 128
+
+        dropped = 0;
+        is_bad = 0;
+        if (msg->prefix_size < SKB_MIN_PAYLOAD_SIZE)
+                prefix_len = SKB_MIN_PAYLOAD_SIZE;
+        else
+                prefix_len = msg->prefix_size;
+        /* We don't enforce the MAX_PACKET_BYTES limit here.  That's
+           okay, because the amount of memory which the other end can
+           cause us to allocate is still limited, which is all that's
+           really needed. */
+        cur_skb = dev_alloc_skb(prefix_len + NET_IP_ALIGN);
+        if (cur_skb == NULL) {
+                is_bad = 1;
+                dropped = 1;
+                nc->rx.nr_failed_no_skb++;
+        } else {
+                skb_reserve(cur_skb, NET_IP_ALIGN);
+                nc2_copy_from_ring_off(&nc->rings.cons_ring,
+                                       skb_put(cur_skb, msg->prefix_size),
+                                       msg->prefix_size,
+                                       frags_off + nr_frags * sizeof(frag));
+        }
+        head_skb = cur_skb;
+        acc_len = 0;
+
+        for (x = 0; x < nr_frags; x++) {
+                fetch_fragment(&nc->rings, x, &frag, frags_off);
+                rxb = find_rx_buffer(nc, frag.pre_post.id);
+                if (rxb == NULL) {
+                        pr_debug("RX in bad frag %d.\n", frag.pre_post.id);
+                        is_bad = 1;
+                        continue;
+                }
+
+                if (!is_bad &&
+                    skb_shinfo(cur_skb)->nr_frags == MAX_SKB_FRAGS) {
+                        new_skb = dev_alloc_skb(0);
+                        skb_shinfo(cur_skb)->frag_list = new_skb;
+                        acc_len += cur_skb->len;
+                        cur_skb = new_skb;
+                        if (cur_skb == NULL) {
+                                is_bad = 1;
+                                dropped = 1;
+                        }
+                }
+
+                if (!is_bad &&
+                    frag.size <= PAGE_SIZE &&
+                    frag.off < PAGE_SIZE &&
+                    frag.size + frag.off <= POSTED_BUFFER_SIZE &&
+                    nc2_end_foreign_access_ref(rxb->gref, 0)) {
+                        gnttab_free_grant_reference(rxb->gref);
+                        attach_buffer_to_skb(cur_skb, rxb, frag.size,
+                                             frag.off);
+
+                } else {
+                        is_bad = 1;
+                        nc2_end_foreign_access(rxb->gref,
+                                               0,
+                                               (unsigned long)rxb->buffer);
+                }
+                rxb->gref = 0;
+                rxb->buffer = NULL;
+                rxb->is_posted = 0;
+                nc->nr_rx_buffers--;
+                list_move(&rxb->list, &nc->unused_rx_buffers);
+        }
+
+        if (is_bad) {
+                pr_debug("Received skb is bad!\n");
+                if (head_skb)
+                        kfree_skb(head_skb);
+                head_skb = NULL;
+                if (dropped)
+                        nc->stats.rx_dropped++;
+                else
+                        nc->stats.rx_errors++;
+        } else {
+                head_skb->len = cur_skb->len + acc_len;
+                head_skb->data_len = cur_skb->data_len + acc_len;
+                head_skb->truesize = cur_skb->truesize + acc_len;
+                if (skb_headlen(head_skb) < SKB_MIN_PAYLOAD_SIZE)
+                        pull_through(head_skb,
+                                     SKB_MIN_PAYLOAD_SIZE - skb_headlen(head_skb));
+        }
+
+        return head_skb;
+}
+
+/* Release a single RX buffer and return it to the unused list. */
+static void release_rx_buffer(struct netchannel2 *nc,
+                              struct nc2_rx_buffer *rxb)
+{
+        rxb->is_posted = 0;
+        nc2_end_foreign_access(rxb->gref,
+                               0,
+                               (unsigned long)rxb->buffer);
+        nc->nr_rx_buffers--;
+        list_move(&rxb->list, &nc->unused_rx_buffers);
+}
+
+/* The other endpoint has finished with one of our RX buffers.  Do
+   something suitable with it. */
+void nc2_handle_return_posted_buffer(struct netchannel2 *nc,
+                                     struct netchannel2_ring_pair *ncrp,
+                                     struct netchannel2_msg_hdr *hdr)
+{
+        struct netchannel2_msg_return_posted_buffer msg;
+        struct nc2_rx_buffer *rxb;
+
+        if (hdr->size != sizeof(msg)) {
+                pr_debug("return rx buffer message wrong size %d != %zd\n",
+                         hdr->size, sizeof(msg));
+                return;
+        }
+        if (ncrp != &nc->rings) {
+                pr_debug("Return a posted buffer on an ancillary ring!\n");
+                return;
+        }
+        nc2_copy_from_ring(&nc->rings.cons_ring, &msg, hdr->size);
+        rxb = find_rx_buffer(nc, msg.id);
+        if (!rxb) {
+                pr_debug("Other end returned buffer id %d which we didn't know about.\n",
+                         msg.id);
+                return;
+        }
+        release_rx_buffer(nc, rxb);
+}
+
+/* Tear down any remaining RX buffers.  The caller should have done
+   something to make sure that the other end isn't going to try and
+   use them any more. */
+void nc2_posted_buffer_rx_forget(struct netchannel2 *nc)
+{
+        struct nc2_rx_buffer *rxb, *next;
+
+        spin_lock_bh(&nc->rings.lock);
+        list_for_each_entry_safe(rxb, next, &nc->rx_buffers, list)
+                release_rx_buffer(nc, rxb);
+        list_for_each_entry_safe(rxb, next, &nc->unposted_rx_buffers, list)
+                release_rx_buffer(nc, rxb);
+
+        BUG_ON(!list_empty(&nc->rx_buffers));
+        BUG_ON(!list_empty(&nc->unposted_rx_buffers));
+
+        INIT_LIST_HEAD(&nc->unused_rx_buffers);
+        kfree(nc->rx_buffer_structs);
+        nc->rx_buffer_structs = NULL;
+        nc->max_nr_rx_buffers = 0;
+        spin_unlock_bh(&nc->rings.lock);
+}
+
+void nc2_handle_set_nr_posted_buffers(struct netchannel2 *nc,
+                                      struct netchannel2_ring_pair *ncrp,
+                                      struct netchannel2_msg_hdr *hdr)
+{
+        struct netchannel2_msg_set_nr_posted_buffers msg;
+        struct nc2_rx_buffer *buffer_structs;
+        unsigned x;
+        unsigned nr_buffers;
+
+        if (ncrp != &nc->rings) {
+                pr_debug("set_nr_posted_buffers on an ancillary ring!\n");
+                return;
+        }
+        if (hdr->size != sizeof(msg)) {
+                pr_debug("set nr posted buffers message wrong size %d != %zd\n",
+                         hdr->size, sizeof(msg));
+                return;
+        }
+        if (nc->rx_buffer_structs != NULL) {
+                pr_debug("Other end tried to change posted buffer settings when they were already set.\n");
+                return;
+        }
+        nc2_copy_from_ring(&nc->rings.cons_ring, &msg, hdr->size);
+        if (msg.nr_buffers <= MAX_POSTED_BUFFERS) {
+                nr_buffers = msg.nr_buffers;
+        } else {
+                pr_debug("remote recommended %d buffers, using %d\n",
+                         msg.nr_buffers, MAX_POSTED_BUFFERS);
+                nr_buffers = MAX_POSTED_BUFFERS;
+        }
+
+        buffer_structs = kzalloc(sizeof(struct nc2_rx_buffer) * nr_buffers,
+                                 GFP_ATOMIC);
+        if (buffer_structs == NULL) {
+                printk(KERN_WARNING "failed to allocate %d rx buffers",
+                       nr_buffers);
+                return;
+        }
+
+        for (x = 0; x < nr_buffers; x++)
+                list_add_tail(&buffer_structs[x].list,
+                              &nc->unused_rx_buffers);
+        nc->max_nr_rx_buffers = nr_buffers;
+        nc->rx_buffer_structs = buffer_structs;
+        nc->dont_post_buffers = 0;
+}
+
+
+/* -------------------------- Transmit ------------------------------- */
+
+/* A representation of a packet which is halfway through being
+   prepared for transmission. */
+struct post_packet_plan {
+        unsigned off_in_cur_buffer;
+        struct nc2_tx_buffer *cur_buffer;
+
+        /* We assemble the next fragment in work_frag, and then copy
+           to output_frag once it's done. */
+        struct netchannel2_fragment work_frag;
+        volatile struct netchannel2_fragment *output_frag;
+};
+
+/* A grant copy failed while we were transmitting a packet.  That
+   indicates that the *receiving* domain gave us a bad RX buffer.
+   We're too late to send them an error, so there isn't really
+   anything we can do to help them.  Oh well, nevermind. */
+void nc2_posted_on_gntcopy_fail(void *ctxt,
+                                gnttab_copy_t *gop)
+{
+        printk(KERN_WARNING "Grant copy failed for transmit; domain provided bad RX buffer (source %x, %x, %x, dest %x, %x, %x, len %x, flags %x, status %d).\n",
+               gop->source.u.ref, gop->source.domid, gop->source.offset,
+               gop->dest.u.ref, gop->dest.domid, gop->dest.offset,
+               gop->len, gop->flags, gop->status);
+}
+
+/* Advance to the next transmit buffer/fragment in the packet. */
+static void advance_to_next_buffer(struct post_packet_plan *plan)
+{
+        BUG_ON(plan->off_in_cur_buffer < plan->cur_buffer->size);
+        plan->cur_buffer = list_entry(plan->cur_buffer->list.next,
+                                      struct nc2_tx_buffer,
+                                      list);
+        plan->off_in_cur_buffer = 0;
+
+        *plan->output_frag = plan->work_frag;
+        plan->output_frag++;
+        memset(&plan->work_frag, 0, sizeof(plan->work_frag));
+        plan->work_frag.pre_post.id = plan->cur_buffer->id;
+}
+
+/* Schedule a copy from a range of bytes in a local page into the
+   packet we're building in @plan.  This cannot cross page or TX
+   buffer boundaries. */
+static void prepare_grant_copy(struct netchannel2 *nc,
+                               struct post_packet_plan *plan,
+                               struct page *page,
+                               unsigned page_off,
+                               unsigned count,
+                               domid_t domid)
+{
+        gnttab_copy_t *gop;
+
+        /* XXX: We don't do any error checking on this grant copy.
+           That's okay.  There are only two ways a grant copy can
+           fail:
+
+           -- The source is bad.  But the source is either in our
+              local memory (so must be good), or something we've
+              already mapped (so the grant reference must be good, and
+              must already be pinned so it can't go bad).  Therefore,
+              the source must always be good, and we can't fail
+              because of a bad source.
+
+           -- The destination is bad.  This could happen if the
+              receiving domain sent us a bad page to use as an RX
+              buffer.  In that case, we'll tell the receiving domain
+              that it received some data in a page when the page is
+              actually uninitialised.  The worst case is that the
+              receiving domain ends up copying its own uninitialised
+              memory to its own userspace.  That's not a problem for
+              us (because it can't see *our* uninitialised memory),
+              and if it's a problem for the receiving domain then it
+              should have been more careful about what memory it gave
+              us to use as RX buffers.
+
+           Therefore, the lack of error checking is actually perfectly
+           safe.
+
+           (Even if it isn't exactly great software engineering
+           practice.)
+        */
+        gop = hypercall_batcher_grant_copy(&nc->batcher,
+                                           NULL,
+                                           nc2_posted_on_gntcopy_fail);
+        gop->flags = GNTCOPY_dest_gref;
+        if (page_is_tracked(page)) {
+                lookup_tracker_page(page,
+                                    &gop->source.domid,
+                                    &gop->source.u.ref);
+                gop->flags |= GNTCOPY_source_gref;
+        } else {
+                gop->source.domid = DOMID_SELF;
+                gop->source.u.gmfn = virt_to_mfn(page_address(page));
+        }
+        gop->source.offset = page_off;
+        gop->dest.domid = domid;
+        gop->dest.offset =
+                plan->cur_buffer->off_in_page + plan->off_in_cur_buffer;
+        gop->dest.u.ref = plan->cur_buffer->gref;
+        gop->len = count;
+}
+
+/* Add the bytes from @ptr to @ptr + @size to the packet we're
+   preparing in @plan.  This cannot handle page-crossing local
+   buffers, but will correctly handle buffer-crossing operations. */
+static void prepare_subpage_post(struct netchannel2 *nc,
+                                 struct page *page,
+                                 unsigned off_in_page,
+                                 unsigned size,
+                                 struct post_packet_plan *plan)
+{
+        unsigned remaining_in_buffer;
+        unsigned this_time;
+
+        BUG_ON(off_in_page + size > PAGE_SIZE);
+        while (size != 0) {
+                remaining_in_buffer =
+                        plan->cur_buffer->size -
+                        plan->off_in_cur_buffer;
+                if (remaining_in_buffer == 0) {
+                        advance_to_next_buffer(plan);
+                        remaining_in_buffer = plan->cur_buffer->size;
+                }
+
+                this_time = size;
+                if (this_time > remaining_in_buffer)
+                        this_time = remaining_in_buffer;
+                prepare_grant_copy(nc,
+                                   plan,
+                                   page,
+                                   off_in_page,
+                                   this_time,
+                                   nc->rings.otherend_id);
+                plan->work_frag.size += this_time;
+                plan->off_in_cur_buffer += this_time;
+
+                size -= this_time;
+                off_in_page += this_time;
+        }
+}
+
+/* Add @skb->data to @skb->tail to the packet which is being prepared
+   in @plan. */
+static void prepare_data_area_post(struct netchannel2 *nc, struct sk_buff *skb,
+                                   struct post_packet_plan *plan)
+{
+        void *ptr = skb->data;
+        unsigned len = skb_headlen(skb);
+        unsigned off;
+        unsigned this_time;
+
+        for (off = 0; off < len; off += this_time) {
+                this_time = len;
+                if (this_time + offset_in_page(ptr + off) > PAGE_SIZE)
+                        this_time = PAGE_SIZE - offset_in_page(ptr + off);
+                prepare_subpage_post(nc,
+                                     virt_to_page(ptr + off),
+                                     offset_in_page(ptr + off),
+                                     this_time,
+                                     plan);
+        }
+}
+
+/* Allocate some TX buffers suitable for transmitting @skb out of
+   @nc's pool.  The buffers are chained on @fragments.  On success,
+   returns the number of buffers allocated.  Returns -1 if
+   insufficient buffers are available, in which case no buffers are
+   allocated.  We assume that the packet will be offset by
+   NET_IP_ALIGN bytes in the first fragment so that everything after
+   the ethernet header is properly aligned. */
+static int grab_tx_buffers(struct netchannel2 *nc,
+                           struct sk_buff *skb,
+                           struct list_head *fragments)
+{
+        unsigned bytes_to_transmit;
+        unsigned bytes_planned;
+        struct nc2_tx_buffer *current_buffer, *next;
+        int count;
+
+        sanity_check_list(&nc->avail_tx_buffers);
+
+        INIT_LIST_HEAD(fragments);
+        bytes_planned = 0;
+        bytes_to_transmit = skb->len + NET_IP_ALIGN;
+        count = 0;
+        list_for_each_entry_safe(current_buffer, next, &nc->avail_tx_buffers,
+                                 list) {
+                count++;
+                bytes_planned += current_buffer->size;
+                list_move(&current_buffer->list, fragments);
+                if (bytes_planned >= bytes_to_transmit) {
+                        BUG_ON(nc->nr_avail_tx_buffers < count);
+                        nc->nr_avail_tx_buffers -= count;
+                        sanity_check_list(&nc->avail_tx_buffers);
+                        return count;
+                }
+        }
+        BUG_ON(nc->nr_avail_tx_buffers != count);
+        sanity_check_list(&nc->avail_tx_buffers);
+        list_splice_init(fragments, &nc->avail_tx_buffers);
+        sanity_check_list(&nc->avail_tx_buffers);
+        return -1;
+}
+
+int prepare_xmit_allocate_post(struct netchannel2 *nc, struct sk_buff *skb)
+{
+        struct skb_cb_overlay *scb;
+        int nr_fragments;
+
+        scb = get_skb_overlay(skb);
+        nr_fragments = grab_tx_buffers(nc, skb, &scb->buffers);
+        if (nr_fragments < 0) {
+                nc->tx.nr_failed_no_buffers++;
+                return -1;
+        }
+        scb->nr_fragments = nr_fragments;
+        scb->type = NC2_PACKET_TYPE_pre_posted;
+
+        return 0;
+}
+
+void xmit_post(struct netchannel2 *nc, struct sk_buff *skb,
+               volatile void *msg_buf)
+{
+        volatile struct netchannel2_msg_packet *msg = msg_buf;
+        struct skb_cb_overlay *scb;
+        struct sk_buff *cur_skb;
+        struct skb_shared_info *shinfo;
+        skb_frag_t *frag;
+        unsigned x;
+        struct post_packet_plan plan;
+
+        scb = get_skb_overlay(skb);
+        memset(&plan, 0, sizeof(plan));
+
+        plan.cur_buffer = list_entry(scb->buffers.next,
+                                     struct nc2_tx_buffer,
+                                     list);
+        plan.output_frag = msg->frags;
+        memset(&plan.work_frag, 0, sizeof(plan.work_frag));
+        plan.work_frag.pre_post.id = plan.cur_buffer->id;
+
+        /* Burn a couple of bytes at the start of the packet so as we
+           get better alignment in the body. */
+        plan.work_frag.off = NET_IP_ALIGN;
+        plan.off_in_cur_buffer = NET_IP_ALIGN;
+
+        for (cur_skb = skb;
+             cur_skb != NULL;
+             cur_skb = skb_shinfo(cur_skb)->frag_list) {
+                prepare_data_area_post(nc, cur_skb, &plan);
+                shinfo = skb_shinfo(cur_skb);
+                for (x = 0; x < shinfo->nr_frags; x++) {
+                        frag = &shinfo->frags[x];
+                        prepare_subpage_post(nc,
+                                             frag->page,
+                                             frag->page_offset,
+                                             frag->size,
+                                             &plan);
+                }
+        }
+
+        *plan.output_frag = plan.work_frag;
+
+        /* All of the buffer slots which have been used in
+           this packet are now available for the other end to
+           fill with new buffers. */
+        list_splice(&scb->buffers, &nc->unused_tx_buffer_slots);
+}
+
+/* The other endpoint has sent us a transmit buffer.  Add it to the
+   list.  Called under the lock. */
+void nc2_handle_post_buffer(struct netchannel2 *nc,
+                            struct netchannel2_ring_pair *ncrp,
+                            struct netchannel2_msg_hdr *hdr)
+{
+        struct netchannel2_msg_post_buffer msg;
+        struct nc2_tx_buffer *txb;
+
+        if (hdr->size != sizeof(msg)) {
+                pr_debug("Strange sized rx buffer post %d\n", hdr->size);
+                return;
+        }
+        if (ncrp != &nc->rings) {
+                pr_debug("Posted buffer on an ancillary ring!\n");
+                return;
+        }
+        nc2_copy_from_ring(&nc->rings.cons_ring, &msg, sizeof(msg));
+        if (list_empty(&nc->unused_tx_buffer_slots) ||
+            msg.size > PAGE_SIZE ||
+            msg.off_in_page > PAGE_SIZE ||
+            msg.size + msg.off_in_page > PAGE_SIZE ||
+            msg.size < 64) {
+                pr_debug("Other end posted too many buffers, or this buffer was strange (%d,%d)\n",
+                         msg.off_in_page, msg.size);
+                return;
+        }
+
+        sanity_check_list(&nc->unused_tx_buffer_slots);
+        txb = list_entry(nc->unused_tx_buffer_slots.next,
+                         struct nc2_tx_buffer,
+                         list);
+        txb->id = msg.id;
+        txb->gref = msg.gref;
+        txb->off_in_page = msg.off_in_page;
+        txb->size = msg.size;
+
+        nc->nr_avail_tx_buffers++;
+
+        sanity_check_list(&nc->avail_tx_buffers);
+        list_move(&txb->list, &nc->avail_tx_buffers);
+        sanity_check_list(&nc->avail_tx_buffers);
+}
+
+/* Process the pending TX buffer return list and push as many as
+   possible onto the ring.  Called under the lock.  Does not
+   automatically flush the ring; that's the caller's
+   responsibility. */
+void nc2_return_pending_posted_buffers(struct netchannel2 *nc)
+{
+        struct netchannel2_msg_return_posted_buffer msg;
+        struct nc2_tx_buffer *txb;
+
+        memset(&msg, 0, sizeof(msg));
+        while (!list_empty(&nc->pending_tx_buffer_return) &&
+               nc2_can_send_payload_bytes(&nc->rings.prod_ring, sizeof(msg))) {
+                txb = list_entry(nc->pending_tx_buffer_return.next,
+                                 struct nc2_tx_buffer,
+                                 list);
+                list_del(&txb->list);
+                free_tx_buffer(nc, txb);
+                msg.id = txb->id;
+                nc2_send_message(&nc->rings.prod_ring,
+                                 NETCHANNEL2_MSG_RETURN_POSTED_BUFFER,
+                                 0,
+                                 &msg,
+                                 sizeof(&msg));
+        }
+}
+
+/* If there is space on the ring, tell the other end how many RX
+   buffers we want it to post (i.e. how many TX buffers we're allowed
+   to accept).  Called under the lock. */
+void nc2_advertise_tx_buffers(struct netchannel2 *nc)
+{
+        struct netchannel2_msg_set_nr_posted_buffers msg;
+
+        if (!nc2_can_send_payload_bytes(&nc->rings.prod_ring, sizeof(msg)))
+                return;
+        msg.nr_buffers = nc->nr_tx_buffers;
+        nc2_send_message(&nc->rings.prod_ring,
+                         NETCHANNEL2_MSG_SET_NR_POSTED_BUFFERS,
+                         0, &msg, sizeof(msg));
+        nc->need_advertise_tx_buffers = 0;
+        nc->rings.pending_time_sensitive_messages = 1;
+}
+
+/* Set the target number of TX buffers. */
+void nc2_set_nr_tx_buffers(struct netchannel2 *nc, unsigned nr_buffers)
+{
+        int changed;
+
+        spin_lock_bh(&nc->rings.lock);
+        changed = (nc->configured_nr_tx_buffers != nr_buffers);
+        nc->configured_nr_tx_buffers = nr_buffers;
+        spin_unlock_bh(&nc->rings.lock);
+        if (changed)
+                prepare_tx_buffers(nc);
+}
+
+/* The local ethX interface just came up.  Set up the TX buffers. */
+static void prepare_tx_buffers(struct netchannel2 *nc)
+{
+        struct nc2_tx_buffer *buffers;
+        unsigned x;
+        unsigned nr_buffers;
+
+        nr_buffers = nc->configured_nr_tx_buffers;
+        if (nr_buffers == 0) {
+                /* Trying to shut down TX in posted buffers. */
+                unprepare_tx_buffers(nc);
+                return;
+        }
+
+        buffers = kzalloc(sizeof(struct nc2_tx_buffer) * nr_buffers,
+                          GFP_KERNEL);
+        if (buffers == NULL) {
+                printk(KERN_ERR "Cannot allocate %d tx buffer slots, posted tx disabled.\n",
+                       nr_buffers);
+                return;
+        }
+
+        spin_lock_bh(&nc->rings.lock);
+
+        /* nc->tx_buffers should be NULL, because starting and
+           stopping the TX buffer management should alternate. */
+        BUG_ON(nc->tx_buffers);
+
+        INIT_LIST_HEAD(&nc->avail_tx_buffers);
+        nc->nr_avail_tx_buffers = 0;
+        for (x = 0; x < nr_buffers; x++)
+                list_add_tail(&buffers[x].list, &nc->unused_tx_buffer_slots);
+        nc->tx_buffers = buffers;
+        nc->nr_tx_buffers = nr_buffers;
+        nc->need_advertise_tx_buffers = 1;
+        spin_unlock_bh(&nc->rings.lock);
+}
+
+/* The local ethX interface is goign down.  Release the TX buffers
+   allocated by prepare_tx_buffers().  Note that the poll() method has
+   already been stopped, so messages posted by the other end will not
+   be processed. */
+void unprepare_tx_buffers(struct netchannel2 *nc)
+{
+        spin_lock_bh(&nc->rings.lock);
+        INIT_LIST_HEAD(&nc->pending_tx_buffer_return);
+        INIT_LIST_HEAD(&nc->unused_tx_buffer_slots);
+        INIT_LIST_HEAD(&nc->avail_tx_buffers);
+        nc->nr_tx_buffers = 0;
+        nc->nr_avail_tx_buffers = 0;
+        nc->need_advertise_tx_buffers = 1;
+        kfree(nc->tx_buffers);
+        nc->tx_buffers = NULL;
+        spin_unlock_bh(&nc->rings.lock);
+}
diff --git a/drivers/net/xen-netchannel2/receiver_map.c b/drivers/net/xen-netchannel2/receiver_map.c
new file mode 100644 (file)
index 0000000..f92aa06
--- /dev/null
@@ -0,0 +1,835 @@
+/* Support for mapping packets into the local domain, rather than
+   copying them or using pre-posted buffers.  We only implement
+   receive-side support here; for transmit-side, we use the rscb.c
+   implementation. */
+/* Each netchannel2 device has an arena in which it can map fragments.
+   When we want to map a packet, we allocate a suitable number of
+   slots in this arena and go and map the packet into them.  We also
+   allocate another structure representing the packet itself.  We set
+   a page free callback for the arena pages.  When all of the pages
+   are release, we send the completion message to the other endpoint.
+   There's also a timeout which will go and copy the page if it seems
+   to have gotten stuck. */
+#include <linux/kernel.h>
+#include <linux/delay.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <xen/live_maps.h>
+#include <xen/gnttab.h>
+#include <xen/balloon.h>
+#include <xen/evtchn.h>
+#include "netchannel2_core.h"
+
+#define MAX_MAPPED_FRAGS 1024
+#define MAX_MAPPED_PACKETS MAX_PENDING_FINISH_PACKETS
+#define SKB_MIN_PAYLOAD_SIZE 128
+
+static DEFINE_SPINLOCK(global_map_lock);
+static struct receive_mapper *receive_mapper;
+
+/* How long do we leave the packets in the Linux stack before trying
+   to copy them, in jiffies? */
+#define PACKET_TIMEOUT (HZ/2)
+
+/* A slot into which we could map a fragment. */
+struct rx_map_fragment {
+        struct list_head list;
+        struct rx_map_packet *packet;
+        grant_handle_t handle; /* 0 if the fragment isn't currently
+                                * mapped */
+        struct netchannel2_fragment nc_frag;
+};
+
+struct rx_map_packet {
+        struct list_head list;
+        struct list_head frags;
+        /* We take a reference for every mapped fragment associated
+           with the packet.  When the refcnt goes to zero, the packet
+           is finished, and can be moved to the
+           finished_packets_list. */
+        atomic_t refcnt;
+        unsigned id;
+        unsigned long expires; /* We expect Linux to have finished
+                                  with the packet by this time (in
+                                  jiffies), or we try to copy it. */
+        struct netchannel2 *nc;
+        uint8_t flags;
+};
+
+struct receive_mapper {
+        struct page_foreign_tracker *tracker;
+
+        struct page **pages;
+
+        /* Nests inside the netchannel2 lock.  The
+           finished_packets_lock nests inside this. */
+        spinlock_t rm_lock;
+
+        /* Packet fragments which we've mapped, or slots into which we
+           could map packets.  The free list and count are protected
+           by @rm_lock. */
+        struct rx_map_fragment frags[MAX_MAPPED_FRAGS];
+        struct list_head free_frags;
+
+        struct rx_map_packet packets[MAX_MAPPED_PACKETS];
+        struct list_head free_packets;
+        struct list_head active_packets;
+        unsigned nr_free_packets;
+
+        /* Packets which Linux has finished with but which we haven't
+           returned to the other endpoint yet. */
+        spinlock_t finished_packets_lock; /* BH-safe leaf lock,
+                                           * acquired from the page
+                                           * free callback.  Nests
+                                           * inside the rm_lock. */
+        struct list_head finished_packets;
+
+        struct tasklet_struct gc_tasklet;
+
+        struct timer_list expire_timer;
+
+        /* Set if we're trying to run the mapper down prior to
+           suspending the domain. */
+        uint8_t suspending;
+};
+
+static void suspend_receive_mapper(struct receive_mapper *rm);
+
+static unsigned fragment_idx(const struct rx_map_fragment *frag)
+{
+        return frag - receive_mapper->frags;
+}
+
+static int alloc_rx_frags_for_packet(unsigned nr_frags,
+                                     struct rx_map_packet *packet)
+{
+        struct rx_map_fragment *rmf;
+        unsigned x;
+
+        INIT_LIST_HEAD(&packet->frags);
+        for (x = 0; x < nr_frags; x++) {
+                if (list_empty(&receive_mapper->free_frags))
+                        goto err;
+                rmf = list_entry(receive_mapper->free_frags.next,
+                                 struct rx_map_fragment,
+                                 list);
+                rmf->packet = packet;
+                rmf->handle = -1;
+                list_move(&rmf->list, &packet->frags);
+        }
+        return 0;
+
+err:
+        list_splice_init(&packet->frags, &receive_mapper->free_frags);
+        return -EBUSY;
+}
+
+static struct rx_map_packet *alloc_rx_packet(struct netchannel2 *nc,
+                                             unsigned nr_frags)
+{
+        struct rx_map_packet *rmp;
+
+        spin_lock(&receive_mapper->rm_lock);
+        if (list_empty(&receive_mapper->free_packets) ||
+            receive_mapper->suspending) {
+                spin_unlock(&receive_mapper->rm_lock);
+                return NULL;
+        }
+        rmp = list_entry(receive_mapper->free_packets.next,
+                         struct rx_map_packet, list);
+
+        if (alloc_rx_frags_for_packet(nr_frags, rmp) < 0) {
+                spin_unlock(&receive_mapper->rm_lock);
+                return NULL;
+        }
+        list_del(&rmp->list);
+        atomic_set(&rmp->refcnt, nr_frags);
+        rmp->nc = nc;
+        receive_mapper->nr_free_packets--;
+
+        spin_unlock(&receive_mapper->rm_lock);
+
+        return rmp;
+}
+
+struct grant_unmapper {
+        unsigned nr_gops;
+        gnttab_unmap_grant_ref_t gop_queue[32];
+};
+
+static void do_unmaps(struct grant_unmapper *unmapper)
+{
+        int ret;
+        unsigned x;
+
+        if (unmapper->nr_gops != 0) {
+                ret = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref,
+                                                unmapper->gop_queue,
+                                                unmapper->nr_gops);
+                BUG_ON(ret);
+                for (x = 0; x < unmapper->nr_gops; x++) {
+                        set_phys_to_machine(
+                                __pa(unmapper->gop_queue[x].host_addr) >>
+                                        PAGE_SHIFT,
+                                INVALID_P2M_ENTRY);
+                }
+        }
+        unmapper->nr_gops = 0;
+}
+
+static void grant_unmap(struct grant_unmapper *unmapper,
+                        void *va,
+                        int handle)
+{
+        gnttab_unmap_grant_ref_t *gop;
+        if (unmapper->nr_gops == ARRAY_SIZE(unmapper->gop_queue))
+                do_unmaps(unmapper);
+        gop = &unmapper->gop_queue[unmapper->nr_gops];
+        gnttab_set_unmap_op(gop, (unsigned long)va, GNTMAP_host_map, handle);
+        unmapper->nr_gops++;
+}
+
+/* A tasklet which is invoked shortly after a packet is released so
+   that we can send the FINISH_PACKET message. */
+static void gc_tasklet(unsigned long _rm)
+{
+        struct list_head packets;
+        struct rx_map_packet *packet;
+        struct rx_map_fragment *rx_frag;
+        struct list_head released_fragments;
+        unsigned nr_released_packets;
+        unsigned idx;
+        struct grant_unmapper unmapper;
+        struct page *page;
+        struct netchannel2 *locked_nc;
+
+        INIT_LIST_HEAD(&packets);
+
+        spin_lock(&receive_mapper->finished_packets_lock);
+        list_splice_init(&receive_mapper->finished_packets, &packets);
+        spin_unlock(&receive_mapper->finished_packets_lock);
+
+        /* Unmap the fragments. */
+        unmapper.nr_gops = 0;
+        BUG_ON(packets.next == NULL);
+        list_for_each_entry(packet, &packets, list) {
+                BUG_ON(packet->list.next == NULL);
+                BUG_ON(atomic_read(&packet->refcnt) != 0);
+                BUG_ON(packet->frags.next == NULL);
+                list_for_each_entry(rx_frag, &packet->frags, list) {
+                        BUG_ON(rx_frag->list.next == NULL);
+                        if (rx_frag->handle == -1)
+                                continue;
+                        idx = fragment_idx(rx_frag);
+                        page = receive_mapper->pages[idx];
+                        stop_tracking_page(page);
+                        grant_unmap(&unmapper, page_address(page),
+                                    rx_frag->handle);
+                }
+        }
+        do_unmaps(&unmapper);
+
+        /* Tell the other end that the packets are finished, and
+           accumulate the fragments into a local free list. */
+        INIT_LIST_HEAD(&released_fragments);
+        nr_released_packets = 0;
+
+        locked_nc = NULL;
+        list_for_each_entry(packet, &packets, list) {
+                if (locked_nc != packet->nc) {
+                        if (locked_nc) {
+                                spin_unlock(&locked_nc->rings.lock);
+                                nc2_kick_fast(&locked_nc->rings);
+                        }
+                        spin_lock(&packet->nc->rings.lock);
+                        locked_nc = packet->nc;
+                }
+                BUG_ON(packet->frags.next == NULL);
+                list_for_each_entry(rx_frag, &packet->frags, list) {
+                        BUG_ON(rx_frag->list.next == NULL);
+                        idx = fragment_idx(rx_frag);
+                        gnttab_reset_grant_page(receive_mapper->pages[idx]);
+                }
+                nr_released_packets++;
+                list_splice_init(&packet->frags, &released_fragments);
+                queue_finish_packet_message(&locked_nc->rings, packet->id, 
+                                           packet->flags);
+        }
+
+        if (locked_nc) {
+                spin_unlock(&locked_nc->rings.lock);
+                nc2_kick_fast(&locked_nc->rings);
+                locked_nc = NULL;
+
+                spin_lock(&receive_mapper->rm_lock);
+                list_splice(&packets, &receive_mapper->free_packets);
+                list_splice(&released_fragments, &receive_mapper->free_frags);
+                receive_mapper->nr_free_packets += nr_released_packets;
+
+                /* Reprogram the expire timer. */
+                if (!list_empty(&receive_mapper->active_packets)) {
+                        mod_timer(&receive_mapper->expire_timer,
+                                  list_entry(receive_mapper->active_packets.next,
+                                             struct rx_map_packet,
+                                             list)->expires);
+                }
+                spin_unlock(&receive_mapper->rm_lock);
+        }
+}
+
+/* Decrement the refcnt on @rmp and, if necessary, move it to the
+   finished packets list and schedule the GC tasklet.  This should be
+   called with softirqs enabled, and acquires both the rm_lock and the
+   finished packets lock. */
+static void put_rx_map_packet(struct rx_map_packet *rmp)
+{
+        rmp->nc->rx.nr_put_packet++;
+        if (atomic_dec_and_test(&rmp->refcnt)) {
+                /* Remove it from the active list. */
+                spin_lock_bh(&receive_mapper->rm_lock);
+                list_del(&rmp->list);
+                spin_unlock_bh(&receive_mapper->rm_lock);
+
+                /* Add it to the finished list. */
+                spin_lock_bh(&receive_mapper->finished_packets_lock);
+                list_add_tail(&rmp->list, &receive_mapper->finished_packets);
+                spin_unlock_bh(&receive_mapper->finished_packets_lock);
+
+                tasklet_schedule(&receive_mapper->gc_tasklet);
+        }
+}
+
+
+/* The page @page, which was previously part of a receiver-mapped SKB,
+ * has been released.  If it was the last page involved in its SKB,
+ * the packet is finished and we can tell the other end that it's
+ * finished.
+ */
+static void netchan2_page_release(struct page *page)
+{
+        struct rx_map_fragment *frag;
+        struct rx_map_packet *rmp;
+
+        frag = (struct rx_map_fragment *)page->mapping;
+        rmp = frag->packet;
+
+        put_rx_map_packet(rmp);
+}
+
+/* Unmap the packet, removing all other references to it.  The caller
+ * should take an additional reference to the packet before calling
+ * this, to stop it disappearing underneath us.  The only way of
+ * checking whether this succeeded is to look at the packet's
+ * reference count after it returns.
+ */
+static void unmap_this_packet(struct rx_map_packet *rmp)
+{
+        struct rx_map_fragment *rx_frag;
+        unsigned idx;
+        int r;
+        int cnt;
+
+        /* Unmap every fragment in the packet.  We ignore the return
+           value of gnttab_copy_grant_page(), because success or
+           failure will be inferable from the reference count on the
+           packet. */
+        cnt = 0;
+        list_for_each_entry(rx_frag, &rmp->frags, list) {
+                idx = fragment_idx(rx_frag);
+                if (rx_frag->handle != -1) {
+                        r = gnttab_copy_grant_page(rx_frag->handle,
+                                                   &receive_mapper->pages[idx]);
+                        if (r == 0) {
+                                /* We copied the page, so it's not really
+                                   mapped any more. */
+                                rx_frag->handle = -1;
+                                atomic_dec(&rmp->refcnt);
+                        }
+                }
+                cnt++;
+        }
+
+        /* Caller should hold a reference. */
+        BUG_ON(atomic_read(&rmp->refcnt) == 0);
+}
+
+static void unmap_all_packets(void)
+{
+        struct rx_map_packet *rmp;
+        struct rx_map_packet *next;
+        struct list_head finished_packets;
+        int need_tasklet;
+
+        INIT_LIST_HEAD(&finished_packets);
+
+        spin_lock_bh(&receive_mapper->rm_lock);
+
+        list_for_each_entry_safe(rmp, next, &receive_mapper->active_packets,
+                                 list) {
+                atomic_inc(&rmp->refcnt);
+                unmap_this_packet(rmp);
+                if (atomic_dec_and_test(&rmp->refcnt))
+                        list_move(&rmp->list, finished_packets.prev);
+        }
+        spin_unlock_bh(&receive_mapper->rm_lock);
+
+        need_tasklet = !list_empty(&finished_packets);
+
+        spin_lock_bh(&receive_mapper->finished_packets_lock);
+        list_splice(&finished_packets, receive_mapper->finished_packets.prev);
+        spin_unlock_bh(&receive_mapper->finished_packets_lock);
+
+        if (need_tasklet)
+                tasklet_schedule(&receive_mapper->gc_tasklet);
+}
+
+static void free_receive_mapper(struct receive_mapper *rm)
+{
+        unsigned x;
+
+        /* Get rid of any packets which are currently mapped. */
+        suspend_receive_mapper(rm);
+
+        /* Stop the expiry timer.  We know it won't get requeued
+         * because there are no packets outstanding and rm->suspending
+         * is set (because of suspend_receive_mapper()). */
+        del_timer_sync(&rm->expire_timer);
+
+        /* Wait for any last instances of the tasklet to finish. */
+        tasklet_kill(&rm->gc_tasklet);
+
+        if (rm->pages != NULL) {
+                for (x = 0; x < MAX_MAPPED_FRAGS; x++) {
+                        if (PageForeign(rm->pages[x]))
+                                ClearPageForeign(rm->pages[x]);
+                        rm->pages[x]->mapping = NULL;
+                }
+                free_empty_pages_and_pagevec(rm->pages, MAX_MAPPED_FRAGS);
+        }
+        if (rm->tracker != NULL)
+                free_page_foreign_tracker(rm->tracker);
+        kfree(rm);
+}
+
+/* Timer invoked shortly after a packet expires, so that we can copy
+   the data and get it back from Linux.  This is necessary if a packet
+   gets stuck in a socket RX queue somewhere, or you risk a
+   deadlock. */
+static void expire_timer(unsigned long data)
+{
+        struct rx_map_packet *rmp, *next;
+        struct list_head finished_packets;
+        int need_tasklet;
+
+        INIT_LIST_HEAD(&finished_packets);
+
+        spin_lock(&receive_mapper->rm_lock);
+        list_for_each_entry_safe(rmp, next, &receive_mapper->active_packets, list) {
+                if (time_after(rmp->expires, jiffies)) {
+                        mod_timer(&receive_mapper->expire_timer, rmp->expires);
+                        break;
+                }
+                atomic_inc(&rmp->refcnt);
+                unmap_this_packet(rmp);
+                if (atomic_dec_and_test(&rmp->refcnt)) {
+                        list_move(&rmp->list, finished_packets.prev);
+                } else {
+                        /* Couldn't unmap the packet, either because
+                           it's in use by real hardware or we've run
+                           out of memory.  Send the packet to the end
+                           of the queue and update the expiry time so
+                           that we try again later. */
+                        /* Note that this can make the active packet
+                           list slightly out of order.  Oh well; it
+                           won't be by more than a few jiffies, and it
+                           doesn't really matter that much. */
+                        rmp->expires = jiffies + PACKET_TIMEOUT;
+                        list_move(&rmp->list, receive_mapper->active_packets.prev);
+                }
+        }
+        spin_unlock(&receive_mapper->rm_lock);
+
+        need_tasklet = !list_empty(&finished_packets);
+
+        spin_lock(&receive_mapper->finished_packets_lock);
+        list_splice(&finished_packets, receive_mapper->finished_packets.prev);
+        spin_unlock(&receive_mapper->finished_packets_lock);
+
+        if (need_tasklet)
+                tasklet_schedule(&receive_mapper->gc_tasklet);
+}
+
+static struct receive_mapper *new_receive_mapper(void)
+{
+        struct receive_mapper *rm;
+        unsigned x;
+
+        rm = kzalloc(sizeof(*rm), GFP_KERNEL);
+        if (!rm)
+                goto err;
+        INIT_LIST_HEAD(&rm->free_frags);
+        INIT_LIST_HEAD(&rm->free_packets);
+        INIT_LIST_HEAD(&rm->active_packets);
+        INIT_LIST_HEAD(&rm->finished_packets);
+        spin_lock_init(&rm->rm_lock);
+        spin_lock_init(&rm->finished_packets_lock);
+        for (x = 0; x < MAX_MAPPED_FRAGS; x++)
+                list_add_tail(&rm->frags[x].list, &rm->free_frags);
+        for (x = 0; x < MAX_MAPPED_PACKETS; x++)
+                list_add_tail(&rm->packets[x].list, &rm->free_packets);
+        rm->nr_free_packets = MAX_MAPPED_PACKETS;
+
+        setup_timer(&rm->expire_timer, expire_timer, 0);
+        tasklet_init(&rm->gc_tasklet, gc_tasklet, 0);
+
+        rm->tracker = alloc_page_foreign_tracker(MAX_MAPPED_FRAGS);
+        if (!rm->tracker)
+                goto err;
+        rm->pages = alloc_empty_pages_and_pagevec(MAX_MAPPED_FRAGS);
+        if (!rm->pages)
+                goto err;
+        for (x = 0; x < MAX_MAPPED_FRAGS; x++) {
+                SetPageForeign(rm->pages[x], netchan2_page_release);
+                rm->pages[x]->mapping = (void *)&rm->frags[x];
+        }
+
+        return rm;
+
+err:
+        if (rm != NULL)
+                free_receive_mapper(rm);
+        return NULL;
+}
+
+/* Allocate a chain of SKBs which can represent @nr_frags fragments.
+   Returns NULL on error.  The first SKB has a prefix_size data area;
+   the rest have 0. */
+static struct sk_buff *alloc_skb_chain(unsigned nr_frags,
+                                       unsigned prefix_size)
+{
+        unsigned nr_skbs;
+        struct sk_buff *head_skb, *cur_skb, **prev_skb;
+        unsigned x;
+
+        head_skb = dev_alloc_skb(prefix_size);
+        if (head_skb == NULL)
+                return NULL;
+
+        if (nr_frags <= MAX_SKB_FRAGS)
+                return head_skb;
+
+        nr_skbs = (nr_frags + MAX_SKB_FRAGS - 1) / MAX_SKB_FRAGS;
+        prev_skb = &skb_shinfo(head_skb)->frag_list;
+        for (x = 1; x < nr_skbs; x++) {
+                cur_skb = dev_alloc_skb(0);
+                if (cur_skb == NULL) {
+                        dev_kfree_skb(head_skb);
+                        return NULL;
+                }
+                *prev_skb = cur_skb;
+                prev_skb = &skb_shinfo(cur_skb)->frag_list;
+        }
+
+        return head_skb;
+}
+
+static void attach_frag_to_skb(struct sk_buff **_skb,
+                               struct rx_map_fragment *frag)
+{
+        struct sk_buff *skb = *_skb;
+        unsigned idx;
+        struct skb_shared_info *shinfo;
+        skb_frag_t *sk_frag;
+
+        shinfo = skb_shinfo(skb);
+        if (shinfo->nr_frags == MAX_SKB_FRAGS) {
+                skb = shinfo->frag_list;
+                BUG_ON(skb == NULL);
+                *_skb = skb;
+        }
+        sk_frag = &shinfo->frags[shinfo->nr_frags];
+        idx = fragment_idx(frag);
+        sk_frag->page = receive_mapper->pages[idx];
+        sk_frag->page_offset = frag->nc_frag.off;
+        sk_frag->size = frag->nc_frag.size;
+        shinfo->nr_frags++;
+}
+
+struct rx_plan {
+        int is_failed;
+        unsigned nr_mops;
+        gnttab_map_grant_ref_t mops[8];
+        struct rx_map_fragment *frags[8];
+};
+
+static void flush_grant_operations(struct rx_plan *rp)
+{
+        unsigned x;
+        int ret;
+        gnttab_map_grant_ref_t *mop;
+
+        if (rp->nr_mops == 0)
+                return;
+        if (!rp->is_failed) {
+                ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref,
+                                                rp->mops,
+                                                rp->nr_mops);
+                BUG_ON(ret);
+                for (x = 0; x < rp->nr_mops; x++) {
+                        mop = &rp->mops[x];
+                        if (mop->status != 0) {
+                                rp->is_failed = 1;
+                        } else {
+                                rp->frags[x]->handle = mop->handle;
+                                set_phys_to_machine(
+                                        __pa(mop->host_addr) >> PAGE_SHIFT,
+                                        FOREIGN_FRAME(mop->dev_bus_addr >>
+                                                      PAGE_SHIFT));
+                        }
+                }
+        }
+        rp->nr_mops = 0;
+}
+
+static void map_fragment(struct rx_plan *rp,
+                         struct rx_map_fragment *rx_frag,
+                         struct netchannel2 *nc)
+{
+        unsigned idx = fragment_idx(rx_frag);
+        gnttab_map_grant_ref_t *mop;
+
+        if (rp->nr_mops == ARRAY_SIZE(rp->mops))
+                flush_grant_operations(rp);
+        mop = &rp->mops[rp->nr_mops];
+        gnttab_set_map_op(mop,
+                          (unsigned long)page_address(receive_mapper->pages[idx]),
+                          GNTMAP_host_map | GNTMAP_readonly,
+                          rx_frag->nc_frag.receiver_map.gref,
+                          nc->rings.otherend_id);
+        rp->frags[rp->nr_mops] = rx_frag;
+        rp->nr_mops++;
+}
+
+/* Unmap a packet which has been half-mapped. */
+static void unmap_partial_packet(struct rx_map_packet *rmp)
+{
+        unsigned idx;
+        struct rx_map_fragment *rx_frag;
+        struct grant_unmapper unmapper;
+
+        unmapper.nr_gops = 0;
+        list_for_each_entry(rx_frag, &rmp->frags, list) {
+                if (rx_frag->handle == -1)
+                        continue;
+                idx = fragment_idx(rx_frag);
+                grant_unmap(&unmapper,
+                            page_address(receive_mapper->pages[idx]),
+                            rx_frag->handle);
+        }
+        do_unmaps(&unmapper);
+}
+
+struct sk_buff *handle_receiver_map_packet(struct netchannel2 *nc,
+                                           struct netchannel2_msg_packet *msg,
+                                           struct netchannel2_msg_hdr *hdr,
+                                           unsigned nr_frags,
+                                           unsigned frags_off)
+{
+        struct sk_buff *head_skb, *cur_skb;
+        struct rx_map_fragment *rx_frag;
+        unsigned x;
+        unsigned len;
+        struct rx_map_packet *rmp;
+        unsigned idx;
+        struct rx_plan plan;
+        unsigned prefix_size;
+
+        memset(&plan, 0, sizeof(plan));
+
+        rmp = alloc_rx_packet(nc, nr_frags);
+        if (rmp == NULL) {
+                nc->rx.nr_failed_no_packet++;
+                return NULL;
+        }
+
+        if (msg->prefix_size < SKB_MIN_PAYLOAD_SIZE)
+                prefix_size = SKB_MIN_PAYLOAD_SIZE;
+        else
+                prefix_size = msg->prefix_size;
+        /* As in posted_buffers.c, we don't limit the total size of
+           the packet, because we don't need to allocate more memory
+           for very large packets.  The prefix is safe because it's
+           only a 16 bit number.  A 64k allocation won't always
+           succeed, but it's unlikely to trigger the OOM killer or
+           otherwise interfere with the normal operation of the local
+           domain. */
+        head_skb = alloc_skb_chain(nr_frags, prefix_size + NET_IP_ALIGN);
+        if (head_skb == NULL) {
+                nc->rx.nr_failed_no_skb++;
+                spin_lock(&receive_mapper->rm_lock);
+                list_splice(&rmp->frags, &receive_mapper->free_frags);
+                list_add(&rmp->list, &receive_mapper->free_packets);
+                receive_mapper->nr_free_packets++;
+                spin_unlock(&receive_mapper->rm_lock);
+                return NULL;
+        }
+        skb_reserve(head_skb, NET_IP_ALIGN);
+
+        rmp->id = msg->id;
+        rmp->flags = msg->flags;
+
+        rx_frag = list_entry(rmp->frags.next, struct rx_map_fragment, list);
+        for (x = 0; x < nr_frags; x++) {
+                fetch_fragment(&nc->rings, x, &rx_frag->nc_frag, frags_off);
+                if (rx_frag->nc_frag.size > PAGE_SIZE ||
+                    rx_frag->nc_frag.off >= PAGE_SIZE ||
+                    rx_frag->nc_frag.size + rx_frag->nc_frag.off > PAGE_SIZE) {
+                        plan.is_failed = 1;
+                        break;
+                }
+                map_fragment(&plan, rx_frag, nc);
+                rx_frag = list_entry(rx_frag->list.next,
+                                     struct rx_map_fragment,
+                                     list);
+        }
+
+        flush_grant_operations(&plan);
+        if (plan.is_failed)
+                goto fail_and_unmap;
+
+        /* Grab the prefix off of the ring. */
+        nc2_copy_from_ring_off(&nc->rings.cons_ring,
+                               skb_put(head_skb, msg->prefix_size),
+                               msg->prefix_size,
+                               frags_off +
+                               nr_frags * sizeof(struct netchannel2_fragment));
+
+        /* All fragments mapped, so we know that this is going to
+           work.  Transfer the receive slots into the SKB. */
+        len = 0;
+        cur_skb = head_skb;
+        list_for_each_entry(rx_frag, &rmp->frags, list) {
+                attach_frag_to_skb(&cur_skb, rx_frag);
+                idx = fragment_idx(rx_frag);
+                start_tracking_page(receive_mapper->tracker,
+                                    receive_mapper->pages[idx],
+                                    nc->rings.otherend_id,
+                                    rx_frag->nc_frag.receiver_map.gref,
+                                    idx,
+                                    nc);
+                len += rx_frag->nc_frag.size;
+        }
+
+        head_skb->len += len;
+        head_skb->data_len += len;
+        head_skb->truesize += len;
+
+        spin_lock(&receive_mapper->rm_lock);
+        list_add_tail(&rmp->list, &receive_mapper->active_packets);
+        rmp->expires = jiffies + PACKET_TIMEOUT;
+        if (rmp == list_entry(receive_mapper->active_packets.next,
+                              struct rx_map_packet,
+                              list)) {
+                nc->rx.nr_mod_timer++;
+                mod_timer(&receive_mapper->expire_timer, rmp->expires);
+        }
+        spin_unlock(&receive_mapper->rm_lock);
+
+        if (skb_headlen(head_skb) < SKB_MIN_PAYLOAD_SIZE)
+                pull_through(head_skb,
+                             SKB_MIN_PAYLOAD_SIZE - skb_headlen(head_skb));
+
+        return head_skb;
+
+fail_and_unmap:
+        pr_debug("Failed to map received packet!\n");
+        unmap_partial_packet(rmp);
+
+        spin_lock(&receive_mapper->rm_lock);
+        list_splice(&rmp->frags, &receive_mapper->free_frags);
+        list_add_tail(&rmp->list, &receive_mapper->free_packets);
+        receive_mapper->nr_free_packets++;
+        spin_unlock(&receive_mapper->rm_lock);
+
+        kfree_skb(head_skb);
+        return NULL;
+}
+
+static void suspend_receive_mapper(struct receive_mapper *rm)
+{
+        spin_lock_bh(&rm->rm_lock);
+        /* Stop any more packets coming in. */
+        rm->suspending = 1;
+
+        /* Wait for Linux to give back all of the SKBs which we've
+           given it. */
+        while (rm->nr_free_packets != MAX_MAPPED_PACKETS) {
+                spin_unlock_bh(&rm->rm_lock);
+                unmap_all_packets();
+                msleep(100);
+                spin_lock_bh(&rm->rm_lock);
+        }
+        spin_unlock_bh(&rm->rm_lock);
+}
+
+static void resume_receive_mapper(void)
+{
+        spin_lock_bh(&receive_mapper->rm_lock);
+        receive_mapper->suspending = 0;
+        spin_unlock_bh(&receive_mapper->rm_lock);
+}
+
+
+int init_receive_map_mode(void)
+{
+        struct receive_mapper *new_rm;
+        spin_lock(&global_map_lock);
+        while (receive_mapper == NULL) {
+                spin_unlock(&global_map_lock);
+                new_rm = new_receive_mapper();
+                if (new_rm == NULL)
+                        return -ENOMEM;
+                spin_lock(&global_map_lock);
+                if (receive_mapper == NULL) {
+                        receive_mapper = new_rm;
+                } else {
+                        spin_unlock(&global_map_lock);
+                        free_receive_mapper(new_rm);
+                        spin_lock(&global_map_lock);
+                }
+        }
+        spin_unlock(&global_map_lock);
+        return 0;
+}
+
+void deinit_receive_map_mode(void)
+{
+        if (!receive_mapper)
+                return;
+        BUG_ON(spin_is_locked(&global_map_lock));
+        free_receive_mapper(receive_mapper);
+        receive_mapper = NULL;
+}
+
+void suspend_receive_map_mode(void)
+{
+        if (!receive_mapper)
+                return;
+        suspend_receive_mapper(receive_mapper);
+}
+
+void resume_receive_map_mode(void)
+{
+        if (!receive_mapper)
+                return;
+        resume_receive_mapper();
+}
+
+struct netchannel2 *nc2_get_interface_for_page(struct page *p)
+{
+        BUG_ON(!page_is_tracked(p));
+        if (!receive_mapper ||
+            tracker_for_page(p) != receive_mapper->tracker)
+                return NULL;
+        return get_page_tracker_ctxt(p);
+}
diff --git a/drivers/net/xen-netchannel2/recv_packet.c b/drivers/net/xen-netchannel2/recv_packet.c
new file mode 100644 (file)
index 0000000..430ed6f
--- /dev/null
@@ -0,0 +1,296 @@
+/* Support for receiving individual packets, and all the stuff which
+ * goes with that. */
+#include <linux/kernel.h>
+#include <linux/etherdevice.h>
+#include <linux/version.h>
+#include "netchannel2_core.h"
+
+/* Only accessed from the tasklet, so no synchronisation needed. */
+static struct sk_buff_head pending_rx_queue;
+
+/* Send as many finish packet messages as will fit on the ring. */
+void send_finish_packet_messages(struct netchannel2_ring_pair *ncrp)
+{
+        struct pending_finish_packets *pfp = &ncrp->pending_finish;
+        struct netchannel2_msg_finish_packet msg;
+        RING_IDX cons;
+
+        while (pfp->prod != pfp->cons &&
+               nc2_can_send_payload_bytes(&ncrp->prod_ring, sizeof(msg))) {
+                cons = pfp->cons;
+                msg.id = pfp->ids[pfp->cons % MAX_PENDING_FINISH_PACKETS];
+                pfp->cons++;
+                nc2_send_message(&ncrp->prod_ring,
+                                 NETCHANNEL2_MSG_FINISH_PACKET,
+                                 0,
+                                 &msg,
+                                 sizeof(msg));
+        }
+}
+
+/* Add a packet ID to the finish packet queue.  The caller should
+   arrange that send_finish_packet_messages is sent soon to flush the
+   requests out. */
+void queue_finish_packet_message(struct netchannel2_ring_pair *ncrp,
+                                 uint32_t id, uint8_t flags)
+{
+        struct pending_finish_packets *pfp = &ncrp->pending_finish;
+        RING_IDX prod;
+
+        prod = pfp->prod;
+        pfp->ids[prod % MAX_PENDING_FINISH_PACKETS] = id;
+        pfp->prod++;
+
+       if (flags & NC2_PACKET_FLAG_need_event)
+               ncrp->pending_time_sensitive_messages = 1;
+}
+
+/* Handle a packet message from the other end.  On success, queues the
+   new skb to the pending skb list.  If the packet is invalid, it is
+   discarded without generating a FINISH message. */
+void nc2_handle_packet_msg(struct netchannel2 *nc,
+                           struct netchannel2_ring_pair *ncrp,
+                           struct netchannel2_msg_hdr *hdr)
+{
+        unsigned nr_frags;
+        struct netchannel2_msg_packet msg;
+        struct sk_buff *skb;
+        const unsigned frags_off = sizeof(msg);
+        unsigned frags_bytes;
+
+        if (ncrp->pending_finish.prod - ncrp->pending_finish.cons ==
+            MAX_PENDING_FINISH_PACKETS) {
+                pr_debug("Remote endpoint sent too many packets!\n");
+                nc->stats.tx_errors++;
+                return;
+        }
+
+        if (hdr->size < sizeof(msg)) {
+                pr_debug("Packet message too small (%d < %zd)\n", hdr->size,
+                         sizeof(msg));
+                nc->stats.tx_errors++;
+                return;
+        }
+
+        if (hdr->size & 7) {
+                pr_debug("Packet size in ring not multiple of 8: %d\n",
+                         hdr->size);
+                nc->stats.tx_errors++;
+                return;
+        }
+
+        nc2_copy_from_ring(&ncrp->cons_ring, &msg, sizeof(msg));
+
+        if (msg.type != NC2_PACKET_TYPE_receiver_copy &&
+            msg.type != NC2_PACKET_TYPE_small &&
+            ncrp != &nc->rings) {
+                pr_debug("Received strange packet type %d on bypass ring.\n",
+                         msg.type);
+                nc->stats.tx_errors++;
+                return;
+        }
+
+        frags_bytes = hdr->size - sizeof(msg) - msg.prefix_size;
+        nr_frags = frags_bytes / sizeof(struct netchannel2_fragment);
+
+        switch (msg.type) {
+        case NC2_PACKET_TYPE_small:
+                if (nr_frags != 0) {
+                        /* Small packets, by definition, have no
+                         * fragments */
+                        pr_debug("Received small packet with %d frags?\n",
+                                 nr_frags);
+                        nc->stats.tx_errors++;
+                        return;
+                }
+                /* Any of the receiver functions can handle small
+                   packets as a trivial special case.  Use receiver
+                   copy, since that's the simplest. */
+                skb = handle_receiver_copy_packet(nc, ncrp, &msg, hdr,
+                                                  nr_frags, frags_off);
+                /* No finish message */
+                break;
+        case NC2_PACKET_TYPE_receiver_copy:
+                skb = handle_receiver_copy_packet(nc, ncrp, &msg, hdr,
+                                                  nr_frags, frags_off);
+                queue_finish_packet_message(ncrp, msg.id, msg.flags);
+                break;
+        case NC2_PACKET_TYPE_pre_posted:
+                skb = handle_pre_posted_packet(nc, &msg, hdr, nr_frags,
+                                               frags_off);
+                /* No finish message */
+                break;
+        case NC2_PACKET_TYPE_receiver_map:
+                if (!nc->local_trusted) {
+                        /* The remote doesn't trust us, so they
+                           shouldn't be sending us receiver-map
+                           packets.  Just treat it as an RSCB
+                           packet. */
+                        skb = NULL;
+                } else {
+                        skb = handle_receiver_map_packet(nc, &msg, hdr,
+                                                         nr_frags,
+                                                         frags_off);
+                        /* Finish message will be sent when we unmap
+                         * the packet. */
+                }
+                if (skb == NULL) {
+                        /* We can't currently map this skb.  Use a
+                           receiver copy instead. */
+                        skb = handle_receiver_copy_packet(nc, ncrp, &msg, hdr,
+                                                          nr_frags, frags_off);
+                        queue_finish_packet_message(ncrp, msg.id, msg.flags);
+                }
+                break;
+        default:
+                pr_debug("Unknown packet type %d\n", msg.type);
+                nc->stats.rx_errors++;
+                skb = NULL;
+                break;
+        }
+        if (skb != NULL) {
+                nc->stats.rx_bytes += skb->len;
+                nc->stats.rx_packets++;
+                skb->dev = nc->net_device;
+
+                if (ncrp->filter_mac &&
+                    skb_headlen(skb) >= sizeof(struct ethhdr) &&
+                    memcmp( ((struct ethhdr *)skb->data)->h_source,
+                            ncrp->remote_mac,
+                            ETH_ALEN) ) {
+                        /* We're in filter MACs mode and the source
+                           MAC on this packet is wrong.  Drop it. */
+                        /* (We know that any packet big enough to
+                           contain an ethernet header at all will
+                           contain it in the head space because we do
+                           a pull_through at the end of the type
+                           handler.) */
+                        nc->rx.dropped_bad_mac++;
+                        goto err;
+                }
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,20)
+                if (msg.flags & NC2_PACKET_FLAG_data_validated) {
+                        nc->rx.nr_csum_validated++;
+                        skb->ip_summed = CHECKSUM_UNNECESSARY;
+#ifdef CONFIG_XEN
+                        skb->proto_data_valid = 1;
+#endif
+                }
+                if (msg.flags & NC2_PACKET_FLAG_csum_blank) {
+                        nc->rx.nr_csum_blank++;
+#ifdef CONFIG_XEN
+                        skb->proto_csum_blank = 1;
+#else
+                        /* This is actually pretty bad: if we forward
+                           this packet on again, it's likely to go out
+                           with a bad checksum.  Oh well, nothing we
+                           can do. */
+#endif
+                }
+#else
+                switch (msg.flags & (NC2_PACKET_FLAG_data_validated |
+                                     NC2_PACKET_FLAG_csum_blank)) {
+                case 0:
+                        skb->ip_summed = CHECKSUM_NONE;
+                        break;
+                case NC2_PACKET_FLAG_data_validated:
+                        skb->ip_summed = CHECKSUM_UNNECESSARY;
+                        break;
+                default:
+                        skb->ip_summed = CHECKSUM_PARTIAL;
+                        break;
+                }
+#endif
+
+#ifdef CONFIG_XEN_NETDEV2_AUTOMATIC_BYPASS
+                if (ncrp == &nc->rings) {
+                        if (msg.flags & NC2_PACKET_FLAG_bypass_candidate)
+                                nc2_received_bypass_candidate_packet(nc, skb);
+                        else
+                                nc->auto_bypass.nr_non_bypass_packets++;
+                } else {
+                        container_of(ncrp, struct nc2_alternate_ring, rings)->
+                                autoteardown.nr_packets++;
+                }
+#endif
+
+                switch (msg.segmentation_type) {
+                case NC2_PACKET_SEGMENTATION_TYPE_none:
+                        break;
+                case NC2_PACKET_SEGMENTATION_TYPE_tcpv4:
+                        if (msg.mss == 0) {
+                                pr_debug("TSO request with mss == 0?\n");
+                                goto err;
+                        }
+                        nc->rx.nr_gso++;
+                        skb_shinfo(skb)->gso_type =
+                                SKB_GSO_TCPV4 | SKB_GSO_DODGY;
+                        skb_shinfo(skb)->gso_size = msg.mss;
+                        skb_shinfo(skb)->gso_segs = 0;
+                        break;
+                default:
+                        pr_debug("Unknown segmentation offload type %d!\n",
+                                 msg.segmentation_type);
+                        goto err;
+                }
+                skb->protocol = eth_type_trans(skb, nc->net_device);
+                __skb_queue_tail(&pending_rx_queue, skb);
+        }
+        return;
+
+err:
+        /* We may need to send a FINISH message here if this was a
+           receiver-map packet.  That should be handled automatically
+           by the kfree_skb(). */
+        kfree_skb(skb);
+        nc->stats.rx_errors++;
+        return;
+}
+
+/* If there is space on the ring, tell the other end how many packets
+   its allowed to send at one time and clear the
+   need_advertise_max_packets flag. */
+void advertise_max_packets(struct netchannel2_ring_pair *ncrp)
+{
+        struct netchannel2_msg_set_max_packets msg;
+
+        if (!nc2_can_send_payload_bytes(&ncrp->prod_ring, sizeof(msg)))
+                return;
+        msg.max_outstanding_packets = MAX_PENDING_FINISH_PACKETS;
+        nc2_send_message(&ncrp->prod_ring,
+                         NETCHANNEL2_MSG_SET_MAX_PACKETS,
+                         0,
+                         &msg,
+                         sizeof(msg));
+        ncrp->need_advertise_max_packets = 0;
+        ncrp->pending_time_sensitive_messages = 1;
+}
+
+void receive_pending_skbs(void)
+{
+        struct sk_buff *skb;
+        struct skb_cb_overlay *sco;
+        while (!skb_queue_empty(&pending_rx_queue)) {
+                skb = __skb_dequeue(&pending_rx_queue);
+                sco = get_skb_overlay(skb);
+                if (unlikely(sco->failed))
+                        kfree_skb(skb);
+                else
+                        netif_receive_skb(skb);
+        }
+}
+
+
+/* These don't really belong here, but it's as good a place as any. */
+int __init nc2_init(void)
+{
+       skb_queue_head_init(&pending_rx_queue);
+        return 0;
+}
+
+void __exit nc2_exit(void)
+{
+        nc2_shutdown_autoteardown();
+        deinit_receive_map_mode();
+        skb_queue_purge(&pending_rx_queue);
+}
diff --git a/drivers/net/xen-netchannel2/rscb.c b/drivers/net/xen-netchannel2/rscb.c
new file mode 100644 (file)
index 0000000..985e476
--- /dev/null
@@ -0,0 +1,474 @@
+/* Receiver-side copy buffer support */
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/version.h>
+#ifdef CONFIG_PARAVIRT
+#include <xen/grant_table.h>
+#include <xen/page.h>
+#else
+#include <xen/gnttab.h>
+#endif
+
+#include "netchannel2_core.h"
+
+/* -------------------------- Receive -------------------------------- */
+
+/* This is called whenever an RSCB grant copy fails. */
+void nc2_rscb_on_gntcopy_fail(void *ctxt, gnttab_copy_t *gop)
+{
+        struct sk_buff *skb = ctxt;
+        struct skb_cb_overlay *sco = get_skb_overlay(skb);
+        if (!sco->failed && net_ratelimit())
+                printk(KERN_WARNING "Dropping RX packet because of copy error\n");
+        sco->failed = 1;
+}
+
+
+/* Copy @size bytes from @offset in grant ref @gref against domain
+   @domid and shove them on the end of @cur_skb.  This can extend the
+   skb chain, in which case *@cur_skb becomes the new tail.  On
+   failure, skb_overlay(head_skb)->failed is set to 1. */
+/* There are a lot of recursive tail calls here.  Trust that the
+   compiler will do the right thing. */
+static void batched_grant_copy(struct sk_buff **cur_skb_p,
+                               struct sk_buff *head_skb,
+                               unsigned offset,
+                               unsigned size,
+                               grant_ref_t gref,
+                               domid_t domid)
+{
+        struct sk_buff *skb = *cur_skb_p;
+        struct sk_buff *new_skb;
+        gnttab_copy_t *gop;
+        unsigned first;
+        unsigned frag_nr;
+        struct skb_shared_info *shinfo;
+        struct page *new_page;
+        void *tail;
+        void *end;
+
+        if (size > PAGE_SIZE)
+                goto fail;
+
+#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,20)
+        tail = skb_tail_pointer(skb);
+        end = skb_end_pointer(skb);
+#else
+        tail = skb->tail;
+        end = skb->end;
+#endif
+
+        /* Is there any space left in the SKB head? */
+        if (end != tail) {
+                /* Yes.  How much? */
+                first = end - tail;
+                /* Limit ourselves to this fragment. */
+                if (first > size)
+                        first = size;
+                /* And don't cross page boundaries. */
+                if (unlikely(offset_in_page(tail) + first > PAGE_SIZE))
+                        first = PAGE_SIZE - offset_in_page(tail);
+
+                /* Copy this fragment to the header. */
+                gop = hypercall_batcher_grant_copy(&pending_rx_hypercalls,
+                                                   skb,
+                                                   nc2_rscb_on_gntcopy_fail);
+                gop->flags = GNTCOPY_source_gref;
+                gop->source.domid = domid;
+                gop->source.offset = offset;
+                gop->source.u.ref = gref;
+                gop->dest.domid = DOMID_SELF;
+                gop->dest.offset = offset_in_page(tail);
+                gop->dest.u.gmfn = virt_to_mfn(tail);
+                gop->len = first;
+
+                if (skb != head_skb) {
+                        head_skb->truesize += size;
+                        head_skb->data_len += size;
+                        head_skb->len += size;
+                }
+                skb_put(skb, first);
+
+                if (size != first)
+                        batched_grant_copy(cur_skb_p, head_skb,
+                                           offset + first, size - first,
+                                           gref, domid);
+                return;
+        }
+
+        /* Okay, we're in fragment space. */
+        shinfo = skb_shinfo(skb);
+        frag_nr = shinfo->nr_frags;
+        if (frag_nr == MAX_SKB_FRAGS) {
+                /* Advance to a new skb */
+                /* size is probably PAGE_SIZE, in which case we'll end
+                   up kmalloc()ing PAGE_SIZE plus about 200 bytes,
+                   which is pretty inefficient.  This should be rare,
+                   though, so just let it be. */
+                new_skb = dev_alloc_skb(size);
+                if (!new_skb) {
+                        /* Uh oh, no memory.  Give up. */
+                        /* (We'll keep trying to tranfer fragments to
+                           this skb until we hit the end of the
+                           packet, which isn't immensely efficient,
+                           but this should be rare enough that it
+                           doesn't matter). */
+                        goto fail;
+                }
+                skb_shinfo(skb)->frag_list = new_skb;
+                *cur_skb_p = new_skb;
+                batched_grant_copy(cur_skb_p, head_skb, offset,
+                                   size, gref, domid);
+                return;
+        }
+
+        /* Allocate a new page for the fragment */
+        new_page = alloc_page(GFP_ATOMIC);
+        if (!new_page)
+                goto fail;
+
+        gop = hypercall_batcher_grant_copy(&pending_rx_hypercalls,
+                                           head_skb,
+                                           nc2_rscb_on_gntcopy_fail);
+        gop->flags = GNTCOPY_source_gref;
+        gop->source.domid = domid;
+        gop->source.offset = offset;
+        gop->source.u.ref = gref;
+        gop->dest.domid = DOMID_SELF;
+        gop->dest.offset = 0;
+        gop->dest.u.gmfn = pfn_to_mfn(page_to_pfn(new_page));
+        gop->len = size;
+
+        shinfo->frags[frag_nr].page = new_page;
+        shinfo->frags[frag_nr].page_offset = 0;
+        shinfo->frags[frag_nr].size = size;
+        shinfo->nr_frags++;
+
+        head_skb->truesize += size;
+        head_skb->data_len += size;
+        head_skb->len += size;
+
+        return;
+
+fail:
+        get_skb_overlay(head_skb)->failed = 1;
+        return;
+}
+
+/* We've received a receiver-copy packet message from the remote.
+   Parse it up, build an sk_buff, and return it.  Returns NULL on
+   error. */
+struct sk_buff *handle_receiver_copy_packet(struct netchannel2 *nc,
+                                            struct netchannel2_ring_pair *ncrp,
+                                            struct netchannel2_msg_packet *msg,
+                                            struct netchannel2_msg_hdr *hdr,
+                                            unsigned nr_frags,
+                                            unsigned frags_off)
+{
+        struct netchannel2_fragment frag;
+        unsigned nr_bytes;
+        unsigned x;
+        struct sk_buff *skb, *cur_skb;
+        unsigned skb_headsize;
+
+        if (msg->prefix_size > NETCHANNEL2_MAX_INLINE_BYTES) {
+                pr_debug("Inline prefix too big! (%d > %d)\n",
+                         msg->prefix_size, NETCHANNEL2_MAX_INLINE_BYTES);
+                return NULL;
+        }
+
+        /* Count the number of bytes in the packet.  Be careful: the
+           other end can still access the packet on the ring, so the
+           size could change later. */
+        nr_bytes = msg->prefix_size;
+        for (x = 0; x < nr_frags; x++) {
+                fetch_fragment(ncrp, x, &frag, frags_off);
+                nr_bytes += frag.size;
+        }
+        if (nr_bytes > NETCHANNEL2_MAX_PACKET_BYTES) {
+                pr_debug("Packet too big! (%d > %d)\n", nr_bytes,
+                         NETCHANNEL2_MAX_PACKET_BYTES);
+                return NULL;
+        }
+        if (nr_bytes < 64) {
+                /* Linux sometimes has problems with very small SKBs.
+                   Impose a minimum size of 64 bytes. */
+                nr_bytes = 64;
+        }
+
+        /* We prefer to put the packet in the head if possible,
+           provided that won't cause the head to be allocated with a
+           multi-page kmalloc().  If we can't manage that, we fall
+           back to just putting the inline part in the head, with the
+           rest of the packet attached as fragments. */
+        /* We could also consider having a maximally-sized head and
+           put the rest in fragments, but that would mean that the
+           Linux-side fragments wouldn't match up with the NC2-side
+           fragments, which would mean we'd need twice as many
+           hypercalls. */
+        skb_headsize = nr_bytes + NET_IP_ALIGN;
+        if (skb_headsize >
+            ((PAGE_SIZE - sizeof(struct skb_shared_info) - NET_SKB_PAD) & ~(SMP_CACHE_BYTES))) {
+                skb_headsize = msg->prefix_size + NET_IP_ALIGN;
+        }
+
+        skb = dev_alloc_skb(skb_headsize);
+        if (!skb) {
+                /* Drop the packet. */
+                pr_debug("Couldn't allocate a %d byte skb.\n", nr_bytes);
+                nc->stats.rx_dropped++;
+                return NULL;
+        }
+
+        /* Arrange that the IP header is nicely aligned in memory. */
+        skb_reserve(skb, NET_IP_ALIGN);
+
+        /* The inline prefix should always fit in the SKB head. */
+        nc2_copy_from_ring_off(&ncrp->cons_ring,
+                               skb_put(skb, msg->prefix_size),
+                               msg->prefix_size,
+                               frags_off + nr_frags * sizeof(frag));
+
+        cur_skb = skb;
+        for (x = 0; x < nr_frags; x++) {
+                fetch_fragment(ncrp, x, &frag, frags_off);
+                batched_grant_copy(&cur_skb, skb, frag.off, frag.size,
+                                   frag.receiver_copy.gref,
+                                   ncrp->otherend_id);
+        }
+        return skb;
+}
+
+
+
+/* ------------------------------- Transmit ---------------------------- */
+/* XXX We should really have a timeout when transmitting via RSCB, in
+   case the remote domain is being naughty. */
+
+struct grant_packet_plan {
+        volatile struct netchannel2_fragment *out_fragment;
+        unsigned gref_pool_left;
+        grant_ref_t gref_pool;
+        int use_subpage_grants;
+        unsigned prefix_avail;
+#ifdef CONFIG_XEN_NETDEV2_BYPASSABLE
+        int could_have_used_bypass;
+#endif
+};
+
+int prepare_xmit_allocate_grant(struct netchannel2_ring_pair *ncrp,
+                                struct sk_buff *skb,
+                                int use_subpage_grants)
+{
+        struct skb_cb_overlay *skb_co = get_skb_overlay(skb);
+        unsigned nr_fragments;
+        struct sk_buff *cur_skb;
+        grant_ref_t gref_pool;
+        int err;
+        unsigned inline_bytes_left;
+        unsigned inline_prefix_size;
+
+        if (allocate_txp_slot(ncrp, skb) < 0)
+                return -1;
+
+        inline_prefix_size = PACKET_PREFIX_SIZE;
+        if (skb_headlen(skb) < inline_prefix_size)
+                inline_prefix_size = skb_headlen(skb);
+
+        if (skb_co->nr_fragments == 0) {
+                inline_bytes_left = inline_prefix_size;
+                nr_fragments = 0;
+                for (cur_skb = skb;
+                     cur_skb != NULL;
+                     cur_skb = skb_shinfo(cur_skb)->frag_list) {
+                        /* How many fragments are we going to need for
+                         * the data area? */
+                        if (inline_bytes_left < skb_headlen(cur_skb)) {
+                                unsigned long start_grant;
+                                unsigned long end_grant;
+                                start_grant =
+                                        ((unsigned long)cur_skb->data +
+                                         inline_bytes_left) &
+                                        ~(PAGE_SIZE-1);
+                                end_grant =
+                                        ((unsigned long)cur_skb->data +
+                                         skb_headlen(cur_skb) +
+                                         PAGE_SIZE - 1) & ~(PAGE_SIZE-1);
+                                nr_fragments +=
+                                        (end_grant - start_grant) / PAGE_SIZE;
+                        } else {
+                                /* No fragments if the head is
+                                   entirely in the prefix */
+                        }
+                        inline_bytes_left = 0;
+                        /* And for the skb fragment area? */
+                        nr_fragments += skb_shinfo(cur_skb)->nr_frags;
+                }
+
+                /* No-fragments packets should be policy small, not
+                 * policy grant. */
+                BUG_ON(nr_fragments == 0);
+
+                skb_co->nr_fragments = nr_fragments;
+        }
+
+        /* Grab the grant references. */
+        err = gnttab_alloc_grant_references(skb_co->nr_fragments, &gref_pool);
+        if (err < 0) {
+                release_txp_slot(ncrp, skb);
+                /* Leave skb_co->nr_fragments set, so that we don't
+                   have to recompute it next time around. */
+                return -1;
+        }
+        skb_co->gref_pool = gref_pool;
+        skb_co->inline_prefix_size = inline_prefix_size;
+
+        if (use_subpage_grants)
+                skb_co->type = NC2_PACKET_TYPE_receiver_copy;
+        else
+                skb_co->type = NC2_PACKET_TYPE_receiver_map;
+
+        return 0;
+}
+
+static void prepare_subpage_grant(struct netchannel2_ring_pair *ncrp,
+                                  struct page *page,
+                                  unsigned off_in_page,
+                                  unsigned size,
+                                  struct grant_packet_plan *plan)
+{
+        volatile struct netchannel2_fragment *frag;
+        domid_t trans_domid;
+        grant_ref_t trans_gref;
+        grant_ref_t gref;
+#ifdef CONFIG_XEN_NETDEV2_BYPASSABLE
+        struct netchannel2 *orig_iface;
+#endif
+
+        if (size <= plan->prefix_avail) {
+                /* This fragment is going to be inline -> nothing to
+                 * do. */
+                plan->prefix_avail -= size;
+                return;
+        }
+        if (plan->prefix_avail > 0) {
+                /* Part inline, part in payload. */
+                size -= plan->prefix_avail;
+                off_in_page += plan->prefix_avail;
+                plan->prefix_avail = 0;
+        }
+        frag = plan->out_fragment;
+        gref = gnttab_claim_grant_reference(&plan->gref_pool);
+        frag->receiver_copy.gref = gref;
+        if (page_is_tracked(page)) {
+#ifdef CONFIG_XEN_NETDEV2_BYPASSABLE
+                orig_iface = nc2_get_interface_for_page(page);
+                if (orig_iface &&
+                    orig_iface->extant_bypasses < orig_iface->max_bypasses)
+                        plan->could_have_used_bypass = 1;
+#endif
+                lookup_tracker_page(page, &trans_domid, &trans_gref);
+                gnttab_grant_foreign_access_ref_trans(gref,
+                                                      ncrp->otherend_id,
+                                                      GTF_readonly,
+                                                      trans_domid,
+                                                      trans_gref);
+        } else if (plan->use_subpage_grants) {
+                gnttab_grant_foreign_access_ref_subpage(gref,
+                                                        ncrp->otherend_id,
+                                                        virt_to_mfn(page_address(page)),
+                                                        GTF_readonly,
+                                                        off_in_page,
+                                                        size);
+        } else {
+                gnttab_grant_foreign_access_ref(gref,
+                                                ncrp->otherend_id,
+                                                virt_to_mfn(page_address(page)),
+                                                GTF_readonly);
+        }
+
+        frag->off = off_in_page;
+        frag->size = size;
+        plan->out_fragment++;
+        plan->gref_pool_left--;
+}
+
+static int grant_data_area(struct netchannel2_ring_pair *ncrp,
+                           struct sk_buff *skb,
+                           struct grant_packet_plan *plan)
+{
+        void *ptr = skb->data;
+        unsigned len = skb_headlen(skb);
+        unsigned off;
+        unsigned this_time;
+
+        for (off = 0; off < len; off += this_time) {
+                this_time = len - off;
+                if (this_time + offset_in_page(ptr + off) > PAGE_SIZE)
+                        this_time = PAGE_SIZE - offset_in_page(ptr + off);
+                prepare_subpage_grant(ncrp,
+                                      virt_to_page(ptr + off),
+                                      offset_in_page(ptr + off),
+                                      this_time,
+                                      plan);
+        }
+        return 0;
+}
+
+void xmit_grant(struct netchannel2_ring_pair *ncrp,
+                struct sk_buff *skb,
+                int use_subpage_grants,
+                volatile void *msg_buf)
+{
+        volatile struct netchannel2_msg_packet *msg = msg_buf;
+        struct skb_cb_overlay *skb_co = get_skb_overlay(skb);
+        struct grant_packet_plan plan;
+        struct sk_buff *cur_skb;
+        unsigned x;
+        struct skb_shared_info *shinfo;
+        skb_frag_t *frag;
+
+        memset(&plan, 0, sizeof(plan));
+        plan.use_subpage_grants = use_subpage_grants;
+        plan.prefix_avail = skb_co->inline_prefix_size;
+        plan.out_fragment = msg->frags;
+
+        /* We allocate a gref pool with one gref for every fragment in
+           prepare_xmit_allocate_grant() */
+        plan.gref_pool = skb_co->gref_pool;
+        plan.gref_pool_left = skb_co->nr_fragments;
+
+       ncrp->count_frags_no_event += skb_co->nr_fragments;
+       if (ncrp->count_frags_no_event >= ncrp->max_count_frags_no_event) {
+               msg->flags |= NC2_PACKET_FLAG_need_event;
+               ncrp->count_frags_no_event = 0;
+       }
+
+        for (cur_skb = skb;
+             cur_skb != NULL;
+             cur_skb = skb_shinfo(cur_skb)->frag_list) {
+                grant_data_area(ncrp, cur_skb, &plan);
+                if (skb_is_nonlinear(cur_skb)) {
+                        shinfo = skb_shinfo(cur_skb);
+                        for (x = 0; x < shinfo->nr_frags; x++) {
+                                frag = &shinfo->frags[x];
+                                prepare_subpage_grant(ncrp,
+                                                      frag->page,
+                                                      frag->page_offset,
+                                                      frag->size,
+                                                      &plan);
+                        }
+                }
+        }
+
+        BUG_ON(plan.gref_pool_left != 0);
+
+#ifdef CONFIG_XEN_NETDEV2_BYPASSABLE
+        if (plan.could_have_used_bypass &&
+            ncrp == &ncrp->interface->rings &&
+            ncrp->interface->extant_bypasses < ncrp->interface->max_bypasses)
+                msg->flags |= NC2_PACKET_FLAG_bypass_candidate;
+#endif
+}
+
diff --git a/drivers/net/xen-netchannel2/sysfs.c b/drivers/net/xen-netchannel2/sysfs.c
new file mode 100644 (file)
index 0000000..177fb23
--- /dev/null
@@ -0,0 +1,121 @@
+/* HACK HACK HACK: this gets #include'd into netback2.c and
+   netfront2.c, and provides their sysfs statistics files. */
+
+#define TX_STATS()                                                      \
+TX_STAT(nr_too_busy)                                                    \
+TX_STAT(nr_queue_tasklet)                                               \
+TX_STAT(nr_tasklet_action)                                              \
+TX_STAT(nr_notifies)                                                    \
+TX_STAT(nr_csum_blank)                                                  \
+TX_STAT(nr_csum_validated)                                              \
+TX_STAT(nr_gso)                                                         \
+TX_STAT(nr_failed_alloc_packet)                                         \
+TX_STAT(nr_failed_alloc_fragment)                                       \
+TX_STAT(nr_failed_reserve_ring)                                         \
+TX_STAT(nr_failed_no_buffers)
+#define RX_STATS()                                                      \
+RX_STAT(nr_irqs)                                                        \
+RX_STAT(nr_polls)                                                       \
+RX_STAT(nr_ring_overflow)                                               \
+RX_STAT(nr_messages)                                                    \
+RX_STAT(nr_ring_race)                                                   \
+RX_STAT(nr_notify)                                                      \
+RX_STAT(nr_unstick)                                                     \
+RX_STAT(nr_incomplete_poll)                                             \
+RX_STAT(nr_failed_no_packet)                                            \
+RX_STAT(nr_failed_no_skb)                                               \
+RX_STAT(nr_mod_timer)                                                   \
+RX_STAT(nr_timer_expire)                                                \
+RX_STAT(nr_unmap)                                                       \
+RX_STAT(nr_put_packet)                                                  \
+RX_STAT(nr_gc_tasklet)                                                  \
+RX_STAT(nr_csum_validated)                                              \
+RX_STAT(nr_csum_blank)                                                  \
+RX_STAT(nr_gso)                                                         \
+RX_STAT(dropped_bad_mac)
+#define OPERATION_FIELDS()                                              \
+OPERATION_FIELD(nr_tx_buffers)                                          \
+OPERATION_FIELD(nr_rx_buffers)                                          \
+OPERATION_FIELD(nr_avail_tx_buffers)
+
+#define NC2_SHOW(rxtx, name, field, format)                             \
+static ssize_t show_##rxtx##_##name(struct device *_dev,                \
+                                    struct device_attribute *attr,      \
+                                    char *buf)                          \
+{                                                                       \
+        struct netchannel2 *nc = device_to_nc2(_dev);                   \
+                                                                        \
+        return sprintf(buf, format, nc-> rxtx . field );                \
+}                                                                       \
+static DEVICE_ATTR( rxtx##_##name, S_IRUGO, show_##rxtx##_##name, NULL);
+#define RX_STAT(name) NC2_SHOW(rx, name, name, "%u\n")
+RX_STATS()
+#define TX_STAT(name) NC2_SHOW(tx, name, name, "%u\n")
+TX_STATS()
+#undef TX_STAT
+#undef RX_STAT
+#undef NC2_SHOW
+
+#define NC2_SHOW_OP(name, format)                                       \
+static ssize_t show_operation_ ## name(struct device *_dev,             \
+                                       struct device_attribute *attr,   \
+                                       char *buf)                       \
+{                                                                       \
+        struct netchannel2 *nc = device_to_nc2(_dev);                   \
+                                                                        \
+        return sprintf(buf, format, nc-> name );                        \
+}                                                                       \
+static DEVICE_ATTR(name, S_IRUGO, show_operation_##name, NULL);
+
+#define OPERATION_FIELD(name) NC2_SHOW_OP(name, "%u\n")
+OPERATION_FIELDS()
+
+#undef OPERATION_FIELD
+#undef NC2_SHOW_OP
+
+#ifdef DEBUG
+static ssize_t do_debug_dump(struct device *device,
+                             struct device_attribute *attr,
+                             const char *buf,
+                             size_t count)
+{
+        struct netchannel2 *nc = device_to_nc2(device);
+        debug_dump_nc2_struct(nc);
+        return count;
+}
+static DEVICE_ATTR(debug_dump, S_IWUSR, NULL, do_debug_dump);
+#endif
+
+#define ATTR(rxtx, name) &dev_attr_##rxtx##_##name .attr,
+#define RX_STAT(x) ATTR(rx, x)
+#define TX_STAT(x) ATTR(tx, x)
+#define OPERATION_FIELD(x) &dev_attr_##x .attr,
+
+static struct attribute *nc2stat_attrs[] = {
+        RX_STATS()
+        TX_STATS()
+        OPERATION_FIELDS()
+#ifdef DEBUG
+        &dev_attr_debug_dump.attr,
+#endif
+        NULL
+};
+#undef OPERATION_FIELD
+#undef TX_STAT
+#undef RX_STAT
+#undef ATTR
+
+static struct attribute_group nc2_stat_group = {
+        .name = "statistics",
+        .attrs = nc2stat_attrs,
+};
+
+static int nc2_sysfs_addif(struct xenbus_device *xd)
+{
+        return sysfs_create_group(&xd->dev.kobj, &nc2_stat_group);
+}
+
+static void nc2_sysfs_delif(struct xenbus_device *xd)
+{
+        sysfs_remove_group(&xd->dev.kobj, &nc2_stat_group);
+}
diff --git a/drivers/net/xen-netchannel2/tools/destroy_bypass.c b/drivers/net/xen-netchannel2/tools/destroy_bypass.c
new file mode 100644 (file)
index 0000000..7e66535
--- /dev/null
@@ -0,0 +1,25 @@
+#include <sys/ioctl.h>
+#include <err.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include "../netchannel2_uspace.h"
+
+int
+main(int argc, char *argv[])
+{
+        int fd;
+        struct netchannel2_ioctl_destroy_bypass ioc;
+        int r;
+
+        fd = open("/dev/netback2", O_RDWR);
+        if (fd < 0)
+                err(1, "openning /dev/netback2");
+        ioc.handle = atoi(argv[1]);
+
+        r = ioctl(fd, NETCHANNEL2_IOCTL_DESTROY_BYPASS, &ioc);
+        if (r < 0)
+                err(1, "destroying bypass");
+        return 0;
+}
diff --git a/drivers/net/xen-netchannel2/tools/establish_bypass.c b/drivers/net/xen-netchannel2/tools/establish_bypass.c
new file mode 100644 (file)
index 0000000..e2a2e8c
--- /dev/null
@@ -0,0 +1,31 @@
+#include <sys/ioctl.h>
+#include <err.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include "../netchannel2_uspace.h"
+
+int
+main(int argc, char *argv[])
+{
+        int fd;
+        unsigned a;
+        unsigned b;
+        struct netchannel2_ioctl_establish_bypass ioc;
+        int r;
+
+        fd = open("/dev/netback2", O_RDWR);
+        if (fd < 0)
+                err(1, "openning /dev/netback2");
+        a = atoi(argv[1]);
+        b = atoi(argv[2]);
+        ioc.handle_a = a;
+        ioc.handle_b = b;
+
+        r = ioctl(fd, NETCHANNEL2_IOCTL_ESTABLISH_BYPASS, &ioc);
+        if (r < 0)
+                err(1, "establishing bypass");
+        printf("%d\n", r);
+        return 0;
+}
diff --git a/drivers/net/xen-netchannel2/util.c b/drivers/net/xen-netchannel2/util.c
new file mode 100644 (file)
index 0000000..edef7f6
--- /dev/null
@@ -0,0 +1,490 @@
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/skbuff.h>
+#include <linux/version.h>
+#ifdef CONFIG_XEN_NETDEV2_BACKEND
+#include <xen/driver_util.h>
+#endif
+#ifdef CONFIG_PARAVIRT
+#include <xen/grant_table.h>
+#else
+#include <xen/gnttab.h>
+#endif
+#include "netchannel2_core.h"
+
+#ifdef DEBUG
+static int list_length(struct list_head *head)
+{
+        struct list_head *lh;
+        int cntr;
+        cntr = 0;
+        __list_for_each(lh, head)
+                cntr++;
+        return cntr;
+}
+
+void _sanity_check_list(struct list_head *root, const char *file, int line)
+{
+        struct list_head *cursor1;
+        struct list_head *cursor2;
+        unsigned counter;
+        counter = 1;
+        cursor1 = cursor2 = root;
+        for (;;) {
+                if (counter++ % 10000 == 0) {
+                        printk("<0>Stupidly long list.\n");
+                        goto bad;
+                }
+                cursor1 = cursor1->next;
+                if (cursor1 == root)
+                        return;
+                if (cursor1 == cursor2)
+                        goto bad;
+                cursor1 = cursor1->next;
+                if (cursor1 == root)
+                        return;
+                if (cursor1 == cursor2)
+                        goto bad;
+                cursor2 = cursor2->next;
+                if (cursor1 == cursor2)
+                        goto bad;
+        }
+bad:
+        cursor1 = cursor2 = root;
+        printk(KERN_CRIT "Bad %p list at %s:%d\n", root, file, line);
+        for (;;) {
+                cursor1 = cursor1->next;
+                printk(KERN_CRIT "1 %p\n", cursor1);
+                if (cursor1 == cursor2)
+                        break;
+                cursor1 = cursor1->next;
+                printk(KERN_CRIT "1 %p\n", cursor1);
+                if (cursor1 == cursor2)
+                        break;
+                cursor2 = cursor2->next;
+                printk(KERN_CRIT "2 %p\n", cursor2);
+                if (cursor1 == cursor2)
+                        break;
+        }
+        BUG();
+}
+#define sanity_check_list(x) _sanity_check_list(x, __FILE__, __LINE__)
+
+static void debug_dump_sring_prod(struct netchannel2_sring_prod *nsp)
+{
+        printk(KERN_DEBUG "\t\t\tprod_sring %p\n", nsp);
+        if (!nsp)
+                return;
+        printk(KERN_DEBUG" \t\t\tprod %d, cons %d, prod_event %d, cons_event %d\n",
+               nsp->prod, nsp->cons, nsp->prod_event, nsp->cons_event);
+}
+
+static void debug_dump_sring_cons(struct netchannel2_sring_cons *nsc)
+{
+        printk(KERN_DEBUG "\t\t\tcons_sring %p\n", nsc);
+        if (!nsc)
+                return;
+        printk(KERN_DEBUG" \t\t\tprod %d, cons %d, prod_event %d, cons_event %d\n",
+               nsc->prod, nsc->cons, nsc->prod_event, nsc->cons_event);
+}
+
+static void debug_dump_prod_ring(struct netchannel2_prod_ring *npr)
+{
+        printk(KERN_DEBUG "\t\tprod_ring %p\n", npr);
+        if (!npr)
+                return;
+        debug_dump_sring_prod(npr->sring);
+        printk(KERN_DEBUG "\t\tprod_pvt %d, bytes_available %d, reserve %d, payload %zd\n",
+               npr->prod_pvt, npr->bytes_available, npr->reserve,
+               npr->payload_bytes);
+}
+
+static void debug_dump_cons_ring(struct netchannel2_cons_ring *ncr)
+{
+        printk(KERN_DEBUG "\t\tcons_ring %p\n", ncr);
+        if (!ncr)
+                return;
+        debug_dump_sring_cons(ncr->sring);
+        printk(KERN_DEBUG "\t\tcons_pvt %d, payload %zd\n",
+               ncr->cons_pvt, ncr->payload_bytes);
+}
+
+static void debug_dump_rate_limiter(struct nc2_rate_limiter *nrl)
+{
+        printk(KERN_DEBUG "\t\trate limiter %p\n", nrl);
+        if (!nrl)
+                return;
+        printk(KERN_DEBUG "\t\tmax %d, per_tick_ord %d, fill_gran %d, stop %p, start %p, ctxt %p, cur %d, restart_thresh %d, last_fill %lld (now %lld)\n",
+               nrl->max_tokens, nrl->tokens_per_tick_ord, nrl->fill_granularity_tokens,
+               nrl->stop, nrl->start, nrl->ctxt, nrl->cur_tokens,
+               nrl->restart_thresh, nrl->last_fill_time, get_jiffies_64());
+}
+
+static void debug_dump_tx_frag(struct transmitted_fragment *txf)
+{
+        printk(KERN_DEBUG "\t\t\ttransmitted_fragment %p\n", txf);
+        printk(KERN_DEBUG "\t\t\tsize %d, off %d, data 0x%x\n",
+               txf->frag.size, txf->frag.off, txf->frag.pre_post.id);
+}
+
+static void debug_dump_pending_finish_packets(struct pending_finish_packets *pfp)
+{
+        printk(KERN_DEBUG "\t\tpending_finish_packets %p\n", pfp);
+        printk(KERN_DEBUG "\t\tprod %d, cons %d\n", pfp->prod, pfp->cons);
+}
+
+void debug_dump_ring_pair(struct netchannel2_ring_pair *ncr)
+{
+        struct transmitted_packet *txp;
+
+        printk(KERN_DEBUG "\tring pair %p\n", ncr);
+        debug_dump_prod_ring(&ncr->prod_ring);
+        debug_dump_cons_ring(&ncr->cons_ring);
+        printk(KERN_DEBUG "\tis_attached %d, detach_pending %d, need_flush %d, otherend_id %d\n",
+               ncr->is_attached, ncr->detach_pending, ncr->need_flush,
+               ncr->otherend_id);
+        debug_dump_rate_limiter(&ncr->limiter);
+        printk(KERN_DEBUG "\trlimit_disabled %d\n", ncr->rlimit_disabled);
+        printk(KERN_DEBUG "\tirq %d (disable %d), remote_mac %02x:%02x:%02x:%02x:%02x:%02x, filter_mac %d\n",
+               ncr->irq, ncr->irq_disable_count, ncr->remote_mac[0],
+               ncr->remote_mac[1], ncr->remote_mac[2], ncr->remote_mac[3],
+               ncr->remote_mac[4], ncr->remote_mac[5], ncr->filter_mac);
+        printk(KERN_DEBUG "\tnr_tx_packets_outstanding %d, max_tx_packets_outstanding %d, need_advertise_max_packets %d\n",
+               ncr->nr_tx_packets_outstanding, ncr->max_tx_packets_outstanding,
+               ncr->need_advertise_max_packets);
+        printk(KERN_DEBUG "\tpending_time_sensitive_messages %d, delayed_kick %d\n",
+               ncr->pending_time_sensitive_messages, ncr->delayed_kick);
+        debug_dump_pending_finish_packets(&ncr->pending_finish);
+        printk(KERN_DEBUG "\tis_pending %d\n", ncr->is_pending);
+}
+
+static void debug_dump_alternate_ring(struct nc2_alternate_ring *nar)
+{
+        printk(KERN_DEBUG "\talt ring %p.\n", nar);
+        printk(KERN_DEBUG "\tstate %x\n", nar->state);
+        debug_dump_ring_pair(&nar->rings);
+        printk(KERN_DEBUG "\tirq_disable %d\n", atomic_read(&nar->irq_disable_count));
+        printk(KERN_DEBUG "\thandle %d\n", nar->handle);
+}
+
+static void debug_dump_bypass(struct nc2_bypass *bypass)
+{
+        printk(KERN_DEBUG "bypass %p\n", bypass);
+        printk(KERN_DEBUG "refcnt %d, handle %d, need_advertise %d %d, need_disable %d %d, disable_sent %d %d, disabled %d %d\n",
+               atomic_read(&bypass->refcnt), bypass->handle,
+               bypass->need_advertise_a, bypass->need_advertise_b,
+               bypass->need_disable_a, bypass->need_disable_b,
+               bypass->disable_sent_a, bypass->disable_sent_b,
+               bypass->disabled_a, bypass->disabled_b);
+        printk(KERN_DEBUG "need_detach %d %d, detach_sent %d %d, detached %d %d\n",
+               bypass->need_detach_a, bypass->need_detach_b,
+               bypass->detach_sent_a, bypass->detach_sent_b,
+               bypass->detached_a, bypass->detached_b);
+}
+
+static void debug_dump_rx_buffer(struct nc2_rx_buffer *rxb)
+{
+        printk(KERN_DEBUG "\trx buffer %p\n", rxb);
+        printk(KERN_DEBUG "\tbuffer %p, gref %d, is_posted %d\n",
+               rxb->buffer, rxb->gref, rxb->is_posted);
+}
+
+static void debug_dump_tx_buffer(struct nc2_tx_buffer *txb)
+{
+        printk(KERN_DEBUG "\ttx buffer %p\n", txb);
+        printk(KERN_DEBUG "\tid %x, gref %d, off_in_page %d, size %d\n",
+               txb->id, txb->gref, txb->off_in_page, txb->size);
+}
+
+void debug_dump_nc2_struct(struct netchannel2 *nc)
+{
+        struct nc2_alternate_ring *ncr;
+        struct nc2_bypass *bypass;
+        struct nc2_rx_buffer *rxb;
+        struct nc2_tx_buffer *txb;
+
+        printk(KERN_DEBUG "nc2 %p\n", nc);
+        BUG_ON(nc->magic != NETCHANNEL2_MAGIC);
+        printk(KERN_DEBUG "remote_trusted %d, local_trusted %d.\n",
+               nc->remote_trusted, nc->local_trusted);
+        list_for_each_entry(ncr, &nc->alternate_rings, rings_by_interface)
+                debug_dump_alternate_ring(ncr);
+        printk(KERN_DEBUG "need_advertise_rings %d\n", nc->need_advertise_rings);
+        debug_dump_ring_pair(&nc->rings);
+        printk(KERN_DEBUG "pending_skb %p\n", nc->pending_skb);
+        printk(KERN_DEBUG "use_rx_csum %d, allow_tx_csum_offload %d, use_lro %d, allow_tso %d, need_advertise_offloads %d\n",
+               nc->use_rx_csum, nc->allow_tx_csum_offload, nc->use_lro,
+               nc->allow_tso, nc->need_advertise_offloads);
+        printk(KERN_DEBUG "need_advertise_bypasses %d\n",
+               nc->need_advertise_bypasses);
+        printk(KERN_DEBUG "bypasses a:\n");
+        list_for_each_entry(bypass, &nc->bypasses_a, a_list)
+                debug_dump_bypass(bypass);
+        list_for_each_entry(bypass, &nc->bypasses_b, b_list)
+                debug_dump_bypass(bypass);
+        printk(KERN_DEBUG "rx buffers:\n");
+        list_for_each_entry(rxb, &nc->rx_buffers, list)
+                debug_dump_rx_buffer(rxb);
+        printk(KERN_DEBUG "unposted rx buffers:\n");
+        list_for_each_entry(rxb, &nc->unposted_rx_buffers, list)
+                debug_dump_rx_buffer(rxb);
+        printk(KERN_DEBUG "unused rx buffers:\n");
+        list_for_each_entry(rxb, &nc->unused_rx_buffers, list)
+                debug_dump_rx_buffer(rxb);
+        printk(KERN_DEBUG "nr_rx_buffers %d, max_nr_rx_buffers %d\n",
+               nc->nr_rx_buffers, nc->max_nr_rx_buffers);
+        printk(KERN_DEBUG "rx_buffer_structs at %p.\n", nc->rx_buffer_structs);
+        printk(KERN_DEBUG "dont_post_buffers %d\n", nc->dont_post_buffers);
+        printk(KERN_DEBUG "avail_tx_buffers:\n");
+        list_for_each_entry(txb, &nc->avail_tx_buffers, list)
+                debug_dump_tx_buffer(txb);
+        printk(KERN_DEBUG "unused_tx_buffer_slots:\n");
+        list_for_each_entry(txb, &nc->unused_tx_buffer_slots, list)
+                debug_dump_tx_buffer(txb);
+        printk(KERN_DEBUG "pending_tx_buffer_return:\n");
+        list_for_each_entry(txb, &nc->pending_tx_buffer_return, list)
+                debug_dump_tx_buffer(txb);
+        printk(KERN_DEBUG "tx buffers at %p.\n", nc->tx_buffers);
+        printk(KERN_DEBUG "need_advertise_tx_buffers %d, nr_tx_buffers %d, nr_avail_tx_buffers %d, configured_nr_tx_buffers %d\n",
+               nc->need_advertise_tx_buffers, nc->nr_tx_buffers,
+               nc->nr_avail_tx_buffers, nc->configured_nr_tx_buffers);
+}
+#endif
+
+int allocate_txp_slot(struct netchannel2_ring_pair *ncrp,
+                      struct sk_buff *skb)
+{
+        struct skb_cb_overlay *skb_co = get_skb_overlay(skb);
+        struct txp_slot *tp;
+
+        BUG_ON(skb_co->tp);
+
+        if (ncrp->head_free_tx_packet == INVALID_TXP_INDEX ||
+            ncrp->nr_tx_packets_outstanding ==
+                    ncrp->max_tx_packets_outstanding) {
+                return -1;
+        }
+
+        tp = &ncrp->tx_packets[ncrp->head_free_tx_packet];
+        ncrp->head_free_tx_packet = txp_get_next_free(tp);
+
+        txp_set_skb(tp, skb);
+        skb_co->tp = tp;
+        ncrp->nr_tx_packets_outstanding++;
+        return 0;
+}
+
+static inline void nc2_free_skb(struct netchannel2 *nc,
+                                struct sk_buff *skb)
+{
+#ifdef CONFIG_XEN_NETDEV2_VMQ
+       nc2_vmq_t *vmq = &nc->vmq;
+       if (get_skb_overlay(skb)->policy == transmit_policy_vmq )
+               skb_queue_tail(&vmq->dealloc_queue, skb);
+       else
+#endif
+               dev_kfree_skb(skb);
+}
+
+void release_txp_slot(struct netchannel2_ring_pair *ncrp,
+                      struct sk_buff *skb)
+{
+        struct skb_cb_overlay *skb_co = get_skb_overlay(skb);
+        struct txp_slot *tp = skb_co->tp;
+
+        BUG_ON(txp_get_skb(tp) != skb);
+
+        /* Try to keep the free TX packet list in order as far as
+         * possible, since that gives slightly better cache behaviour.
+         * It's not worth spending a lot of effort getting this right,
+         * though, so just use a simple heuristic: if we're freeing a
+         * packet, and the previous packet is already free, chain this
+         * packet directly after it, rather than putting it at the
+         * head of the list.  This isn't perfect by any means, but
+         * it's enough that you get nice long runs of contiguous
+         * packets in the free list, and that's all we really need.
+         * Runs much bigger than a cache line aren't really very
+         * useful, anyway. */
+        if (tp != ncrp->tx_packets && !txp_slot_in_use(tp - 1)) {
+                txp_set_next_free(tp, txp_get_next_free(tp - 1));
+                txp_set_next_free(tp - 1, tp - ncrp->tx_packets);
+        } else {
+                txp_set_next_free(tp, ncrp->head_free_tx_packet);
+                ncrp->head_free_tx_packet = tp - ncrp->tx_packets;
+        }
+        skb_co->tp = NULL;
+        ncrp->nr_tx_packets_outstanding--;
+}
+
+void release_tx_packet(struct netchannel2_ring_pair *ncrp,
+                       struct sk_buff *skb)
+{
+        struct skb_cb_overlay *skb_co = get_skb_overlay(skb);
+        struct txp_slot *tp = skb_co->tp;
+        grant_ref_t gref;
+        int r;
+
+        if (skb_co->type == NC2_PACKET_TYPE_receiver_copy) {
+                while (1) {
+                        r = gnttab_claim_grant_reference(&skb_co->gref_pool);
+                        if (r == -ENOSPC)
+                                break;
+                        gref = (grant_ref_t)r;
+                        /* It's a subpage grant reference, so Xen
+                           guarantees to release it quickly.  Sit and
+                           wait for it to do so. */
+                        while (!nc2_end_foreign_access_ref(gref, 1)) {
+                                cpu_relax();
+                        }
+                        gnttab_free_grant_reference(gref);
+                }
+        } else if (skb_co->type == NC2_PACKET_TYPE_receiver_map) {
+                while (1) {
+                        r = gnttab_claim_grant_reference(&skb_co->gref_pool);
+                        if (r == -ENOSPC)
+                                break;
+                        gref = (grant_ref_t)r;
+                        r = nc2_end_foreign_access_ref(gref, 1);
+                        if (r == 0) {
+                                printk(KERN_WARNING "Failed to end remote access to packet memory.\n");
+                        } else {
+                                gnttab_free_grant_reference(gref);
+                        }
+                }
+        } else if (skb_co->gref_pool != 0) {
+                gnttab_free_grant_references(skb_co->gref_pool);
+        }
+
+        if (tp != NULL)
+                release_txp_slot(ncrp, skb);
+
+        nc2_free_skb(ncrp->interface, skb);
+}
+
+void fetch_fragment(struct netchannel2_ring_pair *ncrp,
+                    unsigned idx,
+                    struct netchannel2_fragment *frag,
+                    unsigned off)
+{
+        nc2_copy_from_ring_off(&ncrp->cons_ring,
+                               frag,
+                               sizeof(*frag),
+                               off + idx * sizeof(*frag));
+}
+
+/* Copy @count bytes from the skb's data area into its head, updating
+ * the pointers as appropriate.  The caller should ensure that there
+ * is actually enough space in the head. */
+void pull_through(struct sk_buff *skb, unsigned count)
+{
+        unsigned frag = 0;
+        unsigned this_frag;
+        void *buf;
+        void *va;
+
+        while (count != 0 && frag < skb_shinfo(skb)->nr_frags) {
+                this_frag = skb_shinfo(skb)->frags[frag].size;
+                if (this_frag > count)
+                        this_frag = count;
+                va = page_address(skb_shinfo(skb)->frags[frag].page);
+#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,20)
+                buf = skb_tail_pointer(skb);
+#else
+                buf = skb->tail;
+#endif
+                memcpy(buf, va + skb_shinfo(skb)->frags[frag].page_offset,
+                       this_frag);
+                skb->tail += this_frag;
+                BUG_ON(skb->tail > skb->end);
+                skb_shinfo(skb)->frags[frag].size -= this_frag;
+                skb_shinfo(skb)->frags[frag].page_offset += this_frag;
+                skb->data_len -= this_frag;
+                count -= this_frag;
+                frag++;
+        }
+        for (frag = 0;
+             frag < skb_shinfo(skb)->nr_frags &&
+                     skb_shinfo(skb)->frags[frag].size == 0;
+             frag++) {
+                put_page(skb_shinfo(skb)->frags[frag].page);
+        }
+        skb_shinfo(skb)->nr_frags -= frag;
+        memmove(skb_shinfo(skb)->frags,
+                skb_shinfo(skb)->frags+frag,
+                sizeof(skb_shinfo(skb)->frags[0]) *
+                skb_shinfo(skb)->nr_frags);
+}
+
+#ifdef CONFIG_XEN_NETDEV2_BACKEND
+
+/* Zap a grant_mapping structure, releasing all mappings and the
+   reserved virtual address space.  Prepare the grant_mapping for
+   re-use. */
+void nc2_unmap_grants(struct grant_mapping *gm)
+{
+        struct gnttab_unmap_grant_ref op[MAX_GRANT_MAP_PAGES];
+        int i;
+
+        if (gm->mapping == NULL)
+                return;
+        for (i = 0; i < gm->nr_pages; i++) {
+                gnttab_set_unmap_op(&op[i],
+                                    (unsigned long)gm->mapping->addr +
+                                            i * PAGE_SIZE,
+                                    GNTMAP_host_map,
+                                    gm->handles[i]);
+        }
+        if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, op, i))
+                BUG();
+        free_vm_area(gm->mapping);
+        memset(gm, 0, sizeof(*gm));
+}
+
+int nc2_map_grants(struct grant_mapping *gm,
+                   const grant_ref_t *grefs,
+                   unsigned nr_grefs,
+                   domid_t remote_domain)
+{
+        struct grant_mapping work;
+        struct gnttab_map_grant_ref op[MAX_GRANT_MAP_PAGES];
+        int i;
+
+        memset(&work, 0, sizeof(work));
+
+        if (nr_grefs > MAX_GRANT_MAP_PAGES || nr_grefs == 0)
+                return -EINVAL;
+
+        if (nr_grefs & (nr_grefs-1)) {
+                /* Must map a power-of-two number of pages. */
+                return -EINVAL;
+        }
+
+        work.nr_pages = nr_grefs;
+        work.mapping = alloc_vm_area(PAGE_SIZE * work.nr_pages);
+        if (!work.mapping)
+                return -ENOMEM;
+        for (i = 0; i < nr_grefs; i++)
+                gnttab_set_map_op(&op[i],
+                                  (unsigned long)work.mapping->addr +
+                                          i * PAGE_SIZE,
+                                  GNTMAP_host_map,
+                                  grefs[i],
+                                  remote_domain);
+
+        if (HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, op, nr_grefs))
+                BUG();
+
+        for (i = 0; i < nr_grefs; i++) {
+                if (op[i].status) {
+                        work.nr_pages = i;
+                        nc2_unmap_grants(&work);
+                        return -EFAULT;
+                }
+                work.handles[i] = op[i].handle;
+        }
+
+        nc2_unmap_grants(gm);
+        *gm = work;
+        return 0;
+}
+#endif
diff --git a/drivers/net/xen-netchannel2/vmq.c b/drivers/net/xen-netchannel2/vmq.c
new file mode 100644 (file)
index 0000000..b98d910
--- /dev/null
@@ -0,0 +1,771 @@
+/*****************************************************************************
+ * vmq.c
+ *
+ * Support multi-queue network devices.
+ *
+ * Copyright (c) 2008, Kaushik Kumar Ram, Rice University.
+ * Copyright (c) 2008, Jose Renato Santos, Hewlett-Packard Co.
+ * 
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/netvmq.h>
+#include <linux/skbuff.h>
+#include <xen/xenbus.h>
+#include <xen/balloon.h>
+#include "netchannel2_core.h"
+
+#include "vmq.h"
+
+/* state of device queue when operating in vmq mode */
+#define VMQ_QUEUE_DISABLED  0
+#define VMQ_QUEUE_STARTING  1
+#define VMQ_QUEUE_ENABLED   2
+#define VMQ_QUEUE_CLOSING   3
+
+static inline unsigned long vmq_idx_to_pfn(nc2_vmq_t *vmq, unsigned int idx)
+{
+        return page_to_pfn(vmq->pages[idx]);
+}
+
+static inline unsigned long vmq_idx_to_kaddr(nc2_vmq_t *vmq, unsigned int idx)
+{
+       return (unsigned long)pfn_to_kaddr(vmq_idx_to_pfn(vmq, idx));
+}
+
+/* get vmq idx from page struct  */
+static long nc2_vmq_page_index(struct page *page)
+{
+       nc2_vmq_buf_t *vmq_buf;
+        vmq_buf = (nc2_vmq_buf_t *)page->mapping;
+       return (vmq_buf - vmq_buf->nc->vmq.buffer);
+}
+
+/* Read a physical device name from xenstore and
+ * returns a pointer to the associated net_device structure. 
+ *  Returns NULL on error. */
+static struct net_device *read_pdev(struct xenbus_device *dev)
+{
+       char *pdevstr;
+       struct net_device *pdev = NULL;
+
+       pdevstr = xenbus_read(XBT_NIL, dev->nodename, "pdev", NULL);
+       if (IS_ERR(pdevstr))
+               return NULL;
+
+       if (pdevstr) {
+               pdev = dev_get_by_name(pdevstr);
+       }
+
+       kfree(pdevstr);
+
+       return pdev;
+}
+
+static void nc2_vmq_page_release(struct page *page)
+{
+       printk("%s: ERROR: Unexpected release of netchannel2 vmq page",
+              __FUNCTION__);
+       BUG_ON(1);
+}
+
+static inline int nc2_vmq_is_disabled(struct netchannel2 *nc)
+{
+       return (nc->vmq.vmq_state == VMQ_QUEUE_DISABLED);
+}
+
+static inline int nc2_vmq_is_starting(struct netchannel2 *nc)
+{
+       return (nc->vmq.vmq_state == VMQ_QUEUE_STARTING);
+}
+
+static inline int nc2_vmq_is_enabled(struct netchannel2 *nc)
+{
+       return (nc->vmq.vmq_state == VMQ_QUEUE_ENABLED);
+}
+
+static inline int nc2_vmq_is_closing(struct netchannel2 *nc)
+{
+       return (nc->vmq.vmq_state == VMQ_QUEUE_CLOSING);
+}
+
+static inline void nc2_vmq_enable(struct netchannel2 *nc)
+{
+       nc2_vmq_t *vmq = &nc->vmq;
+       vmq_get(vmq);
+       vmq_enable_queue(vmq->pdev, vmq->vmq_id);
+       vmq->vmq_state = VMQ_QUEUE_ENABLED;
+}
+
+void nc2_vmq_disconnect(struct netchannel2 *nc)
+{
+       nc2_vmq_t *vmq = &nc->vmq;
+
+       if ( nc2_vmq_is_enabled(nc) ) {
+               vmq_disable_queue(vmq->pdev, vmq->vmq_id);
+               vmq_free_queue(vmq->pdev, vmq->vmq_id);
+               vmq->vmq_state = VMQ_QUEUE_CLOSING;
+               /* wait until all buffers have been returned by dev driver */
+               wait_event(vmq->waiting_to_free,
+                          atomic_read(&vmq->refcnt) == 0);
+               return;
+       }
+
+       if ( nc2_vmq_is_starting(nc) ) {
+               vmq_free_queue(vmq->pdev, vmq->vmq_id);
+               vmq->vmq_state = VMQ_QUEUE_CLOSING;
+               return;
+       }
+
+}
+
+
+static void nc2_vmq_end_map_buffers(gnttab_map_grant_ref_t *mop, int count,
+                                   struct netchannel2 *nc, u16 *alloc_idx)
+{
+       int i, err;
+       u16 idx;
+       unsigned int prod;
+       nc2_vmq_t *vmq = &nc->vmq;
+
+       prod = vmq->mapped_pages_prod;
+
+       for (i = 0; i < count; i++) {
+               
+               idx = alloc_idx[i];
+
+               /* Check error status */
+               err = mop->status;
+               if (likely(!err)) {
+                       set_phys_to_machine(
+                                           __pa(vmq_idx_to_kaddr(vmq, idx)) 
+                                           >> PAGE_SHIFT,
+                                           FOREIGN_FRAME(mop->dev_bus_addr 
+                                                         >> PAGE_SHIFT));
+                       
+                       /* Store the handle */
+                       vmq->buffer[idx].buf->grant_handle = mop->handle;
+       
+                       /* Add it to the mapped pages list */   
+                       vmq->mapped_pages[VMQ_IDX_MASK(prod++)] = idx;
+                       mop++;
+                       continue;
+               }
+
+               /* Error mapping page: return posted buffer to other end. 
+                * TODO: We might need an error field on the return buffer 
+                 * message */
+               return_tx_buffer(nc, vmq->buffer[idx].buf);
+
+               /* Add the page back to the free list */
+               vmq->unmapped_pages[VMQ_IDX_MASK(vmq->unmapped_pages_prod++)] 
+                       = idx;
+
+                mop++;
+       }
+       
+       smp_wmb();
+       vmq->mapped_pages_prod = prod;
+
+       return;
+}
+
+/* Map guest buffers and place them in the mapped buffers list. The mapped
+ * pages in this list are used when allocating a skb (vmq_alloc_skb()).
+ */
+static void nc2_vmq_map_buffers(struct netchannel2 *nc)
+{
+        u16 idx;
+       int count = 0;
+       unsigned int cons;
+       int nbufs;
+       int buf_avail;
+        struct nc2_tx_buffer *buf;
+       struct nc2_vmq *vmq = &nc->vmq;
+       int n_mapped = nr_vmq_bufs(nc);
+
+
+       /*
+        * Putting hundreds of bytes on the stack is considered rude.
+        * Static works because a tasklet can only be on one CPU at any time.
+        */
+       static gnttab_map_grant_ref_t rx_map_ops[VMQ_MAX_BUFFERS];
+       static u16 alloc_idx[VMQ_MAX_BUFFERS];
+
+       /* If there is at least VMQ_MIN_BUFFERS buffers, no work to do */
+       if( n_mapped >=  VMQ_MIN_BUFFERS)       
+               return;         
+
+       /* Try to get VMQ_MAX_BUFFERS mapped buffers, if there are
+           sufficient buffers posted by the other end  */
+       nbufs = VMQ_MAX_BUFFERS - n_mapped;     
+       buf_avail = nc->nr_avail_tx_buffers;
+       if (nbufs > buf_avail)
+               nbufs = buf_avail;
+
+       /* Xen cannot handle more than 512 grant ops in a single hypercall */
+       if (nbufs > 512)
+               nbufs = 512;
+
+       /* give up if there are no buffers available */
+       if (nbufs <= 0)
+               return;
+       
+       /* Note that we *should* have free pages to consume here
+        * and no checks are needed.
+        */
+       cons = vmq->unmapped_pages_cons;
+
+       while (count < nbufs) {
+               idx = vmq->unmapped_pages[VMQ_IDX_MASK(cons++)];
+               buf = vmq->buffer[idx].buf = _get_tx_buffer(nc);
+               /* Setup grant map operation */
+               gnttab_set_map_op(&rx_map_ops[count], 
+                                 vmq_idx_to_kaddr(vmq, idx),
+                                 GNTMAP_host_map, 
+                                 buf->gref, 
+                                 nc->rings.otherend_id);               
+               alloc_idx[count] = idx;
+               count++;
+       }
+
+       vmq->unmapped_pages_cons = cons;
+
+        /* Map all the pages */
+        BUG_ON(HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, 
+                                        rx_map_ops, nbufs));
+       
+        /* Finalize buffer mapping after checking if the grant operations 
+          succeeded */
+        nc2_vmq_end_map_buffers(rx_map_ops, nbufs, nc, alloc_idx);
+
+       vmq->nbufs += nbufs;
+}
+
+static void nc2_vmq_unmap_buf(struct netchannel2 *nc,
+                             unsigned int idx, int recycle)
+{
+       nc2_vmq_t *vmq = &nc->vmq;
+       unsigned long pfn;
+       gnttab_unmap_grant_ref_t gop;
+       unsigned prod;
+       int ret;
+
+       pfn = vmq_idx_to_pfn(vmq, idx); 
+       /* Already unmapped? */
+       if (!phys_to_machine_mapping_valid(pfn))
+               return;
+       gnttab_set_unmap_op(&gop, vmq_idx_to_kaddr(vmq, idx),
+                           GNTMAP_host_map,
+                           vmq->buffer[idx].buf->grant_handle);
+       ret = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &gop, 1);
+       BUG_ON(ret);
+
+       vmq->nbufs--;
+
+       set_phys_to_machine(__pa(vmq_idx_to_kaddr(vmq, idx)) >> 
+                           PAGE_SHIFT,
+                           INVALID_P2M_ENTRY);
+       /* Ready for next use. */
+       gnttab_reset_grant_page(vmq->pages[idx]);
+       /* Add the page back to the unmapped list */
+       prod = vmq->unmapped_pages_prod;
+       vmq->unmapped_pages[VMQ_IDX_MASK(prod++)] = idx;
+       if (recycle)
+               recycle_tx_buffer(nc, vmq->buffer[idx].buf);
+       else
+               free_tx_buffer(nc, vmq->buffer[idx].buf);
+       smp_wmb();
+       vmq->unmapped_pages_prod = prod;        
+}
+
+static void nc2_vmq_free_mapped_bufs(struct netchannel2 *nc) 
+{
+       nc2_vmq_t *vmq = &nc->vmq;
+       unsigned int idx;
+       unsigned prod, cons;
+
+       /* The queue should be disabled before this function is called */
+       BUG_ON(vmq->vmq_state == VMQ_QUEUE_ENABLED);
+
+       cons = vmq->mapped_pages_cons;
+       prod = vmq->mapped_pages_prod;
+       smp_rmb();
+       
+       while(cons != prod) {
+               idx = vmq->mapped_pages[VMQ_IDX_MASK(cons++)];
+               nc2_vmq_unmap_buf(nc, idx, 1);
+       }
+
+       vmq->mapped_pages_cons = cons;
+
+}
+
+static void nc2_vmq_free_skb(struct sk_buff *skb)
+{
+       struct netchannel2 *nc;
+       nc2_vmq_t *vmq;
+       unsigned int idx;
+       int nr_frags, i;
+       struct skb_shared_info *shinfo = skb_shinfo(skb); 
+       skb_frag_t *frags = shinfo->frags;
+
+       nc = netdev_priv(skb->dev);
+       vmq = &nc->vmq;
+
+       nr_frags = shinfo->nr_frags;
+       for (i = 0; i < nr_frags; i++) {
+               idx = nc2_vmq_page_index(frags[i].page);
+               nc2_vmq_unmap_buf(nc, idx, 1);
+       }
+       
+       shinfo->frag_list = NULL;
+       shinfo->nr_frags = 0;
+       
+       /* Add the skb back to the free pool */
+       skb_queue_tail(&vmq->free_skb_list, skb);               
+}
+
+/* Initialize the free socket buffer list */
+static int vmq_init_free_skb_list(int n, struct sk_buff_head *free_skb_list) {
+       int i;  
+       struct sk_buff *skb;
+
+       skb_queue_head_init(free_skb_list);     
+       
+       for(i = 0; i < n; i++) {
+               skb = alloc_skb(VMQ_SKB_SIZE, GFP_ATOMIC);
+               if(!skb) {
+                       printk("Netchannel2 vmq: Failed to allocate socket "
+                              "buffer %d (max=%d)\n", i,(int)n);
+                       goto error;
+               }
+               skb_queue_tail(free_skb_list, skb);     
+       }
+
+       return 0;
+error:
+       /* Free all the allocated buffers and return Error */
+       while(!skb_queue_empty(free_skb_list)) {
+               kfree_skb(skb_dequeue(free_skb_list));
+       }
+       
+       return -1;
+}
+
+/* Initialize vmq. Return 1 if vmq is used and 0 otherwise */
+int nc2_vmq_connect(struct netchannel2 *nc)
+{
+       nc2_vmq_t *vmq = &nc->vmq;
+       struct page *page;
+       int q_id;
+       int size;       
+       int i;
+
+       vmq->vmq_mode = 0;
+        vmq->pdev = read_pdev(nc->xenbus_device);
+
+       /* cannot use vmq mode if physical device not found */
+       if (!vmq->pdev)
+               return 0;
+
+       /* Allocate a RX queue */
+       if((q_id = vmq_alloc_queue(vmq->pdev, VMQ_TYPE_RX)) < 0)
+               /* Allocation failed, cannot use multi-queue */
+               goto free_pdev;
+
+       vmq->vmq_id = q_id;     
+
+       /* Set the size of the queue */
+       size = vmq_get_maxsize(vmq->pdev);              
+       if (size > VMQ_QUEUE_SIZE)
+               size = VMQ_QUEUE_SIZE;
+       if(vmq_set_size(vmq->pdev, q_id, size) < 0) {
+               /* Failure, free up the queue and return error */
+               printk("%s: could not set queue size on net device\n",
+                      __FUNCTION__);
+               goto free_queue;
+       }
+       vmq->vmq_size = size;
+
+       /* Set the mac address of the queue */
+       if(vmq_set_mac(vmq->pdev, q_id, nc->rings.remote_mac) < 0) {
+               /* Failure, free up the queue and return error */       
+               printk("%s: could not set MAC address for net device queue\n",
+                      __FUNCTION__);
+               goto free_queue;
+       }
+
+       vmq->pages = alloc_empty_pages_and_pagevec(VMQ_MAX_BUFFERS);
+       if (vmq->pages == NULL) {
+               printk("%s: out of memory\n", __FUNCTION__);
+               goto free_queue;
+       }
+
+       skb_queue_head_init(&vmq->dealloc_queue);       
+       skb_queue_head_init(&vmq->rx_queue);    
+
+       if(vmq_init_free_skb_list(VMQ_MAX_BUFFERS, 
+                                 &vmq->free_skb_list)) {
+               printk("%s: Could not allocate free socket buffers",
+                       __FUNCTION__);
+               goto free_pagevec;
+       }
+
+       for (i = 0; i < VMQ_MAX_BUFFERS; i++) {
+               vmq->buffer[i].nc = nc;
+               page = vmq->pages[i];
+               SetPageForeign(page, nc2_vmq_page_release);
+                page->mapping = (void *)&vmq->buffer[i];
+               vmq->unmapped_pages[i] = i;
+       }
+
+       vmq->unmapped_pages_prod = VMQ_MAX_BUFFERS;
+       vmq->unmapped_pages_cons = 0;
+       
+       vmq->mapped_pages_prod = 0;
+       vmq->mapped_pages_cons = 0;
+               
+       vmq->nbufs = 0;
+       vmq->vmq_mode = 1;
+
+       /* Store the pointer to netchannel2 device in pdev */
+       BUG_ON((vmq->pdev->vmq == NULL) || (vmq->pdev->vmq->queue == NULL)); 
+       vmq->pdev->vmq->queue[q_id].guest = (void*) nc->net_device;
+
+       atomic_set(&vmq->refcnt, 0);
+       init_waitqueue_head(&vmq->waiting_to_free);
+
+       printk(KERN_INFO "Netchannel2 using vmq mode for guest %d\n",
+              nc->xenbus_device->otherend_id);
+
+       vmq->vmq_state = VMQ_QUEUE_STARTING;
+
+       return 1;       /* Success */
+
+
+free_pagevec:
+       free_empty_pages_and_pagevec(vmq->pages, VMQ_MAX_BUFFERS);
+free_queue:
+       vmq_free_queue(vmq->pdev, vmq->vmq_id);
+free_pdev:
+       dev_put(vmq->pdev);
+       vmq->pdev = NULL;
+       return 0;
+       
+}
+
+void nc2_vmq_shutdown(struct netchannel2 *nc)
+{
+       nc2_vmq_t *vmq = &nc->vmq;
+       int i;
+
+       if (!vmq->vmq_mode)
+               return;
+
+       /* All posted bufs should have been returned */
+       BUG_ON(nr_vmq_bufs(nc) != nr_vmq_mapped_bufs(nc));
+
+       /* free the mapped bufs */
+       nc2_vmq_free_mapped_bufs(nc);
+       
+        /* Free the vmq pages */
+       if (vmq->pages) {
+               for (i = 0; i < VMQ_MAX_BUFFERS; i++) {
+                        if (PageForeign(vmq->pages[i]))
+                                ClearPageForeign(vmq->pages[i]);
+                        vmq->pages[i]->mapping = NULL;
+                }
+               free_empty_pages_and_pagevec(vmq->pages, VMQ_MAX_BUFFERS);
+               vmq->pages = NULL;
+       }
+       
+        while(!skb_queue_empty(&vmq->free_skb_list)) {
+               /* Free the socket buffer pool */
+               kfree_skb(skb_dequeue(&vmq->free_skb_list));
+       }
+       vmq->vmq_state = VMQ_QUEUE_DISABLED;
+       vmq->vmq_mode = 0;
+
+       if (vmq->pdev) {
+               dev_put(vmq->pdev);
+               vmq->pdev = NULL;
+       }
+
+       vmq_put(vmq);
+}
+
+static int prepare_xmit_allocate_vmq(struct netchannel2 *nc,
+                                     struct sk_buff *skb)
+{
+        unsigned msg_size;
+
+        msg_size = get_transmitted_packet_msg_size(skb);
+        if (!nc2_reserve_payload_bytes(&nc->rings.prod_ring, msg_size))
+                return -1;
+        return 0;
+}
+
+void do_vmq_work(struct netchannel2 *nc)
+{
+       nc2_vmq_t *vmq = &nc->vmq;
+       struct sk_buff *skb;
+        unsigned long flags;
+
+       /* if not in vmq mode do nothing */
+       if (!nc2_in_vmq_mode(nc))
+               return;
+
+       /* Map guest buffers for dedicated NIC RX queue if needed */
+       if (nr_vmq_bufs(nc) < VMQ_MIN_BUFFERS) {
+               nc2_vmq_map_buffers(nc);
+               /* We delay enabling the queue until we have enough
+                  posted buffers. Check if it is time to enable it */
+               if (nc2_vmq_is_starting(nc) && 
+                   (nr_vmq_bufs(nc) >= VMQ_MIN_BUFFERS)) {
+                       nc2_vmq_enable(nc);
+               }
+       }
+
+       /* free vmq skb's returned by the physical device driver */ 
+       while(!skb_queue_empty(&nc->vmq.dealloc_queue)) {
+               nc2_vmq_free_skb(skb_dequeue(&nc->vmq.dealloc_queue));
+       }
+
+       /* complete vmq closing after all packets returned by physical 
+        * device driver */
+
+       if (nc2_vmq_is_closing(nc) && 
+           (nr_vmq_bufs(nc) == nr_vmq_mapped_bufs(nc))) {
+               nc->vmq.vmq_state = VMQ_QUEUE_DISABLED;
+               nc2_vmq_shutdown(nc);
+       }
+
+       spin_lock_irqsave(&vmq->rx_queue.lock, flags);
+       while (!skb_queue_empty(&vmq->rx_queue)) {
+               skb = __skb_dequeue(&nc->vmq.rx_queue);
+               if (prepare_xmit_allocate_vmq(nc, skb) < 0) {
+                       __skb_queue_head(&vmq->rx_queue, skb);
+                       spin_unlock_irqrestore(&vmq->rx_queue.lock,flags);
+                       return;
+               }
+                __skb_queue_tail(&nc->rings.pending_tx_queue, skb);
+       }
+       spin_unlock_irqrestore(&vmq->rx_queue.lock,flags);
+}
+
+/* Return the netchannel2 device corresponding to the given queue in pdev */
+static inline struct net_device *nc2_vmq_queue_to_vif(struct net_device *pdev, 
+                                                     int queue_id) 
+{
+       net_vmq_t *n_vmq;
+        vmq_queue_t *vmq_q;
+
+       n_vmq = pdev->vmq;
+       BUG_ON(n_vmq == NULL);
+       vmq_q = &n_vmq->queue[queue_id];
+       BUG_ON(vmq_q == NULL);
+
+       return (struct net_device*)vmq_q->guest;
+}
+
+/* Handle incoming vmq packet */
+int vmq_netif_rx(struct sk_buff *skb, int queue_id) 
+{
+        struct skb_cb_overlay *skb_co = get_skb_overlay(skb);
+       struct net_device *dev;
+       struct netchannel2 *nc;
+       nc2_vmq_t *vmq;
+
+        memset(skb_co, 0, sizeof(*skb_co));
+
+       skb_co->nr_fragments = skb_shinfo(skb)->nr_frags;
+        skb_co->type = NC2_PACKET_TYPE_pre_posted;
+        skb_co->policy = transmit_policy_vmq;
+
+       /* get the netchannel2 interface corresponding to this queue */
+       dev = nc2_vmq_queue_to_vif(skb->dev, queue_id);
+       nc = netdev_priv(dev);
+       vmq = &nc->vmq;
+
+       /* replace source dev with destination dev */
+       skb->dev = dev;
+       /* add skb to rx_queue */
+       skb_queue_tail(&vmq->rx_queue, skb);
+
+       /* Trigger thread excution to procees new packets */
+        nc2_kick(&nc->rings);
+
+       return 0;
+}
+
+
+/* Allocate a socket buffer from the free list, get a guest posted 
+ * buffer, attach it to the skb, and return it.
+ */ 
+struct sk_buff *vmq_alloc_skb(struct net_device *netdevice, int queue_id,
+                             unsigned int length) 
+{
+       struct sk_buff *skb;
+        struct netchannel2 *nc;
+       nc2_vmq_t *vmq;
+       unsigned int idx;
+       int nr_bufs, i;
+       unsigned int cons;
+       unsigned int prod;
+
+       /* get the netchannel2 interface corresponding to this queue */
+        nc = netdev_priv(nc2_vmq_queue_to_vif(netdevice, queue_id));
+
+       vmq = &nc->vmq;
+
+       /* Get a free buffer from the pool */
+       if(skb_queue_empty(&vmq->free_skb_list)) {
+               /* No buffers to allocate */
+               return NULL;
+       }
+
+
+       skb = skb_dequeue(&vmq->free_skb_list);
+       BUG_ON(skb == NULL);
+
+       nr_bufs = VMQ_NUM_BUFFERS(length);
+
+       cons = vmq->mapped_pages_cons;
+       prod = vmq->mapped_pages_prod;
+       smp_rmb();      
+
+       if(nr_bufs > (prod - cons))
+               /* Not enough mapped buffers in the pool */
+               goto kick_nc2;
+
+       if(nr_bufs > MAX_SKB_FRAGS)
+               goto error;
+
+       for(i = 0; i < nr_bufs; i++) {
+               idx = vmq->mapped_pages[VMQ_IDX_MASK(cons)];    
+               /* FIX ME: This can be simplified */
+               skb_shinfo(skb)->frags[i].page = 
+                       virt_to_page(vmq_idx_to_kaddr(vmq,idx));
+               skb_shinfo(skb)->frags[i].page_offset = 0;
+               skb_shinfo(skb)->frags[i].size = PAGE_SIZE;
+               skb_shinfo(skb)->nr_frags++;
+               skb->dev = netdevice;
+               cons++;
+       }
+
+       vmq->mapped_pages_cons = cons;
+
+       /* if number of buffers get low run tasklet to map more buffers */
+       if (nr_vmq_bufs(nc)  < VMQ_MIN_BUFFERS)
+               nc2_kick(&nc->rings);
+
+       return skb;
+
+kick_nc2:
+       /* kick netchannel2 interface to get any recently posted buffers */
+       nc2_kick(&nc->rings);   
+error:
+       /* Add the skb back to the free pool */
+       skb_queue_tail(&vmq->free_skb_list, skb);
+       return NULL;
+}
+
+/* Detach the guest pages and free the socket buffer */
+void vmq_free_skb(struct sk_buff *skb, int queue_id)
+{
+       struct net_device *dev;
+       struct netchannel2 *nc;
+       nc2_vmq_t *vmq;
+       
+       /* get the netchannel2 interface corresponding to this queue */
+       dev = nc2_vmq_queue_to_vif(skb->dev, queue_id);
+
+        nc = netdev_priv(dev);
+       vmq = &nc->vmq;
+
+       /* Add skb to the dealloc queue */
+       skb->dev = dev;
+       skb_queue_tail(&vmq->dealloc_queue, skb);
+
+       /* kick netchannel2 interface  */
+       nc2_kick(&nc->rings);   
+
+}
+
+int nc2_is_vmq_packet(struct netchannel2 *nc, struct sk_buff *skb)
+{
+       int nr_frags;
+       long idx;
+       nc2_vmq_t *vmq = &nc->vmq;
+
+        nr_frags = skb_shinfo(skb)->nr_frags;
+       if (vmq->vmq_mode && nr_frags &&
+           PageForeign(skb_shinfo(skb)->frags[0].page)) {
+               idx = nc2_vmq_page_index(skb_shinfo(skb)->frags[0].page);
+               if ( (idx >= 0) && (idx < VMQ_MAX_BUFFERS) )
+                       return 1;
+       }
+
+       return 0;
+}
+
+/* Prepare to transmit a vmq packet */
+void xmit_vmq(struct netchannel2 *nc, struct sk_buff *skb,
+              volatile void *msg_buf)
+{
+        volatile struct netchannel2_msg_packet *msg = msg_buf;
+        volatile struct netchannel2_fragment *out_frag;
+       nc2_vmq_t *vmq = &nc->vmq;
+        skb_frag_t *frag;
+        struct nc2_tx_buffer *txbuf;
+       int nr_frags;
+       unsigned int idx;
+        unsigned x;
+
+       nr_frags = skb_shinfo(skb)->nr_frags;
+        for (x = 0; x < nr_frags; x++) {
+                frag = &skb_shinfo(skb)->frags[x];
+                out_frag = &msg->frags[x];
+
+               idx = nc2_vmq_page_index(frag->page);
+               txbuf = vmq->buffer[idx].buf;
+               out_frag->pre_post.id = txbuf->id;
+               out_frag->off  = frag->page_offset;
+               out_frag->size = frag->size;
+               /* TODO: need to batch unmap grants */
+               nc2_vmq_unmap_buf(nc, idx, 0);
+       }
+
+       /* Avoid unmapping frags grants when skb is freed later */
+       /* by nc2_vmq_fre_skb() */
+       skb_shinfo(skb)->nr_frags = 0;
+}
+
+EXPORT_SYMBOL(vmq_alloc_skb);
+EXPORT_SYMBOL(vmq_netif_rx);
+EXPORT_SYMBOL(vmq_free_skb);
diff --git a/drivers/net/xen-netchannel2/vmq.h b/drivers/net/xen-netchannel2/vmq.h
new file mode 100644 (file)
index 0000000..0af7924
--- /dev/null
@@ -0,0 +1,39 @@
+#ifndef VMQ_H__
+#define VMQ_H__
+
+#include "netchannel2_core.h"
+
+int nc2_vmq_connect(struct netchannel2 *nc);
+void nc2_vmq_disconnect(struct netchannel2 *nc);
+void do_vmq_work(struct netchannel2 *nc);
+int nc2_is_vmq_packet(struct netchannel2 *nc, struct sk_buff *skb);
+void xmit_vmq(struct netchannel2 *nc, struct sk_buff *skb,
+              volatile void *msg);
+
+#define vmq_get(_b)                                            \
+               atomic_inc(&(_b)->refcnt);
+
+#define vmq_put(_b)                                            \
+       do {                                                    \
+               if ( atomic_dec_and_test(&(_b)->refcnt) ) {     \
+                       wake_up(&(_b)->waiting_to_free);        \
+               }                                               \
+       } while (0)
+
+static inline int nr_vmq_mapped_bufs(struct netchannel2 *nc)
+{      
+       return nc->vmq.mapped_pages_prod - 
+               nc->vmq.mapped_pages_cons;
+}
+
+static inline int nr_vmq_bufs(struct netchannel2 *nc)
+{      
+       return nc->vmq.nbufs;
+}
+
+static inline int nc2_in_vmq_mode(struct netchannel2 *nc)
+{
+       return nc->vmq.vmq_mode;
+}
+
+#endif /* !VMQ_H__ */
diff --git a/drivers/net/xen-netchannel2/vmq_def.h b/drivers/net/xen-netchannel2/vmq_def.h
new file mode 100644 (file)
index 0000000..d187a2b
--- /dev/null
@@ -0,0 +1,72 @@
+#ifndef VMQ_DEF_H__
+#define VMQ_DEF_H__
+
+
+/* No matter what the other end wants, we never post more than this
+   number of RX buffers to it. */
+#define MAX_POSTED_BUFFERS 2048+256
+
+/* size of HW queue in VMQ device */
+#define VMQ_QUEUE_SIZE 1024
+
+/* Mimimum amount of buffers needed for VMQ 
+ * This is the lower water mark that triggers mapping more guest buffers
+ * Should be larger than the queue size to allow for in flight packets 
+ */
+#define VMQ_MIN_BUFFERS 1920
+
+/* Maximum amount of posted buffers which are reserved for VMQ
+ * Should be less than MAX_POSTED_BUFFERS. For now, the difference can be used 
+ * for intra-node guest to guest traffic. When we map guest buffers we try to
+ * have VMQ_MAX_BUFFERS mapped. The difference (VMQ_MAX_BUFFERS-VMQ_MIN_BUFFERS)
+ * helps batch multiple grant map operattions
+ * VMQ_QUEUE_SIZE < VMQ_MIN_BUFFER < VMQ_MAX_BUFFER < MAX_POSTED_BUFFERS
+ * VMQ_MAX_BUFFERS must be a power of 2
+ */
+#define VMQ_MAX_BUFFERS 2048
+
+/* skb size is zero since packet data uses fragments */
+#define VMQ_SKB_SIZE 0
+
+#define VMQ_NUM_BUFFERS(len) ((len + PAGE_SIZE - 1) / PAGE_SIZE)
+
+#define VMQ_IDX_MASK(_i) ((_i)&(VMQ_MAX_BUFFERS-1))
+
+typedef struct nc2_vmq_buf {
+       struct nc2_tx_buffer *buf;
+       struct netchannel2   *nc;
+} nc2_vmq_buf_t;
+
+typedef struct nc2_vmq {
+       struct net_device *pdev;        /* Pointer to physical device */
+       int vmq_mode;                   /* indicate if vif is in vmq mode   */ 
+       struct page **pages;            /* pages for mapping guest RX bufs  */
+       struct sk_buff_head free_skb_list;     /* Free socket buffer pool   */
+       struct sk_buff_head dealloc_queue;     /* list of skb's to be free  */
+       struct sk_buff_head rx_queue;          /* list of received packets  */
+
+       /* guest mapped buffers */
+       nc2_vmq_buf_t buffer[VMQ_MAX_BUFFERS];
+
+       /* Ring with free pages available for mapping guest RX buffers */
+       u16 unmapped_pages[VMQ_MAX_BUFFERS];
+       unsigned int unmapped_pages_prod;          
+       unsigned int unmapped_pages_cons;
+
+       /* Ring of mapped RX  pages avaialable for vmq device */
+       u16 mapped_pages[VMQ_MAX_BUFFERS];
+       unsigned int mapped_pages_prod;
+       unsigned int mapped_pages_cons;
+
+       unsigned int nbufs;           /* number of vmq buffers: posted to   */
+                                     /* HW queue or available to be posted */
+       int vmq_id;                   /* Queue id    */
+       int vmq_size;                 /* Queue size  */
+       int vmq_state;                /* queue stste */
+
+       atomic_t         refcnt;
+       wait_queue_head_t waiting_to_free;
+
+} nc2_vmq_t;
+
+#endif /* !VMQ_DEF_H__ */
diff --git a/drivers/net/xen-netchannel2/xmit_packet.c b/drivers/net/xen-netchannel2/xmit_packet.c
new file mode 100644 (file)
index 0000000..9b890c1
--- /dev/null
@@ -0,0 +1,424 @@
+/* Things related to actually sending packet messages, and which is
+   shared across all transmit modes. */
+#include <linux/kernel.h>
+#include <linux/version.h>
+#include "netchannel2_core.h"
+
+#ifdef CONFIG_XEN_NETDEV2_VMQ
+#include "vmq.h"
+#endif
+
+/* We limit the number of transmitted packets which can be in flight
+   at any one time, as a somewhat paranoid safety catch. */
+#define MAX_TX_PACKETS MAX_PENDING_FINISH_PACKETS
+
+static enum transmit_policy transmit_policy(struct netchannel2 *nc,
+                                            struct sk_buff *skb)
+{
+        if (skb->len <= PACKET_PREFIX_SIZE && !skb_is_nonlinear(skb))
+                return transmit_policy_small;
+        else if (nc->remote_trusted)
+                return transmit_policy_map;
+        else
+                return transmit_policy_grant;
+}
+
+/* Allocate resources for a small packet.  The entire thing will be
+   transmitted in the ring.  This is only called for small, linear
+   SKBs.  It always succeeds, but has an int return type for symmetry
+   with the other prepare_xmit_*() functions. */
+int prepare_xmit_allocate_small(struct netchannel2_ring_pair *ncrp,
+                                struct sk_buff *skb)
+{
+        struct skb_cb_overlay *skb_co = get_skb_overlay(skb);
+
+        BUG_ON(skb_is_nonlinear(skb));
+        BUG_ON(skb->len > NETCHANNEL2_MAX_INLINE_BYTES);
+
+        skb_co->type = NC2_PACKET_TYPE_small;
+        skb_co->gref_pool = 0;
+        skb_co->inline_prefix_size = skb->len;
+
+        return 0;
+}
+
+
+/* Figure out how much space @tp will take up on the ring. */
+unsigned get_transmitted_packet_msg_size(struct sk_buff *skb)
+{
+        struct skb_cb_overlay *skb_co = get_skb_overlay(skb);
+        return (sizeof(struct netchannel2_msg_packet) +
+                sizeof(struct netchannel2_fragment) * skb_co->nr_fragments +
+                skb_co->inline_prefix_size + 7) & ~7;
+}
+
+/* Do the minimum amount of work to be certain that when we come to
+   transmit this packet we won't run out of resources.  This includes
+   figuring out how we're going to fragment the packet for
+   transmission, which buffers we're going to use, etc. Return <0 if
+   insufficient resources are available right now, or 0 if we
+   succeed. */
+/* Careful: this may allocate e.g. a TXP slot and then discover that
+   it can't reserve ring space.  In that case, the TXP remains
+   allocated.  The expected case is that the caller will arrange for
+   us to retry the allocation later, in which case we'll pick up the
+   already-allocated buffers. */
+int prepare_xmit_allocate_resources(struct netchannel2 *nc,
+                                    struct sk_buff *skb)
+{
+        struct skb_cb_overlay *skb_co = get_skb_overlay(skb);
+        enum transmit_policy policy;
+        unsigned msg_size;
+        int r;
+
+        if (skb_co->policy == transmit_policy_unknown) {
+                policy = transmit_policy(nc, skb);
+                switch (policy) {
+                case transmit_policy_small:
+                        r = prepare_xmit_allocate_small(&nc->rings, skb);
+                        break;
+                case transmit_policy_grant:
+                        r = prepare_xmit_allocate_grant(&nc->rings, skb, 1);
+                        break;
+                case transmit_policy_map:
+                        r = prepare_xmit_allocate_grant(&nc->rings, skb, 0);
+                        break;
+                case transmit_policy_post:
+                        r = prepare_xmit_allocate_post(nc, skb);
+                        break;
+                default:
+                        BUG();
+                        /* Shut the compiler up. */
+                        r = -1;
+                }
+                if (r < 0)
+                        return r;
+                skb_co->policy = policy;
+        }
+
+        msg_size = get_transmitted_packet_msg_size(skb);
+        if (nc2_reserve_payload_bytes(&nc->rings.prod_ring, msg_size))
+                return 0;
+
+        nc->tx.nr_failed_reserve_ring++;
+        return -1;
+}
+
+static void set_offload_flags(struct sk_buff *skb,
+                              volatile struct netchannel2_msg_packet *msg)
+{
+#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,20)
+        if (skb->ip_summed == CHECKSUM_PARTIAL)
+                msg->flags |=
+                        NC2_PACKET_FLAG_csum_blank |
+                        NC2_PACKET_FLAG_data_validated;
+#else
+        if (skb->ip_summed == CHECKSUM_HW)
+                msg->flags |=
+                        NC2_PACKET_FLAG_csum_blank |
+                        NC2_PACKET_FLAG_data_validated;
+#ifdef CONFIG_XEN
+        if (skb->proto_data_valid)
+                msg->flags |= NC2_PACKET_FLAG_data_validated;
+        if (skb->proto_csum_blank)
+                msg->flags |= NC2_PACKET_FLAG_csum_blank;
+#endif
+#endif
+
+        if (skb_shinfo(skb)->gso_size != 0) {
+                msg->mss = skb_shinfo(skb)->gso_size;
+                msg->segmentation_type = NC2_PACKET_SEGMENTATION_TYPE_tcpv4;
+        } else {
+                msg->mss = 0;
+                msg->segmentation_type = NC2_PACKET_SEGMENTATION_TYPE_none;
+        }
+}
+
+/* Transmit a packet which has previously been prepared with
+   prepare_xmit_allocate_resources(). */
+/* Once this has been called, the ring must not be flushed until the
+   TX hypercall batcher is (assuming this ring has a hypercall
+   batcher). */
+void nc2_really_start_xmit(struct netchannel2_ring_pair *ncrp,
+                           struct sk_buff *skb)
+{
+        struct skb_cb_overlay *skb_co = get_skb_overlay(skb);
+        struct netchannel2 *nc = ncrp->interface;
+        unsigned msg_size;
+        volatile struct netchannel2_msg_packet *msg;
+        unsigned nr_credits;
+
+        ENTER();
+
+        msg_size = get_transmitted_packet_msg_size(skb);
+        /* Un-reserve the space we reserved for the packet. */
+        BUG_ON(ncrp->prod_ring.reserve < msg_size);
+        ncrp->prod_ring.reserve -= msg_size;
+        if (!nc2_can_send_payload_bytes(&ncrp->prod_ring, msg_size)) {
+                /* Aw, crud.  We had to transmit a PAD message at just
+                   the wrong time, and our attempt to reserve ring
+                   space failed.  Back all the way back out of
+                   transmitting this packet, stop the queue, and get
+                   out. */
+                printk("<0>Ring reservation failed, trying to recover...\n");
+                nc = ncrp->interface;
+                if (ncrp == &nc->rings) {
+                        /* Requeue the packet so that we'll try again
+                           when the ring's less busy */
+                        __skb_queue_head(&nc->pending_skbs, skb);
+                        nc->is_stopped = 1;
+                        netif_stop_queue(nc->net_device);
+                } else {
+                        /* Just drop it on the floor.  There isn't
+                           really anything else we can do. */
+                        release_tx_packet(ncrp, skb);
+                }
+                return;
+        }
+
+        __nc2_avoid_ring_wrap(&ncrp->prod_ring, msg_size);
+
+        /* Set up part of the message.  We do the message header
+           itself and the inline prefix.  The individual xmit_*
+           methods are responsible for the fragments.  They may also
+           set some more msg flags. */
+        msg = __nc2_get_message_ptr(&ncrp->prod_ring);
+        msg->hdr.type = NETCHANNEL2_MSG_PACKET;
+        msg->hdr.flags = 0;
+        msg->hdr.size = msg_size;
+        msg->id = skb_co->tp - ncrp->tx_packets;
+        msg->type = skb_co->type;
+        msg->flags = 0;
+        msg->prefix_size = skb_co->inline_prefix_size;
+
+        /* We cast away the volatile to avoid compiler warnings, and
+           then use barrier()s to discourage gcc from using msg->frags
+           in CSE or somesuch.  It's kind of unlikely that it would,
+           but better to make sure. */
+        barrier();
+        memcpy((void *)(msg->frags + skb_co->nr_fragments),
+               skb->data,
+               skb_co->inline_prefix_size);
+        barrier();
+
+        set_offload_flags(skb, msg);
+
+        switch (skb_co->policy) {
+#ifdef CONFIG_XEN_NETDEV2_VMQ
+        case transmit_policy_vmq:
+                xmit_vmq(nc, skb, msg);
+                break;
+#endif
+        case transmit_policy_small:
+                /* Nothing to do */
+                break;
+        case transmit_policy_grant:
+                xmit_grant(ncrp, skb, 1, msg);
+                break;
+        case transmit_policy_post:
+                xmit_post(nc, skb, msg);
+                break;
+        case transmit_policy_map:
+                xmit_grant(ncrp, skb, 0, msg);
+                break;
+        default:
+                BUG();
+        }
+
+        ncrp->prod_ring.prod_pvt += msg_size;
+
+        BUG_ON(ncrp->prod_ring.bytes_available < msg_size);
+
+        ncrp->prod_ring.bytes_available -= msg_size;
+
+        ncrp->pending_time_sensitive_messages = 1;
+
+        if (skb_co->tp) {
+                ncrp->expected_finish_messages++;
+                if (ncrp->expected_finish_messages == 1 &&
+                    !timer_pending(&ncrp->polling_timer))
+                        nc2_start_polling(ncrp);
+        }
+
+        /* Sending a message over the ring allows the other end to
+           send us (a) POST_BUFFERs to replenish the ones we've used,
+           (b) asymmetry_factor more packets, and (c) a FINISH_PACKET
+           message (if we're not using posted buffers, which don't
+           need FINISH messages). */
+        nr_credits = 4;
+        if (skb_co->policy == transmit_policy_post)
+                nr_credits += skb_co->nr_fragments;
+        else
+                nr_credits ++;
+        nc2_rate_limiter_credit(&ncrp->limiter, nr_credits);
+
+        if (skb_co->tp) {
+                /* We're now ready to accept a FINISH message for this
+                   packet. */
+                skb_co->expecting_finish = 1;
+        } else {
+                /* This packet doesn't need a FINISH message.  Queue
+                   it up to be released as soon as we flush the
+                   hypercall batcher and the ring. */
+                nc->stats.tx_bytes += skb->len;
+                nc->stats.tx_packets++;
+                __skb_queue_tail(&ncrp->release_on_flush_batcher, skb);
+        }
+
+        ncrp->need_flush = 1;
+
+        EXIT();
+}
+
+/* Arrange that @skb will be sent on ring @ncrp soon.  Assumes that
+   prepare_xmit_allocate_resources() has been successfully called on
+   @skb already. */
+void queue_packet_to_interface(struct sk_buff *skb,
+                               struct netchannel2_ring_pair *ncrp)
+{
+        __skb_queue_tail(&ncrp->pending_tx_queue, skb);
+        if (ncrp->pending_tx_queue.qlen == 1)
+                nc2_kick(ncrp);
+}
+
+int nc2_start_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+        struct netchannel2 *nc = netdev_priv(dev);
+        struct skb_cb_overlay *sco = get_skb_overlay(skb);
+        int r;
+
+        ENTER();
+
+        memset(sco, 0, sizeof(*sco));
+
+        spin_lock_bh(&nc->rings.lock);
+
+        /* If we have a bypass suitable for this packet then we prefer
+         * that to the main ring pair. */
+#ifdef CONFIG_XEN_NETDEV2_BYPASS_ENDPOINT
+        {
+                struct nc2_alternate_ring *ncr;
+                list_for_each_entry(ncr, &nc->alternate_rings,
+                                    rings_by_interface) {
+                        if (bypass_xmit_packet(nc, ncr, skb)) {
+                                spin_unlock_bh(&nc->rings.lock);
+                                return NETDEV_TX_OK;
+                        }
+                }
+        }
+#endif
+
+        if (!nc->rings.is_attached) {
+                spin_unlock_bh(&nc->rings.lock);
+                dev_kfree_skb(skb);
+                nc->stats.tx_dropped++;
+                return NETDEV_TX_OK;
+        }
+
+        r = prepare_xmit_allocate_resources(nc, skb);
+        if (r < 0)
+                goto out_busy;
+        queue_packet_to_interface(skb, &nc->rings);
+        spin_unlock_bh(&nc->rings.lock);
+
+       return NETDEV_TX_OK;
+
+out_busy:
+        /* Some more buffers may have arrived, so kick the worker
+         * thread to go and have a look. */
+        nc2_kick(&nc->rings);
+
+        __skb_queue_tail(&nc->pending_skbs, skb);
+       nc->is_stopped = 1;
+        netif_stop_queue(dev);
+        spin_unlock_bh(&nc->rings.lock);
+       return NETDEV_TX_OK;
+}
+
+
+void nc2_handle_finish_packet_msg(struct netchannel2 *nc,
+                                  struct netchannel2_ring_pair *ncrp,
+                                  struct netchannel2_msg_hdr *hdr)
+{
+        struct skb_cb_overlay *sco;
+        struct netchannel2_msg_finish_packet msg;
+        struct txp_slot *tp;
+        struct sk_buff *skb;
+
+        if (hdr->size < sizeof(msg)) {
+                pr_debug("Packet finish message had strange size %d\n",
+                         hdr->size);
+                return;
+        }
+        nc2_copy_from_ring(&ncrp->cons_ring, &msg, sizeof(msg));
+        if (msg.id > NR_TX_PACKETS) {
+                pr_debug("Other end tried to end bad packet id %d\n",
+                         msg.id);
+                return;
+        }
+        tp = &ncrp->tx_packets[msg.id];
+        skb = txp_get_skb(tp);
+        if (!skb) {
+                pr_debug("Other end tried to end packet id %d which wasn't in use\n",
+                         msg.id);
+                return;
+        }
+        sco = get_skb_overlay(skb);
+        /* Careful: if the remote is malicious, they may try to end a
+           packet after we allocate it but before we send it (e.g. if
+           we've had to back out because we didn't have enough ring
+           space). */
+        if (!sco->expecting_finish) {
+                pr_debug("Other end finished packet before we sent it?\n");
+                return;
+        }
+        nc->stats.tx_bytes += skb->len;
+        nc->stats.tx_packets++;
+        release_tx_packet(ncrp, skb);
+        ncrp->expected_finish_messages--;
+}
+
+
+/* ------------------------ Control-path operations ---------------------- */
+void nc2_handle_set_max_packets_msg(struct netchannel2_ring_pair *ncrp,
+                                    struct netchannel2_msg_hdr *hdr)
+{
+        struct netchannel2_msg_set_max_packets msg;
+
+        if (hdr->size != sizeof(msg)) {
+                pr_debug("Set max packets message had strange size %d\n",
+                         hdr->size);
+                return;
+        }
+        if (ncrp->max_tx_packets_outstanding != 0) {
+                pr_debug("Other end tried to change number of outstanding packets from %d.\n",
+                         ncrp->max_tx_packets_outstanding);
+                return;
+        }
+        nc2_copy_from_ring(&ncrp->cons_ring, &msg, sizeof(msg));
+        /* Limit the number of outstanding packets to something sane.
+           This is a little bit paranoid (it should be safe to set
+           this arbitrarily high), but limiting it avoids nasty
+           surprises in untested configurations. */
+        if (msg.max_outstanding_packets > MAX_TX_PACKETS) {
+                pr_debug("Other end tried to set max outstanding to %d, limiting to %d.\n",
+                         msg.max_outstanding_packets, MAX_TX_PACKETS);
+                ncrp->max_tx_packets_outstanding = MAX_TX_PACKETS;
+        } else {
+                ncrp->max_tx_packets_outstanding = msg.max_outstanding_packets;
+        }
+}
+
+/* Release all packets on the transmitted and pending_tx lists. */
+void drop_pending_tx_packets(struct netchannel2_ring_pair *ncrp)
+{
+        struct sk_buff *skb;
+        unsigned x;
+
+        for (x = 0; x < NR_TX_PACKETS; x++) {
+                skb = txp_get_skb(&ncrp->tx_packets[x]);
+                if (skb)
+                        release_tx_packet(ncrp, skb);
+        }
+}
+
diff --git a/include/xen/interface/io/netchannel2.h b/include/xen/interface/io/netchannel2.h
new file mode 100644 (file)
index 0000000..f746b29
--- /dev/null
@@ -0,0 +1,367 @@
+#ifndef __NETCHANNEL2_H__
+#define __NETCHANNEL2_H__
+
+#include <xen/interface/io/uring.h>
+
+/* Tell the other end how many packets its allowed to have
+ * simultaneously outstanding for transmission.  An endpoint must not
+ * send PACKET messages which would take it over this limit.
+ *
+ * The SET_MAX_PACKETS message must be sent before any PACKET
+ * messages.  It should only be sent once, unless the ring is
+ * disconnected and reconnected.
+ */
+#define NETCHANNEL2_MSG_SET_MAX_PACKETS 1
+struct netchannel2_msg_set_max_packets {
+       struct netchannel2_msg_hdr hdr;
+        uint32_t max_outstanding_packets;
+};
+
+/* Pass a packet to the other end.  The packet consists of a header,
+ * followed by a bunch of fragment descriptors, followed by an inline
+ * packet prefix.  Every fragment descriptor in a packet must be the
+ * same type, and the type is determined by the header.  The receiving
+ * endpoint should respond with a finished_packet message as soon as
+ * possible.  The prefix may be no more than
+ * NETCHANNEL2_MAX_INLINE_BYTES.  Packets may contain no more than
+ * NETCHANNEL2_MAX_PACKET_BYTES bytes of data, including all fragments
+ * and the prefix.
+ */
+#define NETCHANNEL2_MSG_PACKET 2
+#define NETCHANNEL2_MAX_PACKET_BYTES 65536
+#define NETCHANNEL2_MAX_INLINE_BYTES 256
+struct netchannel2_fragment {
+        uint16_t size;
+        /* The offset is always relative to the start of the page.
+           For pre_posted packet types, it is not relative to the
+           start of the buffer (although the fragment range will
+           obviously be within the buffer range). */
+        uint16_t off;
+        union {
+                struct {
+                        grant_ref_t gref;
+                } receiver_copy;
+                struct {
+                        /* The id of a buffer which previously posted
+                           in a POST_BUFFER message. */
+                        uint32_t id;
+                } pre_post;
+                struct {
+                        grant_ref_t gref;
+                } receiver_map;
+        };
+};
+struct netchannel2_msg_packet {
+       struct netchannel2_msg_hdr hdr;
+        uint32_t id; /* Opaque ID which is echoed into the finished
+                        packet message. */
+        uint8_t type;
+        uint8_t flags;
+        uint8_t segmentation_type;
+        uint8_t pad;
+        uint16_t prefix_size;
+        uint16_t mss;
+        /* Variable-size array.  The number of elements is determined
+           by the size of the message. */
+        struct netchannel2_fragment frags[0];
+};
+#define NC2_PACKET_FLAG_csum_blank 1
+#define NC2_PACKET_FLAG_data_validated 2
+#define NC2_PACKET_FLAG_bypass_candidate 4
+#define NC2_PACKET_FLAG_need_event 8
+
+/* The mechanism which should be used to receive the data part of
+ * a packet:
+ *
+ * receiver_copy -- The transmitting domain has granted the receiving
+ *                  domain access to the original RX buffers using
+ *                  copy-only grant references.  The receiving domain
+ *                  should copy the data out of the buffers and issue
+ *                  a FINISH message.
+ *
+ *                  Due to backend bugs, it is in not safe to use this
+ *                  packet type except on bypass rings.
+ *
+ * pre_posted -- The transmitting domain has copied the packet to
+ *               buffers which were previously provided in POST_BUFFER
+ *               messages.  No FINISH message is required, and it is
+ *               an error to send one.
+ *
+ *               This packet type may not be used on bypass rings.
+ *
+ * receiver_map -- The transmitting domain has granted the receiving
+ *                 domain access to the original RX buffers using
+ *                 full (mappable) grant references.  This can be
+ *                 treated the same way as receiver_copy, but the
+ *                 receiving domain also has the option of mapping
+ *                 the fragments, rather than copying them.  If it
+ *                 decides to do so, it should ensure that the fragments
+ *                 will be unmapped in a reasonably timely fashion,
+ *                 and don't e.g. become stuck in a receive buffer
+ *                 somewhere.  In general, anything longer than about
+ *                 a second is likely to cause problems.  Once all
+ *                 grant references have been unmapper, the receiving
+ *                 domain should send a FINISH message.
+ *
+ *                 This packet type may not be used on bypass rings.
+ *
+ * small -- The packet does not have any fragment descriptors
+ *          (i.e. the entire thing is inline in the ring).  The receiving
+ *          domain should simply the copy the packet out of the ring
+ *          into a locally allocated buffer.  No FINISH message is required
+ *          or allowed.
+ *
+ *          This packet type may be used on any ring.
+ *
+ * All endpoints must be able to receive all packet types, but note
+ * that it is correct to treat receiver_map and small packets as
+ * receiver_copy ones. */
+#define NC2_PACKET_TYPE_receiver_copy 1
+#define NC2_PACKET_TYPE_pre_posted 2
+#define NC2_PACKET_TYPE_receiver_map 3
+#define NC2_PACKET_TYPE_small 4
+
+#define NC2_PACKET_SEGMENTATION_TYPE_none  0
+#define NC2_PACKET_SEGMENTATION_TYPE_tcpv4 1
+
+/* Tell the other end that we're finished with a message it sent us,
+   and it can release the transmit buffers etc.  This must be sent in
+   response to receiver_copy and receiver_map packets.  It must not be
+   sent in response to pre_posted or small packets. */
+#define NETCHANNEL2_MSG_FINISH_PACKET 3
+struct netchannel2_msg_finish_packet {
+       struct netchannel2_msg_hdr hdr;
+        uint32_t id;
+};
+
+/* Tell the other end what sort of offloads we're going to let it use.
+ * An endpoint must not use any offload unless it has been enabled
+ * by a previous SET_OFFLOAD message. */
+/* Note that there is no acknowledgement for this message.  This means
+ * that an endpoint can continue to receive PACKET messages which
+ * require offload support for some time after it disables task
+ * offloading.  The endpoint is expected to handle this case correctly
+ * (which may just mean dropping the packet and returning a FINISH
+ * message, if appropriate).
+ */
+#define NETCHANNEL2_MSG_SET_OFFLOAD 4
+struct netchannel2_msg_set_offload {
+       struct netchannel2_msg_hdr hdr;
+        /* Checksum offload.  If this is 0, the other end must
+         * calculate checksums before sending the packet.  If it is 1,
+         * the other end does not have to perform the calculation.
+         */
+        uint8_t csum;
+        /* Segmentation offload.  If this is 0, the other end must not
+         * generate any packet messages with a segmentation type other
+         * than NC2_PACKET_SEGMENTATION_TYPE_none.  If it is 1, the
+         * other end may also generate packets with a type of
+         * NC2_PACKET_SEGMENTATION_TYPE_tcpv4.
+         */
+        uint8_t tcpv4_segmentation_offload;
+       uint16_t reserved;
+};
+
+/* Provide a buffer to the other end.  The buffer is initially empty.
+ * The other end is expected to either:
+ *
+ * -- Put some packet data in it, and return it as part of a
+ *    pre_posted PACKET message, or
+ * -- Not do anything with it, and return it in a RETURN_BUFFER
+ *    message.
+ *
+ * The other end is allowed to hold on to the buffer for as long as it
+ * wants before returning the buffer.  Buffers may be used out of
+ * order.
+ *
+ * This message cannot be sent unless the VM has received a
+ * SET_NR_POSTED_BUFFERS message.  The total number of outstanding
+ * buffers must not exceed the limit specified in the
+ * SET_NR_POSTED_BUFFERS message.
+ *
+ * The grant reference should be a whole-page reference, and not a
+ * subpage reference, because the reeciving domain may need to map it
+ * in order to make the buffer available to hardware.  The current
+ * Linux implementation doesn't do this, but a future version will.
+ */
+#define NETCHANNEL2_MSG_POST_BUFFER 5
+struct netchannel2_msg_post_buffer {
+       struct netchannel2_msg_hdr hdr;
+        uint32_t id;
+        grant_ref_t gref;
+        uint16_t off_in_page;
+        uint16_t size;
+};
+
+/* The other end has decided not to use the buffer for some reason
+ * (usually because it's shutting down).  The buffer is returned
+ * containing no data.
+ */
+#define NETCHANNEL2_MSG_RETURN_POSTED_BUFFER 6
+struct netchannel2_msg_return_posted_buffer {
+       struct netchannel2_msg_hdr hdr;
+        uint32_t id;
+};
+
+/* The other end is allowing us to post up to @nr_buffers messages to
+ * us.  If @nr_buffers is 0, the use of posted buffers is disabled.
+ *
+ * If there are buffers outstanding, a SET_NR_POSTED_BUFFERS message
+ * implicitly returns all of them, as if they had been returned with a
+ * run of RETURN_POSTED_BUFFER messages.  This is true even if
+ * @nr_buffers is unchanged.
+ *
+ * @nr_buffers only ever provides an upper bound on the number of
+ * buffers posted; an endpoint may elect to post less than that.
+ */
+#define NETCHANNEL2_MSG_SET_NR_POSTED_BUFFERS 7
+struct netchannel2_msg_set_nr_posted_buffers {
+       struct netchannel2_msg_hdr hdr;
+        uint32_t nr_buffers;
+};
+
+/* Attach to a bypass ring as a frontend.  The receiving domain should
+ * map the bypass ring (which will be in the sending domain's memory)
+ * and attach to it in the same as it attached to the original ring.
+ * This bypass ring will, once it's been successfully set up, be used
+ * for all packets destined for @remote_mac (excluding broadcasts).
+ *
+ * @ring_domid indicates which domain allocated the ring pages, and
+ * hence which domain should be specified when grant mapping
+ * @control_gref, @prod_gref, and @cons_gref.
+ *
+ * @peer_domid indicates the domain ID of the domain on the other end
+ * of the ring.
+ *
+ * @handle gives a unique handle for the bypass which will be used in
+ * future messages.
+ *
+ * @peer_trusted is true if the peer should be trusted by the domain
+ * which sent the bypass message.
+ *
+ * @ring_pages gives the number of valid grefs in the @prod_grefs and
+ * @cons_grefs arrays.
+ *
+ * @is_backend_like indicates which ring attach the receiving domain
+ * should use.  If @is_backend_like is set, the receiving domain
+ * should interpret the control area as a netchannel2_backend_shared.
+ * Otherwise, it's a netchannel2_frontend_shared.  Also, a
+ * backend-like endpoint should receive an event channel from the peer
+ * domain, while a frontend-like one should send one.  Once
+ * established, the ring is symmetrical.
+ *
+ *
+ * BYPASS messages can only be sent by a trusted endpoint.  They may
+ * not be sent over bypass rings.
+ *
+ * No packets may be sent over the ring until a READY message is
+ * received.  Until that point, all packets must be sent over the
+ * parent ring.
+ */
+struct netchannel2_msg_bypass_common {
+        uint16_t ring_domid;
+        uint16_t peer_domid;
+        uint32_t handle;
+
+        uint8_t remote_mac[6];
+        uint8_t peer_trusted;
+        uint8_t ring_pages;
+
+        uint32_t control_gref;
+        uint32_t pad;
+
+        /* Followed by a run of @ring_pages uint32_t producer ring
+           grant references, then a run of @ring_pages uint32_t
+           consumer ring grant references */
+};
+
+#define NETCHANNEL2_MSG_BYPASS_FRONTEND 8
+struct netchannel2_msg_bypass_frontend {
+       struct netchannel2_msg_hdr hdr;
+        uint32_t pad;
+        struct netchannel2_msg_bypass_common common;
+};
+
+#define NETCHANNEL2_MSG_BYPASS_BACKEND 9
+struct netchannel2_msg_bypass_backend {
+       struct netchannel2_msg_hdr hdr;
+        uint32_t port;
+        struct netchannel2_msg_bypass_common common;
+};
+
+#define NETCHANNEL2_MSG_BYPASS_FRONTEND_READY 10
+struct netchannel2_msg_bypass_frontend_ready {
+       struct netchannel2_msg_hdr hdr;
+        int32_t port;
+};
+
+/* This message is sent on a bypass ring once the sending domain is
+ * ready to receive packets.  Until it has been received, the bypass
+ * ring cannot be used to transmit packets.  It may only be sent once.
+ *
+ * Note that it is valid to send packet messages before *sending* a
+ * BYPASS_READY message, provided a BYPASS_READY message has been
+ * *received*.
+ *
+ * This message can only be sent on a bypass ring.
+ */
+#define NETCHANNEL2_MSG_BYPASS_READY 11
+struct netchannel2_msg_bypass_ready {
+       struct netchannel2_msg_hdr hdr;
+        uint32_t pad;
+};
+
+/* Disable an existing bypass.  This is sent over the *parent* ring,
+ * in the same direction as the original BYPASS message, when the
+ * bypassed domain wishes to disable the ring.  The receiving domain
+ * should stop sending PACKET messages over the ring, wait for FINISH
+ * messages for any outstanding PACKETs, and then acknowledge this
+ * message with a DISABLED message.
+ *
+ * This message may not be sent on bypass rings.
+ */
+#define NETCHANNEL2_MSG_BYPASS_DISABLE 12
+struct netchannel2_msg_bypass_disable {
+       struct netchannel2_msg_hdr hdr;
+        uint32_t handle;
+};
+#define NETCHANNEL2_MSG_BYPASS_DISABLED 13
+struct netchannel2_msg_bypass_disabled {
+       struct netchannel2_msg_hdr hdr;
+        uint32_t handle;
+};
+
+/* Detach from an existing bypass.  This is sent over the *parent* in
+ * the same direction as the original BYPASS message, when the
+ * bypassed domain wishes to destroy the ring.  The receiving domain
+ * should immediately unmap the ring and respond with a DETACHED
+ * message.  Any PACKET messages which haven't already received a
+ * FINISH message are dropped.
+ *
+ * During a normal shutdown, this message will be sent after DISABLED
+ * messages have been received from both endpoints.  However, it can
+ * also be sent without a preceding DISABLE message if the other
+ * endpoint appears to be misbehaving or has crashed.
+ *
+ * This message may not be sent on bypass rings.
+ */
+#define NETCHANNEL2_MSG_BYPASS_DETACH 14
+struct netchannel2_msg_bypass_detach {
+       struct netchannel2_msg_hdr hdr;
+        uint32_t handle;
+};
+#define NETCHANNEL2_MSG_BYPASS_DETACHED 15
+struct netchannel2_msg_bypass_detached {
+       struct netchannel2_msg_hdr hdr;
+        uint32_t handle;
+};
+
+#define NETCHANNEL2_MSG_SUGGEST_BYPASS 16
+struct netchannel2_msg_suggest_bypass {
+        struct netchannel2_msg_hdr hdr;
+        unsigned char mac[6];
+        uint16_t pad1;
+        uint32_t pad2;
+};
+
+#endif /* !__NETCHANNEL2_H__ */
diff --git a/include/xen/interface/io/uring.h b/include/xen/interface/io/uring.h
new file mode 100644 (file)
index 0000000..3b1c9f5
--- /dev/null
@@ -0,0 +1,437 @@
+#ifndef __XEN_PUBLIC_IO_URING_H__
+#define __XEN_PUBLIC_IO_URING_H__
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <asm/system.h>
+
+typedef unsigned RING_IDX;
+
+#define NETCHANNEL2_MSG_PAD 255
+
+/* The sring structures themselves.  The _cons and _prod variants are
+   different views of the same bit of shared memory, and are supposed
+   to provide better checking of the expected use patterns.  Fields in
+   the shared ring are owned by either the producer end or the
+   consumer end.  If a field is owned by your end, the other end will
+   never modify it.  If it's owned by the other end, the other end is
+   allowed to modify it whenever it likes, and you can never do so.
+
+   Fields owned by the other end are always const (because you can't
+   change them).  They're also volatile, because there are a bunch
+   of places where we go:
+
+   local_x = sring->x;
+   validate(local_x);
+   use(local_x);
+
+   and it would be very bad if the compiler turned that into:
+
+   local_x = sring->x;
+   validate(sring->x);
+   use(local_x);
+
+   because that contains a potential TOCTOU race (hard to exploit, but
+   still present).  The compiler is only allowed to do that
+   optimisation because it knows that local_x == sring->x at the start
+   of the call to validate(), and it only knows that if it can reorder
+   the read of sring->x over the sequence point at the end of the
+   first statement.  In other words, it can only do the bad
+   optimisation if it knows that reads of sring->x are side-effect
+   free.  volatile stops it from making that assumption.
+
+   We don't need a full memory barrier here, because it's sufficient
+   to copy the volatile data into stable guest-local storage, and
+   volatile achieves that.  i.e. we don't need local_x to be precisely
+   sring->x, but we do need it to be a stable snapshot of some
+   previous valud of sring->x.
+
+   Note that there are still plenty of other places where we *do* need
+   full barriers.  volatile just deals with this one, specific, case.
+
+   We could also deal with it by putting compiler barriers in all over
+   the place.  The downside of that approach is that you need to put
+   the barrier()s in lots of different places (basically, everywhere
+   which needs to access these fields), and it's easy to forget one.
+   barrier()s also have somewhat heavier semantics than volatile
+   (because they prevent all reordering, rather than just reordering
+   on this one field), although that's pretty much irrelevant because
+   gcc usually treats pretty much any volatile access as a call to
+   barrier().
+*/
+
+/* Messages are sent over sring pairs.  Each sring in a pair provides
+ * a unidirectional byte stream which can generate events when either
+ * the producer or consumer pointers cross a particular threshold.
+ *
+ * We define both sring_prod and sring_cons structures.  The two
+ * structures will always map onto the same physical bytes in memory,
+ * but they provide different views of that memory which are
+ * appropriate to either producers or consumers.
+ *
+ * Obviously, the endpoints need to agree on which end produces
+ * messages on which ring.  The endpoint which provided the memory
+ * backing the ring always produces on the first sring, and the one
+ * which just mapped the ring produces on the second.  By convention,
+ * these are known as the frontend and backend, respectively.
+ */
+
+/* For both rings, the producer (consumer) pointers point at the
+ * *next* byte which is going to be produced (consumed).  An endpoint
+ * must generate an event on the event channel port if it moves the
+ * producer pointer (consumer pointer) across prod_event (cons_event).
+ *
+ * i.e if an endpoint ever updates a pointer so that the old pointer
+ * is strictly less than the event, and the new pointer is greater
+ * than or equal to the event then the remote must be notified.  If
+ * the pointer overflows the ring, treat the new value as if it were
+ * (actual new value) + (1 << 32).
+ */
+struct netchannel2_sring_prod {
+        RING_IDX prod;
+        volatile const RING_IDX cons;
+        volatile const RING_IDX prod_event;
+        RING_IDX cons_event;
+        unsigned char pad[48];
+};
+
+struct netchannel2_sring_cons {
+        volatile const RING_IDX prod;
+        RING_IDX cons;
+        RING_IDX prod_event;
+        volatile const RING_IDX cons_event;
+        unsigned char pad[48];
+};
+
+struct netchannel2_frontend_shared {
+        struct netchannel2_sring_prod prod;
+        struct netchannel2_sring_cons cons;
+};
+
+struct netchannel2_backend_shared {
+        struct netchannel2_sring_cons cons;
+        struct netchannel2_sring_prod prod;
+};
+
+struct netchannel2_prod_ring {
+        struct netchannel2_sring_prod *sring;
+        void *payload;
+        RING_IDX prod_pvt;
+        /* This is the number of bytes available after prod_pvt last
+           time we checked, minus the number of bytes which we've
+           consumed since then.  It's used to a avoid a bunch of
+           memory barriers when checking for ring space. */
+        unsigned bytes_available;
+        /* Number of bytes reserved by nc2_reserve_payload_bytes() */
+        unsigned reserve;
+        size_t payload_bytes;
+};
+
+struct netchannel2_cons_ring {
+        struct netchannel2_sring_cons *sring;
+        const volatile void *payload;
+        RING_IDX cons_pvt;
+        size_t payload_bytes;
+};
+
+/* A message header.  There is one of these at the start of every
+ * message.  @type is one of the #define's below, and @size is the
+ * size of the message, including the header and any padding.
+ * size should be a multiple of 8 so we avoid unaligned memory copies.
+ * structs defining message formats should have sizes multiple of 8
+ * bytes and should use paddding fields if needed.
+ */
+struct netchannel2_msg_hdr {
+        uint8_t type;
+        uint8_t flags;
+        uint16_t size;
+};
+
+/* Copy some bytes from the shared ring to a stable local buffer,
+ * starting at the private consumer pointer.  Does not update the
+ * private consumer pointer.
+ */
+static inline void nc2_copy_from_ring_off(struct netchannel2_cons_ring *ring,
+                            void *buf,
+                            size_t nbytes,
+                            unsigned off)
+{
+        unsigned start, end;
+
+        start = (ring->cons_pvt + off) & (ring->payload_bytes-1);
+        end = (ring->cons_pvt + nbytes + off) & (ring->payload_bytes-1);
+        /* We cast away the volatile modifier to get rid of an
+           irritating compiler warning, and compensate with a
+           barrier() at the end. */
+        memcpy(buf, (const void *)ring->payload + start, nbytes);
+        barrier();
+}
+
+static inline void nc2_copy_from_ring(struct netchannel2_cons_ring *ring,
+                        void *buf,
+                        size_t nbytes)
+{
+        nc2_copy_from_ring_off(ring, buf, nbytes, 0);
+}
+
+
+/* Copy some bytes to the shared ring, starting at the private
+ * producer pointer.  Does not update the private pointer.
+ */
+static inline void nc2_copy_to_ring_off(struct netchannel2_prod_ring *ring,
+                         const void *src,
+                         unsigned nr_bytes,
+                         unsigned off)
+{
+        unsigned start, end;
+
+        start = (ring->prod_pvt + off) & (ring->payload_bytes-1);
+        end = (ring->prod_pvt + nr_bytes + off) & (ring->payload_bytes-1);
+        memcpy(ring->payload + start, src, nr_bytes);
+}
+
+static inline void nc2_copy_to_ring(struct netchannel2_prod_ring *ring,
+                     const void *src,
+                     unsigned nr_bytes)
+{
+        nc2_copy_to_ring_off(ring, src, nr_bytes, 0);
+}
+
+static inline void __nc2_send_pad(struct netchannel2_prod_ring *ring,
+                                  unsigned nr_bytes)
+{
+        struct netchannel2_msg_hdr msg;
+        msg.type = NETCHANNEL2_MSG_PAD;
+        msg.flags = 0;
+        msg.size = nr_bytes;
+        nc2_copy_to_ring(ring, &msg, sizeof(msg));
+        ring->prod_pvt += nr_bytes;
+        ring->bytes_available -= nr_bytes;
+}
+
+static inline int __nc2_ring_would_wrap(struct netchannel2_prod_ring *ring,
+                                        unsigned nr_bytes)
+{
+        RING_IDX mask;
+        mask = ~(ring->payload_bytes - 1);
+        return (ring->prod_pvt & mask) != ((ring->prod_pvt + nr_bytes) & mask);
+}
+
+static inline unsigned __nc2_pad_needed(struct netchannel2_prod_ring *ring)
+{
+        return ring->payload_bytes -
+                        (ring->prod_pvt & (ring->payload_bytes - 1));
+}
+
+static inline void __nc2_avoid_ring_wrap(struct netchannel2_prod_ring *ring,
+                                         unsigned nr_bytes)
+{
+        if (!__nc2_ring_would_wrap(ring, nr_bytes))
+                return;
+        __nc2_send_pad(ring, __nc2_pad_needed(ring));
+
+}
+
+/* Prepare a message for the other end and place it on the shared
+ * ring, updating the private producer pointer.  You need to call
+ * nc2_flush_messages() before the message is actually made visible to
+ * the other end.  It is permissible to send several messages in a
+ * batch and only flush them once.
+ */
+static inline void nc2_send_message(struct netchannel2_prod_ring *ring,
+                      unsigned type,
+                      unsigned flags,
+                      const void *msg,
+                      size_t size)
+{
+        struct netchannel2_msg_hdr *hdr = (struct netchannel2_msg_hdr *)msg;
+
+        __nc2_avoid_ring_wrap(ring, size);
+
+        hdr->type = type;
+        hdr->flags = flags;
+        hdr->size = size;
+
+        nc2_copy_to_ring(ring, msg, size);
+        ring->prod_pvt += size;
+        BUG_ON(ring->bytes_available < size);
+        ring->bytes_available -= size;
+}
+
+static inline volatile void *__nc2_get_message_ptr(struct netchannel2_prod_ring *ncrp)
+{
+        return (volatile void *)ncrp->payload +
+                (ncrp->prod_pvt & (ncrp->payload_bytes-1));
+}
+
+/* Copy the private producer pointer to the shared producer pointer,
+ * with a suitable memory barrier such that all messages placed on the
+ * ring are stable before we do the copy.  This effectively pushes any
+ * messages which we've just sent out to the other end.  Returns 1 if
+ * we need to notify the other end and 0 otherwise.
+ */
+static inline int nc2_flush_ring(struct netchannel2_prod_ring *ring)
+{
+        RING_IDX old_prod, new_prod;
+
+        old_prod = ring->sring->prod;
+        new_prod = ring->prod_pvt;
+
+        wmb();
+
+        ring->sring->prod = new_prod;
+
+        /* We need the update to prod to happen before we read
+         * event. */
+        mb();
+
+        /* We notify if the producer pointer moves across the event
+         * pointer. */
+        if ( (RING_IDX)(new_prod - ring->sring->prod_event) <
+             (RING_IDX)(new_prod - old_prod) ) {
+                return 1;
+        } else {
+                return 0;
+        }
+}
+
+/* Copy the private consumer pointer to the shared consumer pointer,
+ * with a memory barrier so that any previous reads from the ring
+ * complete before the pointer is updated.  This tells the other end
+ * that we're finished with the messages, and that it can re-use the
+ * ring space for more messages.  Returns 1 if we need to notify the
+ * other end and 0 otherwise.
+ */
+static inline int nc2_finish_messages(struct netchannel2_cons_ring *ring)
+{
+        RING_IDX old_cons, new_cons;
+
+        old_cons = ring->sring->cons;
+        new_cons = ring->cons_pvt;
+
+        /* Need to finish reading from the ring before updating
+           cons */
+        mb();
+        ring->sring->cons = ring->cons_pvt;
+
+        /* Need to publish our new consumer pointer before checking
+           event. */
+        mb();
+        if ( (RING_IDX)(new_cons - ring->sring->cons_event) <
+             (RING_IDX)(new_cons - old_cons) )
+                return 1;
+        else
+                return 0;
+}
+
+/* Check whether there are any unconsumed messages left on the shared
+ * ring.  Returns 1 if there are, and 0 if there aren't.  If there are
+ * no more messages, set the producer event so that we'll get a
+ * notification as soon as another one gets sent.  It is assumed that
+ * all messages up to @prod have been processed, and none of the ones
+ * after it have been. */
+static inline int nc2_final_check_for_messages(struct netchannel2_cons_ring *ring,
+                                 RING_IDX prod)
+{
+        if (prod != ring->sring->prod)
+                return 1;
+        /* Request an event when more stuff gets poked on the ring. */
+        ring->sring->prod_event = prod + 1;
+
+        /* Publish event before final check for responses. */
+        mb();
+        if (prod != ring->sring->prod)
+                return 1;
+        else
+                return 0;
+}
+
+/* Can we send a message with @nr_bytes payload bytes?  Returns 1 if
+ * we can or 0 if we can't.  If there isn't space right now, set the
+ * consumer event so that we'll get notified when space is
+ * available. */
+static inline int nc2_can_send_payload_bytes(struct netchannel2_prod_ring *ring,
+                               unsigned nr_bytes)
+{
+        unsigned space;
+        RING_IDX cons;
+        BUG_ON(ring->bytes_available > ring->payload_bytes);
+        /* Times 2 because we might need to send a pad message */
+        if (likely(ring->bytes_available > nr_bytes * 2 + ring->reserve))
+                return 1;
+        if (__nc2_ring_would_wrap(ring, nr_bytes))
+                nr_bytes += __nc2_pad_needed(ring);
+retry:
+        cons = ring->sring->cons;
+        space = ring->payload_bytes - (ring->prod_pvt - cons);
+        if (likely(space >= nr_bytes + ring->reserve)) {
+                /* We have enough space to send the message. */
+
+                /* Need to make sure that the read of cons happens
+                   before any following memory writes. */
+                mb();
+
+                ring->bytes_available = space;
+
+                return 1;
+        } else {
+                /* Not enough space available.  Set an event pointer
+                   when cons changes.  We need to be sure that the
+                   @cons used here is the same as the cons used to
+                   calculate @space above, and the volatile modifier
+                   on sring->cons achieves that. */
+                ring->sring->cons_event = cons + 1;
+
+                /* Check whether more space became available while we
+                   were messing about. */
+
+                /* Need the event pointer to be stable before we do
+                   the check. */
+                mb();
+                if (unlikely(cons != ring->sring->cons)) {
+                        /* Cons pointer changed.  Try again. */
+                        goto retry;
+                }
+
+                /* There definitely isn't space on the ring now, and
+                   an event has been set such that we'll be notified
+                   if more space becomes available. */
+                /* XXX we get a notification as soon as any more space
+                   becomes available.  We could maybe optimise by
+                   setting the event such that we only get notified
+                   when we know that enough space is available.  The
+                   main complication is handling the case where you
+                   try to send a message of size A, fail due to lack
+                   of space, and then try to send one of size B, where
+                   B < A.  It's not clear whether you want to set the
+                   event for A bytes or B bytes.  The obvious answer
+                   is B, but that means moving the event pointer
+                   backwards, and it's not clear that that's always
+                   safe.  Always setting for a single byte is safe, so
+                   stick with that for now. */
+                return 0;
+        }
+}
+
+static inline int nc2_reserve_payload_bytes(struct netchannel2_prod_ring *ring,
+                              unsigned nr_bytes)
+{
+        if (nc2_can_send_payload_bytes(ring, nr_bytes)) {
+                ring->reserve += nr_bytes;
+                return 1;
+        } else {
+                return 0;
+        }
+}
+
+#endif /* __XEN_PUBLIC_IO_URING_H__ */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */