From: Jean Guyader Date: Tue, 6 Oct 2009 16:12:21 +0000 (+0100) Subject: Add netchannel2, disable v2v-core. X-Git-Url: http://xenbits.xensource.com/gitweb?a=commitdiff_plain;h=a215d350dac479d66aced78d61c27eed7b0d03e6;p=xenclient%2Flinux-2.6.27-pq.git Add netchannel2, disable v2v-core. --- diff --git a/master/netback_map_foreign b/master/netback_map_foreign new file mode 100644 index 0000000..5ce9bf2 --- /dev/null +++ b/master/netback_map_foreign @@ -0,0 +1,362 @@ + + A mechanism for netback to register its mappings of foreign + pages in a way which will be accessible to netback2. This is + necessary when forwarding packets between the two backends. + + Signed-off-by: Steven Smith + +diff --git a/drivers/xen/core/Makefile b/drivers/xen/core/Makefile +index 546f0b2..967981a 100644 +--- a/drivers/xen/core/Makefile ++++ b/drivers/xen/core/Makefile +@@ -2,7 +2,7 @@ + # Makefile for the linux kernel. + # + +-obj-y := evtchn.o gnttab.o features.o reboot.o machine_reboot.o firmware.o ++obj-y := evtchn.o gnttab.o features.o reboot.o machine_reboot.o firmware.o live_maps.o + + obj-$(CONFIG_PCI) += pci.o + obj-$(CONFIG_PROC_FS) += xen_proc.o +diff --git a/drivers/xen/core/gnttab.c b/drivers/xen/core/gnttab.c +index cea08c0..aaf526d 100644 +--- a/drivers/xen/core/gnttab.c ++++ b/drivers/xen/core/gnttab.c +@@ -589,6 +589,7 @@ int gnttab_copy_grant_page(grant_ref_t ref, struct page **pagep) + + new_page->mapping = page->mapping; + new_page->index = page->index; ++ new_page->private = page->private; + set_bit(PG_foreign, &new_page->flags); + *pagep = new_page; + +diff --git a/drivers/xen/core/live_maps.c b/drivers/xen/core/live_maps.c +new file mode 100644 +index 0000000..69d41f4 +--- /dev/null ++++ b/drivers/xen/core/live_maps.c +@@ -0,0 +1,61 @@ ++#include ++#include ++#include ++#include ++#include ++ ++/* This lock protects allocation and release of trackers, but is not ++ held when we're actually looking stuff up. The caller is ++ responsible for making sure that suitable locks are held around ++ data path operations. */ ++static DEFINE_SPINLOCK(tracker_lock); ++ ++struct page_foreign_tracker *foreign_trackers[LIVE_MAP_NR_TRACKERS]; ++EXPORT_SYMBOL(foreign_trackers); ++ ++/* Allocate a foreign page tracker. @size is the maximum index in the ++ tracker. Returns NULL on error. */ ++struct page_foreign_tracker *alloc_page_foreign_tracker(unsigned size) ++{ ++ struct page_foreign_tracker *work; ++ unsigned x; ++ ++ BUG_ON(size & ~LIVE_MAP_TRACKER_IDX_MASK); ++ ++ work = kzalloc(sizeof(*work) + ++ size * sizeof(struct page_foreign_tracked), ++ GFP_KERNEL); ++ if (!work) ++ return work; ++ work->size = size; ++ ++ spin_lock(&tracker_lock); ++ for (x = 0; x < LIVE_MAP_NR_TRACKERS; x++) { ++ if (foreign_trackers[x] == NULL) { ++ work->id = x; ++ foreign_trackers[x] = work; ++ break; ++ } ++ } ++ spin_unlock(&tracker_lock); ++ if (x == LIVE_MAP_NR_TRACKERS) { ++ printk(KERN_WARNING "Out of foreign page trackers!\n"); ++ kfree(work); ++ return NULL; ++ } ++ return work; ++} ++ ++/* Release a tracker allocated with alloc_page_foreign_tracker. There ++ should be no tracked pages when this is called. */ ++void free_page_foreign_tracker(struct page_foreign_tracker *pft) ++{ ++ spin_lock(&tracker_lock); ++ BUG_ON(foreign_trackers[pft->id] != pft); ++ foreign_trackers[pft->id] = NULL; ++ spin_unlock(&tracker_lock); ++ kfree(pft); ++} ++ ++EXPORT_SYMBOL(alloc_page_foreign_tracker); ++EXPORT_SYMBOL(free_page_foreign_tracker); +diff --git a/drivers/xen/netback/netback.c b/drivers/xen/netback/netback.c +index fe6eff7..76641b1 100644 +--- a/drivers/xen/netback/netback.c ++++ b/drivers/xen/netback/netback.c +@@ -37,6 +37,7 @@ + #include "common.h" + #include + #include ++#include + #include + + /*define NETBE_DEBUG_INTERRUPT*/ +@@ -133,6 +134,8 @@ typedef unsigned int PEND_RING_IDX; + static PEND_RING_IDX pending_prod, pending_cons; + #define NR_PENDING_REQS (MAX_PENDING_REQS - pending_prod + pending_cons) + ++static struct page_foreign_tracker *foreign_page_tracker; ++ + /* Freed TX SKBs get batched on this ring before return to pending_ring. */ + static u16 dealloc_ring[MAX_PENDING_REQS]; + static PEND_RING_IDX dealloc_prod, dealloc_cons; +@@ -438,16 +441,14 @@ static void netbk_gop_frag_copy(netif_t *netif, + { + gnttab_copy_t *copy_gop; + struct netbk_rx_meta *meta; +- int idx = netif_page_index(page); + + meta = npo->meta + npo->meta_prod - 1; + + copy_gop = npo->copy + npo->copy_prod++; + copy_gop->flags = GNTCOPY_dest_gref; +- if (idx > -1) { +- struct pending_tx_info *src_pend = &pending_tx_info[idx]; +- copy_gop->source.domid = src_pend->netif->domid; +- copy_gop->source.u.ref = src_pend->req.gref; ++ if (page_is_tracked(page)) { ++ lookup_tracker_page(page, ©_gop->source.domid, ++ ©_gop->source.u.ref); + copy_gop->flags |= GNTCOPY_source_gref; + } else { + copy_gop->source.domid = DOMID_SELF; +@@ -1081,6 +1082,8 @@ inline static void net_tx_action_dealloc(void) + if (!phys_to_machine_mapping_valid(pfn)) + continue; + ++ stop_tracking_page(mmap_pages[pending_idx]); ++ + gnttab_set_unmap_op(gop, idx_to_kaddr(pending_idx), + GNTMAP_host_map, + grant_tx_handle[pending_idx]); +@@ -1219,6 +1222,13 @@ static gnttab_map_grant_ref_t *netbk_get_requests(netif_t *netif, + netif_get(netif); + pending_tx_info[pending_idx].netif = netif; + frags[i].page = (void *)pending_idx; ++ ++ start_tracking_page(foreign_page_tracker, ++ mmap_pages[pending_idx], ++ netif->domid, ++ pending_tx_info[pending_idx].req.gref, ++ pending_idx, ++ NULL); + } + + return mop; +@@ -1526,6 +1536,13 @@ static void net_tx_action(void) + txreq.gref, netif->domid); + mop++; + ++ start_tracking_page(foreign_page_tracker, ++ mmap_pages[pending_idx], ++ netif->domid, ++ txreq.gref, ++ pending_idx, ++ NULL); ++ + memcpy(&pending_tx_info[pending_idx].req, + &txreq, sizeof(txreq)); + pending_tx_info[pending_idx].netif = netif; +@@ -1804,9 +1821,13 @@ static int __init netback_init(void) + netbk_tx_pending_timer.data = 0; + netbk_tx_pending_timer.function = netbk_tx_pending_timeout; + ++ foreign_page_tracker = alloc_page_foreign_tracker(MAX_PENDING_REQS); ++ if (!foreign_page_tracker) ++ return -ENOMEM; + mmap_pages = alloc_empty_pages_and_pagevec(MAX_PENDING_REQS); + if (mmap_pages == NULL) { + printk("%s: out of memory\n", __FUNCTION__); ++ free_page_foreign_tracker(foreign_page_tracker); + return -ENOMEM; + } + +diff --git a/include/xen/live_maps.h b/include/xen/live_maps.h +new file mode 100644 +index 0000000..96e080d +--- /dev/null ++++ b/include/xen/live_maps.h +@@ -0,0 +1,165 @@ ++#ifndef XEN_LIVE_MAPS_H__ ++#define XEN_LIVE_MAPS_H__ ++ ++/* A mechanism for tracking where pages have been grant mapped from. ++ Anything which can map pages through a grant reference is supposed ++ to allocate a page_tracker and then, whenever they map a grant: ++ ++ a) Flag the page as foreign with SetPageForeign(), and ++ b) Register the struct page with a tracker through start_tracking_page(). ++ ++ If you later need to grant access to the page (either with a normal ++ grant or implicitly in a copy grant operation), you should use ++ lookup_tracker_page() to find out what domain and grant reference ++ it was mapped from. ++ ++ Obviously, if a backend knows that the page will never need to be ++ re-granted once it's been mapped, it can avoid doing all this ++ stuff. ++ ++ The number of trackers is quite limited, so they shouldn't be ++ allocated unnecessarily. One per backend class is reasonable ++ (i.e. netback, blkback, etc.), but one per backend device probably ++ isn't. ++*/ ++ ++#include ++#include ++ ++#ifdef CONFIG_XEN ++ ++/* We use page->private to store some index information so that we can ++ find the tracking information later. The top few bits are used to ++ identify the tracker, and the rest are used as an index into that ++ tracker. */ ++ ++/* How many bits to use for tracker IDs. */ ++#define LIVE_MAP_TRACKER_BITS 2 ++ ++/* How many bits to use for tracker indexes. */ ++#define LIVE_MAP_TRACKER_IDX_BITS (32 - LIVE_MAP_TRACKER_BITS) ++ ++/* Maximum number of trackers */ ++#define LIVE_MAP_NR_TRACKERS (1 << LIVE_MAP_TRACKER_BITS) ++ ++/* Bitmask of index inside tracker */ ++#define LIVE_MAP_TRACKER_IDX_MASK (~0u >> LIVE_MAP_TRACKER_BITS) ++ ++/* Turn off some moderately expensive debug checks. */ ++#undef LIVE_MAPS_DEBUG ++ ++struct page_foreign_tracked { ++ domid_t dom; ++ grant_ref_t gref; ++ void *ctxt; ++#ifdef LIVE_MAPS_DEBUG ++ unsigned in_use; ++#endif ++}; ++ ++struct page_foreign_tracker { ++ unsigned size; ++ unsigned id; ++ struct page_foreign_tracked contents[]; ++}; ++ ++extern struct page_foreign_tracker *foreign_trackers[LIVE_MAP_NR_TRACKERS]; ++ ++/* Allocate a foreign page tracker. @size is the maximum index in the ++ tracker. Returns NULL on error. */ ++struct page_foreign_tracker *alloc_page_foreign_tracker(unsigned size); ++ ++/* Release a tracker allocated with alloc_page_foreign_tracker. There ++ should be no tracked pages when this is called. */ ++void free_page_foreign_tracker(struct page_foreign_tracker *pft); ++ ++static inline struct page_foreign_tracker *tracker_for_page(struct page *p) ++{ ++ unsigned idx = page_private(p); ++ return foreign_trackers[idx >> LIVE_MAP_TRACKER_IDX_BITS]; ++} ++ ++static inline void *get_page_tracker_ctxt(struct page *p) ++{ ++ struct page_foreign_tracker *pft = tracker_for_page(p); ++ unsigned idx = page_private(p); ++ return pft->contents[idx & LIVE_MAP_TRACKER_IDX_MASK].ctxt; ++} ++ ++/* Start tracking a page. @idx is an index in the tracker which is ++ not currently in use, and must be less than the size of the ++ tracker. The page must be marked as foreign before this is called. ++ The caller is expected to make sure that the page is not a ++ simulataneous target of lookup_tracker_page(). The page should be ++ passed to stop_tracking_page() when the grant is unmapped. */ ++static inline void start_tracking_page(struct page_foreign_tracker *pft, ++ struct page *p, ++ domid_t dom, ++ grant_ref_t gref, ++ unsigned idx, ++ void *ctxt) ++{ ++ BUG_ON(!PageForeign(p)); ++#ifdef LIVE_MAPS_DEBUG ++ BUG_ON(idx > pft->size); ++ BUG_ON(pft->contents[idx].in_use); ++ pft->contents[idx].in_use = 1; ++#endif ++ pft->contents[idx].dom = dom; ++ pft->contents[idx].gref = gref; ++ pft->contents[idx].ctxt = ctxt; ++ set_page_private(p, idx | (pft->id << LIVE_MAP_TRACKER_IDX_BITS)); ++} ++ ++static inline void stop_tracking_page(struct page *p) ++{ ++#ifdef LIVE_MAPS_DEBUG ++ struct page_foreign_tracker *pft; ++ unsigned idx = page_private(p); ++ BUG_ON(!PageForeign(p)); ++ pft = tracker_for_page(p); ++ BUG_ON((idx & LIVE_MAP_TRACKER_IDX_MASK) >= pft->size); ++ BUG_ON(!pft->contents[idx & LIVE_MAP_TRACKER_IDX_MASK].in_use); ++ pft->contents[idx & LIVE_MAP_TRACKER_IDX_MASK].in_use = 0; ++ set_page_private(p, 0); ++#endif ++} ++ ++/* Lookup a page which is tracked in some tracker. ++ start_tracking_page() must have been called previously. *@dom and ++ *@gref will be set to the values which were specified when ++ start_tracking_page() was called. */ ++static inline void lookup_tracker_page(struct page *p, domid_t *dom, ++ grant_ref_t *gref) ++{ ++ struct page_foreign_tracker *pft; ++ unsigned idx = page_private(p); ++ BUG_ON(!PageForeign(p)); ++ pft = tracker_for_page(p); ++#ifdef LIVE_MAPS_DEBUG ++ BUG_ON(!pft); ++ BUG_ON((idx & LIVE_MAP_TRACKER_IDX_MASK) >= pft->size); ++ BUG_ON(!pft->contents[idx & LIVE_MAP_TRACKER_IDX_MASK].in_use); ++#endif ++ *dom = pft->contents[idx & LIVE_MAP_TRACKER_IDX_MASK].dom; ++ *gref = pft->contents[idx & LIVE_MAP_TRACKER_IDX_MASK].gref; ++} ++ ++static inline int page_is_tracked(struct page *p) ++{ ++ return PageForeign(p) && p->mapping; ++} ++ ++#else /* !CONFIG_XEN */ ++static inline int page_is_tracked(struct page *p) ++{ ++ return 0; ++} ++static void lookup_tracker_page(struct page *p, domid_t *domid, ++ grant_ref_t *gref) ++{ ++ BUG(); ++} ++#endif ++ ++#endif /* !XEN_LIVE_MAPS_H__ */ diff --git a/master/netchannel2 b/master/netchannel2 new file mode 100644 index 0000000..56391c6 --- /dev/null +++ b/master/netchannel2 @@ -0,0 +1,8151 @@ + + netchannel2 + + Signed-off-by: Steven Smith + +diff --git a/drivers/xen/Kconfig b/drivers/xen/Kconfig +index 266e3e2..7d3033b 100644 +--- a/drivers/xen/Kconfig ++++ b/drivers/xen/Kconfig +@@ -210,6 +210,57 @@ config XEN_SCSI_FRONTEND + The SCSI frontend driver allows the kernel to access SCSI Devices + within another guest OS. + ++config XEN_NETCHANNEL2 ++ tristate "Net channel 2 support" ++ depends on XEN && NET ++ default y ++ help ++ Xen netchannel2 driver support. This allows a domain to act as ++ either the backend or frontend part of a netchannel2 connection. ++ Unless you are building a dedicated device-driver domain, you ++ almost certainly want to say Y here. ++ ++ If you say Y or M here, you should also say Y to one or both of ++ ``Net channel2 backend support'' and ``Net channel2 frontend ++ support'', below. ++ ++config XEN_NETDEV2_BACKEND ++ bool "Net channel 2 backend support" ++ depends on XEN_BACKEND && XEN_NETCHANNEL2 ++ default XEN_BACKEND ++ ++config XEN_NETDEV2_FRONTEND ++ bool "Net channel 2 frontend support" ++ depends on XEN_NETCHANNEL2 ++ default y ++ ++config XEN_NETDEV2_BYPASSABLE ++ bool "Net channel 2 bypassee support" ++ depends on XEN_NETDEV2_BACKEND ++ default y ++ help ++ This option allows net channel 2 endpoints in this domain to ++ be bypassed. If this domain is acting as a bridge between ++ domains on a single host, bypass support will allow faster ++ inter-domain communication and reduce load in this domain. ++ ++config XEN_NETDEV2_BYPASS_ENDPOINT ++ bool "Net channel 2 bypass endpoint support" ++ depends on XEN_NETDEV2_BACKEND && XEN_NETDEV2_FRONTEND ++ default y ++ help ++ Support for acting as the endpoint of a netchannel2 bypass. ++ Bypasses allow faster inter-domain communication, provided ++ every VM supports them. ++ ++config XEN_NETDEV2_AUTOMATIC_BYPASS ++ bool "Automatically manage netchannel2 bypasses" ++ depends on XEN_NETDEV2_BYPASS_ENDPOINT ++ default y ++ help ++ Try to detect when bypasses would be useful, and manage ++ them automatically. ++ + config XEN_GRANT_DEV + tristate "User-space granted page access driver" + default XEN_PRIVILEGED_GUEST +diff --git a/drivers/xen/Makefile b/drivers/xen/Makefile +index ab35228..f7a72d3 100644 +--- a/drivers/xen/Makefile ++++ b/drivers/xen/Makefile +@@ -30,3 +30,4 @@ obj-$(CONFIG_XEN_GRANT_DEV) += gntdev/ + obj-$(CONFIG_XEN_NETDEV_ACCEL_SFC_UTIL) += sfc_netutil/ + obj-$(CONFIG_XEN_NETDEV_ACCEL_SFC_FRONTEND) += sfc_netfront/ + obj-$(CONFIG_XEN_NETDEV_ACCEL_SFC_BACKEND) += sfc_netback/ ++obj-$(CONFIG_XEN_NETCHANNEL2) += netchannel2/ +diff --git a/drivers/xen/netchannel2/Makefile b/drivers/xen/netchannel2/Makefile +new file mode 100644 +index 0000000..d779de5 +--- /dev/null ++++ b/drivers/xen/netchannel2/Makefile +@@ -0,0 +1,25 @@ ++obj-$(CONFIG_XEN_NETCHANNEL2) += netchannel2.o ++ ++netchannel2-objs := chan.o netchan2.o rscb.o util.o \ ++ xmit_packet.o offload.o recv_packet.o poll.o \ ++ receiver_map.o ++ ++ifeq ($(CONFIG_XEN_NETDEV2_BACKEND),y) ++netchannel2-objs += netback2.o ++endif ++ ++ifeq ($(CONFIG_XEN_NETDEV2_FRONTEND),y) ++netchannel2-objs += netfront2.o ++endif ++ ++ifeq ($(CONFIG_XEN_NETDEV2_BYPASSABLE),y) ++netchannel2-objs += bypassee.o ++endif ++ ++ifeq ($(CONFIG_XEN_NETDEV2_BYPASS_ENDPOINT),y) ++netchannel2-objs += bypass.o ++endif ++ ++ifeq ($(CONFIG_XEN_NETDEV2_AUTOMATIC_BYPASS),y) ++netchannel2-objs += autobypass.o ++endif +diff --git a/drivers/xen/netchannel2/autobypass.c b/drivers/xen/netchannel2/autobypass.c +new file mode 100644 +index 0000000..a55df09 +--- /dev/null ++++ b/drivers/xen/netchannel2/autobypass.c +@@ -0,0 +1,316 @@ ++#include ++#include ++#include "netchannel2_core.h" ++ ++/* The state machine works like this: ++ ++ -- We start in state NORMAL. In this state, we count how many ++ bypass and non-bypass packets we receive, and don't do anything ++ else. ++ ++ -- After receiving AUTOBYPASS_PERIOD packets, we look at the ++ bypass-candidate to non-bypass-candidate ratio. If the number ++ of non-bypass packets exceeds the number of bypass packets by ++ more than a factor of AUTOBYPASS_RATIO, reset the counters and ++ go back to state NORMAL. Otherwise, go to state CONSIDERING. ++ We also reset and go back to normal if it took more than ++ AUTOBYPASS_MAX_PERIOD_JIFFIES jiffies to get here. ++ ++ -- In state CONSIDERING, continue to count up the bypass and ++ non-bypass packets. In addition, whenever we get a bypass ++ packet, pull the source MAC address out of the header and ++ compare it to the hot list. If it's in the hot list, increment ++ that entry's count. ++ ++ -- After another AUTOBYPASS_PERIOD, check the packet counts again. ++ Provided the total bypass ratio is good enough (see the NORMAL ++ exit criteria), walk the hot list, and if any entry accounts for ++ more than AUTOBYPASS_RATIO2 of the total traffic, suggest to ++ dom0 that it create a new bypass for us. The go to DEBOUNCE. ++ ++ -- In DEBOUNCE, wait until we've received at least ++ AUTOBYPASS_DEBOUNCE_PERIOD bypass packets, then go to NORMAL. ++ ++ So, we establish a bypass if total traffic > PERIOD/MAX_PERIOD ++ packets per second, of which at least PERIOD/(MAX_PERIOD*(RATIO+1)) ++ are bypass candidates and PERIOD/(MAX_PERIOD*(RATIO2+1)) are for ++ one specific bypass. This needs to be sustained for at least ++ PERIOD*2 before we actually establish a bypass. ++*/ ++ ++/* If you increase this past 65536, consider changing the type of ++ auto_bypass.hot_macs[...].count, to avoid overflow. */ ++#define AUTOBYPASS_PERIOD 1024 ++#define AUTOBYPASS_RATIO 2 ++#define AUTOBYPASS_RATIO2 4 ++#define AUTOBYPASS_DEBOUNCE_PERIOD 1024 ++#define AUTOBYPASS_MAX_PERIOD_JIFFIES (HZ/2) ++ ++ ++#define TEARDOWN_PERIOD_JIFFIES (HZ*5) ++#define TEARDOWN_MIN_PACKETS (256*TEARDOWN_PERIOD_JIFFIES) ++ ++static void autoteardown_timer_fn(unsigned long ignore); ++ ++static DEFINE_SPINLOCK(autoteardown_lock); ++static LIST_HEAD(autoteardown_list); ++static DEFINE_TIMER(autoteardown_timer, autoteardown_timer_fn, 0, 0); ++ ++static void autoteardown_timer_fn(unsigned long ignore) ++{ ++ struct nc2_alternate_ring *nar; ++ ++ spin_lock(&autoteardown_lock); ++ list_for_each_entry(nar, &autoteardown_list, ++ autoteardown.autoteardown_list) { ++ if (nar->autoteardown.seen_count < 2) { ++ /* Give it at least two periods to get started, ++ to avoid flapping. */ ++ /* One period isn't enough, because we reset ++ the seen_count without holding the teardown ++ lock from ++ nc2_aux_ring_start_disable_sequence, and ++ there's a risk that we'll see it non-zero ++ when it should be zero. However, the ++ chances of that happening twice in a row ++ are so small that we can ignore them. Even ++ if it does go wrong twice, the worst case ++ is that we drop a few packets by forcing a ++ detach when the remote is behaving ++ correctly. */ ++ nar->autoteardown.seen_count++; ++ continue; ++ } ++ switch (nar->state) { ++ case nc2_alt_ring_frontend_sent_ready: ++ /* Interesting. We're ready to go, but the ++ backend isn't. Furthermore, this isn't the ++ first time we've seen this interface, so ++ we've been trying to establish it for at ++ least TEARDOWN_PERIOD_JIFFIES. Conclude ++ that the backend is misbehaving and start a ++ disable sequence. */ ++ nc2_aux_ring_start_disable_sequence(nar); ++ break; ++ case nc2_alt_ring_ready: ++ if (nar->autoteardown.nr_packets < ++ TEARDOWN_MIN_PACKETS) { ++ /* This interface isn't busy enough -> ++ needs to be torn down. */ ++ nc2_aux_ring_start_disable_sequence(nar); ++ } ++ nar->autoteardown.nr_packets = 0; ++ break; ++ case nc2_alt_ring_disabling: ++ /* We seem to have gotten stuck trying to ++ disable the ring, probably because the ++ remote isn't sending FINISH messages fast ++ enough. Be a bit more aggressive. */ ++ nc2_aux_ring_start_detach_sequence(nar); ++ break; ++ default: ++ /* Other states are waiting either for the ++ local operating system to complete work ++ items, or for the upstream interface to ++ process messages. Upstream is always ++ trusted, so just assume that this'll fix ++ itself sooner or later. */ ++ break; ++ } ++ } ++ if (!list_empty(&autoteardown_list)) { ++ mod_timer(&autoteardown_timer, ++ jiffies + TEARDOWN_PERIOD_JIFFIES); ++ } ++ spin_unlock(&autoteardown_lock); ++} ++ ++void nc2_register_bypass_for_autoteardown(struct nc2_alternate_ring *nar) ++{ ++ spin_lock_bh(&autoteardown_lock); ++ if (list_empty(&autoteardown_list)) ++ mod_timer(&autoteardown_timer, ++ jiffies + TEARDOWN_PERIOD_JIFFIES); ++ list_move(&nar->autoteardown.autoteardown_list, &autoteardown_list); ++ spin_unlock_bh(&autoteardown_lock); ++} ++ ++void nc2_unregister_bypass_for_autoteardown(struct nc2_alternate_ring *nar) ++{ ++ spin_lock_bh(&autoteardown_lock); ++ list_del_init(&nar->autoteardown.autoteardown_list); ++ if (list_empty(&autoteardown_list)) ++ del_timer(&autoteardown_timer); ++ spin_unlock_bh(&autoteardown_lock); ++} ++ ++static int busy_enough_for_bypass(struct netchannel2 *nc) ++{ ++ uint64_t nr_non_bypass; ++ unsigned long start_jiffies; ++ ++ nr_non_bypass = nc->auto_bypass.nr_non_bypass_packets; ++ start_jiffies = nc->auto_bypass.start_jiffies; ++ nc->auto_bypass.nr_non_bypass_packets = 0; ++ nc->auto_bypass.nr_bypass_packets = 0; ++ if (nr_non_bypass > AUTOBYPASS_PERIOD * AUTOBYPASS_RATIO || ++ jiffies - start_jiffies > AUTOBYPASS_MAX_PERIOD_JIFFIES) { ++ /* Either took too long to collect the bypass ++ packets, or too many non-bypass relative to ++ number of bypasses. Either way, not a good ++ time to consider doing bypasses. */ ++ nc->auto_bypass.start_jiffies = jiffies; ++ return 0; ++ } else { ++ return 1; ++ } ++} ++ ++static void record_source_mac(struct netchannel2 *nc, struct sk_buff *skb) ++{ ++ struct ethhdr *eh; ++ unsigned x; ++ ++ if (skb_headlen(skb) < sizeof(struct ethhdr)) ++ return; ++ eh = (struct ethhdr *)skb->data; ++ for (x = 0; x < nc->auto_bypass.nr_hot_macs; x++) { ++ if (!memcmp(eh->h_source, nc->auto_bypass.hot_macs[x].mac, ++ sizeof(eh->h_source))) { ++ nc->auto_bypass.hot_macs[x].count++; ++ return; ++ } ++ } ++ if (x == AUTOBYPASS_MAX_HOT_MACS) { ++ /* Communicating with too many bypass candidates -> ++ can't keep track of them all -> drop a couple at ++ random. */ ++ return; ++ } ++ nc->auto_bypass.hot_macs[x].count = 1; ++ memcpy(nc->auto_bypass.hot_macs[x].mac, ++ eh->h_source, ++ sizeof(eh->h_source)); ++ nc->auto_bypass.nr_hot_macs++; ++} ++ ++static void queue_suggested_bypass(struct netchannel2 *nc, ++ const char *mac) ++{ ++ int ind; ++ ++ ind = nc->auto_bypass.suggestion_head % AUTOBYPASS_SUGG_QUEUE_SIZE; ++ if (nc->auto_bypass.suggestion_head == ++ nc->auto_bypass.suggestion_tail + AUTOBYPASS_SUGG_QUEUE_SIZE) { ++ /* We've overflowed the suggestion queue. That means ++ that, even though we're receiving a massive number ++ of packets, we've never had enough free ring space ++ to actually send a suggestion message. I'm not ++ convinced that's actually possible, but it's ++ trivial to handle, so we might as well. */ ++ /* Drop the oldest pending suggestion, since it's the ++ most likely to be out of date and therefore ++ useless. */ ++ nc->auto_bypass.suggestion_tail++; ++ } ++ nc->auto_bypass.suggestion_head++; ++ memcpy(&nc->auto_bypass.suggestions[ind], ++ mac, ++ ETH_ALEN); ++} ++ ++static void suggest_bypasses(struct netchannel2 *nc) ++{ ++ unsigned x; ++ unsigned threshold; ++ ++ BUG_ON(nc->auto_bypass.nr_hot_macs == 0); ++ threshold = ++ (nc->auto_bypass.nr_non_bypass_packets + ++ nc->auto_bypass.nr_bypass_packets) / AUTOBYPASS_RATIO2; ++ for (x = 0; x < nc->auto_bypass.nr_hot_macs; x++) { ++ if (nc->auto_bypass.hot_macs[x].count > threshold) { ++ queue_suggested_bypass( ++ nc, ++ nc->auto_bypass.hot_macs[x].mac); ++ } ++ } ++} ++ ++/* Called under the master ring lock whenever we receive a packet with ++ NC2_PACKET_FLAG_bypass_candidate set. */ ++void nc2_received_bypass_candidate_packet(struct netchannel2 *nc, ++ struct sk_buff *skb) ++{ ++ nc->auto_bypass.nr_bypass_packets++; ++ switch (nc->auto_bypass.state) { ++ case autobypass_state_normal: ++ if (nc->auto_bypass.nr_bypass_packets != AUTOBYPASS_PERIOD) ++ return; ++ if (!busy_enough_for_bypass(nc)) ++ return; ++ nc->auto_bypass.nr_hot_macs = 0; ++ nc->auto_bypass.state = autobypass_state_considering; ++ break; ++ case autobypass_state_considering: ++ record_source_mac(nc, skb); ++ if (nc->auto_bypass.nr_bypass_packets != AUTOBYPASS_PERIOD) ++ return; ++ if (busy_enough_for_bypass(nc)) ++ suggest_bypasses(nc); ++ nc->auto_bypass.state = autobypass_state_debounce; ++ break; ++ case autobypass_state_debounce: ++ if (nc->auto_bypass.nr_bypass_packets == AUTOBYPASS_PERIOD) { ++ nc->auto_bypass.state = autobypass_state_normal; ++ nc->auto_bypass.nr_non_bypass_packets = 0; ++ nc->auto_bypass.nr_bypass_packets = 0; ++ nc->auto_bypass.start_jiffies = jiffies; ++ } ++ break; ++ } ++} ++ ++static int send_suggestion(struct netchannel2_ring_pair *ncrp, ++ const char *mac) ++{ ++ struct netchannel2_msg_suggest_bypass msg; ++ ++ if (!nc2_can_send_payload_bytes(&ncrp->prod_ring, sizeof(msg))) ++ return 0; ++ ++ memset(&msg, 0, sizeof(msg)); ++ memcpy(msg.mac, mac, ETH_ALEN); ++ nc2_send_message(&ncrp->prod_ring, ++ NETCHANNEL2_MSG_SUGGEST_BYPASS, ++ 0, ++ &msg, ++ sizeof(msg)); ++ ncrp->pending_time_sensitive_messages = 1; ++ return 1; ++} ++ ++void _nc2_autobypass_make_suggestions(struct netchannel2 *nc) ++{ ++ struct nc2_auto_bypass *nab = &nc->auto_bypass; ++ struct netchannel2_ring_pair *ncrp = &nc->rings; ++ unsigned ind; ++ ++ while (nab->suggestion_tail != nab->suggestion_head) { ++ BUG_ON(nab->suggestion_head - nab->suggestion_tail > ++ AUTOBYPASS_SUGG_QUEUE_SIZE); ++ ind = nab->suggestion_tail % AUTOBYPASS_SUGG_QUEUE_SIZE; ++ if (!send_suggestion(ncrp, nab->suggestions[ind].mac)) ++ break; ++ nab->suggestion_tail++; ++ } ++} ++ ++void nc2_shutdown_autoteardown(void) ++{ ++ /* There shouldn't be any interfaces at all, so there ++ certainly won't be any bypasses, and we don't have to worry ++ about the timer getting requeued. Make sure it's finished ++ and then get out. */ ++ del_timer_sync(&autoteardown_timer); ++} +diff --git a/drivers/xen/netchannel2/bypass.c b/drivers/xen/netchannel2/bypass.c +new file mode 100644 +index 0000000..0bee4ea +--- /dev/null ++++ b/drivers/xen/netchannel2/bypass.c +@@ -0,0 +1,824 @@ ++#include ++#include ++#include ++#include ++#include ++#include ++#include "netchannel2_core.h" ++ ++/* Can we send this packet on this bypass? True if the destination ++ MAC address matches. */ ++static int can_bypass_packet(struct nc2_alternate_ring *ncr, ++ struct sk_buff *skb) ++{ ++ struct ethhdr *eh; ++ ++ if (skb_headlen(skb) < sizeof(*eh)) ++ return 0; ++ eh = (struct ethhdr *)skb->data; ++ if (memcmp(eh->h_dest, ncr->rings.remote_mac, ETH_ALEN)) ++ return 0; ++ else ++ return 1; ++} ++ ++/* Called from the netdev start_xmit method. We're holding the master ++ nc ring lock, but not the bypass ring lock. */ ++int bypass_xmit_packet(struct netchannel2 *nc, ++ struct nc2_alternate_ring *ncr, ++ struct sk_buff *skb) ++{ ++ struct netchannel2_ring_pair *rings = &ncr->rings; ++ struct skb_cb_overlay *skb_co = get_skb_overlay(skb); ++ size_t msg_size; ++ enum transmit_policy policy; ++ int r; ++ ++ if (!can_bypass_packet(ncr, skb)) ++ return 0; ++ ++ spin_lock(&rings->lock); ++ if (ncr->state != nc2_alt_ring_ready) { ++ spin_unlock(&rings->lock); ++ return 0; ++ } ++ /* We're now committed to either transmitting this packet on ++ this ring or dropping it outright. */ ++ if (skb->len <= PACKET_PREFIX_SIZE && !skb_is_nonlinear(skb)) { ++ r = prepare_xmit_allocate_small(rings, skb); ++ policy = transmit_policy_small; ++ } else { ++ r = prepare_xmit_allocate_grant(rings, skb, 1); ++ policy = transmit_policy_grant; ++ } ++ if (r < 0) { ++ spin_unlock(&rings->lock); ++ dev_kfree_skb(skb); ++ return 1; ++ } ++ ++ skb_co->policy = policy; ++ msg_size = get_transmitted_packet_msg_size(skb); ++ if (!nc2_reserve_payload_bytes(&rings->prod_ring, msg_size)) { ++ /* Uh oh. */ ++ release_tx_packet(rings, skb); ++ spin_unlock(&rings->lock); ++ return 1; ++ } ++ ++#ifdef CONFIG_XEN_NETDEV2_AUTOMATIC_BYPASS ++ ncr->autoteardown.nr_packets++; ++#endif ++ ++ queue_packet_to_interface(skb, rings); ++ ++ spin_unlock(&rings->lock); ++ ++ return 1; ++} ++ ++void nc2_aux_ring_start_disable_sequence(struct nc2_alternate_ring *nar) ++{ ++ spin_lock(&nar->rings.lock); ++ if (nar->state < nc2_alt_ring_disabling) { ++#ifdef CONFIG_XEN_NETDEV2_AUTOMATIC_BYPASS ++ /* We should really hold the autoteardown lock for ++ this, but see the big comment in ++ autoteardown_timer_fn() */ ++ nar->autoteardown.seen_count = 0; ++#endif ++ nar->state = nc2_alt_ring_disabling; ++ nc2_kick(&nar->rings); ++ } ++ spin_unlock(&nar->rings.lock); ++} ++ ++static void start_detach_worker(struct work_struct *ws) ++{ ++ struct nc2_alternate_ring *ncr = ++ container_of(ws, struct nc2_alternate_ring, detach_work_item); ++ ++ /* Detach from the ring. Note that it may still be running at ++ this point. In that case, we need to stop it and then go ++ and discard any outstanding messages on it. */ ++ ++ /* Stop the IRQ and change state. This will prevent us from ++ being added to the schedule list again, but we may still be ++ on it for other reasons, so we need to get back into the ++ worker thread to finish up. */ ++ ++ /* We defer actually unmapping the rings to ++ nc2_advertise_rings(), since that's on the worker thread ++ and we therefore know we're not going to race anything ++ doing it there. */ ++ ++ if (ncr->rings.irq >= 0) ++ unbind_from_irqhandler(ncr->rings.irq, &ncr->rings); ++ ncr->rings.irq = -1; ++ ++ nc2_unregister_bypass_for_autoteardown(ncr); ++ ++ spin_lock_bh(&ncr->rings.lock); ++ ncr->state = nc2_alt_ring_detached_pending; ++ ncr->rings.interface->need_aux_ring_state_machine = 1; ++ nc2_kick(&ncr->rings.interface->rings); ++ spin_unlock_bh(&ncr->rings.lock); ++} ++ ++void nc2_aux_ring_start_detach_sequence(struct nc2_alternate_ring *nar) ++{ ++ spin_lock(&nar->rings.lock); ++ if (nar->state >= nc2_alt_ring_detaching) { ++ spin_unlock(&nar->rings.lock); ++ return; ++ } ++ nar->state = nc2_alt_ring_detaching; ++ spin_unlock(&nar->rings.lock); ++ ++ /* We can't do unbind_from_irqhandler() from a tasklet, so ++ punt it to a workitem. */ ++ INIT_WORK(&nar->detach_work_item, ++ start_detach_worker); ++ schedule_work(&nar->detach_work_item); ++} ++ ++/* Crank through the auxiliary ring state machine. Called holding the ++ * master ring lock. */ ++void _nc2_crank_aux_ring_state_machine(struct netchannel2 *nc) ++{ ++ struct nc2_alternate_ring *nar; ++ struct nc2_alternate_ring *next_nar; ++ struct netchannel2_msg_bypass_disabled disabled_msg; ++ struct netchannel2_msg_bypass_detached detached_msg; ++ struct netchannel2_msg_bypass_frontend_ready frontend_ready_msg; ++ ++ memset(&disabled_msg, 0, sizeof(disabled_msg)); ++ memset(&detached_msg, 0, sizeof(detached_msg)); ++ memset(&frontend_ready_msg, 0, sizeof(frontend_ready_msg)); ++ ++ if (nc->pending_bypass_error) { ++ if (!nc2_can_send_payload_bytes(&nc->rings.prod_ring, ++ sizeof(frontend_ready_msg))) ++ return; ++ frontend_ready_msg.port = -1; ++ nc2_send_message(&nc->rings.prod_ring, ++ NETCHANNEL2_MSG_BYPASS_FRONTEND_READY, ++ 0, ++ &frontend_ready_msg, ++ sizeof(frontend_ready_msg)); ++ nc->rings.pending_time_sensitive_messages = 1; ++ nc->pending_bypass_error = 0; ++ } ++ ++ list_for_each_entry_safe(nar, next_nar, &nc->alternate_rings, ++ rings_by_interface) { ++ ++ spin_lock(&nar->rings.lock); ++ if (nar->state == nc2_alt_ring_frontend_send_ready_pending) { ++ if (!nc2_can_send_payload_bytes(&nc->rings.prod_ring, ++ sizeof(frontend_ready_msg))) { ++ spin_unlock(&nar->rings.lock); ++ return; ++ } ++ frontend_ready_msg.port = ++ irq_to_evtchn_port(nar->rings.irq); ++ nc2_send_message(&nc->rings.prod_ring, ++ NETCHANNEL2_MSG_BYPASS_FRONTEND_READY, ++ 0, ++ &frontend_ready_msg, ++ sizeof(frontend_ready_msg)); ++ nar->state = nc2_alt_ring_frontend_sent_ready; ++ nc->rings.pending_time_sensitive_messages = 1; ++ } ++ if (nar->state == nc2_alt_ring_disabled_pending) { ++ if (!nc2_can_send_payload_bytes(&nc->rings.prod_ring, ++ sizeof(disabled_msg))) { ++ spin_unlock(&nar->rings.lock); ++ return; ++ } ++ disabled_msg.handle = nar->handle; ++ nc2_send_message(&nc->rings.prod_ring, ++ NETCHANNEL2_MSG_BYPASS_DISABLED, ++ 0, ++ &disabled_msg, ++ sizeof(disabled_msg)); ++ nar->state = nc2_alt_ring_disabled; ++ nc->rings.pending_time_sensitive_messages = 1; ++ } ++ if (nar->state == nc2_alt_ring_detached_pending) { ++ if (!nc2_can_send_payload_bytes(&nc->rings.prod_ring, ++ sizeof(detached_msg))) { ++ spin_unlock(&nar->rings.lock); ++ return; ++ } ++ ++ /* If we get here then we know that nobody ++ else is going to touch the ring, because ++ that's what detached_pending means. */ ++ /* Deferred from start_detach_worker() */ ++ nc2_unmap_grants(&nar->prod_mapper); ++ nc2_unmap_grants(&nar->cons_mapper); ++ nc2_unmap_grants(&nar->control_mapper); ++ ++ detached_msg.handle = nar->handle; ++ nc2_send_message(&nc->rings.prod_ring, ++ NETCHANNEL2_MSG_BYPASS_DETACHED, ++ 0, ++ &detached_msg, ++ sizeof(detached_msg)); ++ nc->rings.pending_time_sensitive_messages = 1; ++ ++ list_del(&nar->rings_by_interface); ++ ++ spin_unlock(&nar->rings.lock); ++ ++ kfree(nar); ++ } else { ++ spin_unlock(&nar->rings.lock); ++ } ++ } ++ nc->need_aux_ring_state_machine = 0; ++} ++ ++static int map_rings_common(struct nc2_alternate_ring *ncr, ++ struct netchannel2_msg_bypass_common *msg) ++{ ++ int err; ++ ++ if (msg->ring_domid == DOMID_SELF) ++ msg->ring_domid = ncr->rings.interface->rings.otherend_id; ++ ++ err = nc2_map_grants(&ncr->prod_mapper, ++ ncr->prod_grefs, ++ msg->ring_pages, ++ msg->ring_domid); ++ if (err < 0) { ++ printk(KERN_ERR "%d mapping producer ring", err); ++ return err; ++ } ++ ++ err = nc2_map_grants(&ncr->cons_mapper, ++ ncr->cons_grefs, ++ msg->ring_pages, ++ msg->ring_domid); ++ if (err < 0) { ++ printk(KERN_ERR "%d mapping consumer ring", err); ++ return err; ++ } ++ ++ err = nc2_map_grants(&ncr->control_mapper, ++ &msg->control_gref, ++ 1, ++ msg->ring_domid); ++ if (err < 0) ++ printk(KERN_ERR "%d mapping control ring", err); ++ return err; ++} ++ ++static int map_rings_frontend(struct nc2_alternate_ring *ncr) ++{ ++ struct netchannel2_frontend_shared *nfs; ++ struct netchannel2_sring_prod *prod_sring; ++ struct netchannel2_sring_cons *cons_sring; ++ int err; ++ ++ err = map_rings_common(ncr, &ncr->frontend_setup_msg.common); ++ if (err < 0) ++ return err; ++ ++ nfs = ncr->control_mapper.mapping->addr; ++ cons_sring = &nfs->cons; ++ prod_sring = &nfs->prod; ++ _nc2_attach_rings(&ncr->rings, ++ cons_sring, ++ ncr->cons_mapper.mapping->addr, ++ ncr->frontend_setup_msg.common.ring_pages * PAGE_SIZE, ++ prod_sring, ++ ncr->prod_mapper.mapping->addr, ++ ncr->frontend_setup_msg.common.ring_pages * PAGE_SIZE, ++ ncr->frontend_setup_msg.common.peer_domid); ++ ++ return 0; ++} ++ ++static int map_rings_backend(struct nc2_alternate_ring *ncr) ++{ ++ struct netchannel2_backend_shared *nbs; ++ struct netchannel2_sring_prod *prod_sring; ++ struct netchannel2_sring_cons *cons_sring; ++ int err; ++ ++ err = map_rings_common(ncr, &ncr->backend_setup_msg.common); ++ if (err < 0) ++ return err; ++ ++ nbs = ncr->control_mapper.mapping->addr; ++ cons_sring = &nbs->cons; ++ prod_sring = &nbs->prod; ++ _nc2_attach_rings(&ncr->rings, ++ cons_sring, ++ ncr->cons_mapper.mapping->addr, ++ ncr->backend_setup_msg.common.ring_pages * PAGE_SIZE, ++ prod_sring, ++ ncr->prod_mapper.mapping->addr, ++ ncr->backend_setup_msg.common.ring_pages * PAGE_SIZE, ++ ncr->backend_setup_msg.common.peer_domid); ++ ++ return 0; ++} ++ ++static void send_ready_message(struct nc2_alternate_ring *ncr) ++{ ++ struct netchannel2_msg_bypass_ready msg; ++ ++ memset(&msg, 0, sizeof(msg)); ++ if (nc2_can_send_payload_bytes(&ncr->rings.prod_ring, sizeof(msg))) { ++ nc2_send_message(&ncr->rings.prod_ring, ++ NETCHANNEL2_MSG_BYPASS_READY, ++ 0, &msg, sizeof(msg)); ++ if (nc2_flush_ring(&ncr->rings.prod_ring)) ++ notify_remote_via_irq(ncr->rings.irq); ++ } else { ++ /* This shouldn't happen, because the producer ring ++ should be essentially empty at this stage. If it ++ does, it probably means the other end is playing ++ silly buggers with the ring indexes. Drop the ++ message. */ ++ printk(KERN_WARNING "Failed to send bypass ring ready message.\n"); ++ } ++} ++ ++void nc2_handle_bypass_ready(struct netchannel2 *nc, ++ struct netchannel2_ring_pair *ncrp, ++ struct netchannel2_msg_hdr *hdr) ++{ ++ struct nc2_alternate_ring *ncr; ++ ++ if (ncrp == &nc->rings) { ++ pr_debug("bypass ready on principal interface?\n"); ++ return; ++ } ++ ncr = container_of(ncrp, struct nc2_alternate_ring, rings); ++ /* We're now allowed to start sending packets over this ++ * ring. */ ++ if (ncr->state == nc2_alt_ring_frontend_sent_ready) { ++ ncr->state = nc2_alt_ring_ready; ++#ifdef CONFIG_XEN_NETDEV2_AUTOMATIC_BYPASS ++ ncr->autoteardown.seen_count = 0; ++#endif ++ } ++} ++ ++/* Called holding the aux ring lock. */ ++void _nc2_alternate_ring_disable_finish(struct nc2_alternate_ring *ncr) ++{ ++ /* No more packets will ever come out of this ring -> it is ++ now disabled. */ ++ ncr->state = nc2_alt_ring_disabled_pending; ++ ncr->rings.interface->need_aux_ring_state_machine = 1; ++ nc2_kick(&ncr->rings.interface->rings); ++} ++ ++static void initialise_bypass_frontend_work_item(struct work_struct *ws) ++{ ++ struct nc2_alternate_ring *ncr = ++ container_of(ws, struct nc2_alternate_ring, work_item); ++ struct netchannel2 *interface = ncr->rings.interface; ++ int err; ++ ++ memcpy(&ncr->rings.remote_mac, ++ ncr->frontend_setup_msg.common.remote_mac, 6); ++ err = map_rings_frontend(ncr); ++ if (err < 0) ++ goto err; ++ ++ BUG_ON(ncr->rings.cons_ring.sring == NULL); ++ ++ err = bind_listening_port_to_irqhandler(ncr->rings.otherend_id, ++ nc2_int, ++ 0, ++ "netchannel2_bypass", ++ &ncr->rings); ++ if (err < 0) ++ goto err; ++ ncr->rings.irq = err; ++ ++ /* Get it going. */ ++ nc2_kick(&ncr->rings); ++ ++ /* And get the master ring to send a FRONTEND_READY message */ ++ ncr->state = nc2_alt_ring_frontend_send_ready_pending; ++ spin_lock_bh(&interface->rings.lock); ++ interface->need_aux_ring_state_machine = 1; ++ nc2_kick(&interface->rings); ++ spin_unlock_bh(&interface->rings.lock); ++ ++ nc2_register_bypass_for_autoteardown(ncr); ++ ++ return; ++ ++err: ++ printk(KERN_ERR "Error %d setting up bypass ring!\n", err); ++ ++ spin_lock_bh(&interface->rings.lock); ++ interface->pending_bypass_error = 1; ++ interface->need_aux_ring_state_machine = 1; ++ nc2_kick(&interface->rings); ++ list_del(&ncr->rings_by_interface); ++ spin_unlock_bh(&interface->rings.lock); ++ ++ nc2_unmap_grants(&ncr->prod_mapper); ++ nc2_unmap_grants(&ncr->cons_mapper); ++ nc2_unmap_grants(&ncr->control_mapper); ++ kfree(ncr); ++ return; ++} ++ ++static void initialise_bypass_backend_work_item(struct work_struct *ws) ++{ ++ struct nc2_alternate_ring *ncr = ++ container_of(ws, struct nc2_alternate_ring, work_item); ++ struct netchannel2 *interface = ncr->rings.interface; ++ int err; ++ ++ memcpy(&ncr->rings.remote_mac, ++ ncr->backend_setup_msg.common.remote_mac, 6); ++ err = map_rings_backend(ncr); ++ if (err < 0) ++ goto err; ++ ++ err = bind_interdomain_evtchn_to_irqhandler(ncr->rings.otherend_id, ++ ncr->backend_setup_msg.port, ++ nc2_int, ++ 0, ++ "netchannel2_bypass", ++ &ncr->rings); ++ if (err < 0) ++ goto err; ++ ncr->rings.irq = err; ++ ++ send_ready_message(ncr); ++ ++#ifdef CONFIG_XEN_NETDEV2_AUTOMATIC_BYPASS ++ ncr->autoteardown.seen_count = 0; ++#endif ++ ++ spin_lock_bh(&ncr->rings.lock); ++ ncr->state = nc2_alt_ring_ready; ++ spin_unlock_bh(&ncr->rings.lock); ++ ++ nc2_kick(&ncr->rings); ++ ++ nc2_register_bypass_for_autoteardown(ncr); ++ ++ return; ++ ++err: ++ printk(KERN_ERR "Error %d setting up bypass ring!\n", err); ++ ++ spin_lock_bh(&interface->rings.lock); ++ list_del(&ncr->rings_by_interface); ++ spin_unlock_bh(&interface->rings.lock); ++ ++ nc2_unmap_grants(&ncr->prod_mapper); ++ nc2_unmap_grants(&ncr->cons_mapper); ++ nc2_unmap_grants(&ncr->control_mapper); ++ kfree(ncr); ++ return; ++} ++ ++void nc2_handle_bypass_frontend(struct netchannel2 *nc, ++ struct netchannel2_ring_pair *ncrp, ++ struct netchannel2_msg_hdr *hdr) ++{ ++ struct nc2_alternate_ring *work; ++ ++ if (hdr->size < sizeof(work->frontend_setup_msg)) { ++ pr_debug("Bypass message had strange size %d\n", hdr->size); ++ return; ++ } ++ if (ncrp != &nc->rings) { ++ pr_debug("Bypass message on ancillary ring!\n"); ++ return; ++ } ++ if (!nc->remote_trusted) { ++ pr_debug("Untrusted domain tried to set up a bypass.\n"); ++ return; ++ } ++ if (nc->pending_bypass_error) { ++ pr_debug("Remote tried to establish a bypass when we already had a pending error\n"); ++ return; ++ } ++ work = kzalloc(sizeof(*work), GFP_ATOMIC); ++ if (!work) { ++ printk(KERN_WARNING "no memory for alternative ring pair!\n"); ++ nc->pending_bypass_error = 1; ++ nc->need_aux_ring_state_machine = 1; ++ return; ++ } ++ nc2_copy_from_ring(&nc->rings.cons_ring, &work->frontend_setup_msg, ++ sizeof(work->frontend_setup_msg)); ++ if (hdr->size != sizeof(work->frontend_setup_msg) + ++ sizeof(uint32_t) * 2 * ++ work->frontend_setup_msg.common.ring_pages) { ++ printk(KERN_WARNING "inconsistent bypass message size (%d for %d pages)\n", ++ hdr->size, work->frontend_setup_msg.common.ring_pages); ++ goto err; ++ } ++ if (work->frontend_setup_msg.common.ring_pages > ++ MAX_BYPASS_RING_PAGES_MAPPABLE) { ++ printk(KERN_WARNING "too many ring pages: %d > %d\n", ++ work->frontend_setup_msg.common.ring_pages, ++ MAX_BYPASS_RING_PAGES_MAPPABLE); ++err: ++ kfree(work); ++ nc->pending_bypass_error = 1; ++ nc->need_aux_ring_state_machine = 1; ++ return; ++ } ++ nc2_copy_from_ring_off(&ncrp->cons_ring, ++ &work->prod_grefs, ++ sizeof(uint32_t) * ++ work->frontend_setup_msg.common.ring_pages, ++ sizeof(work->frontend_setup_msg)); ++ nc2_copy_from_ring_off(&ncrp->cons_ring, ++ &work->cons_grefs, ++ sizeof(uint32_t) * ++ work->frontend_setup_msg.common.ring_pages, ++ sizeof(work->frontend_setup_msg) + ++ sizeof(uint32_t) * ++ work->frontend_setup_msg.common.ring_pages); ++ ++ work->state = nc2_alt_ring_frontend_preparing; ++#ifdef CONFIG_XEN_NETDEV2_AUTOMATIC_BYPASS ++ INIT_LIST_HEAD(&work->autoteardown.autoteardown_list); ++#endif ++ init_waitqueue_head(&work->eventq); ++ work->handle = work->frontend_setup_msg.common.handle; ++ INIT_WORK(&work->work_item, initialise_bypass_frontend_work_item); ++ if (init_ring_pair(&work->rings, nc) < 0) ++ goto err; ++ work->rings.filter_mac = 1; ++ ++ list_add(&work->rings_by_interface, &nc->alternate_rings); ++ schedule_work(&work->work_item); ++} ++ ++void nc2_handle_bypass_backend(struct netchannel2 *nc, ++ struct netchannel2_ring_pair *ncrp, ++ struct netchannel2_msg_hdr *hdr) ++{ ++ struct nc2_alternate_ring *work; ++ ++ if (hdr->size < sizeof(work->backend_setup_msg)) { ++ pr_debug("Bypass message had strange size %d\n", hdr->size); ++ return; ++ } ++ if (ncrp != &nc->rings) { ++ pr_debug("Bypass message on ancillary ring!\n"); ++ return; ++ } ++ if (!nc->remote_trusted) { ++ pr_debug("Untrusted domain tried to set up a bypass.\n"); ++ return; ++ } ++ work = kzalloc(sizeof(*work), GFP_ATOMIC); ++ if (!work) { ++ printk(KERN_WARNING "no memory for alternative ring pair!\n"); ++ return; ++ } ++ nc2_copy_from_ring(&nc->rings.cons_ring, &work->backend_setup_msg, ++ sizeof(work->backend_setup_msg)); ++ if (hdr->size != sizeof(work->backend_setup_msg) + ++ sizeof(uint32_t) * 2 * ++ work->backend_setup_msg.common.ring_pages) { ++ printk(KERN_WARNING "inconsistent bypass message size (%d for %d pages)\n", ++ hdr->size, work->backend_setup_msg.common.ring_pages); ++ goto err; ++ } ++ if (work->backend_setup_msg.common.ring_pages > ++ MAX_BYPASS_RING_PAGES_MAPPABLE) { ++ printk(KERN_WARNING "too many ring pages: %d > %d\n", ++ work->backend_setup_msg.common.ring_pages, ++ MAX_BYPASS_RING_PAGES_MAPPABLE); ++err: ++ kfree(work); ++ return; ++ } ++ nc2_copy_from_ring_off(&ncrp->cons_ring, ++ &work->prod_grefs, ++ sizeof(uint32_t) * ++ work->backend_setup_msg.common.ring_pages, ++ sizeof(work->backend_setup_msg)); ++ nc2_copy_from_ring_off(&ncrp->cons_ring, ++ &work->cons_grefs, ++ sizeof(uint32_t) * ++ work->backend_setup_msg.common.ring_pages, ++ sizeof(work->backend_setup_msg) + ++ sizeof(uint32_t) * ++ work->backend_setup_msg.common.ring_pages); ++ ++#ifdef CONFIG_XEN_NETDEV2_AUTOMATIC_BYPASS ++ INIT_LIST_HEAD(&work->autoteardown.autoteardown_list); ++#endif ++ work->state = nc2_alt_ring_backend_preparing; ++ init_waitqueue_head(&work->eventq); ++ work->handle = work->backend_setup_msg.common.handle; ++ INIT_WORK(&work->work_item, initialise_bypass_backend_work_item); ++ if (init_ring_pair(&work->rings, nc) < 0) ++ goto err; ++ work->rings.filter_mac = 1; ++ ++ list_add(&work->rings_by_interface, &nc->alternate_rings); ++ schedule_work(&work->work_item); ++} ++ ++/* Called under the nc master ring. */ ++static struct nc2_alternate_ring *find_ring_by_handle(struct netchannel2 *nc, ++ uint32_t handle) ++{ ++ struct nc2_alternate_ring *nar; ++ list_for_each_entry(nar, &nc->alternate_rings, rings_by_interface) { ++ if (nar->handle == handle) ++ return nar; ++ } ++ return NULL; ++} ++ ++void nc2_handle_bypass_disable(struct netchannel2 *nc, ++ struct netchannel2_ring_pair *ncrp, ++ struct netchannel2_msg_hdr *hdr) ++{ ++ struct netchannel2_msg_bypass_disable msg; ++ struct nc2_alternate_ring *nar; ++ ++ if (ncrp != &nc->rings) { ++ pr_debug("Bypass disable on ancillary ring!\n"); ++ return; ++ } ++ if (!nc->remote_trusted) { ++ pr_debug("Untrusted remote requested bypass disable.\n"); ++ return; ++ } ++ if (hdr->size != sizeof(msg)) { ++ printk(KERN_WARNING "Strange size bypass disable message; %d != %zd.\n", ++ hdr->size, sizeof(msg)); ++ return; ++ } ++ nc2_copy_from_ring(&nc->rings.cons_ring, &msg, sizeof(msg)); ++ nar = find_ring_by_handle(nc, msg.handle); ++ if (nar == NULL) { ++ printk(KERN_WARNING "Request to disable unknown alternate ring %d.\n", ++ msg.handle); ++ return; ++ } ++ nc2_aux_ring_start_disable_sequence(nar); ++} ++ ++/* We've received a BYPASS_DETACH message on the master ring. Do ++ what's needed to process it. */ ++/* Called from the tasklet holding the master ring lock. */ ++void nc2_handle_bypass_detach(struct netchannel2 *nc, ++ struct netchannel2_ring_pair *ncrp, ++ struct netchannel2_msg_hdr *hdr) ++{ ++ struct netchannel2_msg_bypass_detach msg; ++ struct nc2_alternate_ring *nar; ++ ++ if (ncrp != &nc->rings) { ++ pr_debug("Bypass detach on wrong ring.\n"); ++ return; ++ } ++ if (!nc->remote_trusted) { ++ pr_debug("Detach request from untrusted peer.\n"); ++ return; ++ } ++ if (hdr->size != sizeof(msg)) { ++ printk(KERN_WARNING "Strange size bypass detach message; %d != %zd.\n", ++ hdr->size, sizeof(msg)); ++ return; ++ } ++ nc2_copy_from_ring(&nc->rings.cons_ring, &msg, sizeof(msg)); ++ nar = find_ring_by_handle(nc, msg.handle); ++ if (nar == NULL) { ++ printk(KERN_WARNING "Request to detach from unknown alternate ring %d.\n", ++ msg.handle); ++ return; ++ } ++ ++ nc2_aux_ring_start_detach_sequence(nar); ++} ++ ++/* This is only called once the irqs have been stopped and the ++ interfaces have been de-pended, so it shouldn't have to worry about ++ any async activity. */ ++static void release_alt_ring(struct nc2_alternate_ring *nar) ++{ ++ flush_scheduled_work(); ++ ++ nc2_unmap_grants(&nar->prod_mapper); ++ nc2_unmap_grants(&nar->cons_mapper); ++ nc2_unmap_grants(&nar->control_mapper); ++ ++ cleanup_ring_pair(&nar->rings); ++} ++ ++void nc2_release_alt_rings(struct netchannel2 *nc) ++{ ++ struct nc2_alternate_ring *nar, *next_nar; ++ ++ list_for_each_entry_safe(nar, next_nar, &nc->alternate_rings, ++ rings_by_interface) { ++ release_alt_ring(nar); ++ } ++} ++ ++/* This is called from a suspend callback just before the VM goes down ++ for suspend/resume. When it returns, we must have unmapped all ++ bypass rings. There is no possibility of failing. */ ++void detach_all_bypasses(struct netchannel2 *nc) ++{ ++ struct nc2_alternate_ring *nar; ++ ++ int cntr; ++ ++ spin_lock_bh(&nc->rings.lock); ++ cntr = 0; ++ while (!list_empty(&nc->alternate_rings) && cntr < 500) { ++ list_for_each_entry(nar, &nc->alternate_rings, ++ rings_by_interface) { ++ spin_lock(&nar->rings.lock); ++ /* If we're currently in an operating state, ++ pretend we received a DISABLE message, so ++ we eventually generate a DISABLED message. ++ The peer will then start the detach state ++ machine, which will eventually destroy the ++ bypass. */ ++ /* nc2_alt_ring_frontend_sent_ready is a bit ++ odd. We are frontend-like, and we've told ++ the backend who we are, but we haven't yet ++ received a READY from the backend. We ++ don't necessarily trust the backend, so we ++ can't wait for it. The best we can do is ++ to tell the peer that we've disabled, and ++ let it drive the backend into shutdown. */ ++ if (nar->state == nc2_alt_ring_frontend_sent_ready || ++ nar->state == nc2_alt_ring_ready) { ++ nar->state = nc2_alt_ring_disabling; ++ nc2_kick(&nar->rings); ++ } ++ spin_unlock(&nar->rings.lock); ++ } ++ spin_unlock_bh(&nc->rings.lock); ++ /* Bit of a hack... */ ++ msleep(10); ++ cntr++; ++ spin_lock_bh(&nc->rings.lock); ++ } ++ spin_unlock_bh(&nc->rings.lock); ++ ++ if (cntr < 500) ++ return; ++ ++ /* Okay, doing it the nice way didn't work. This can happen ++ if the domain at the other end of the bypass isn't picking ++ up messages, so we can't flush through all of our pending ++ packets and disable ourselves cleanly. Force it through ++ instead, by pretending that we've received a DETACH message ++ from the parent. */ ++ printk(KERN_WARNING "timed out trying to disable a bypass nicely, being more forceful\n"); ++ spin_lock_bh(&nc->rings.lock); ++ cntr = 0; ++ while (!list_empty(&nc->alternate_rings)) { ++ list_for_each_entry(nar, &nc->alternate_rings, ++ rings_by_interface) { ++ spin_lock(&nar->rings.lock); ++ if (nar->state >= nc2_alt_ring_detaching) { ++ /* Okay, we're already detaching, and ++ we're waiting either for our work ++ item to run or for an opportunity ++ to tell the parent that we're ++ detached. The parent is trusted, ++ so just wait for whatever it is ++ that we're waiting for to ++ happen. */ ++ spin_unlock(&nar->rings.lock); ++ continue; ++ } ++ nar->state = nc2_alt_ring_detaching; ++ spin_unlock(&nar->rings.lock); ++ INIT_WORK(&nar->detach_work_item, ++ start_detach_worker); ++ schedule_work(&nar->detach_work_item); ++ } ++ spin_unlock_bh(&nc->rings.lock); ++ msleep(10); ++ cntr++; ++ if (cntr % 100 == 0) ++ printk(KERN_WARNING ++ "taking a long time to detach from bypasses (%d)\n", ++ cntr); ++ spin_lock_bh(&nc->rings.lock); ++ } ++ spin_unlock_bh(&nc->rings.lock); ++} +diff --git a/drivers/xen/netchannel2/bypassee.c b/drivers/xen/netchannel2/bypassee.c +new file mode 100644 +index 0000000..ce44334 +--- /dev/null ++++ b/drivers/xen/netchannel2/bypassee.c +@@ -0,0 +1,802 @@ ++/* All the bits which allow a domain to be bypassed. */ ++#include ++#include ++#include ++#include "netchannel2_core.h" ++ ++/* Bypass disable is a bit tricky. Enable is relatively easy: ++ ++ 1) We decide to establish a bypass between two interfaces. ++ 2) We allocate the pages for the rings and grant them to ++ the relevant domains. ++ 3) We nominate one endpoint as the ``backend''. ++ 4) We send both endpoints BYPASS messages. ++ 5) As far as we're concerned, the bypass is now ready. The ++ endpoints will do the rest of the negotiation without any help ++ from us. ++ ++ Disable is harder. Each bypass endpoint can be in one of three ++ states: ++ ++ -- Running normally. ++ -- Disabled. ++ -- Detached. ++ ++ A disabled endpoint won't generate any new operations (which means ++ that it can't send packets, but can send FINISHED_PACKET messages ++ and so forth). A detached endpoint is one which has no longer ++ mapped the ring pages, so it can neither send nor receive. There ++ is no provision for transitioning ``backwards'' i.e. from Disabled ++ to Running, Detached to Running, or Detached to Disabled. There ++ are a couple of messages relevant to changing state: ++ ++ -- DISABLE -- go to state Disabled if we're in Running. Ignored in ++ other states (we won't even get an ACK). We send this to the ++ endpoint. ++ -- DISABLED -- endpoint has transitioned to Disabled, whether of ++ its own accord or due to a DISABLE message. We receive this ++ from the endpoint. ++ -- DETACH -- go to state Detached if we're in Running or Disabled. ++ Ignore in other states (without an ACK). Sent to the endpoint. ++ -- DETACHED -- endpoint has transitioned to DETACHED. Received ++ from the endpoint. ++ ++ A bypass in which both endpoints are Detached can be safely ++ destroyed. ++ ++ Once either endpoint has transitioned out of Running, the bypass is ++ pretty useless, so we try to push things so that we go to ++ Detached/Detached as quickly as possible. In particular: ++ ++ A state B state Action ++ Running Disabled Send A a DISABLE ++ Running Detached Send A a DETACH ++ Disabled Disabled Send both endpoints DETACH ++ Disabled Detached Send A a DETACH ++ Detached Detached Destroy the interface ++ ++ (And the obvious mirror images) ++ ++ There's some filtering so that we never send a given endpoint more ++ than one DISABLE message or more than one DETACH message. If we ++ want to tear the bypass down from this end, we send both endpoints ++ DISABLE messages and let the state machine take things from ++ there. ++ ++ The core state machine is implemented in ++ crank_bypass_state_machine(). ++*/ ++ ++/* A list of all currently-live nc2_bypass interfaces. Only touched ++ from the worker thread. */ ++static LIST_HEAD(all_bypasses); ++ ++/* Bottom-half safe lock protecting pretty much all of the bypass ++ state, across all interfaces. The pending_list_lock is sometimes ++ acquired while this is held. It is acquired while holding the ring ++ lock. */ ++static DEFINE_SPINLOCK(bypasses_lock); ++ ++/* Encourage the endpoint to detach as soon as possible. */ ++/* Called under the bypass lock. */ ++static void schedule_detach(struct nc2_bypass_endpoint *ep) ++{ ++ if (!ep->detached && !ep->need_detach && !ep->detach_sent) { ++ BUG_ON(ep->nc2 == NULL); ++ ep->need_detach = 1; ++ ep->nc2->need_advertise_bypasses = 1; ++ nc2_kick(&ep->nc2->rings); ++ } ++} ++ ++/* Encourage the endpoint to disable as soon as possible. */ ++/* Called under the bypass lock. */ ++static void schedule_disable(struct nc2_bypass_endpoint *ep) ++{ ++ if (!ep->disabled && !ep->need_disable && !ep->disable_sent) { ++ BUG_ON(ep->detached); ++ BUG_ON(ep->nc2 == NULL); ++ ep->need_disable = 1; ++ ep->nc2->need_advertise_bypasses = 1; ++ nc2_kick(&ep->nc2->rings); ++ } ++} ++ ++static void grant_end(grant_ref_t *gref) ++{ ++ if (*gref && gnttab_end_foreign_access_ref(*gref)) { ++ gnttab_free_grant_reference(*gref); ++ *gref = 0; ++ } ++} ++ ++/* Release all resources associated with the bypass. It is assumed ++ that the caller has ensured that nobody else is going to access it ++ any more. */ ++static void release_bypass(struct nc2_bypass *bypass) ++{ ++ int i; ++ ++ BUG_ON(atomic_read(&bypass->refcnt) != 0); ++ ++ for (i = 0; i < bypass->nr_ring_pages; i++) { ++ grant_end(&bypass->ep_a.incoming_grefs[i]); ++ grant_end(&bypass->ep_b.incoming_grefs[i]); ++ grant_end(&bypass->ep_a.outgoing_grefs[i]); ++ grant_end(&bypass->ep_b.outgoing_grefs[i]); ++ if (bypass->ep_a.incoming_pages[i] && ++ !bypass->ep_a.incoming_grefs[i] && ++ !bypass->ep_b.outgoing_grefs[i]) ++ free_page(bypass->ep_a.incoming_pages[i]); ++ if (bypass->ep_b.incoming_pages[i] && ++ !bypass->ep_b.incoming_grefs[i] && ++ !bypass->ep_a.outgoing_grefs[i]) ++ free_page(bypass->ep_b.incoming_pages[i]); ++ } ++ grant_end(&bypass->ep_a.control_gref); ++ grant_end(&bypass->ep_b.control_gref); ++ if (bypass->control_page && ++ !bypass->ep_a.control_gref && ++ !bypass->ep_b.control_gref) ++ free_page(bypass->control_page); ++ ++ kfree(bypass); ++} ++ ++static void put_bypass(struct nc2_bypass *bypass) ++{ ++ if (atomic_dec_and_test(&bypass->refcnt)) ++ release_bypass(bypass); ++} ++ ++/* The state of one of the bypass endpoints has changed. Crank ++ through the state machine, scheduling any messages which are ++ needed. Tear the bypass down if both ends have detached. */ ++/* Called under the bypass lock. */ ++static void crank_bypass_state_machine(struct nc2_bypass *bypass) ++{ ++ if (bypass->ep_a.disabled != bypass->ep_b.disabled) { ++ schedule_disable(&bypass->ep_a); ++ schedule_disable(&bypass->ep_b); ++ } ++ if (bypass->ep_a.disabled && bypass->ep_b.disabled) { ++ schedule_detach(&bypass->ep_b); ++ schedule_detach(&bypass->ep_a); ++ } ++ if (bypass->ep_a.detached != bypass->ep_b.detached) { ++ schedule_detach(&bypass->ep_b); ++ schedule_detach(&bypass->ep_a); ++ } ++ if (bypass->ep_a.detached && bypass->ep_b.detached) { ++ /* Okay, neither endpoint knows about the bypass any ++ more. It is therefore dead. */ ++ /* XXX: Should there be a concept of zombie bypasses? ++ * i.e. keep the bypass around until userspace ++ * explicitly reaps it, so as to avoid the usual ID ++ * reuse races. */ ++ list_del_init(&bypass->list); ++ wake_up_all(&bypass->detach_waitq); ++ put_bypass(bypass); ++ } ++} ++ ++/* A bypass disabled message has been received on @ncrp (which should ++ be the main ring for @nc, or someone's misbehaving). */ ++/* Called from the tasklet. */ ++void nc2_handle_bypass_disabled(struct netchannel2 *nc, ++ struct netchannel2_ring_pair *ncrp, ++ struct netchannel2_msg_hdr *hdr) ++{ ++ struct netchannel2_msg_bypass_disabled msg; ++ struct nc2_bypass *bypass; ++ ++ if (hdr->size != sizeof(msg)) { ++ pr_debug("Strange size bypass disabled message; %d != %zd.\n", ++ hdr->size, sizeof(msg)); ++ return; ++ } ++ if (ncrp != &nc->rings) { ++ pr_debug("bypass_disabled on wrong ring.\n"); ++ return; ++ } ++ nc2_copy_from_ring(&nc->rings.cons_ring, &msg, sizeof(msg)); ++ spin_lock(&bypasses_lock); ++ list_for_each_entry(bypass, &nc->bypasses_a, ep_a.list) { ++ if (bypass->handle == msg.handle) { ++ bypass->ep_a.disabled = 1; ++ crank_bypass_state_machine(bypass); ++ spin_unlock(&bypasses_lock); ++ return; ++ } ++ } ++ list_for_each_entry(bypass, &nc->bypasses_b, ep_b.list) { ++ if (bypass->handle == msg.handle) { ++ bypass->ep_b.disabled = 1; ++ crank_bypass_state_machine(bypass); ++ spin_unlock(&bypasses_lock); ++ return; ++ } ++ } ++ spin_unlock(&bypasses_lock); ++ ++ pr_debug("Disabled message was on the wrong ring (%d)?\n", ++ msg.handle); ++ return; ++} ++ ++static void detach(struct nc2_bypass_endpoint *ep) ++{ ++ if (ep->detached) ++ return; ++ list_del_init(&ep->list); ++ ep->disabled = ep->detached = 1; ++ ep->nc2->extant_bypasses--; ++ ep->nc2 = NULL; ++} ++ ++/* One of our peers has sent us a bypass detached message i.e. it was ++ previously bypassing us, and it's not any more. Do the appropriate ++ thing. */ ++void nc2_handle_bypass_detached(struct netchannel2 *nc, ++ struct netchannel2_ring_pair *ncrp, ++ struct netchannel2_msg_hdr *hdr) ++{ ++ struct netchannel2_msg_bypass_detached msg; ++ struct nc2_bypass *bypass; ++ ++ if (hdr->size != sizeof(msg)) { ++ pr_debug("Strange size bypass detached message; %d != %zd.\n", ++ hdr->size, sizeof(msg)); ++ return; ++ } ++ if (ncrp != &nc->rings) { ++ pr_debug("bypass_disabled on wrong ring.\n"); ++ return; ++ } ++ nc2_copy_from_ring(&nc->rings.cons_ring, &msg, sizeof(msg)); ++ spin_lock(&bypasses_lock); ++ list_for_each_entry(bypass, &nc->bypasses_a, ep_a.list) { ++ if (bypass->handle == msg.handle) { ++ detach(&bypass->ep_a); ++ crank_bypass_state_machine(bypass); ++ spin_unlock(&bypasses_lock); ++ return; ++ } ++ } ++ list_for_each_entry(bypass, &nc->bypasses_b, ep_b.list) { ++ if (bypass->handle == msg.handle) { ++ detach(&bypass->ep_b); ++ crank_bypass_state_machine(bypass); ++ spin_unlock(&bypasses_lock); ++ return; ++ } ++ } ++ spin_unlock(&bypasses_lock); ++ pr_debug("Detached message was on the wrong ring (%d)?\n", ++ msg.handle); ++} ++ ++static void process_suggestion_queue_workitem(struct work_struct *ws) ++{ ++ struct netchannel2 *nc = ++ container_of(ws, struct netchannel2, ++ incoming_bypass_suggestions.workitem); ++ struct nc2_incoming_bypass_suggestions *sugg = ++ &nc->incoming_bypass_suggestions; ++ unsigned ind; ++ unsigned char mac[ETH_ALEN]; ++ ++ spin_lock_bh(&sugg->lock); ++ while (sugg->tail != sugg->head) { ++ ind = sugg->tail % NC2_BYPASS_SUGG_QUEUE_SIZE; ++ memcpy(mac, sugg->queue[ind].mac, ETH_ALEN); ++ sugg->tail++; ++ spin_unlock_bh(&sugg->lock); ++ ++ nb2_handle_suggested_bypass(nc, mac); ++ ++ spin_lock_bh(&sugg->lock); ++ } ++ spin_unlock_bh(&sugg->lock); ++} ++ ++void nc2_handle_suggest_bypass(struct netchannel2 *nc, ++ struct netchannel2_ring_pair *ncrp, ++ struct netchannel2_msg_hdr *hdr) ++{ ++ struct nc2_incoming_bypass_suggestions *sugg = ++ &nc->incoming_bypass_suggestions; ++ struct netchannel2_msg_suggest_bypass msg; ++ unsigned ind; ++ ++ if (hdr->size != sizeof(msg)) { ++ pr_debug("strange size suggest bypass message; %d != %zd\n", ++ hdr->size, sizeof(msg)); ++ return; ++ } ++ if (ncrp != &nc->rings) { ++ pr_debug("suggest bypass on bypass ring?\n"); ++ return; ++ } ++ nc2_copy_from_ring(&nc->rings.cons_ring, &msg, sizeof(msg)); ++ ++ spin_lock(&sugg->lock); ++ ind = sugg->head % NC2_BYPASS_SUGG_QUEUE_SIZE; ++ /* Drop if we've overflowed the queue */ ++ if (sugg->head == sugg->tail + NC2_BYPASS_SUGG_QUEUE_SIZE) ++ sugg->tail++; ++ memcpy(&sugg->queue[ind].mac, msg.mac, ETH_ALEN); ++ if (sugg->head == sugg->tail) ++ schedule_work(&sugg->workitem); ++ sugg->head++; ++ spin_unlock(&sugg->lock); ++} ++ ++ ++static int send_disable_bypass_msg(struct netchannel2 *nc, ++ struct nc2_bypass *bypass) ++{ ++ struct netchannel2_msg_bypass_disable msg = { ++ .handle = bypass->handle ++ }; ++ ++ if (!nc2_can_send_payload_bytes(&nc->rings.prod_ring, sizeof(msg))) ++ return 1; ++ nc2_send_message(&nc->rings.prod_ring, NETCHANNEL2_MSG_BYPASS_DISABLE, ++ 0, &msg, sizeof(msg)); ++ nc->rings.pending_time_sensitive_messages = 1; ++ return 0; ++} ++ ++static int send_detach_bypass_msg(struct netchannel2 *nc, ++ struct nc2_bypass *bypass) ++{ ++ struct netchannel2_msg_bypass_detach msg = { ++ .handle = bypass->handle ++ }; ++ ++ if (!nc2_can_send_payload_bytes(&nc->rings.prod_ring, sizeof(msg))) ++ return 1; ++ nc2_send_message(&nc->rings.prod_ring, NETCHANNEL2_MSG_BYPASS_DETACH, ++ 0, &msg, sizeof(msg)); ++ nc->rings.pending_time_sensitive_messages = 1; ++ return 0; ++} ++ ++static void init_bypass_msg_common(struct netchannel2_msg_bypass_common *msg, ++ struct nc2_bypass_endpoint *this_ep, ++ struct netchannel2 *remote, ++ struct nc2_bypass *bypass) ++{ ++ msg->control_gref = this_ep->control_gref; ++ ++ msg->ring_domid = DOMID_SELF; ++ msg->ring_pages = bypass->nr_ring_pages;; ++ msg->peer_domid = remote->rings.otherend_id; ++ msg->peer_trusted = remote->remote_trusted; ++ msg->handle = bypass->handle; ++ memcpy(msg->remote_mac, remote->rings.remote_mac, ETH_ALEN); ++} ++ ++static int advertise_bypass_frontend(struct netchannel2 *nc, ++ struct nc2_bypass *bypass) ++{ ++ struct netchannel2_msg_bypass_frontend msg; ++ unsigned msg_size; ++ ++ BUG_ON(nc != bypass->ep_a.nc2); ++ ++ msg_size = sizeof(msg) + bypass->nr_ring_pages * 2 * sizeof(uint32_t); ++ if (!nc->current_bypass_frontend && ++ !nc2_can_send_payload_bytes(&nc->rings.prod_ring, msg_size)) ++ return 1; ++ ++ memset(&msg, 0, sizeof(msg)); ++ ++ init_bypass_msg_common(&msg.common, &bypass->ep_a, bypass->ep_b.nc2, ++ bypass); ++ ++ nc->current_bypass_frontend = bypass; ++ ++ /* Send the message. nc2_send_message doesn't support the ++ right kind of scatter gather, so do it by hand. */ ++ __nc2_avoid_ring_wrap(&nc->rings.prod_ring, msg_size); ++ msg.hdr.type = NETCHANNEL2_MSG_BYPASS_FRONTEND; ++ msg.hdr.size = msg_size; ++ nc2_copy_to_ring(&nc->rings.prod_ring, &msg, sizeof(msg)); ++ nc2_copy_to_ring_off(&nc->rings.prod_ring, ++ bypass->ep_a.outgoing_grefs, ++ sizeof(uint32_t) * bypass->nr_ring_pages, ++ sizeof(msg)); ++ nc2_copy_to_ring_off(&nc->rings.prod_ring, ++ bypass->ep_a.incoming_grefs, ++ sizeof(uint32_t) * bypass->nr_ring_pages, ++ sizeof(msg) + sizeof(uint32_t) * bypass->nr_ring_pages); ++ nc->rings.prod_ring.prod_pvt += msg_size; ++ nc->rings.prod_ring.bytes_available -= msg_size; ++ nc->rings.pending_time_sensitive_messages = 1; ++ return 0; ++} ++ ++static int advertise_bypass_backend(struct netchannel2 *nc, ++ struct nc2_bypass *bypass) ++{ ++ struct netchannel2_msg_bypass_backend msg; ++ unsigned msg_size; ++ ++ BUG_ON(nc != bypass->ep_b.nc2); ++ ++ msg_size = sizeof(msg) + bypass->nr_ring_pages * 2 * sizeof(uint32_t); ++ if (!nc2_can_send_payload_bytes(&nc->rings.prod_ring, msg_size)) ++ return 1; ++ ++ memset(&msg, 0, sizeof(msg)); ++ ++ init_bypass_msg_common(&msg.common, &bypass->ep_b, bypass->ep_a.nc2, ++ bypass); ++ ++ BUG_ON(bypass->evtchn_port == 0); ++ msg.port = bypass->evtchn_port; ++ msg.hdr.type = NETCHANNEL2_MSG_BYPASS_BACKEND; ++ msg.hdr.size = msg_size; ++ nc2_copy_to_ring(&nc->rings.prod_ring, &msg, sizeof(msg)); ++ nc2_copy_to_ring_off(&nc->rings.prod_ring, ++ bypass->ep_b.outgoing_grefs, ++ sizeof(uint32_t) * bypass->nr_ring_pages, ++ sizeof(msg)); ++ nc2_copy_to_ring_off(&nc->rings.prod_ring, ++ bypass->ep_b.incoming_grefs, ++ sizeof(uint32_t) * bypass->nr_ring_pages, ++ sizeof(msg) + sizeof(uint32_t) * bypass->nr_ring_pages); ++ nc->rings.prod_ring.prod_pvt += msg_size; ++ nc->rings.prod_ring.bytes_available -= msg_size; ++ nc->rings.pending_time_sensitive_messages = 1; ++ return 0; ++} ++ ++/* Called from the tasklet, holding the ring lock for nc and the ++ bypass lock. */ ++static int advertise_bypass(struct netchannel2 *nc, struct nc2_bypass *bypass) ++{ ++ if (nc == bypass->ep_a.nc2) ++ return advertise_bypass_frontend(nc, bypass); ++ else ++ return advertise_bypass_backend(nc, bypass); ++} ++ ++/* Called from the tasklet holding the ring and bypass locks. */ ++static int nc2_do_bypass_advertise_work(struct nc2_bypass_endpoint *ep, ++ struct netchannel2 *nc, ++ struct nc2_bypass *bypass) ++{ ++ if (ep->need_advertise) { ++ if (advertise_bypass(nc, bypass)) ++ return 0; ++ ep->need_advertise = 0; ++ } ++ if (ep->need_disable) { ++ if (send_disable_bypass_msg(nc, bypass)) ++ return 0; ++ ep->need_disable = 0; ++ ep->disable_sent = 1; ++ } ++ if (ep->need_detach) { ++ if (send_detach_bypass_msg(nc, bypass)) ++ return 0; ++ ep->need_detach = 0; ++ ep->detach_sent = 1; ++ } ++ return 1; ++} ++ ++/* Called from the tasklet holding the ring lock. */ ++void _nc2_advertise_bypasses(struct netchannel2 *nc) ++{ ++ struct nc2_bypass *bypass; ++ int success; ++ ++ spin_lock(&bypasses_lock); ++ success = 1; ++ list_for_each_entry(bypass, &nc->bypasses_a, ep_a.list) { ++ success &= nc2_do_bypass_advertise_work(&bypass->ep_a, ++ nc, ++ bypass); ++ } ++ list_for_each_entry(bypass, &nc->bypasses_b, ep_b.list) { ++ success &= nc2_do_bypass_advertise_work(&bypass->ep_b, ++ nc, ++ bypass); ++ } ++ if (success) ++ nc->need_advertise_bypasses = 0; ++ spin_unlock(&bypasses_lock); ++} ++ ++void nc2_handle_bypass_frontend_ready(struct netchannel2 *nc, ++ struct netchannel2_ring_pair *ncrp, ++ struct netchannel2_msg_hdr *hdr) ++{ ++ struct netchannel2_msg_bypass_frontend_ready msg; ++ struct nc2_bypass *bypass; ++ ++ if (hdr->size != sizeof(msg) || ncrp != &nc->rings || ++ !nc->current_bypass_frontend) ++ return; ++ bypass = nc->current_bypass_frontend; ++ nc->current_bypass_frontend = NULL; ++ nc2_copy_from_ring(&nc->rings.cons_ring, &msg, sizeof(msg)); ++ spin_lock(&bypasses_lock); ++ if (msg.port <= 0) { ++ printk(KERN_WARNING "%d from frontend trying to establish bypass\n", ++ msg.port); ++ detach(&bypass->ep_a); ++ detach(&bypass->ep_b); ++ crank_bypass_state_machine(bypass); ++ spin_unlock(&bypasses_lock); ++ return; ++ } ++ ++ bypass->evtchn_port = msg.port; ++ bypass->ep_b.need_advertise = 1; ++ bypass->ep_b.nc2->need_advertise_bypasses = 1; ++ nc2_kick(&bypass->ep_b.nc2->rings); ++ spin_unlock(&bypasses_lock); ++} ++ ++/* Called from an ioctl not holding any locks. */ ++static int build_bypass_page(int *gref_pool, ++ int *grefp_a, ++ int *grefp_b, ++ domid_t domid_a, ++ domid_t domid_b, ++ unsigned long *pagep) ++{ ++ int gref_a, gref_b; ++ unsigned long page; ++ ++ page = get_zeroed_page(GFP_ATOMIC); ++ if (page == 0) ++ return -ENOMEM; ++ gref_a = gnttab_claim_grant_reference(gref_pool); ++ gref_b = gnttab_claim_grant_reference(gref_pool); ++ BUG_ON(gref_a < 0); ++ BUG_ON(gref_b < 0); ++ gnttab_grant_foreign_access_ref(gref_a, domid_a, virt_to_mfn(page), 0); ++ gnttab_grant_foreign_access_ref(gref_b, domid_b, virt_to_mfn(page), 0); ++ ++ *pagep = page; ++ *grefp_a = gref_a; ++ *grefp_b = gref_b; ++ return 0; ++} ++ ++/* Called from an ioctl or work queue item not holding any locks. */ ++int nc2_establish_bypass(struct netchannel2 *a, struct netchannel2 *b) ++{ ++ struct nc2_bypass *work; ++ struct nc2_bypass *other_bypass; ++ int err; ++ grant_ref_t gref_pool; ++ int i; ++ static atomic_t next_handle; ++ int handle; ++ unsigned nr_pages; ++ ++ /* Can't establish a bypass unless we're trusted by both of ++ the remote endpoints. */ ++ if (!a->local_trusted || !b->local_trusted) ++ return -EPERM; ++ ++ /* Can't establish a bypass unless it's allowed by both ++ * endpoints. */ ++ if (!a->bypass_max_pages || !b->bypass_max_pages) ++ return -EOPNOTSUPP; ++ ++ if (a->extant_bypasses >= a->max_bypasses || ++ b->extant_bypasses >= b->max_bypasses) ++ return -EMFILE; ++ ++ nr_pages = a->bypass_max_pages; ++ if (nr_pages > b->bypass_max_pages) ++ nr_pages = b->bypass_max_pages; ++ if (nr_pages > MAX_BYPASS_RING_PAGES_GRANTABLE) ++ nr_pages = MAX_BYPASS_RING_PAGES_GRANTABLE; ++ if (nr_pages == 0) { ++ printk(KERN_WARNING "tried to establish a null bypass ring?\n"); ++ return -EINVAL; ++ } ++ ++ work = kzalloc(sizeof(*work), GFP_ATOMIC); ++ if (!work) ++ return -ENOMEM; ++ atomic_set(&work->refcnt, 1); ++ init_waitqueue_head(&work->detach_waitq); ++ ++ work->nr_ring_pages = nr_pages; ++ ++ work->ep_a.nc2 = a; ++ work->ep_b.nc2 = b; ++ ++ work->ep_a.need_advertise = 1; ++ ++ handle = atomic_inc_return(&next_handle); ++ work->handle = handle; ++ ++ err = gnttab_alloc_grant_references(work->nr_ring_pages * 4 + 2, ++ &gref_pool); ++ if (err < 0) ++ goto err; ++ ++ err = -ENOMEM; ++ for (i = 0; i < work->nr_ring_pages; i++) { ++ err = build_bypass_page(&gref_pool, ++ &work->ep_a.incoming_grefs[i], ++ &work->ep_b.outgoing_grefs[i], ++ a->rings.otherend_id, ++ b->rings.otherend_id, ++ &work->ep_a.incoming_pages[i]); ++ if (err < 0) ++ goto err; ++ err = build_bypass_page(&gref_pool, ++ &work->ep_b.incoming_grefs[i], ++ &work->ep_a.outgoing_grefs[i], ++ b->rings.otherend_id, ++ a->rings.otherend_id, ++ &work->ep_b.incoming_pages[i]); ++ if (err < 0) ++ goto err; ++ } ++ err = build_bypass_page(&gref_pool, ++ &work->ep_a.control_gref, ++ &work->ep_b.control_gref, ++ a->rings.otherend_id, ++ b->rings.otherend_id, ++ &work->control_page); ++ if (err < 0) ++ goto err; ++ ++ spin_lock_bh(&bypasses_lock); ++ ++ if (work->ep_a.nc2->current_bypass_frontend) { ++ /* We can't establish another bypass until this one ++ has finished (which might be forever, if the remote ++ domain is misbehaving, but that's not a ++ problem). */ ++ err = -EBUSY; ++ spin_unlock_bh(&bypasses_lock); ++ goto err; ++ } ++ ++ /* Don't allow redundant bypasses, because they'll never be used. ++ This doesn't actually matter all that much, because in order ++ to establish a redundant bypass, either: ++ ++ -- The user explicitly requested one, in which case they ++ get what they deserve, or ++ -- They're using the autobypasser, in which case it'll detect ++ that the bypass isn't being used within a few seconds ++ and tear it down. ++ ++ Still, it's better to avoid it (if only so the user gets a ++ sensible error message), and so we do a quick check here. ++ */ ++ list_for_each_entry(other_bypass, &a->bypasses_a, ep_a.list) { ++ BUG_ON(other_bypass->ep_a.nc2 != a); ++ if (other_bypass->ep_b.nc2 == b) { ++ err = -EEXIST; ++ spin_unlock_bh(&bypasses_lock); ++ goto err; ++ } ++ } ++ list_for_each_entry(other_bypass, &a->bypasses_b, ep_b.list) { ++ BUG_ON(other_bypass->ep_b.nc2 != a); ++ if (other_bypass->ep_a.nc2 == b) { ++ err = -EEXIST; ++ spin_unlock_bh(&bypasses_lock); ++ goto err; ++ } ++ } ++ ++ list_add(&work->ep_a.list, &a->bypasses_a); ++ INIT_LIST_HEAD(&work->ep_b.list); ++ a->need_advertise_bypasses = 1; ++ list_add(&work->ep_b.list, &b->bypasses_b); ++ list_add_tail(&work->list, &all_bypasses); ++ ++ a->extant_bypasses++; ++ b->extant_bypasses++; ++ ++ spin_unlock_bh(&bypasses_lock); ++ ++ nc2_kick(&a->rings); ++ ++ return handle; ++ ++err: ++ gnttab_free_grant_references(gref_pool); ++ put_bypass(work); ++ return err; ++} ++ ++/* Called from an ioctl holding the bypass lock. */ ++static struct nc2_bypass *get_bypass(uint32_t handle) ++{ ++ struct nc2_bypass *bypass; ++ ++ list_for_each_entry(bypass, &all_bypasses, list) { ++ if (bypass->handle == handle) { ++ atomic_inc(&bypass->refcnt); ++ return bypass; ++ } ++ } ++ return NULL; ++} ++ ++static int bypass_fully_detached(struct nc2_bypass *bypass) ++{ ++ int res; ++ spin_lock_bh(&bypasses_lock); ++ res = bypass->ep_a.detached && bypass->ep_b.detached; ++ spin_unlock_bh(&bypasses_lock); ++ return res; ++} ++ ++int nc2_destroy_bypass(int handle) ++{ ++ struct nc2_bypass *bypass; ++ int r; ++ ++ spin_lock_bh(&bypasses_lock); ++ bypass = get_bypass(handle); ++ if (bypass == NULL) { ++ spin_unlock_bh(&bypasses_lock); ++ return -ESRCH; ++ } ++ schedule_disable(&bypass->ep_a); ++ schedule_disable(&bypass->ep_b); ++ spin_unlock_bh(&bypasses_lock); ++ ++ r = wait_event_interruptible_timeout(bypass->detach_waitq, ++ bypass_fully_detached(bypass), ++ 5 * HZ); ++ put_bypass(bypass); ++ if (r < 0) { ++ printk(KERN_WARNING "Failed to destroy a bypass (%d).\n", ++ r); ++ } ++ return r; ++} ++ ++/* We're guaranteed to be the only thing accessing @nc at this point, ++ but we don't know what's happening to the other endpoints of any ++ bypasses which it might have attached. */ ++void release_bypasses(struct netchannel2 *nc) ++{ ++ struct nc2_bypass *bypass, *next_bypass; ++ ++ spin_lock(&bypasses_lock); ++ list_for_each_entry_safe(bypass, next_bypass, &nc->bypasses_a, ++ ep_a.list) { ++ detach(&bypass->ep_a); ++ crank_bypass_state_machine(bypass); ++ } ++ list_for_each_entry_safe(bypass, next_bypass, &nc->bypasses_b, ++ ep_b.list) { ++ detach(&bypass->ep_b); ++ crank_bypass_state_machine(bypass); ++ } ++ spin_unlock(&bypasses_lock); ++ ++ BUG_ON(!list_empty(&nc->bypasses_a)); ++ BUG_ON(!list_empty(&nc->bypasses_b)); ++ ++ flush_scheduled_work(); ++} ++ ++void nc2_init_incoming_bypass_suggestions( ++ struct netchannel2 *nc2, ++ struct nc2_incoming_bypass_suggestions *nibs) ++{ ++ spin_lock_init(&nibs->lock); ++ INIT_WORK(&nibs->workitem, process_suggestion_queue_workitem); ++} +diff --git a/drivers/xen/netchannel2/chan.c b/drivers/xen/netchannel2/chan.c +new file mode 100644 +index 0000000..03913a3 +--- /dev/null ++++ b/drivers/xen/netchannel2/chan.c +@@ -0,0 +1,838 @@ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "netchannel2_endpoint.h" ++#include "netchannel2_core.h" ++ ++static int process_ring(struct napi_struct *napi, ++ int work_avail); ++ ++irqreturn_t nc2_int(int irq, void *dev_id) ++{ ++ struct netchannel2_ring_pair *ncr = dev_id; ++ ++ if (ncr->irq == -1) ++ return IRQ_HANDLED; ++ ncr->last_event = jiffies; ++ if (ncr->cons_ring.sring->prod != ncr->cons_ring.cons_pvt || ++ ncr->interface->is_stopped) ++ nc2_kick(ncr); ++ return IRQ_HANDLED; ++} ++ ++/* Process all incoming messages. The function is given an ++ IRQ-disabled reference for the interface, and must dispose of it ++ (either by enabling the IRQ or re-introducing it to the pending ++ list). Alternatively, the function can stop the ring being ++ processed again by leaking the reference (e.g. when the remote ++ endpoint is misbehaving). */ ++/* Returns -1 if we used all the available work without finishing, or ++ the amount of work used otherwise. */ ++static int process_messages(struct netchannel2_ring_pair *ncrp, ++ int work_avail, ++ struct sk_buff_head *pending_rx_queue) ++{ ++ struct netchannel2_msg_hdr hdr; ++ RING_IDX prod; ++ struct netchannel2 *nc = ncrp->interface; ++ int work_done; ++ ++ work_done = 1; ++ ++retry: ++ prod = ncrp->cons_ring.sring->prod; ++ rmb(); ++ while (work_done < work_avail && ++ prod != ncrp->cons_ring.cons_pvt) { ++ nc2_copy_from_ring(&ncrp->cons_ring, &hdr, sizeof(hdr)); ++ if (hdr.size < sizeof(hdr)) { ++ printk(KERN_WARNING "Other end sent too-small message (%d)\n", ++ hdr.size); ++ goto done; ++ } ++ if (hdr.size > ncrp->cons_ring.payload_bytes) { ++ /* This one message is bigger than the whole ++ ring -> other end is clearly misbehaving. ++ We won't take any more messages from this ++ ring. */ ++ printk(KERN_WARNING "Other end sent enormous message (%d > %zd)\n", ++ hdr.size, ++ ncrp->cons_ring.payload_bytes); ++ goto done; ++ } ++ ++ switch (hdr.type) { ++ case NETCHANNEL2_MSG_SET_MAX_PACKETS: ++ nc2_handle_set_max_packets_msg(ncrp, &hdr); ++ break; ++ case NETCHANNEL2_MSG_PACKET: ++ nc2_handle_packet_msg(nc, ncrp, &hdr, ++ pending_rx_queue); ++ break; ++ case NETCHANNEL2_MSG_FINISH_PACKET: ++ nc2_handle_finish_packet_msg(nc, ncrp, &hdr); ++ break; ++ case NETCHANNEL2_MSG_SET_OFFLOAD: ++ nc2_handle_set_offload(nc, ncrp, &hdr); ++ break; ++ case NETCHANNEL2_MSG_SET_MAX_FRAGMENTS_PER_PACKET: ++ nc2_handle_set_max_fragments_per_packet(nc, ncrp, ++ &hdr); ++ break; ++ case NETCHANNEL2_MSG_BYPASS_FRONTEND: ++ nc2_handle_bypass_frontend(nc, ncrp, &hdr); ++ break; ++ case NETCHANNEL2_MSG_BYPASS_BACKEND: ++ nc2_handle_bypass_backend(nc, ncrp, &hdr); ++ break; ++ case NETCHANNEL2_MSG_BYPASS_FRONTEND_READY: ++ nc2_handle_bypass_frontend_ready(nc, ncrp, &hdr); ++ break; ++ case NETCHANNEL2_MSG_BYPASS_DISABLE: ++ nc2_handle_bypass_disable(nc, ncrp, &hdr); ++ break; ++ case NETCHANNEL2_MSG_BYPASS_DISABLED: ++ nc2_handle_bypass_disabled(nc, ncrp, &hdr); ++ break; ++ case NETCHANNEL2_MSG_BYPASS_DETACH: ++ nc2_handle_bypass_detach(nc, ncrp, &hdr); ++ break; ++ case NETCHANNEL2_MSG_BYPASS_DETACHED: ++ nc2_handle_bypass_detached(nc, ncrp, &hdr); ++ break; ++ case NETCHANNEL2_MSG_BYPASS_READY: ++ nc2_handle_bypass_ready(nc, ncrp, &hdr); ++ break; ++ case NETCHANNEL2_MSG_SUGGEST_BYPASS: ++ nc2_handle_suggest_bypass(nc, ncrp, &hdr); ++ break; ++ case NETCHANNEL2_MSG_PAD: ++ break; ++ default: ++ /* Drop bad messages. We should arguably stop ++ processing the ring at this point, because ++ the ring is probably corrupt. However, if ++ it is corrupt then one of the other checks ++ will hit soon enough, and doing it this way ++ should make it a bit easier to add new ++ message types in future. */ ++ pr_debug("Bad message type %d from peer!\n", ++ hdr.type); ++ break; ++ } ++ hdr.size = (hdr.size + 7) & ~7; ++ ncrp->cons_ring.cons_pvt += hdr.size; ++ ++ work_done++; ++ if (work_done == work_avail) ++ return -1; ++ } ++ ++ if (unlikely(prod != ncrp->cons_ring.sring->prod)) ++ goto retry; ++ ++ /* Dispose of our IRQ-disable reference. */ ++done: ++ napi_complete(&ncrp->napi); ++ enable_irq(ncrp->irq); ++ ++ if (nc2_final_check_for_messages(&ncrp->cons_ring, ++ prod)) { ++ /* More work to do still. */ ++ nc2_kick(ncrp); ++ } ++ ++ return work_done; ++} ++ ++/* Flush out all pending metadata messages on ring @ncrp, and then ++ update the ring pointers to indicate that we've done so. Fire the ++ event channel if necessary. */ ++static void flush_rings(struct netchannel2_ring_pair *ncrp) ++{ ++ struct netchannel2 *nc = ncrp->interface; ++ int need_kick; ++ ++ flush_hypercall_batcher(&ncrp->pending_rx_hypercalls, ++ nc2_rscb_on_gntcopy_fail); ++ send_finish_packet_messages(ncrp); ++ if (ncrp->need_advertise_max_packets) ++ advertise_max_packets(ncrp); ++ if (ncrp->need_advertise_max_fragments_per_packet) ++ advertise_max_fragments_per_packet(ncrp); ++ ++ if (ncrp == &nc->rings) { ++ if (nc->need_advertise_offloads) ++ advertise_offloads(nc); ++ nc2_advertise_bypasses(nc); ++ nc2_crank_aux_ring_state_machine(nc); ++ nc2_autobypass_make_suggestions(nc); ++ } else { ++ nc2_alternate_ring_disable_finish(ncrp); ++ } ++ ++ need_kick = 0; ++ if (nc2_finish_messages(&ncrp->cons_ring)) { ++ need_kick = 1; ++ /* If we need an event on the consumer ring, we always ++ need to notify the other end, even if we don't have ++ any messages which would normally be considered ++ urgent. */ ++ ncrp->pending_time_sensitive_messages = 1; ++ } ++ if (nc2_flush_ring(&ncrp->prod_ring)) ++ need_kick = 1; ++ if (need_kick || ++ (ncrp->delayed_kick && ncrp->pending_time_sensitive_messages)) { ++ if (ncrp->pending_time_sensitive_messages) { ++ notify_remote_via_irq(ncrp->irq); ++ ncrp->delayed_kick = 0; ++ } else { ++ ncrp->delayed_kick = 1; ++ } ++ ncrp->pending_time_sensitive_messages = 0; ++ } ++} ++ ++/* Process incoming messages, and then flush outgoing metadata ++ * messages. We also try to unjam the xmit queue if any of the ++ * incoming messages would give us permission to send more stuff. */ ++/* This is given an IRQ-disable reference, and must dispose of it. */ ++static int nc2_poll(struct netchannel2_ring_pair *ncrp, int work_avail, ++ struct sk_buff_head *rx_queue) ++{ ++ int work_done; ++ ++ if (!ncrp->is_attached) { ++ napi_complete(&ncrp->napi); ++ enable_irq(ncrp->irq); ++ return 0; ++ } ++ ++ work_done = process_messages(ncrp, work_avail, rx_queue); ++ ++ flush_rings(ncrp); ++ ++ if (work_done < 0) ++ return work_avail; ++ else ++ return work_done; ++} ++ ++/* Like skb_queue_purge(), but use release_tx_packet() rather than ++ kfree_skb() */ ++void nc2_queue_purge(struct netchannel2_ring_pair *ncrp, ++ struct sk_buff_head *queue) ++{ ++ struct sk_buff *skb; ++ ++ while (!skb_queue_empty(queue)) { ++ skb = skb_dequeue(queue); ++ release_tx_packet(ncrp, skb); ++ } ++} ++ ++/* struct net_device stop() method. */ ++static int nc2_stop(struct net_device *nd) ++{ ++ struct netchannel2 *nc = netdev_priv(nd); ++ ++ spin_lock_bh(&nc->rings.lock); ++ nc->stats.tx_dropped += skb_queue_len(&nc->pending_skbs); ++ nc2_queue_purge(&nc->rings, &nc->pending_skbs); ++ spin_unlock_bh(&nc->rings.lock); ++ ++ return 0; ++} ++ ++/* Kick a netchannel2 interface so that the poll() method runs ++ * soon. */ ++/* This has semi release-like semantics, so you can set flags ++ lock-free and be guaranteed that the poll() method will eventually ++ run and see the flag set, without doing any explicit locking. */ ++void nc2_kick(struct netchannel2_ring_pair *ncrp) ++{ ++ if (napi_schedule_prep(&ncrp->napi)) { ++ disable_irq_nosync(ncrp->irq); ++ __napi_schedule(&ncrp->napi); ++ } ++} ++ ++static int nc2_open(struct net_device *nd) ++{ ++ struct netchannel2 *nc = netdev_priv(nd); ++ ++ nc2_kick(&nc->rings); ++ return 0; ++} ++ ++/* Rad a mac address from an address in xenstore at @prefix/@node. ++ * Call not holding locks. Returns 0 on success or <0 on error. */ ++static int read_mac_address(const char *prefix, const char *node, ++ unsigned char *addr) ++{ ++ int err; ++ unsigned mac[6]; ++ int i; ++ ++ err = xenbus_scanf(XBT_NIL, prefix, node, ++ "%x:%x:%x:%x:%x:%x", ++ &mac[0], ++ &mac[1], ++ &mac[2], ++ &mac[3], ++ &mac[4], ++ &mac[5]); ++ if (err < 0) ++ return err; ++ if (err != 6) ++ return -EINVAL; ++ for (i = 0; i < 6; i++) { ++ if (mac[i] >= 0x100) ++ return -EINVAL; ++ addr[i] = mac[i]; ++ } ++ return 0; ++} ++ ++/* Release resources associated with a ring pair. It is assumed that ++ the ring pair has already been detached (which stops the IRQ and ++ un-pends the ring). */ ++void cleanup_ring_pair(struct netchannel2_ring_pair *ncrp) ++{ ++ BUG_ON(ncrp->prod_ring.sring); ++ BUG_ON(ncrp->cons_ring.sring); ++ ++ drop_pending_tx_packets(ncrp); ++ nc2_queue_purge(ncrp, &ncrp->release_on_flush_batcher); ++ if (ncrp->gref_pool != 0) ++ gnttab_free_grant_references(ncrp->gref_pool); ++ netif_napi_del(&ncrp->napi); ++} ++ ++int init_ring_pair(struct netchannel2_ring_pair *ncrp, ++ struct netchannel2 *nc) ++{ ++ unsigned x; ++ ++ ncrp->interface = nc; ++ spin_lock_init(&ncrp->lock); ++ ncrp->irq = -1; ++ ++ for (x = 0; x < NR_TX_PACKETS - 1; x++) ++ txp_set_next_free(ncrp->tx_packets + x, x + 1); ++ txp_set_next_free(ncrp->tx_packets + x, INVALID_TXP_INDEX); ++ ncrp->head_free_tx_packet = 0; ++ ++ skb_queue_head_init(&ncrp->pending_tx_queue); ++ skb_queue_head_init(&ncrp->release_on_flush_batcher); ++ ++ if (gnttab_alloc_grant_references(NR_TX_PACKETS, ++ &ncrp->gref_pool) < 0) ++ return -1; ++ ++ nc2_init_poller(ncrp); ++ ++ netif_napi_add(ncrp->interface->net_device, &ncrp->napi, ++ process_ring, 64); ++ napi_enable(&ncrp->napi); ++ ++ return 0; ++} ++ ++static struct net_device_stats *nc2_get_stats(struct net_device *nd) ++{ ++ struct netchannel2 *nc = netdev_priv(nd); ++ ++ return &nc->stats; ++} ++ ++static int nc2_change_mtu(struct net_device *nd, int mtu) ++{ ++ if (mtu > NETCHANNEL2_MAX_PACKET_BYTES) ++ return -EINVAL; ++ nd->mtu = mtu; ++ return 0; ++} ++ ++/* Create a new netchannel2 structure. Call with no locks held. ++ Returns NULL on error. The xenbus device must remain valid for as ++ long as the netchannel2 structure does. The core does not take out ++ any kind of reference count on it, but will refer to it throughout ++ the returned netchannel2's life. */ ++struct netchannel2 *nc2_new(struct xenbus_device *xd) ++{ ++ struct net_device *netdev; ++ struct netchannel2 *nc; ++ int err; ++ int local_trusted; ++ int remote_trusted; ++ int filter_mac; ++#ifdef CONFIG_XEN_NETDEV2_BYPASSABLE ++ int max_bypasses; ++#endif ++ ++ if (!gnttab_subpage_grants_available()) { ++ printk(KERN_ERR "netchannel2 needs version 2 grant tables\n"); ++ return NULL; ++ } ++ ++ if (xenbus_scanf(XBT_NIL, xd->nodename, "local-trusted", ++ "%d", &local_trusted) != 1) { ++ printk(KERN_WARNING "Can't tell whether local endpoint is trusted; assuming it is.\n"); ++ local_trusted = 1; ++ } ++ ++#ifdef CONFIG_XEN_NETDEV2_BYPASSABLE ++ max_bypasses = 0; ++ if (local_trusted) { ++ if (xenbus_scanf(XBT_NIL, xd->nodename, "max-bypasses", ++ "%d", &max_bypasses) != 1) { ++ printk(KERN_WARNING "Can't get maximum bypass count; assuming 0.\n"); ++ max_bypasses = 0; ++ } ++ } ++#endif ++ ++ if (xenbus_scanf(XBT_NIL, xd->nodename, "remote-trusted", ++ "%d", &remote_trusted) != 1) { ++ printk(KERN_WARNING "Can't tell whether local endpoint is trusted; assuming it isn't.\n"); ++ remote_trusted = 0; ++ } ++ ++ if (xenbus_scanf(XBT_NIL, xd->nodename, "filter-mac", ++ "%d", &filter_mac) != 1) { ++ if (remote_trusted) { ++ printk(KERN_WARNING "Can't tell whether to filter MAC addresses from remote domain; filtering off.\n"); ++ filter_mac = 0; ++ } else { ++ printk(KERN_WARNING "Can't tell whether to filter MAC addresses from remote domain; filtering on.\n"); ++ filter_mac = 1; ++ } ++ } ++ ++ netdev = alloc_etherdev(sizeof(*nc)); ++ if (netdev == NULL) ++ return NULL; ++ ++ nc = netdev_priv(netdev); ++ memset(nc, 0, sizeof(*nc)); ++ nc->magic = NETCHANNEL2_MAGIC; ++ nc->net_device = netdev; ++ nc->xenbus_device = xd; ++ ++ nc->remote_trusted = remote_trusted; ++ nc->local_trusted = local_trusted; ++ nc->rings.filter_mac = filter_mac; ++ ++ /* Default to RX csum on. */ ++ nc->use_rx_csum = 1; ++ ++#ifdef CONFIG_XEN_NETDEV2_BYPASSABLE ++ INIT_LIST_HEAD(&nc->bypasses_a); ++ INIT_LIST_HEAD(&nc->bypasses_b); ++ nc2_init_incoming_bypass_suggestions(nc, ++ &nc->incoming_bypass_suggestions); ++ nc->max_bypasses = max_bypasses; ++#endif ++#ifdef CONFIG_XEN_NETDEV2_BYPASS_ENDPOINT ++ INIT_LIST_HEAD(&nc->alternate_rings); ++#endif ++ ++ skb_queue_head_init(&nc->pending_skbs); ++ if (init_ring_pair(&nc->rings, nc) < 0) { ++ nc2_release(nc); ++ return NULL; ++ } ++ ++ if (local_trusted) { ++ if (init_receive_map_mode() < 0) { ++ nc2_release(nc); ++ return NULL; ++ } ++ } ++ ++ netdev->open = nc2_open; ++ netdev->stop = nc2_stop; ++ netdev->hard_start_xmit = nc2_start_xmit; ++ netdev->get_stats = nc2_get_stats; ++ netdev->change_mtu = nc2_change_mtu; ++ ++ /* We need to hold the ring lock in order to send messages ++ anyway, so there's no point in Linux doing additional ++ synchronisation. */ ++ netdev->features = NETIF_F_LLTX; ++ ++ SET_NETDEV_DEV(netdev, &xd->dev); ++ SET_ETHTOOL_OPS(netdev, &nc2_ethtool_ops); ++ ++ err = read_mac_address(xd->nodename, "remote-mac", ++ nc->rings.remote_mac); ++ if (err == 0) ++ err = read_mac_address(xd->nodename, "mac", netdev->dev_addr); ++ if (err == 0) ++ err = register_netdev(netdev); ++ ++ if (err != 0) { ++ nc2_release(nc); ++ return NULL; ++ } ++ ++ return nc; ++} ++ ++/* Release a netchannel2 structure previously allocated with ++ * nc2_new(). Call with no locks held. The rings will be ++ * automatically detach if necessary. */ ++void nc2_release(struct netchannel2 *nc) ++{ ++ netif_carrier_off(nc->net_device); ++ ++ unregister_netdev(nc->net_device); ++ ++ nc2_detach_rings(nc); ++ ++ /* Unregistering the net device stops any netdev methods from ++ running, and detaching the rings stops the napi methods, so ++ we're now the only thing accessing this netchannel2 ++ structure and we can tear it down with impunity. */ ++ ++ nc2_release_alt_rings(nc); ++ ++ cleanup_ring_pair(&nc->rings); ++ ++ nc2_queue_purge(&nc->rings, &nc->pending_skbs); ++ ++ release_bypasses(nc); ++ ++ free_netdev(nc->net_device); ++} ++ ++void _nc2_attach_rings(struct netchannel2_ring_pair *ncrp, ++ struct netchannel2_sring_cons *cons_sring, ++ const volatile void *cons_payload, ++ size_t cons_size, ++ struct netchannel2_sring_prod *prod_sring, ++ void *prod_payload, ++ size_t prod_size, ++ domid_t otherend_id) ++{ ++ BUG_ON(prod_sring == NULL); ++ BUG_ON(cons_sring == NULL); ++ ++ ncrp->prod_ring.sring = prod_sring; ++ ncrp->prod_ring.payload_bytes = prod_size; ++ ncrp->prod_ring.prod_pvt = 0; ++ ncrp->prod_ring.payload = prod_payload; ++ ++ ncrp->cons_ring.sring = cons_sring; ++ ncrp->cons_ring.payload_bytes = cons_size; ++ ncrp->cons_ring.sring->prod_event = ncrp->cons_ring.sring->prod + 1; ++ ncrp->cons_ring.cons_pvt = 0; ++ ncrp->cons_ring.payload = cons_payload; ++ ++ ncrp->otherend_id = otherend_id; ++ ++ ncrp->is_attached = 1; ++ ++ ncrp->need_advertise_max_packets = 1; ++ ncrp->need_advertise_max_fragments_per_packet = 1; ++ ncrp->max_fragments_per_tx_packet = 1; ++} ++ ++/* Attach a netchannel2 structure to a ring pair. The endpoint is ++ also expected to set up an event channel after calling this before ++ using the interface. Returns 0 on success or <0 on error. */ ++int nc2_attach_rings(struct netchannel2 *nc, ++ struct netchannel2_sring_cons *cons_sring, ++ const volatile void *cons_payload, ++ size_t cons_size, ++ struct netchannel2_sring_prod *prod_sring, ++ void *prod_payload, ++ size_t prod_size, ++ domid_t otherend_id) ++{ ++#ifdef CONFIG_XEN_NETDEV2_BYPASSABLE ++ int feature_bypass; ++ int max_bypass_pages; ++ ++ if (xenbus_scanf(XBT_NIL, nc->xenbus_device->otherend, ++ "feature-bypass", "%d", &feature_bypass) < 0) ++ feature_bypass = 0; ++ if (feature_bypass) { ++ if (xenbus_scanf(XBT_NIL, nc->xenbus_device->otherend, ++ "feature-bypass-max-pages", "%d", ++ &max_bypass_pages) < 0) { ++ printk(KERN_WARNING "other end claimed to support bypasses, but didn't expose max-pages?\n"); ++ /* Bypasses disabled for this ring. */ ++ nc->max_bypasses = 0; ++ } else { ++ nc->bypass_max_pages = max_bypass_pages; ++ } ++ } else { ++ nc->max_bypasses = 0; ++ } ++#endif ++ ++ spin_lock_bh(&nc->rings.lock); ++ _nc2_attach_rings(&nc->rings, cons_sring, cons_payload, cons_size, ++ prod_sring, prod_payload, prod_size, otherend_id); ++ ++ nc->need_advertise_offloads = 1; ++ ++ spin_unlock_bh(&nc->rings.lock); ++ ++ resume_receive_map_mode(); ++ ++ netif_carrier_on(nc->net_device); ++ ++ /* Kick it to get it going. */ ++ nc2_kick(&nc->rings); ++ ++ return 0; ++} ++ ++static void _detach_rings(struct netchannel2_ring_pair *ncrp) ++{ ++ spin_lock_bh(&ncrp->lock); ++ /* We need to release all of the pending transmission packets, ++ because they're never going to complete now that we've lost ++ the ring. */ ++ drop_pending_tx_packets(ncrp); ++ ++ disable_irq(ncrp->irq); ++ ++ BUG_ON(ncrp->nr_tx_packets_outstanding); ++ ncrp->max_tx_packets_outstanding = 0; ++ ++ /* No way of sending pending finish messages now; drop ++ * them. */ ++ ncrp->pending_finish.prod = 0; ++ ncrp->pending_finish.cons = 0; ++ ++ ncrp->cons_ring.sring = NULL; ++ ncrp->prod_ring.sring = NULL; ++ ncrp->is_attached = 0; ++ ++ spin_unlock_bh(&ncrp->lock); ++ ++#ifdef CONFIG_XEN_NETDEV2_BYPASS_ENDPOINT ++ { ++ struct nc2_alternate_ring *nar; ++ ++ /* Walk the alternate rings list and detach all of ++ them as well. This is recursive, but it's only ++ ever going to recur one deep, so it's okay. */ ++ /* Don't need to worry about synchronisation because ++ the interface has been stopped. */ ++ if (ncrp == &ncrp->interface->rings) { ++ list_for_each_entry(nar, ++ &ncrp->interface->alternate_rings, ++ rings_by_interface) ++ _detach_rings(&nar->rings); ++ } ++ } ++#endif ++} ++ ++/* Detach from the rings. This includes unmapping them and stopping ++ the interrupt. */ ++/* Careful: the netdev methods may still be running at this point. */ ++/* This is not allowed to wait for the other end, because it might ++ have gone away (e.g. over suspend/resume). */ ++static void nc2_detach_ring(struct netchannel2_ring_pair *ncrp) ++{ ++ if (!ncrp->is_attached) ++ return; ++ nc2_stop_polling(ncrp); ++ napi_disable(&ncrp->napi); ++ _detach_rings(ncrp); ++} ++ ++/* Trivial wrapper around nc2_detach_ring(). Make the ring no longer ++ used. */ ++void nc2_detach_rings(struct netchannel2 *nc) ++{ ++ nc2_detach_ring(&nc->rings); ++ ++ /* Okay, all async access to the ring is stopped. Kill the ++ irqhandlers. (It might be better to do this from the ++ _detach_ring() functions, but you're not allowed to ++ free_irq() from interrupt context, and tasklets are close ++ enough to cause problems). */ ++ ++ if (nc->rings.irq >= 0) ++ unbind_from_irqhandler(nc->rings.irq, &nc->rings); ++ nc->rings.irq = -1; ++#ifdef CONFIG_XEN_NETDEV2_BYPASS_ENDPOINT ++ { ++ struct nc2_alternate_ring *ncr; ++ ++ list_for_each_entry(ncr, &nc->alternate_rings, ++ rings_by_interface) { ++ if (ncr->rings.irq >= 0) { ++ unbind_from_irqhandler(ncr->rings.irq, ++ &ncr->rings); ++ ncr->rings.irq = -1; ++ } ++ } ++ } ++#endif ++ ++ /* Disable all offloads */ ++ nc->net_device->features &= ~(NETIF_F_IP_CSUM | NETIF_F_SG | NETIF_F_TSO); ++ nc->allow_tx_csum_offload = 0; ++ nc->rings.max_fragments_per_tx_packet = 1; ++ nc->allow_tso = 0; ++} ++ ++#if defined(CONFIG_XEN_NETDEV2_BACKEND) ++/* Connect to an event channel port in a remote domain. Returns 0 on ++ success or <0 on error. The port is automatically disconnected ++ when the channel is released or if the rings are detached. This ++ should not be called if the port is already open. */ ++int nc2_connect_evtchn(struct netchannel2 *nc, domid_t domid, ++ int evtchn) ++{ ++ int err; ++ ++ BUG_ON(nc->rings.irq >= 0); ++ ++ err = bind_interdomain_evtchn_to_irqhandler(domid, ++ evtchn, ++ nc2_int, ++ IRQF_SAMPLE_RANDOM, ++ "netchannel2", ++ &nc->rings); ++ if (err >= 0) { ++ nc->rings.irq = err; ++ nc->rings.evtchn = irq_to_evtchn_port(err); ++ return 0; ++ } else { ++ return err; ++ } ++} ++#endif ++ ++#if defined(CONFIG_XEN_NETDEV2_FRONTEND) ++/* Listen for incoming event channel connections from domain domid. ++ Similar semantics to nc2_connect_evtchn(). */ ++int nc2_listen_evtchn(struct netchannel2 *nc, domid_t domid) ++{ ++ int err; ++ ++ BUG_ON(nc->rings.irq >= 0); ++ ++ err = bind_listening_port_to_irqhandler(domid, ++ nc2_int, ++ IRQF_SAMPLE_RANDOM, ++ "netchannel2", ++ &nc->rings); ++ if (err >= 0) { ++ nc->rings.irq = err; ++ nc->rings.evtchn = irq_to_evtchn_port(err); ++ return 0; ++ } else { ++ return err; ++ } ++} ++#endif ++ ++/* Find the local event channel port which was allocated by ++ * nc2_listen_evtchn() or nc2_connect_evtchn(). It is an error to ++ * call this when there is no event channel connected. */ ++int nc2_get_evtchn_port(struct netchannel2 *nc) ++{ ++ BUG_ON(nc->rings.irq < 0); ++ return nc->rings.evtchn; ++} ++ ++void nc2_suspend(struct netchannel2 *nc) ++{ ++ detach_all_bypasses(nc); ++ suspend_receive_map_mode(); ++} ++ ++/* @ncrp has been recently nc2_kick()ed. Do all of the necessary ++ stuff. */ ++static int process_ring(struct napi_struct *napi, ++ int work_avail) ++{ ++ struct netchannel2_ring_pair *ncrp = ++ container_of(napi, struct netchannel2_ring_pair, napi); ++ struct netchannel2 *nc = ncrp->interface; ++ struct sk_buff *skb; ++ int work_done; ++ struct sk_buff_head rx_queue; ++ ++ skb_queue_head_init(&rx_queue); ++ ++ spin_lock(&ncrp->lock); ++ ++ /* Pick up incoming messages. */ ++ work_done = nc2_poll(ncrp, work_avail, &rx_queue); ++ ++ /* Transmit pending packets. */ ++ if (!skb_queue_empty(&ncrp->pending_tx_queue)) { ++ skb = __skb_dequeue(&ncrp->pending_tx_queue); ++ do { ++ if (!nc2_really_start_xmit(ncrp, skb)) { ++ /* Requeue the packet so that we will try ++ when the ring is less busy */ ++ __skb_queue_head(&ncrp->pending_tx_queue, skb); ++ break; ++ } ++ skb = __skb_dequeue(&ncrp->pending_tx_queue); ++ } while (skb != NULL); ++ ++ flush_rings(ncrp); ++ ++ while ((skb = __skb_dequeue(&ncrp->release_on_flush_batcher))) ++ release_tx_packet(ncrp, skb); ++ } ++ ++ if (ncrp == &nc->rings && nc->is_stopped) { ++ /* If the other end has processed some messages, there ++ may be space on the ring for a delayed send from ++ earlier. Process it now. */ ++ while (1) { ++ skb = skb_peek_tail(&nc->pending_skbs); ++ if (!skb) ++ break; ++ switch (prepare_xmit_allocate_resources(nc, skb)) { ++ case PREP_XMIT_OKAY: ++ __skb_unlink(skb, &nc->pending_skbs); ++ queue_packet_to_interface(skb, ncrp); ++ break; ++ case PREP_XMIT_BUSY: ++ goto still_stuck; ++ case PREP_XMIT_DROP: ++ __skb_unlink(skb, &nc->pending_skbs); ++ release_tx_packet(ncrp, skb); ++ break; ++ } ++ } ++ if (skb_queue_empty(&nc->pending_skbs)) { ++ nc->is_stopped = 0; ++ netif_wake_queue(nc->net_device); ++ } ++still_stuck: ++ ; ++ } ++ ++ spin_unlock(&ncrp->lock); ++ ++ receive_pending_skbs(&rx_queue); ++ ++ return work_done; ++} +diff --git a/drivers/xen/netchannel2/netback2.c b/drivers/xen/netchannel2/netback2.c +new file mode 100644 +index 0000000..844f452 +--- /dev/null ++++ b/drivers/xen/netchannel2/netback2.c +@@ -0,0 +1,482 @@ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "netchannel2_core.h" ++#include "netchannel2_endpoint.h" ++#include "netchannel2_uspace.h" ++ ++static atomic_t next_handle; ++/* A list of all currently-live netback2 interfaces. */ ++static LIST_HEAD(all_netbacks); ++/* A lock to protect the above list. */ ++static DEFINE_MUTEX(all_netbacks_lock); ++ ++#define NETBACK2_MAGIC 0xb5e99485 ++struct netback2 { ++ unsigned magic; ++ struct xenbus_device *xenbus_device; ++ ++ int handle; ++ struct list_head list; ++ ++ struct netchannel2 *chan; ++ ++ struct grant_mapping b2f_mapping; ++ struct grant_mapping f2b_mapping; ++ struct grant_mapping control_mapping; ++ ++ int attached; ++ ++ struct xenbus_watch shutdown_watch; ++ int have_shutdown_watch; ++}; ++ ++static struct netback2 *xenbus_device_to_nb2(struct xenbus_device *xd) ++{ ++ struct netback2 *nb = xd->dev.driver_data; ++ BUG_ON(nb->magic != NETBACK2_MAGIC); ++ return nb; ++} ++ ++/* Read a range of grants out of xenstore and map them in gm. Any ++ existing mapping in gm is released. Returns 0 on success or <0 on ++ error. On error, gm is preserved, and xenbus_dev_fatal() is ++ called. */ ++static int map_grants(struct netback2 *nd, const char *prefix, ++ struct grant_mapping *gm) ++{ ++ struct xenbus_device *xd = nd->xenbus_device; ++ int err; ++ char buf[32]; ++ int i; ++ unsigned nr_pages; ++ grant_ref_t grefs[MAX_GRANT_MAP_PAGES]; ++ ++ sprintf(buf, "%s-nr-pages", prefix); ++ err = xenbus_scanf(XBT_NIL, xd->otherend, buf, "%u", &nr_pages); ++ if (err == -ENOENT) { ++ nr_pages = 1; ++ } else if (err != 1) { ++ if (err < 0) { ++ xenbus_dev_fatal(xd, err, "reading %s", buf); ++ return err; ++ } else { ++ xenbus_dev_fatal(xd, err, "reading %s as integer", ++ buf); ++ return -EINVAL; ++ } ++ } ++ ++ for (i = 0; i < nr_pages; i++) { ++ sprintf(buf, "%s-ref-%d", prefix, i); ++ err = xenbus_scanf(XBT_NIL, xd->otherend, buf, "%u", ++ &grefs[i]); ++ if (err != 1) { ++ if (err < 0) { ++ xenbus_dev_fatal(xd, ++ err, ++ "reading gref %d from %s/%s", ++ i, ++ xd->otherend, ++ buf); ++ } else { ++ xenbus_dev_fatal(xd, ++ -EINVAL, ++ "expected an integer at %s/%s", ++ xd->otherend, ++ buf); ++ err = -EINVAL; ++ } ++ return err; ++ } ++ } ++ ++ err = nc2_map_grants(gm, grefs, nr_pages, xd->otherend_id); ++ if (err < 0) ++ xenbus_dev_fatal(xd, err, "mapping ring %s from %s", ++ prefix, xd->otherend); ++ return err; ++} ++ ++/* Undo the effects of attach_to_frontend */ ++static void detach_from_frontend(struct netback2 *nb) ++{ ++ if (!nb->attached) ++ return; ++ nc2_detach_rings(nb->chan); ++ nc2_unmap_grants(&nb->b2f_mapping); ++ nc2_unmap_grants(&nb->f2b_mapping); ++ nc2_unmap_grants(&nb->control_mapping); ++ nb->attached = 0; ++} ++ ++static int attach_to_frontend(struct netback2 *nd) ++{ ++ int err; ++ int evtchn; ++ struct xenbus_device *xd = nd->xenbus_device; ++ struct netchannel2 *nc = nd->chan; ++ struct netchannel2_backend_shared *nbs; ++ ++ if (nd->attached) ++ return 0; ++ ++ /* Attach the shared memory bits */ ++ err = map_grants(nd, "b2f-ring", &nd->b2f_mapping); ++ if (err) ++ return err; ++ err = map_grants(nd, "f2b-ring", &nd->f2b_mapping); ++ if (err) ++ return err; ++ err = map_grants(nd, "control", &nd->control_mapping); ++ if (err) ++ return err; ++ nbs = nd->control_mapping.mapping->addr; ++ err = nc2_attach_rings(nc, ++ &nbs->cons, ++ nd->f2b_mapping.mapping->addr, ++ nd->f2b_mapping.nr_pages * PAGE_SIZE, ++ &nbs->prod, ++ nd->b2f_mapping.mapping->addr, ++ nd->b2f_mapping.nr_pages * PAGE_SIZE, ++ xd->otherend_id); ++ if (err < 0) { ++ xenbus_dev_fatal(xd, err, "attaching to rings"); ++ return err; ++ } ++ ++ /* Connect the event channel. */ ++ err = xenbus_scanf(XBT_NIL, xd->otherend, "event-channel", "%u", ++ &evtchn); ++ if (err < 0) { ++ xenbus_dev_fatal(xd, err, ++ "reading %s/event-channel or {t,r}x-sring-pages", ++ xd->otherend); ++ return err; ++ } ++ err = nc2_connect_evtchn(nd->chan, xd->otherend_id, evtchn); ++ if (err < 0) { ++ xenbus_dev_fatal(xd, err, "binding to event channel"); ++ return err; ++ } ++ ++ /* All done */ ++ nd->attached = 1; ++ ++ return 0; ++} ++ ++static void frontend_changed(struct xenbus_device *xd, ++ enum xenbus_state frontend_state) ++{ ++ struct netback2 *nb = xenbus_device_to_nb2(xd); ++ int err; ++ ++ switch (frontend_state) { ++ case XenbusStateInitialising: ++ /* If the frontend does a kexec following a crash, we ++ can end up bounced back here even though we're ++ attached. Try to recover by detaching from the old ++ rings. */ ++ /* (A normal shutdown, and even a normal kexec, would ++ * have gone through Closed first, so we'll already be ++ * detached, and this is pointless but harmless.) */ ++ detach_from_frontend(nb); ++ ++ /* Tell the frontend what sort of rings we're willing ++ to accept. */ ++ xenbus_printf(XBT_NIL, nb->xenbus_device->nodename, ++ "max-sring-pages", "%d", MAX_GRANT_MAP_PAGES); ++ ++#ifdef CONFIG_XEN_NETDEV2_BYPASS_ENDPOINT ++ xenbus_printf(XBT_NIL, nb->xenbus_device->nodename, ++ "feature-bypass", "1"); ++ xenbus_printf(XBT_NIL, nb->xenbus_device->nodename, ++ "feature-bypass-max-pages", "%d", ++ MAX_BYPASS_RING_PAGES_GRANTABLE); ++#endif ++ ++ /* Start the device bring-up bit of the state ++ * machine. */ ++ xenbus_switch_state(nb->xenbus_device, XenbusStateInitWait); ++ break; ++ ++ case XenbusStateInitWait: ++ /* Frontend doesn't use this state */ ++ xenbus_dev_fatal(xd, EINVAL, ++ "unexpected frontend state InitWait"); ++ break; ++ ++ case XenbusStateInitialised: ++ case XenbusStateConnected: ++ /* Frontend has advertised its rings to us */ ++ err = attach_to_frontend(nb); ++ if (err >= 0) ++ xenbus_switch_state(xd, XenbusStateConnected); ++ break; ++ ++ case XenbusStateClosing: ++ detach_from_frontend(nb); ++ xenbus_switch_state(xd, XenbusStateClosed); ++ break; ++ ++ case XenbusStateClosed: ++ detach_from_frontend(nb); ++ xenbus_switch_state(xd, XenbusStateClosed); ++ if (!xenbus_dev_is_online(xd)) ++ device_unregister(&xd->dev); ++ break; ++ ++ case XenbusStateUnknown: ++ detach_from_frontend(nb); ++ xenbus_switch_state(xd, XenbusStateClosed); ++ device_unregister(&xd->dev); ++ break; ++ ++ default: ++ /* Ignore transitions to unknown states */ ++ break; ++ } ++} ++ ++static int netback2_uevent(struct xenbus_device *xd, ++ struct kobj_uevent_env *env) ++{ ++ struct netback2 *nb = xenbus_device_to_nb2(xd); ++ ++ add_uevent_var(env, "vif=%s", nb->chan->net_device->name); ++ ++ return 0; ++} ++ ++static void netback2_shutdown(struct xenbus_device *xd) ++{ ++ xenbus_switch_state(xd, XenbusStateClosing); ++} ++ ++static void shutdown_watch_callback(struct xenbus_watch *watch, ++ const char **vec, ++ unsigned int len) ++{ ++ struct netback2 *nb = ++ container_of(watch, struct netback2, shutdown_watch); ++ char *type; ++ ++ type = xenbus_read(XBT_NIL, nb->xenbus_device->nodename, ++ "shutdown-request", NULL); ++ if (IS_ERR(type)) { ++ if (PTR_ERR(type) != -ENOENT) ++ printk(KERN_WARNING "Cannot read %s/%s: %ld\n", ++ nb->xenbus_device->nodename, "shutdown-request", ++ PTR_ERR(type)); ++ return; ++ } ++ if (strcmp(type, "force") == 0) { ++ detach_from_frontend(nb); ++ xenbus_switch_state(nb->xenbus_device, XenbusStateClosed); ++ } else if (strcmp(type, "normal") == 0) { ++ netback2_shutdown(nb->xenbus_device); ++ } else { ++ printk(KERN_WARNING "Unrecognised shutdown request %s from tools\n", ++ type); ++ } ++ xenbus_rm(XBT_NIL, nb->xenbus_device->nodename, "shutdown-request"); ++ kfree(type); ++} ++ ++static int netback2_probe(struct xenbus_device *xd, ++ const struct xenbus_device_id *id) ++{ ++ struct netback2 *nb; ++ ++ nb = kzalloc(sizeof(*nb), GFP_KERNEL); ++ if (nb == NULL) ++ goto err; ++ nb->magic = NETBACK2_MAGIC; ++ nb->xenbus_device = xd; ++ ++ nb->shutdown_watch.node = kasprintf(GFP_KERNEL, "%s/shutdown-request", ++ xd->nodename); ++ if (nb->shutdown_watch.node == NULL) ++ goto err; ++ nb->shutdown_watch.callback = shutdown_watch_callback; ++ if (register_xenbus_watch(&nb->shutdown_watch)) ++ goto err; ++ nb->have_shutdown_watch = 1; ++ ++ nb->chan = nc2_new(xd); ++ if (!nb->chan) ++ goto err; ++ ++ xd->dev.driver_data = nb; ++ ++ nb->handle = atomic_inc_return(&next_handle); ++ mutex_lock(&all_netbacks_lock); ++ list_add(&nb->list, &all_netbacks); ++ mutex_unlock(&all_netbacks_lock); ++ ++ kobject_uevent(&xd->dev.kobj, KOBJ_ONLINE); ++ ++ return 0; ++ ++err: ++ if (nb != NULL) { ++ if (nb->have_shutdown_watch) ++ unregister_xenbus_watch(&nb->shutdown_watch); ++ kfree(nb->shutdown_watch.node); ++ kfree(nb); ++ } ++ xenbus_dev_fatal(xd, ENOMEM, "probing netdev"); ++ return -ENOMEM; ++} ++ ++static int netback2_remove(struct xenbus_device *xd) ++{ ++ struct netback2 *nb = xenbus_device_to_nb2(xd); ++ kobject_uevent(&xd->dev.kobj, KOBJ_OFFLINE); ++ mutex_lock(&all_netbacks_lock); ++ list_del(&nb->list); ++ mutex_unlock(&all_netbacks_lock); ++ if (nb->chan != NULL) ++ nc2_release(nb->chan); ++ if (nb->have_shutdown_watch) ++ unregister_xenbus_watch(&nb->shutdown_watch); ++ kfree(nb->shutdown_watch.node); ++ nc2_unmap_grants(&nb->b2f_mapping); ++ nc2_unmap_grants(&nb->f2b_mapping); ++ nc2_unmap_grants(&nb->control_mapping); ++ kfree(nb); ++ return 0; ++} ++ ++static const struct xenbus_device_id netback2_ids[] = { ++ { "vif2" }, ++ { "" } ++}; ++ ++static struct xenbus_driver netback2 = { ++ .name = "vif2", ++ .ids = netback2_ids, ++ .probe = netback2_probe, ++ .remove = netback2_remove, ++ .otherend_changed = frontend_changed, ++ .uevent = netback2_uevent, ++}; ++ ++#ifdef CONFIG_XEN_NETDEV2_BYPASSABLE ++static struct netback2 *find_netback_by_handle_locked(unsigned handle) ++{ ++ struct netback2 *nb; ++ ++ list_for_each_entry(nb, &all_netbacks, list) { ++ if (nb->handle == handle) ++ return nb; ++ } ++ return NULL; ++} ++ ++static struct netback2 *find_netback_by_remote_mac_locked(const char *mac) ++{ ++ struct netback2 *nb; ++ ++ list_for_each_entry(nb, &all_netbacks, list) { ++ if (!memcmp(nb->chan->rings.remote_mac, mac, ETH_ALEN)) ++ return nb; ++ } ++ return NULL; ++} ++ ++static long netchannel2_ioctl_establish_bypass(struct netchannel2_ioctl_establish_bypass __user *argsp) ++{ ++ struct netchannel2_ioctl_establish_bypass args; ++ struct netback2 *a, *b; ++ int res; ++ ++ if (copy_from_user(&args, argsp, sizeof(args))) ++ return -EFAULT; ++ ++ mutex_lock(&all_netbacks_lock); ++ a = find_netback_by_handle_locked(args.handle_a); ++ b = find_netback_by_handle_locked(args.handle_b); ++ if (a && b) ++ res = nc2_establish_bypass(a->chan, b->chan); ++ else ++ res = -EINVAL; ++ mutex_unlock(&all_netbacks_lock); ++ ++ return res; ++} ++ ++void nb2_handle_suggested_bypass(struct netchannel2 *a_chan, const char *mac_b) ++{ ++ struct netback2 *b; ++ mutex_lock(&all_netbacks_lock); ++ b = find_netback_by_remote_mac_locked(mac_b); ++ if (b != NULL) ++ nc2_establish_bypass(a_chan, b->chan); ++ mutex_unlock(&all_netbacks_lock); ++} ++ ++static long netchannel2_ioctl_destroy_bypass(struct netchannel2_ioctl_destroy_bypass __user *argsp) ++{ ++ struct netchannel2_ioctl_destroy_bypass args; ++ ++ if (copy_from_user(&args, argsp, sizeof(args))) ++ return -EFAULT; ++ ++ return nc2_destroy_bypass(args.handle); ++} ++#endif ++ ++static long misc_dev_unlocked_ioctl(struct file *filp, unsigned cmd, ++ unsigned long data) ++{ ++ switch (cmd) { ++#ifdef CONFIG_XEN_NETDEV2_BYPASSABLE ++ case NETCHANNEL2_IOCTL_ESTABLISH_BYPASS: ++ return netchannel2_ioctl_establish_bypass( ++ (struct netchannel2_ioctl_establish_bypass __user *)data); ++ case NETCHANNEL2_IOCTL_DESTROY_BYPASS: ++ return netchannel2_ioctl_destroy_bypass( ++ (struct netchannel2_ioctl_destroy_bypass __user *)data); ++#endif ++ default: ++ return -EINVAL; ++ } ++} ++ ++static struct file_operations misc_dev_fops = { ++ .owner = THIS_MODULE, ++ .unlocked_ioctl = misc_dev_unlocked_ioctl ++}; ++ ++static struct miscdevice netback2_misc_dev = { ++ .minor = MISC_DYNAMIC_MINOR, ++ .name = "netback2", ++ .fops = &misc_dev_fops ++}; ++ ++int __init netback2_init(void) ++{ ++ int r; ++ ++ r = misc_register(&netback2_misc_dev); ++ if (r < 0) { ++ printk(KERN_ERR "Error %d registering control device.\n", ++ r); ++ return r; ++ } ++ r = xenbus_register_backend(&netback2); ++ if (r < 0) { ++ printk(KERN_ERR "error %d registering backend driver.\n", ++ r); ++ misc_deregister(&netback2_misc_dev); ++ } ++ return r; ++} +diff --git a/drivers/xen/netchannel2/netchan2.c b/drivers/xen/netchannel2/netchan2.c +new file mode 100644 +index 0000000..9ca6c91 +--- /dev/null ++++ b/drivers/xen/netchannel2/netchan2.c +@@ -0,0 +1,32 @@ ++#include ++#include ++#include "netchannel2_endpoint.h" ++ ++static int __init netchan2_init(void) ++{ ++ int r; ++ ++ r = nc2_init(); ++ if (r < 0) ++ return r; ++ r = netfront2_init(); ++ if (r < 0) ++ return r; ++ r = netback2_init(); ++ if (r < 0) ++ netfront2_exit(); ++ return r; ++} ++module_init(netchan2_init); ++ ++/* We can't unload if we're acting as a backend. */ ++#ifndef CONFIG_XEN_NETDEV2_BACKEND ++static void __exit netchan2_exit(void) ++{ ++ netfront2_exit(); ++ nc2_exit(); ++} ++module_exit(netchan2_exit); ++#endif ++ ++MODULE_LICENSE("GPL"); +diff --git a/drivers/xen/netchannel2/netchannel2_core.h b/drivers/xen/netchannel2/netchannel2_core.h +new file mode 100644 +index 0000000..7a62af4 +--- /dev/null ++++ b/drivers/xen/netchannel2/netchannel2_core.h +@@ -0,0 +1,797 @@ ++#ifndef NETCHANNEL2_CORE_H__ ++#define NETCHANNEL2_CORE_H__ ++ ++#include ++#include ++#include ++#include ++#include ++ ++/* After we send this number of frags, we request the other end to ++ * notify us when sending the corresponding finish packet message */ ++#define MAX_MAX_COUNT_FRAGS_NO_EVENT 192 ++ ++/* Very small packets (e.g. TCP pure acks) are sent inline in the ++ * ring, to avoid the hypercall overhead. This is the largest packet ++ * which will be sent small, in bytes. It should be big enough to ++ * cover the normal headers (i.e. ethernet + IP + TCP = 66 bytes) plus ++ * a little bit of slop for options etc. */ ++#define PACKET_PREFIX_SIZE 96 ++ ++/* How many packets can we have outstanding at any one time? This ++ * must be small enough that it won't be confused with an sk_buff ++ * pointer; see the txp_slot stuff later. */ ++#define NR_TX_PACKETS 256 ++ ++/* A way of keeping track of a mapping of a bunch of grant references ++ into a contigous chunk of virtual address space. This is used for ++ things like multi-page rings. */ ++#define MAX_GRANT_MAP_PAGES 4 ++struct grant_mapping { ++ unsigned nr_pages; ++ grant_handle_t handles[MAX_GRANT_MAP_PAGES]; ++ struct vm_struct *mapping; ++}; ++ ++enum transmit_policy { ++ transmit_policy_unknown = 0, ++ transmit_policy_first = 0xf001, ++ transmit_policy_grant = transmit_policy_first, ++ transmit_policy_map, ++ transmit_policy_small, ++ transmit_policy_last = transmit_policy_small ++}; ++ ++/* When we send a packet message, we need to tag it with an ID. That ++ ID is an index into the TXP slot array. Each slot contains either ++ a pointer to an sk_buff (if it's in use), or the index of the next ++ free slot (if it isn't). A slot is in use if the contents is > ++ NR_TX_PACKETS, and free otherwise. */ ++struct txp_slot { ++ unsigned long __contents; ++}; ++ ++typedef uint32_t nc2_txp_index_t; ++ ++#define INVALID_TXP_INDEX ((nc2_txp_index_t)NR_TX_PACKETS) ++ ++static inline int txp_slot_in_use(struct txp_slot *slot) ++{ ++ if (slot->__contents <= NR_TX_PACKETS) ++ return 0; ++ else ++ return 1; ++} ++ ++static inline void txp_set_skb(struct txp_slot *slot, struct sk_buff *skb) ++{ ++ slot->__contents = (unsigned long)skb; ++} ++ ++static inline struct sk_buff *txp_get_skb(struct txp_slot *slot) ++{ ++ if (txp_slot_in_use(slot)) ++ return (struct sk_buff *)slot->__contents; ++ else ++ return NULL; ++} ++ ++static inline void txp_set_next_free(struct txp_slot *slot, ++ nc2_txp_index_t idx) ++{ ++ slot->__contents = idx; ++} ++ ++static inline nc2_txp_index_t txp_get_next_free(struct txp_slot *slot) ++{ ++ return (nc2_txp_index_t)slot->__contents; ++} ++ ++/* This goes in struct sk_buff::cb */ ++struct skb_cb_overlay { ++ struct txp_slot *tp; ++ unsigned nr_fragments; ++ grant_ref_t gref_pool; ++ enum transmit_policy policy; ++ uint8_t failed; ++ uint8_t expecting_finish; ++ uint8_t type; ++ uint16_t inline_prefix_size; ++}; ++ ++#define CASSERT(x) typedef unsigned __cassert_ ## __LINE__ [(x)-1] ++CASSERT(sizeof(struct skb_cb_overlay) <= sizeof(((struct sk_buff *)0)->cb)); ++ ++static inline struct skb_cb_overlay *get_skb_overlay(struct sk_buff *skb) ++{ ++ return (struct skb_cb_overlay *)skb->cb; ++} ++ ++struct nc2_alternate_ring; ++struct netchannel2; ++ ++#ifdef CONFIG_XEN_NETDEV2_AUTOMATIC_BYPASS ++#define AUTOBYPASS_MAX_HOT_MACS 8 ++#define AUTOBYPASS_SUGG_QUEUE_SIZE 8 ++struct nc2_auto_bypass { ++ enum { ++ autobypass_state_normal, ++ autobypass_state_considering, ++ autobypass_state_debounce ++ } state; ++ uint32_t nr_bypass_packets; ++ uint64_t nr_non_bypass_packets; ++ unsigned long start_jiffies; ++ unsigned nr_hot_macs; ++ struct { ++ unsigned char mac[ETH_ALEN]; ++ /* This won't overflow because the autobypass period ++ is less than 65536. */ ++ uint16_t count; ++ } hot_macs[AUTOBYPASS_MAX_HOT_MACS]; ++ unsigned suggestion_head; ++ unsigned suggestion_tail; ++ struct { ++ unsigned char mac[ETH_ALEN]; ++ } suggestions[AUTOBYPASS_SUGG_QUEUE_SIZE]; ++}; ++void nc2_received_bypass_candidate_packet(struct netchannel2 *nc, ++ struct sk_buff *skb); ++ ++struct nc2_bypass_autoteardown { ++ struct list_head autoteardown_list; ++ uint64_t nr_packets; ++ unsigned seen_count; ++}; ++ ++void nc2_register_bypass_for_autoteardown(struct nc2_alternate_ring *nar); ++void nc2_unregister_bypass_for_autoteardown(struct nc2_alternate_ring *nar); ++void nc2_shutdown_autoteardown(void); ++#else ++static inline void nc2_shutdown_autoteardown(void) ++{ ++} ++static inline void nc2_register_bypass_for_autoteardown(struct nc2_alternate_ring *nar) ++{ ++} ++static inline void nc2_unregister_bypass_for_autoteardown(struct nc2_alternate_ring *nar) ++{ ++} ++#endif ++ ++#ifdef CONFIG_XEN_NETDEV2_BYPASSABLE ++#define NC2_BYPASS_SUGG_QUEUE_SIZE 8 ++struct nc2_incoming_bypass_suggestions { ++ spinlock_t lock; ++ ++ unsigned head; ++ unsigned tail; ++ ++ struct work_struct workitem; ++ ++ struct { ++ unsigned char mac[ETH_ALEN]; ++ } queue[NC2_BYPASS_SUGG_QUEUE_SIZE]; ++}; ++ ++void nc2_init_incoming_bypass_suggestions( ++ struct netchannel2 *nc, ++ struct nc2_incoming_bypass_suggestions *nibs); ++#endif ++ ++ ++/* Packets for which we need to send FINISH_PACKET messages for as ++ soon as possible. */ ++struct pending_finish_packets { ++#define MAX_PENDING_FINISH_PACKETS 256 ++ uint32_t ids[MAX_PENDING_FINISH_PACKETS]; ++ RING_IDX prod; ++ RING_IDX cons; ++}; ++ ++#define RX_GRANT_COPY_BATCH 32 ++struct hypercall_batcher { ++ unsigned nr_pending_gops; ++ gnttab_copy_t gops[RX_GRANT_COPY_BATCH]; ++ void *ctxt[RX_GRANT_COPY_BATCH]; ++}; ++ ++struct netchannel2_ring_pair { ++ struct netchannel2 *interface; ++ /* Main ring lock. Acquired from bottom halves. */ ++ spinlock_t lock; ++ ++ struct napi_struct napi; ++ ++ /* jiffies the last time the interrupt fired. Not ++ synchronised at all, because it doesn't usually matter if ++ it's a bit off. */ ++ unsigned last_event; ++ ++ /* Protected by the lock. Initialised at attach_ring() time ++ and de-initialised at detach_ring() time. */ ++ struct netchannel2_prod_ring prod_ring; ++ struct netchannel2_cons_ring cons_ring; ++ uint8_t is_attached; /* True if the rings are currently safe to ++ access. */ ++ ++ unsigned max_count_frags_no_event; ++ unsigned expected_finish_messages; ++ ++ struct timer_list polling_timer; ++ ++ domid_t otherend_id; ++ ++ grant_ref_t gref_pool; ++ ++ /* The IRQ corresponding to the event channel which is ++ connected to the other end. This only changes from the ++ xenbus state change handler. It is notified from lots of ++ other places. Fortunately, it's safe to notify on an irq ++ after it's been released, so the lack of synchronisation ++ doesn't matter. */ ++ int irq; ++ int evtchn; ++ ++ /* The MAC address of our peer. */ ++ unsigned char remote_mac[ETH_ALEN]; ++ ++ /* Set if we need to check the source MAC address on incoming ++ packets. */ ++ int filter_mac; ++ ++ /* A pool of free transmitted_packet structures, threaded on ++ the list member. Protected by the lock. */ ++ nc2_txp_index_t head_free_tx_packet; ++ ++ /* Total number of packets on the allocated list. Protected ++ by the lock. */ ++ unsigned nr_tx_packets_outstanding; ++ /* Maximum number of packets which the other end will allow us ++ to keep outstanding at one time. Valid whenever ++ is_attached is set. */ ++ unsigned max_tx_packets_outstanding; ++ ++ /* Count number of frags that we have sent to the other side ++ When we reach a max value we request that the other end ++ send an event when sending the corresponding finish message */ ++ unsigned count_frags_no_event; ++ ++ /* Set if we need to send a SET_MAX_PACKETS message. ++ Protected by the lock. */ ++ uint8_t need_advertise_max_packets; ++ ++ /* Set if there are messages on the ring which are considered ++ time-sensitive, so that it's necessary to notify the remote ++ endpoint as soon as possible. */ ++ uint8_t pending_time_sensitive_messages; ++ ++ /* Set if we've previously suppressed a remote notification ++ because none of the messages pending at the time of the ++ flush were time-sensitive. The remote should be notified ++ as soon as the ring is flushed, even if the normal ++ filtering rules would suppress the event. */ ++ uint8_t delayed_kick; ++ ++ /* Set if we need to send a SET_MAX_FRAGMENTS_PER_PACKET ++ * message. */ ++ uint8_t need_advertise_max_fragments_per_packet; ++ ++ /* The maximum number of fragments which can be used in any ++ given packet. We have to linearise anything which is more ++ fragmented than this. */ ++ uint32_t max_fragments_per_tx_packet; ++ ++ /* A list of packet IDs which we need to return to the other ++ end as soon as there is space on the ring. Protected by ++ the lock. */ ++ struct pending_finish_packets pending_finish; ++ ++ /* transmitted_packet structures which are to be transmitted ++ next time the TX tasklet looks at this interface. ++ Protected by the lock. */ ++ struct sk_buff_head pending_tx_queue; ++ ++ /* Packets which we'll have finished transmitting as soon as ++ we flush the hypercall batcher. Protected by the lock. */ ++ struct sk_buff_head release_on_flush_batcher; ++ ++ struct hypercall_batcher pending_rx_hypercalls; ++ ++ /* A pre-allocated pool of TX packets. The ++ allocated_tx_packets and free_tx_packets linked lists ++ contain elements of this array, and it can also be directly ++ indexed by packet ID. Protected by the lock. */ ++ struct txp_slot tx_packets[NR_TX_PACKETS]; ++}; ++ ++struct netchannel2 { ++#define NETCHANNEL2_MAGIC 0x57c68c1d ++ unsigned magic; ++ ++ /* Set when the structure is created and never changed */ ++ struct net_device *net_device; ++ struct xenbus_device *xenbus_device; ++ ++ /* Set if we trust the remote endpoint. */ ++ int remote_trusted; ++ /* Set if the remote endpoint is expected to trust us. ++ There's no guarantee that this is actually correct, but ++ it's useful for optimisation. */ ++ int local_trusted; ++ ++#ifdef CONFIG_XEN_NETDEV2_BYPASS_ENDPOINT ++ /* Alternate rings for this interface. Protected by the ++ master rings lock. */ ++ struct list_head alternate_rings; ++ uint8_t need_aux_ring_state_machine; ++ ++ uint8_t pending_bypass_error; ++#endif ++ ++ struct netchannel2_ring_pair rings; ++ ++ /* Packets which we need to transmit soon */ ++ struct sk_buff_head pending_skbs; ++ ++ /* Task offload control. These are all protected by the ++ * lock. */ ++ /* Ethtool allows us to use RX checksumming */ ++ uint8_t use_rx_csum; ++ /* The remote endpoint allows us to use TX checksumming. ++ Whether we actually use TX checksumming is controlled by ++ the net device feature bits. */ ++ uint8_t allow_tx_csum_offload; ++ /* The remote endpoint allows us to use TSO for TCPv4. As for ++ checksumming, we only actually use the feature if the net ++ device says to. */ ++ uint8_t allow_tso; ++ /* At some point in the past, we tried to tell the other end ++ what our current offload policy is and failed. Try again ++ as soon as possible. */ ++ uint8_t need_advertise_offloads; ++ ++ /* Flag to indicate that the interface is stopped ++ When the interface is stopped we need to run the tasklet ++ after we receive an interrupt so that we can wake it up */ ++ uint8_t is_stopped; ++ ++#ifdef CONFIG_XEN_NETDEV2_BYPASSABLE ++ /* Bypass support. */ ++ /* There's some unadvertised bypass in one of the lists. */ ++ uint8_t need_advertise_bypasses; ++ uint8_t bypass_max_pages; ++ uint16_t max_bypasses; ++ uint16_t extant_bypasses; ++ struct list_head bypasses_a; ++ struct list_head bypasses_b; ++ ++ struct nc2_bypass *current_bypass_frontend; ++ struct nc2_incoming_bypass_suggestions incoming_bypass_suggestions; ++#endif ++ ++ /* Updates are protected by the lock. This can be read at any ++ * time without holding any locks, and the rest of Linux is ++ * expected to cope. */ ++ struct net_device_stats stats; ++ ++#ifdef CONFIG_XEN_NETDEV2_AUTOMATIC_BYPASS ++ struct nc2_auto_bypass auto_bypass; ++#endif ++}; ++ ++#ifdef CONFIG_XEN_NETDEV2_BYPASSABLE ++#define MAX_BYPASS_RING_PAGES_GRANTABLE 4 ++struct nc2_bypass_endpoint { ++ struct list_head list; /* Always ``valid'', but won't actually ++ be in any list if we're detached (it ++ gets set to the empty list). */ ++ struct netchannel2 *nc2; /* Valid provided detached isn't ++ * set */ ++ grant_ref_t incoming_grefs[MAX_BYPASS_RING_PAGES_GRANTABLE]; ++ grant_ref_t outgoing_grefs[MAX_BYPASS_RING_PAGES_GRANTABLE]; ++ grant_ref_t control_gref; ++ unsigned long incoming_pages[MAX_BYPASS_RING_PAGES_GRANTABLE]; ++ ++ uint8_t need_advertise; ++ uint8_t need_disable; ++ uint8_t disable_sent; ++ uint8_t disabled; ++ uint8_t need_detach; ++ uint8_t detach_sent; ++ uint8_t detached; ++}; ++ ++/* This is the representation of a bypass in the bypassed domain. */ ++struct nc2_bypass { ++ /* Cleared to an empty list if both endpoints are detached. */ ++ struct list_head list; ++ ++ /* Reference count. Being on the big list, threaded through ++ @list, counts as a single reference. */ ++ atomic_t refcnt; ++ ++ struct nc2_bypass_endpoint ep_a; ++ struct nc2_bypass_endpoint ep_b; ++ unsigned long control_page; ++ unsigned nr_ring_pages; ++ ++ unsigned handle; ++ int evtchn_port; ++ ++ wait_queue_head_t detach_waitq; ++}; ++ ++int nc2_establish_bypass(struct netchannel2 *a, struct netchannel2 *b); ++int nc2_destroy_bypass(int handle); ++void _nc2_advertise_bypasses(struct netchannel2 *nc); ++static inline void nc2_advertise_bypasses(struct netchannel2 *nc) ++{ ++ if (nc->need_advertise_bypasses) ++ _nc2_advertise_bypasses(nc); ++} ++void nc2_handle_bypass_disabled(struct netchannel2 *nc, ++ struct netchannel2_ring_pair *ncrp, ++ struct netchannel2_msg_hdr *hdr); ++void nc2_handle_bypass_detached(struct netchannel2 *nc, ++ struct netchannel2_ring_pair *ncrp, ++ struct netchannel2_msg_hdr *hdr); ++void nc2_handle_bypass_frontend_ready(struct netchannel2 *nc, ++ struct netchannel2_ring_pair *ncrp, ++ struct netchannel2_msg_hdr *hdr); ++void nc2_handle_bypass_disabled(struct netchannel2 *nc, ++ struct netchannel2_ring_pair *ncrp, ++ struct netchannel2_msg_hdr *hdr); ++void nc2_handle_bypass_detached(struct netchannel2 *nc, ++ struct netchannel2_ring_pair *ncrp, ++ struct netchannel2_msg_hdr *hdr); ++void nc2_handle_suggest_bypass(struct netchannel2 *nc, ++ struct netchannel2_ring_pair *ncrp, ++ struct netchannel2_msg_hdr *hdr); ++void release_bypasses(struct netchannel2 *nc); ++void nb2_handle_suggested_bypass(struct netchannel2 *a_chan, ++ const char *mac_b); ++#else ++static inline void release_bypasses(struct netchannel2 *nc) ++{ ++} ++static inline void nc2_advertise_bypasses(struct netchannel2 *nc) ++{ ++} ++static inline void nc2_handle_bypass_frontend_ready(struct netchannel2 *nc, ++ struct netchannel2_ring_pair *ncrp, ++ struct netchannel2_msg_hdr *hdr) ++{ ++} ++static inline void nc2_handle_bypass_disabled(struct netchannel2 *nc, ++ struct netchannel2_ring_pair *ncrp, ++ struct netchannel2_msg_hdr *hdr) ++{ ++} ++static inline void nc2_handle_bypass_detached(struct netchannel2 *nc, ++ struct netchannel2_ring_pair *ncrp, ++ struct netchannel2_msg_hdr *hdr) ++{ ++} ++static inline void nc2_handle_suggest_bypass(struct netchannel2 *nc, ++ struct netchannel2_ring_pair *ncrp, ++ struct netchannel2_msg_hdr *hdr) ++{ ++} ++#endif ++ ++#ifdef CONFIG_XEN_NETDEV2_BYPASS_ENDPOINT ++#define MAX_BYPASS_RING_PAGES_MAPPABLE 4 ++/* This is the representation of a bypass from the point of view of ++ one of the endpoint domains. */ ++struct nc2_alternate_ring { ++ /* List of all alternate rings on a given interface. Dangles ++ * off of alternate_rings in struct netchannel2. Protected by ++ * the netchannel2 master ring lock. */ ++ struct list_head rings_by_interface; ++ /* The state of the alternate ring. This only ever goes ++ * forwards. It is protected by the auxiliary ring lock. */ ++ enum { ++ /* This is a frontend, it's just been allocated and ++ doesn't yet have a port. */ ++ nc2_alt_ring_frontend_preparing = 0xf001, ++ /* This is a frontend, it has a port but hasn't told ++ the parent yet. */ ++ nc2_alt_ring_frontend_send_ready_pending, ++ /* We've sent the FRONTEND_READY message and are ++ waiting for the backend to say it's ready. */ ++ nc2_alt_ring_frontend_sent_ready, ++ /* This is a backend. In theory, we know what port to ++ use, but we haven't tried to bind to it yet. */ ++ nc2_alt_ring_backend_preparing, ++ /* Running normally */ ++ nc2_alt_ring_ready, ++ /* Can't be used for more PACKETs, will disable as ++ soon as all FINISHes arrive. */ ++ nc2_alt_ring_disabling, ++ /* All FINISHes arrived, waiting to send DISABLED */ ++ nc2_alt_ring_disabled_pending, ++ /* DISABLED sent. */ ++ nc2_alt_ring_disabled, ++ /* DETACH received */ ++ nc2_alt_ring_detaching, ++ /* Ring has been detached, waiting to send the ++ DETACHED message. */ ++ nc2_alt_ring_detached_pending ++ } state; ++ struct work_struct work_item; ++ struct work_struct detach_work_item; ++ ++ struct grant_mapping prod_mapper; ++ struct grant_mapping cons_mapper; ++ struct grant_mapping control_mapper; ++ ++ struct netchannel2_ring_pair rings; ++ ++ /* A lower bound on the number of times we've called ++ disable_irq() on the irq. The interrupt handler guarantees ++ to notify the eventq quickly if this increases. It ++ increases whenever there is work for the worker thread to ++ do. */ ++ atomic_t irq_disable_count; ++ wait_queue_head_t eventq; ++ uint32_t handle; ++ ++ struct netchannel2_msg_bypass_frontend frontend_setup_msg; ++ struct netchannel2_msg_bypass_backend backend_setup_msg; ++ uint32_t cons_grefs[MAX_BYPASS_RING_PAGES_MAPPABLE]; ++ uint32_t prod_grefs[MAX_BYPASS_RING_PAGES_MAPPABLE]; ++ ++#ifdef CONFIG_XEN_NETDEV2_AUTOMATIC_BYPASS ++ struct nc2_bypass_autoteardown autoteardown; ++#endif ++}; ++ ++void nc2_handle_bypass_ready(struct netchannel2 *nc, ++ struct netchannel2_ring_pair *ncrp, ++ struct netchannel2_msg_hdr *hdr); ++int bypass_xmit_packet(struct netchannel2 *nc, ++ struct nc2_alternate_ring *ncr, ++ struct sk_buff *skb); ++void _nc2_alternate_ring_disable_finish(struct nc2_alternate_ring *ncr); ++static inline void nc2_alternate_ring_disable_finish(struct netchannel2_ring_pair *ncrp) ++{ ++ struct nc2_alternate_ring *nar; ++ nar = container_of(ncrp, struct nc2_alternate_ring, rings); ++ if (nar->state == nc2_alt_ring_disabling && ++ ncrp->nr_tx_packets_outstanding == 0) ++ _nc2_alternate_ring_disable_finish(nar); ++} ++void _nc2_crank_aux_ring_state_machine(struct netchannel2 *nc); ++static inline void nc2_crank_aux_ring_state_machine(struct netchannel2 *nc) ++{ ++ if (nc->need_aux_ring_state_machine) ++ _nc2_crank_aux_ring_state_machine(nc); ++} ++void nc2_release_alt_rings(struct netchannel2 *nc); ++void detach_all_bypasses(struct netchannel2 *nc); ++void nc2_handle_bypass_frontend(struct netchannel2 *nc, ++ struct netchannel2_ring_pair *ncrp, ++ struct netchannel2_msg_hdr *hdr); ++void nc2_handle_bypass_backend(struct netchannel2 *nc, ++ struct netchannel2_ring_pair *ncrp, ++ struct netchannel2_msg_hdr *hdr); ++void nc2_handle_bypass_disable(struct netchannel2 *nc, ++ struct netchannel2_ring_pair *ncrp, ++ struct netchannel2_msg_hdr *hdr); ++void nc2_handle_bypass_detach(struct netchannel2 *nc, ++ struct netchannel2_ring_pair *ncrp, ++ struct netchannel2_msg_hdr *hdr); ++void nc2_handle_bypass_ready(struct netchannel2 *nc, ++ struct netchannel2_ring_pair *ncrp, ++ struct netchannel2_msg_hdr *hdr); ++void nc2_aux_ring_start_disable_sequence(struct nc2_alternate_ring *nar); ++void nc2_aux_ring_start_detach_sequence(struct nc2_alternate_ring *nar); ++#else ++static inline void detach_all_bypasses(struct netchannel2 *nc) ++{ ++} ++static inline void nc2_crank_aux_ring_state_machine(struct netchannel2 *nc) ++{ ++} ++static inline void nc2_alternate_ring_disable_finish(struct netchannel2_ring_pair *ncrp) ++{ ++} ++static inline void nc2_release_alt_rings(struct netchannel2 *nc) ++{ ++} ++static inline void nc2_handle_bypass_frontend(struct netchannel2 *nc, ++ struct netchannel2_ring_pair *ncrp, ++ struct netchannel2_msg_hdr *hdr) ++{ ++} ++static inline void nc2_handle_bypass_backend(struct netchannel2 *nc, ++ struct netchannel2_ring_pair *ncrp, ++ struct netchannel2_msg_hdr *hdr) ++{ ++} ++static inline void nc2_handle_bypass_disable(struct netchannel2 *nc, ++ struct netchannel2_ring_pair *ncrp, ++ struct netchannel2_msg_hdr *hdr) ++{ ++} ++static inline void nc2_handle_bypass_detach(struct netchannel2 *nc, ++ struct netchannel2_ring_pair *ncrp, ++ struct netchannel2_msg_hdr *hdr) ++{ ++} ++static inline void nc2_handle_bypass_ready(struct netchannel2 *nc, ++ struct netchannel2_ring_pair *ncrp, ++ struct netchannel2_msg_hdr *hdr) ++{ ++} ++#endif ++ ++#ifdef CONFIG_XEN_NETDEV2_AUTOMATIC_BYPASS ++void _nc2_autobypass_make_suggestions(struct netchannel2 *nc); ++static inline void nc2_autobypass_make_suggestions(struct netchannel2 *nc) ++{ ++ if (nc->auto_bypass.suggestion_tail != nc->auto_bypass.suggestion_head) ++ _nc2_autobypass_make_suggestions(nc); ++} ++#else ++static inline void nc2_autobypass_make_suggestions(struct netchannel2 *nc) ++{ ++} ++#endif ++ ++static inline void flush_prepared_grant_copies(struct hypercall_batcher *hb, ++ void (*on_fail)(void *ctxt, ++ gnttab_copy_t *gop)) ++{ ++ unsigned x; ++ ++ if (hb->nr_pending_gops == 0) ++ return; ++ if (HYPERVISOR_grant_table_op(GNTTABOP_copy, hb->gops, ++ hb->nr_pending_gops)) ++ BUG(); ++ for (x = 0; x < hb->nr_pending_gops; x++) ++ if (hb->gops[x].status != GNTST_okay) ++ on_fail(hb->ctxt[x], &hb->gops[x]); ++ hb->nr_pending_gops = 0; ++} ++ ++static inline gnttab_copy_t *hypercall_batcher_grant_copy(struct hypercall_batcher *hb, ++ void *ctxt, ++ void (*on_fail)(void *, ++ gnttab_copy_t *gop)) ++{ ++ if (hb->nr_pending_gops == ARRAY_SIZE(hb->gops)) ++ flush_prepared_grant_copies(hb, on_fail); ++ hb->ctxt[hb->nr_pending_gops] = ctxt; ++ return &hb->gops[hb->nr_pending_gops++]; ++} ++ ++static inline void flush_hypercall_batcher(struct hypercall_batcher *hb, ++ void (*on_fail)(void *, ++ gnttab_copy_t *gop)) ++{ ++ flush_prepared_grant_copies(hb, on_fail); ++} ++ ++struct sk_buff *handle_receiver_copy_packet(struct netchannel2 *nc, ++ struct netchannel2_ring_pair *ncrp, ++ struct netchannel2_msg_packet *msg, ++ struct netchannel2_msg_hdr *hdr, ++ unsigned nr_frags, ++ unsigned frags_off); ++struct sk_buff *handle_receiver_map_packet(struct netchannel2 *nc, ++ struct netchannel2_msg_packet *msg, ++ struct netchannel2_msg_hdr *hdr, ++ unsigned nr_frags, ++ unsigned frags_off); ++ ++enum prepare_xmit_result { ++ PREP_XMIT_OKAY = 0, ++ PREP_XMIT_BUSY = -1, ++ PREP_XMIT_DROP = -2, ++}; ++ ++enum prepare_xmit_result prepare_xmit_allocate_small( ++ struct netchannel2_ring_pair *ncrp, ++ struct sk_buff *skb); ++enum prepare_xmit_result prepare_xmit_allocate_grant( ++ struct netchannel2_ring_pair *ncrp, ++ struct sk_buff *skb, ++ int use_subpage_grants); ++void xmit_grant(struct netchannel2_ring_pair *ncrp, ++ struct sk_buff *skb, ++ int use_subpage_grants, ++ volatile void *msg); ++ ++void queue_finish_packet_message(struct netchannel2_ring_pair *ncrp, ++ uint32_t id, uint8_t flags); ++ ++int allocate_txp_slot(struct netchannel2_ring_pair *ncrp, ++ struct sk_buff *skb); ++void release_txp_slot(struct netchannel2_ring_pair *ncrp, ++ struct sk_buff *skb); ++/* Releases the txp slot, the grant pool, and the skb */ ++void release_tx_packet(struct netchannel2_ring_pair *ncrp, ++ struct sk_buff *skb); ++ ++void fetch_fragment(struct netchannel2_ring_pair *ncrp, ++ unsigned idx, ++ struct netchannel2_fragment *frag, ++ unsigned off); ++ ++void pull_through(struct sk_buff *skb, unsigned count); ++ ++void nc2_kick(struct netchannel2_ring_pair *ncrp); ++ ++int nc2_map_grants(struct grant_mapping *gm, ++ const grant_ref_t *grefs, ++ unsigned nr_grefs, ++ domid_t remote_domain); ++void nc2_unmap_grants(struct grant_mapping *gm); ++ ++void _nc2_attach_rings(struct netchannel2_ring_pair *ncrp, ++ struct netchannel2_sring_cons *cons_sring, ++ const volatile void *cons_payload, ++ size_t cons_size, ++ struct netchannel2_sring_prod *prod_sring, ++ void *prod_payload, ++ size_t prod_size, ++ domid_t otherend_id); ++void queue_packet_to_interface(struct sk_buff *skb, ++ struct netchannel2_ring_pair *ncrp); ++ ++unsigned get_transmitted_packet_msg_size(struct sk_buff *skb); ++int init_ring_pair(struct netchannel2_ring_pair *ncrp, ++ struct netchannel2 *nc); ++ ++irqreturn_t nc2_int(int irq, void *dev_id); ++ ++void cleanup_ring_pair(struct netchannel2_ring_pair *ncrp); ++void nc2_rscb_on_gntcopy_fail(void *ctxt, gnttab_copy_t *gop); ++ ++int init_receive_map_mode(void); ++void deinit_receive_map_mode(void); ++void suspend_receive_map_mode(void); ++void resume_receive_map_mode(void); ++ ++struct netchannel2 *nc2_get_interface_for_page(struct page *p); ++ ++int nc2_start_xmit(struct sk_buff *skb, struct net_device *dev); ++int nc2_really_start_xmit(struct netchannel2_ring_pair *ncrp, ++ struct sk_buff *skb); ++enum prepare_xmit_result prepare_xmit_allocate_resources(struct netchannel2 *nc, ++ struct sk_buff *skb); ++void nc2_handle_finish_packet_msg(struct netchannel2 *nc, ++ struct netchannel2_ring_pair *ncrp, ++ struct netchannel2_msg_hdr *hdr); ++void nc2_handle_set_max_packets_msg(struct netchannel2_ring_pair *ncrp, ++ struct netchannel2_msg_hdr *hdr); ++void drop_pending_tx_packets(struct netchannel2_ring_pair *ncrp); ++ ++void send_finish_packet_messages(struct netchannel2_ring_pair *ncrp); ++void nc2_handle_packet_msg(struct netchannel2 *nc, ++ struct netchannel2_ring_pair *ncrp, ++ struct netchannel2_msg_hdr *hdr, ++ struct sk_buff_head *pending_rx_queue); ++void advertise_max_packets(struct netchannel2_ring_pair *ncrp); ++void nc2_handle_set_max_fragments_per_packet(struct netchannel2 *nc, ++ struct netchannel2_ring_pair *ncrp, ++ struct netchannel2_msg_hdr *hdr); ++void advertise_max_fragments_per_packet(struct netchannel2_ring_pair *ncrp); ++void receive_pending_skbs(struct sk_buff_head *rx_queue); ++void nc2_queue_purge(struct netchannel2_ring_pair *ncrp, ++ struct sk_buff_head *queue); ++ ++void advertise_offloads(struct netchannel2 *nc); ++void nc2_handle_set_offload(struct netchannel2 *nc, ++ struct netchannel2_ring_pair *ncrp, ++ struct netchannel2_msg_hdr *hdr); ++extern struct ethtool_ops nc2_ethtool_ops; ++ ++void nc2_init_poller(struct netchannel2_ring_pair *ncrp); ++void nc2_start_polling(struct netchannel2_ring_pair *ncrp); ++void nc2_stop_polling(struct netchannel2_ring_pair *ncrp); ++ ++#endif /* !NETCHANNEL2_CORE_H__ */ +diff --git a/drivers/xen/netchannel2/netchannel2_endpoint.h b/drivers/xen/netchannel2/netchannel2_endpoint.h +new file mode 100644 +index 0000000..2ed9050 +--- /dev/null ++++ b/drivers/xen/netchannel2/netchannel2_endpoint.h +@@ -0,0 +1,63 @@ ++/* Interface between the endpoint implementations (netfront2.c, ++ netback2.c) and the netchannel2 core (chan.c and the various ++ transmission modes). */ ++#ifndef NETCHANNEL2_ENDPOINT_H__ ++#define NETCHANNEL2_ENDPOINT_H__ ++ ++#include ++#include ++ ++struct netchannel2_sring_prod; ++struct netchannel2_sring_cons; ++struct netchannel2; ++struct xenbus_device; ++ ++struct netchannel2 *nc2_new(struct xenbus_device *xd); ++void nc2_release(struct netchannel2 *nc); ++ ++int nc2_attach_rings(struct netchannel2 *nc, ++ struct netchannel2_sring_cons *cons_sring, ++ const volatile void *cons_payload, ++ size_t cons_size, ++ struct netchannel2_sring_prod *prod_sring, ++ void *prod_payload, ++ size_t prod_size, ++ domid_t otherend_id); ++void nc2_detach_rings(struct netchannel2 *nc); ++#if defined(CONFIG_XEN_NETDEV2_FRONTEND) ++int nc2_listen_evtchn(struct netchannel2 *nc, domid_t dom); ++#endif ++#if defined(CONFIG_XEN_NETDEV2_BACKEND) ++int nc2_connect_evtchn(struct netchannel2 *nc, domid_t domid, ++ int evtchn); ++#endif ++int nc2_get_evtchn_port(struct netchannel2 *nc); ++void nc2_suspend(struct netchannel2 *nc); ++ ++void nc2_set_nr_tx_buffers(struct netchannel2 *nc, unsigned nr_buffers); ++ ++/* Interface which the endpoints provide to the core. */ ++#ifdef CONFIG_XEN_NETDEV2_FRONTEND ++int __init netfront2_init(void); ++void __exit netfront2_exit(void); ++#else ++static inline int netfront2_init(void) ++{ ++ return 0; ++} ++static inline void netfront2_exit(void) ++{ ++} ++#endif ++#ifdef CONFIG_XEN_NETDEV2_BACKEND ++int __init netback2_init(void); ++#else ++static inline int netback2_init(void) ++{ ++ return 0; ++} ++#endif ++int __init nc2_init(void); ++void __exit nc2_exit(void); ++ ++#endif /* NETCHANNEL2_ENDPOINT_H__ */ +diff --git a/drivers/xen/netchannel2/netchannel2_uspace.h b/drivers/xen/netchannel2/netchannel2_uspace.h +new file mode 100644 +index 0000000..64c4312 +--- /dev/null ++++ b/drivers/xen/netchannel2/netchannel2_uspace.h +@@ -0,0 +1,17 @@ ++#ifndef NETCHANNEL2_USPACE_H__ ++#define NETCHANNEL2_USPACE_H__ ++ ++#include ++ ++struct netchannel2_ioctl_establish_bypass { ++ unsigned handle_a; ++ unsigned handle_b; ++}; ++#define NETCHANNEL2_IOCTL_ESTABLISH_BYPASS _IOW('N', 0, struct netchannel2_ioctl_establish_bypass) ++ ++struct netchannel2_ioctl_destroy_bypass { ++ unsigned handle; ++}; ++#define NETCHANNEL2_IOCTL_DESTROY_BYPASS _IOW('N', 1, struct netchannel2_ioctl_destroy_bypass) ++ ++#endif /* !NETCHANNEL2_USPACE_H__ */ +diff --git a/drivers/xen/netchannel2/netfront2.c b/drivers/xen/netchannel2/netfront2.c +new file mode 100644 +index 0000000..13d94e4 +--- /dev/null ++++ b/drivers/xen/netchannel2/netfront2.c +@@ -0,0 +1,513 @@ ++#include ++#include ++#include ++#include ++#include ++ ++#include "netchannel2_core.h" ++#include "netchannel2_endpoint.h" ++ ++#define MAX_SRING_PAGES 4 ++ ++struct netfront2 { ++#define NETFRONT2_MAGIC 0x9268e704 ++ unsigned magic; ++ struct xenbus_device *xenbus_device; ++ ++ void *f2b_sring; ++ grant_ref_t f2b_grefs[MAX_SRING_PAGES]; ++ void *b2f_sring; ++ grant_ref_t b2f_grefs[MAX_SRING_PAGES]; ++ ++ struct netchannel2_frontend_shared *control_shared; ++ grant_ref_t control_shared_gref; ++ ++ int nr_sring_pages; ++ int sring_order; ++ ++ grant_ref_t rings_gref_pool; /* Some pre-allocated grant ++ references to cover the shared ++ rings. */ ++ ++ struct netchannel2 *chan; ++ ++ int attached; /* True if the shared rings are ready to go. */ ++}; ++ ++static struct netfront2 *xenbus_device_to_nf2(struct xenbus_device *xd) ++{ ++ struct netfront2 *work = xd->dev.driver_data; ++ BUG_ON(work->magic != NETFRONT2_MAGIC); ++ return work; ++} ++ ++/* Try to revoke a bunch of grant references and return the grefs to ++ the rings grefs pool. Any cleared grefs are set to 0. Returns 0 ++ on success or <0 on error. Ignores zero entries in the @grefs ++ list, and zeroes any entries which are successfully ended. */ ++static int ungrant_access_to_ring(struct netfront2 *nf, ++ grant_ref_t *grefs, ++ int nr_pages) ++{ ++ int i; ++ int succ; ++ int failed; ++ ++ failed = 0; ++ ++ for (i = 0; i < nr_pages; i++) { ++ if (grefs[i]) { ++ succ = gnttab_end_foreign_access_ref(grefs[i]); ++ if (!succ) { ++ /* XXX we can't recover when this ++ * happens. Try to do something ++ * vaguely plausible, but the device ++ * is pretty much doomed. */ ++ printk(KERN_WARNING "Failed to end access to gref %d\n", ++ i); ++ failed = 1; ++ continue; ++ } ++ gnttab_release_grant_reference(&nf->rings_gref_pool, ++ grefs[i]); ++ grefs[i] = 0; ++ } ++ } ++ ++ if (failed) ++ return -EBUSY; ++ else ++ return 0; ++} ++ ++/* Allocate and initialise grant references to cover a bunch of pages. ++ @ring should be in the direct-mapped region. The rings_gref_pool ++ on nf should contain at least @nr_pages references. ++ Already-populated slots in the @grefs list are left unchanged. */ ++static void grant_access_to_ring(struct netfront2 *nf, ++ domid_t otherend, ++ void *ring, ++ int *grefs, ++ int nr_pages) ++{ ++ void *p; ++ int i; ++ grant_ref_t ref; ++ ++ for (i = 0; i < nr_pages; i++) { ++ ++ if (grefs[i] != 0) ++ continue; ++ ++ p = (void *)((unsigned long)ring + PAGE_SIZE * i); ++ ++ ref = gnttab_claim_grant_reference(&nf->rings_gref_pool); ++ /* There should be enough grefs in the pool to handle ++ the rings. */ ++ BUG_ON(ref < 0); ++ gnttab_grant_foreign_access_ref(ref, ++ otherend, ++ virt_to_mfn(p), ++ 0); ++ grefs[i] = ref; ++ } ++} ++ ++/* Push an already-granted ring into xenstore. */ ++static int publish_ring(struct xenbus_transaction xbt, ++ struct netfront2 *nf, ++ const char *prefix, ++ const int *grefs, ++ int nr_grefs) ++{ ++ int i; ++ char buf[32]; ++ int err; ++ ++ sprintf(buf, "%s-nr-pages", prefix); ++ err = xenbus_printf(xbt, nf->xenbus_device->nodename, buf, ++ "%u", nr_grefs); ++ if (err) ++ return err; ++ ++ for (i = 0; i < nr_grefs; i++) { ++ BUG_ON(grefs[i] == 0); ++ sprintf(buf, "%s-ref-%u", prefix, i); ++ err = xenbus_printf(xbt, nf->xenbus_device->nodename, ++ buf, "%u", grefs[i]); ++ if (err) ++ return err; ++ } ++ return 0; ++} ++ ++static int publish_rings(struct netfront2 *nf) ++{ ++ int err; ++ struct xenbus_transaction xbt; ++ const char *msg; ++ ++again: ++ err = xenbus_transaction_start(&xbt); ++ if (err) { ++ xenbus_dev_fatal(nf->xenbus_device, err, ++ "starting transaction"); ++ return err; ++ } ++ ++ err = publish_ring(xbt, nf, "f2b-ring", nf->f2b_grefs, ++ nf->nr_sring_pages); ++ if (err) { ++ msg = "publishing f2b-ring"; ++ goto abort; ++ } ++ err = publish_ring(xbt, nf, "b2f-ring", nf->b2f_grefs, ++ nf->nr_sring_pages); ++ if (err) { ++ msg = "publishing b2f-ring"; ++ goto abort; ++ } ++ err = publish_ring(xbt, nf, "control", &nf->control_shared_gref, 1); ++ if (err) { ++ msg = "publishing control"; ++ goto abort; ++ } ++ err = xenbus_printf(xbt, nf->xenbus_device->nodename, ++ "event-channel", "%u", ++ nc2_get_evtchn_port(nf->chan)); ++ if (err) { ++ msg = "publishing event channel"; ++ goto abort; ++ } ++ ++#ifdef CONFIG_XEN_NETDEV2_BYPASS_ENDPOINT ++ err = xenbus_printf(xbt, nf->xenbus_device->nodename, ++ "feature-bypass", "1"); ++ if (!err) ++ err = xenbus_printf(xbt, nf->xenbus_device->nodename, ++ "feature-bypass-max-pages", "%d", ++ MAX_BYPASS_RING_PAGES_MAPPABLE); ++ if (err) { ++ msg = "publishing bypass info"; ++ goto abort; ++ } ++#endif ++ ++ err = xenbus_transaction_end(xbt, 0); ++ if (err) { ++ if (err == -EAGAIN) ++ goto again; ++ xenbus_dev_fatal(nf->xenbus_device, err, ++ "completing transaction"); ++ } ++ ++ return err; ++ ++abort: ++ xenbus_transaction_end(xbt, 1); ++ xenbus_dev_fatal(nf->xenbus_device, err, msg); ++ return err; ++} ++ ++/* Release the rings. WARNING: This will leak memory if the other end ++ still has the rings mapped. There isn't really anything we can do ++ about that; the alternative (giving the other end access to ++ whatever Linux puts in the memory after we released it) is probably ++ worse. */ ++static void release_rings(struct netfront2 *nf) ++{ ++ int have_outstanding_grants; ++ ++ have_outstanding_grants = 0; ++ ++ if (nf->f2b_sring) { ++ if (ungrant_access_to_ring(nf, nf->f2b_grefs, ++ nf->nr_sring_pages) >= 0) { ++ free_pages((unsigned long)nf->f2b_sring, ++ nf->sring_order); ++ } else { ++ have_outstanding_grants = 1; ++ } ++ nf->f2b_sring = NULL; ++ } ++ ++ if (nf->b2f_sring) { ++ if (ungrant_access_to_ring(nf, nf->b2f_grefs, ++ nf->nr_sring_pages) >= 0) { ++ free_pages((unsigned long)nf->b2f_sring, ++ nf->sring_order); ++ } else { ++ have_outstanding_grants = 1; ++ } ++ nf->b2f_sring = NULL; ++ } ++ ++ if (nf->control_shared) { ++ if (ungrant_access_to_ring(nf, &nf->control_shared_gref, ++ 1) >= 0) { ++ free_page((unsigned long)nf->control_shared); ++ } else { ++ have_outstanding_grants = 1; ++ } ++ nf->control_shared = NULL; ++ } ++ ++ if (have_outstanding_grants != 0) { ++ printk(KERN_WARNING ++ "Released shared rings while the backend still had them mapped; leaking memory\n"); ++ } ++ ++ /* We can't release the gref pool if there are still ++ references outstanding against it. */ ++ if (!have_outstanding_grants) { ++ if (nf->rings_gref_pool) ++ gnttab_free_grant_references(nf->rings_gref_pool); ++ nf->rings_gref_pool = 0; ++ } ++ ++ nf->attached = 0; ++} ++ ++static int allocate_rings(struct netfront2 *nf, domid_t otherend) ++{ ++ int err; ++ int max_sring_pages; ++ int sring_order; ++ int nr_sring_pages; ++ size_t sring_size; ++ ++ /* Figure out how big our shared rings are going to be. */ ++ err = xenbus_scanf(XBT_NIL, nf->xenbus_device->otherend, ++ "max-sring-pages", "%d", &max_sring_pages); ++ if (err < 0) { ++ xenbus_dev_fatal(nf->xenbus_device, err, ++ "reading %s/max-sring-pages", ++ nf->xenbus_device->otherend); ++ return err; ++ } ++ if (max_sring_pages > MAX_SRING_PAGES) ++ max_sring_pages = MAX_SRING_PAGES; ++ sring_order = order_base_2(max_sring_pages); ++ nr_sring_pages = 1 << sring_order; ++ sring_size = nr_sring_pages * PAGE_SIZE; ++ ++ release_rings(nf); ++ ++ nf->nr_sring_pages = nr_sring_pages; ++ nf->sring_order = sring_order; ++ ++ nf->f2b_sring = (void *)__get_free_pages(GFP_KERNEL, sring_order); ++ if (!nf->f2b_sring) ++ return -ENOMEM; ++ memset(nf->f2b_sring, 0, sring_size); ++ ++ nf->b2f_sring = (void *)__get_free_pages(GFP_KERNEL, sring_order); ++ if (!nf->b2f_sring) ++ return -ENOMEM; ++ memset(nf->b2f_sring, 0, sring_size); ++ ++ nf->control_shared = (void *)get_zeroed_page(GFP_KERNEL); ++ if (!nf->control_shared) ++ return -ENOMEM; ++ ++ /* Pre-allocate enough grant references to be sure that we can ++ grant access to both rings without an error. */ ++ err = gnttab_alloc_grant_references(nr_sring_pages * 2 + 1, ++ &nf->rings_gref_pool); ++ if (err < 0) ++ return err; ++ ++ grant_access_to_ring(nf, ++ otherend, ++ nf->b2f_sring, ++ nf->b2f_grefs, ++ nr_sring_pages); ++ grant_access_to_ring(nf, ++ otherend, ++ nf->f2b_sring, ++ nf->f2b_grefs, ++ nr_sring_pages); ++ grant_access_to_ring(nf, ++ otherend, ++ nf->control_shared, ++ &nf->control_shared_gref, ++ 1); ++ err = nc2_listen_evtchn(nf->chan, otherend); ++ if (err < 0) ++ return err; ++ ++ nf->attached = 1; ++ ++ return 0; ++} ++ ++static void backend_changed(struct xenbus_device *xd, ++ enum xenbus_state backend_state) ++{ ++ struct netfront2 *nf = xenbus_device_to_nf2(xd); ++ int err; ++ ++ switch (backend_state) { ++ case XenbusStateInitialising: ++ /* Backend isn't ready yet, don't do anything. */ ++ break; ++ ++ case XenbusStateInitWait: ++ /* Backend has advertised the ring protocol. Allocate ++ the rings, and tell the backend about them. */ ++ ++ err = 0; ++ if (!nf->attached) ++ err = allocate_rings(nf, xd->otherend_id); ++ if (err < 0) { ++ xenbus_dev_fatal(xd, err, "allocating shared rings"); ++ break; ++ } ++ err = publish_rings(nf); ++ if (err >= 0) ++ xenbus_switch_state(xd, XenbusStateInitialised); ++ break; ++ ++ case XenbusStateInitialised: ++ /* Backend isn't supposed to use this state. */ ++ xenbus_dev_fatal(xd, EINVAL, ++ "unexpected backend state Initialised"); ++ break; ++ ++ case XenbusStateConnected: ++ /* All ready */ ++ err = nc2_attach_rings(nf->chan, ++ &nf->control_shared->cons, ++ nf->b2f_sring, ++ nf->nr_sring_pages * PAGE_SIZE, ++ &nf->control_shared->prod, ++ nf->f2b_sring, ++ nf->nr_sring_pages * PAGE_SIZE, ++ nf->xenbus_device->otherend_id); ++ if (err < 0) { ++ xenbus_dev_fatal(xd, err, ++ "failed to attach to rings"); ++ } else { ++ xenbus_switch_state(xd, XenbusStateConnected); ++ } ++ break; ++ ++ case XenbusStateClosing: ++ xenbus_switch_state(xd, XenbusStateClosing); ++ break; ++ ++ case XenbusStateClosed: ++ /* Tell the tools that it's safe to remove the device ++ from the bus. */ ++ xenbus_frontend_closed(xd); ++ /* Note that we don't release the rings here. This ++ means that if the backend moves to a different ++ domain, we won't be able to reconnect, but it also ++ limits the amount of memory which can be wasted in ++ the release_rings() leak if the backend is faulty ++ or malicious. It's not obvious which is more ++ useful, and so I choose the safer but less ++ featureful approach. */ ++ /* This is only a problem if you're using driver ++ domains and trying to recover from a driver error ++ by rebooting the backend domain. The rest of the ++ tools don't support that, so it's a bit ++ theoretical. The memory leaks aren't, though. */ ++ break; ++ ++ case XenbusStateUnknown: ++ /* The tools have removed the device area from the ++ store. Do nothing and rely on xenbus core to call ++ our remove method. */ ++ break; ++ ++ default: ++ /* Ignore transitions to unknown states */ ++ break; ++ } ++} ++ ++static int __devinit netfront_probe(struct xenbus_device *xd, ++ const struct xenbus_device_id *id) ++{ ++ struct netfront2 *nf; ++ ++ nf = kzalloc(sizeof(*nf), GFP_KERNEL); ++ if (nf == NULL) ++ goto err; ++ nf->magic = NETFRONT2_MAGIC; ++ nf->xenbus_device = xd; ++ nf->chan = nc2_new(xd); ++ if (nf->chan == NULL) ++ goto err; ++ ++ xd->dev.driver_data = nf; ++ ++ return 0; ++ ++err: ++ kfree(nf); ++ xenbus_dev_fatal(xd, ENOMEM, "probing netdev"); ++ return -ENOMEM; ++} ++ ++static int netfront_suspend(struct xenbus_device *xd) ++{ ++ /* We're about to suspend. Do the minimum amount of work to ++ make that safe. */ ++ struct netfront2 *nf = xenbus_device_to_nf2(xd); ++ ++ nc2_suspend(nf->chan); ++ ++ return 0; ++} ++ ++static int netfront_resume(struct xenbus_device *xd) ++{ ++ /* We've been suspended and come back. The rings are ++ therefore dead. Tear them down. */ ++ /* We rely on the normal xenbus state machine to bring them ++ back to life. */ ++ struct netfront2 *nf = xenbus_device_to_nf2(xd); ++ ++ nc2_detach_rings(nf->chan); ++ release_rings(nf); ++ ++ return 0; ++} ++ ++static int __devexit netfront_remove(struct xenbus_device *xd) ++{ ++ struct netfront2 *nf = xenbus_device_to_nf2(xd); ++ if (nf->chan != NULL) ++ nc2_release(nf->chan); ++ release_rings(nf); ++ kfree(nf); ++ return 0; ++} ++ ++static const struct xenbus_device_id netfront_ids[] = { ++ { "vif2" }, ++ { "" } ++}; ++MODULE_ALIAS("xen:vif2"); ++ ++static struct xenbus_driver netfront2 = { ++ .name = "vif2", ++ .ids = netfront_ids, ++ .probe = netfront_probe, ++ .remove = __devexit_p(netfront_remove), ++ .otherend_changed = backend_changed, ++ .resume = netfront_resume, ++ .suspend = netfront_suspend, ++}; ++ ++int __init netfront2_init(void) ++{ ++ return xenbus_register_frontend(&netfront2); ++} ++ ++void __exit netfront2_exit(void) ++{ ++ xenbus_unregister_driver(&netfront2); ++} +diff --git a/drivers/xen/netchannel2/offload.c b/drivers/xen/netchannel2/offload.c +new file mode 100644 +index 0000000..ec256aa +--- /dev/null ++++ b/drivers/xen/netchannel2/offload.c +@@ -0,0 +1,181 @@ ++/* All the bits used to handle enabling and disabling the various ++ * offloads. */ ++#include ++#include ++#include "netchannel2_core.h" ++ ++static int nc2_set_tx_csum(struct net_device *nd, u32 val); ++static int nc2_set_sg(struct net_device *nd, u32 val); ++static int nc2_set_tso(struct net_device *nd, u32 val); ++ ++/* ---------------- Interface to the other domain ----------------------- */ ++void nc2_handle_set_offload(struct netchannel2 *nc, ++ struct netchannel2_ring_pair *ncrp, ++ struct netchannel2_msg_hdr *hdr) ++{ ++ struct netchannel2_msg_set_offload msg; ++ if (hdr->size != sizeof(msg)) { ++ pr_debug("Strange sized offload message: %d\n", ++ hdr->size); ++ return; ++ } ++ if (ncrp != &nc->rings) { ++ pr_debug("Setting offloads on an ancillary ring!\n"); ++ return; ++ } ++ nc2_copy_from_ring(&nc->rings.cons_ring, &msg, hdr->size); ++ if (msg.csum != nc->allow_tx_csum_offload) { ++ nc->allow_tx_csum_offload = msg.csum; ++ nc2_set_tx_csum(nc->net_device, msg.csum); ++ /* Linux doesn't support scatter-gather mode without ++ TX csum offload. We therefore need to disable SG ++ support whenever the remote turns off csum support. ++ We also elect to enable SG support whenever the ++ remote turns on csum support, since that's more ++ likely to be useful than requiring the user to ++ manually enable it every time. */ ++ nc2_set_sg(nc->net_device, msg.csum); ++ } ++ ++ if (msg.tcpv4_segmentation_offload != nc->allow_tso) { ++ nc->allow_tso = msg.tcpv4_segmentation_offload; ++ nc2_set_tso(nc->net_device, msg.tcpv4_segmentation_offload); ++ } ++} ++ ++/* Tell the other end what sort of offloads it's allowed to use. */ ++void advertise_offloads(struct netchannel2 *nc) ++{ ++ struct netchannel2_msg_set_offload msg; ++ ++ memset(&msg, 0, sizeof(msg)); ++ ++ if (nc2_can_send_payload_bytes(&nc->rings.prod_ring, sizeof(msg))) { ++ msg.csum = nc->use_rx_csum; ++ /* We always claim to be able to accept TSO packets, ++ and don't provide any way of turning it off through ++ ethtool. We used to use the LRO flag, but that's ++ not quite right: receiving an LRO packet and ++ receiving a TSO one are subtly different, due to ++ the way they get packed into the skbuff ++ structure. */ ++ msg.tcpv4_segmentation_offload = 1; ++ nc2_send_message(&nc->rings.prod_ring, ++ NETCHANNEL2_MSG_SET_OFFLOAD, ++ 0, &msg, sizeof(msg)); ++ nc->need_advertise_offloads = 0; ++ nc->rings.pending_time_sensitive_messages = 1; ++ } else { ++ nc->need_advertise_offloads = 1; ++ } ++} ++ ++/* Not really offload-related, but it interacts with checksum offload ++ and is easiest to do here. */ ++void nc2_handle_set_max_fragments_per_packet(struct netchannel2 *nc, ++ struct netchannel2_ring_pair *ncrp, ++ struct netchannel2_msg_hdr *hdr) ++{ ++ struct netchannel2_msg_set_max_fragments_per_packet msg; ++ ++ if (hdr->size != sizeof(msg)) { ++ pr_debug("Set max fragments per packet message had strange size %d\n", ++ hdr->size); ++ return; ++ } ++ nc2_copy_from_ring(&ncrp->cons_ring, &msg, sizeof(msg)); ++ if (msg.max_frags_per_packet < 1) { ++ pr_debug("set max fragments per packet to %d?\n", ++ msg.max_frags_per_packet); ++ return; ++ } ++ if (ncrp == &nc->rings && ++ ncrp->max_fragments_per_tx_packet == 1 && ++ msg.max_frags_per_packet > 1) { ++ /* Turning on scatter-gather mode. Linux only ++ supports it if you've got TX csum offload, ++ though. */ ++ if (nc->net_device->features & NETIF_F_IP_CSUM) ++ nc->net_device->features |= NETIF_F_SG; ++ } ++ ncrp->max_fragments_per_tx_packet = msg.max_frags_per_packet; ++} ++ ++ ++ ++/* ---------------------- Ethtool interface ---------------------------- */ ++ ++static int nc2_set_rx_csum(struct net_device *nd, u32 val) ++{ ++ struct netchannel2 *nc = netdev_priv(nd); ++ ++ spin_lock_bh(&nc->rings.lock); ++ if (nc->use_rx_csum != val) { ++ nc->use_rx_csum = val; ++ nc->need_advertise_offloads = 1; ++ spin_unlock_bh(&nc->rings.lock); ++ nc2_kick(&nc->rings); ++ } else { ++ spin_unlock_bh(&nc->rings.lock); ++ } ++ ++ return 0; ++} ++ ++static u32 nc2_get_rx_csum(struct net_device *nd) ++{ ++ struct netchannel2 *nc = netdev_priv(nd); ++ return nc->use_rx_csum; ++} ++ ++static int nc2_set_tx_csum(struct net_device *nd, u32 val) ++{ ++ struct netchannel2 *nc = netdev_priv(nd); ++ ++ /* Can't turn on TX csum offload if the other end can't do RX ++ csum offload. */ ++ if (val != 0 && !nc->allow_tx_csum_offload) ++ return -EOPNOTSUPP; ++ return ethtool_op_set_tx_csum(nd, val); ++} ++ ++/* ethtool set_sg() handler. Linux makes sure that TX csum offload is ++ only enabled when scatter-gather mode is, so we don't have to worry ++ about that here. */ ++static int nc2_set_sg(struct net_device *nd, u32 val) ++{ ++ struct netchannel2 *nc = netdev_priv(nd); ++ ++ if (nc->rings.max_fragments_per_tx_packet <= 1) ++ return -EOPNOTSUPP; ++ ++ if (val) ++ nd->features |= NETIF_F_SG; ++ else ++ nd->features &= ~NETIF_F_SG; ++ return 0; ++} ++ ++static int nc2_set_tso(struct net_device *nd, u32 val) ++{ ++ struct netchannel2 *nc = netdev_priv(nd); ++ /* We only allow ourselves to use TSO if the other end's ++ allowed us to use sufficiently many fragments per ++ packet. */ ++ if (val != 0 && ++ (!nc->allow_tso || ++ nc->rings.max_fragments_per_tx_packet < MAX_SKB_FRAGS)) ++ return -EOPNOTSUPP; ++ return ethtool_op_set_tso(nd, val); ++} ++ ++struct ethtool_ops nc2_ethtool_ops = { ++ .get_tx_csum = ethtool_op_get_tx_csum, ++ .set_tx_csum = nc2_set_tx_csum, ++ .get_rx_csum = nc2_get_rx_csum, ++ .set_rx_csum = nc2_set_rx_csum, ++ .get_sg = ethtool_op_get_sg, ++ .set_sg = nc2_set_sg, ++ .get_tso = ethtool_op_get_tso, ++ .set_tso = nc2_set_tso ++}; +diff --git a/drivers/xen/netchannel2/poll.c b/drivers/xen/netchannel2/poll.c +new file mode 100644 +index 0000000..42ca0d5 +--- /dev/null ++++ b/drivers/xen/netchannel2/poll.c +@@ -0,0 +1,59 @@ ++/* There are a couple of places where we try to minimise wakeups in ++ ways which work in the vast majority of cases, but occasionally ++ cause a needed event to be lost. Compensate for those with a 1Hz ++ ticker. The ticker runs whenever we have outstanding TX packets. ++ Once it's running, we never try to modify it, and instead just let ++ it run out. */ ++/* If we're relying on this timer for correctness then performance is ++ going to be absolutely dire, but it should be sufficient to avoid ++ outright deadlocks. */ ++#include ++#include ++#include "netchannel2_core.h" ++ ++#define TICKER_INTERVAL (HZ) ++ ++static void poll_timer(unsigned long arg) ++{ ++ struct netchannel2_ring_pair *ncrp = ++ (struct netchannel2_ring_pair *)arg; ++ ++ /* If the ring appears to be behaving ``normally'', increase ++ the number of messages which we're allowed to have ++ outstanding by some small amount. If it looks like we've ++ deadlocked, halve it. */ ++ /* Arbitrarily define ``normal'' to be at least one interrupt ++ every 100ms, and a small amount to be 10. */ ++ /* We don't synchronise against concurrent readers of ++ max_count_frags_no_event, because it doesn't matter too ++ much if it's slightly wrong. We don't need to worry about ++ concurrent writers, because this timer is the only thing ++ which can change it, and it's only ever run on one cpu at a ++ time. */ ++ if (jiffies - ncrp->last_event > HZ/10) ++ ncrp->max_count_frags_no_event /= 2; ++ else if (ncrp->max_count_frags_no_event + 10 <= ++ MAX_MAX_COUNT_FRAGS_NO_EVENT) ++ ncrp->max_count_frags_no_event += 10; ++ ++ if (ncrp->expected_finish_messages == 0) ++ return; ++ if (ncrp->cons_ring.sring->prod != ncrp->cons_ring.cons_pvt) ++ nc2_kick(ncrp); ++ nc2_start_polling(ncrp); ++} ++ ++void nc2_init_poller(struct netchannel2_ring_pair *ncrp) ++{ ++ setup_timer(&ncrp->polling_timer, poll_timer, (unsigned long)ncrp); ++} ++ ++void nc2_start_polling(struct netchannel2_ring_pair *ncrp) ++{ ++ mod_timer(&ncrp->polling_timer, jiffies + TICKER_INTERVAL); ++} ++ ++void nc2_stop_polling(struct netchannel2_ring_pair *ncrp) ++{ ++ del_timer_sync(&ncrp->polling_timer); ++} +diff --git a/drivers/xen/netchannel2/receiver_map.c b/drivers/xen/netchannel2/receiver_map.c +new file mode 100644 +index 0000000..82a8a4a +--- /dev/null ++++ b/drivers/xen/netchannel2/receiver_map.c +@@ -0,0 +1,786 @@ ++/* Support for mapping packets into the local domain, rather than ++ copying them or using pre-posted buffers. We only implement ++ receive-side support here; for transmit-side, we use the rscb.c ++ implementation. */ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include "netchannel2_core.h" ++ ++#define MAX_MAPPED_FRAGS 1024 ++#define MAX_MAPPED_PACKETS MAX_PENDING_FINISH_PACKETS ++#define SKB_MIN_PAYLOAD_SIZE 128 ++ ++static DEFINE_SPINLOCK(global_map_lock); ++static struct receive_mapper *receive_mapper; ++ ++/* How long do we leave the packets in the Linux stack before trying ++ to copy them, in jiffies? */ ++#define PACKET_TIMEOUT (HZ/2) ++ ++/* A slot into which we could map a fragment. */ ++struct rx_map_fragment { ++ struct list_head list; ++ struct rx_map_packet *packet; ++ grant_handle_t handle; /* 0 if the fragment isn't currently ++ * mapped */ ++ struct netchannel2_fragment nc_frag; ++}; ++ ++struct rx_map_packet { ++ struct list_head list; ++ struct list_head frags; ++ /* We take a reference for every mapped fragment associated ++ with the packet. When the refcnt goes to zero, the packet ++ is finished, and can be moved to the ++ finished_packets_list. */ ++ atomic_t refcnt; ++ unsigned id; ++ unsigned long expires; /* We expect Linux to have finished ++ with the packet by this time (in ++ jiffies), or we try to copy it. */ ++ struct netchannel2 *nc; ++ uint8_t flags; ++}; ++ ++struct receive_mapper { ++ struct page_foreign_tracker *tracker; ++ ++ struct page **pages; ++ ++ /* Nests inside the netchannel2 lock. The ++ finished_packets_lock nests inside this. */ ++ spinlock_t rm_lock; ++ ++ /* Packet fragments which we've mapped, or slots into which we ++ could map packets. The free list and count are protected ++ by @rm_lock. */ ++ struct rx_map_fragment frags[MAX_MAPPED_FRAGS]; ++ struct list_head free_frags; ++ ++ struct rx_map_packet packets[MAX_MAPPED_PACKETS]; ++ struct list_head free_packets; ++ struct list_head active_packets; ++ unsigned nr_free_packets; ++ ++ /* Packets which Linux has finished with but which we haven't ++ returned to the other endpoint yet. */ ++ spinlock_t finished_packets_lock; /* BH-safe leaf lock, ++ * acquired from the page ++ * free callback. Nests ++ * inside the rm_lock. */ ++ struct list_head finished_packets; ++ ++ struct tasklet_struct gc_tasklet; ++ ++ struct timer_list expire_timer; ++ ++ /* Set if we're trying to run the mapper down prior to ++ suspending the domain. */ ++ uint8_t suspending; ++}; ++ ++static void suspend_receive_mapper(struct receive_mapper *rm); ++ ++static unsigned fragment_idx(const struct rx_map_fragment *frag) ++{ ++ return frag - receive_mapper->frags; ++} ++ ++static int alloc_rx_frags_for_packet(unsigned nr_frags, ++ struct rx_map_packet *packet) ++{ ++ struct rx_map_fragment *rmf; ++ unsigned x; ++ ++ INIT_LIST_HEAD(&packet->frags); ++ for (x = 0; x < nr_frags; x++) { ++ if (list_empty(&receive_mapper->free_frags)) ++ goto err; ++ rmf = list_entry(receive_mapper->free_frags.next, ++ struct rx_map_fragment, ++ list); ++ rmf->packet = packet; ++ rmf->handle = -1; ++ list_move(&rmf->list, &packet->frags); ++ } ++ return 0; ++ ++err: ++ list_splice_init(&packet->frags, &receive_mapper->free_frags); ++ return -EBUSY; ++} ++ ++static struct rx_map_packet *alloc_rx_packet(struct netchannel2 *nc, ++ unsigned nr_frags) ++{ ++ struct rx_map_packet *rmp; ++ ++ spin_lock(&receive_mapper->rm_lock); ++ if (list_empty(&receive_mapper->free_packets) || ++ receive_mapper->suspending) { ++ spin_unlock(&receive_mapper->rm_lock); ++ return NULL; ++ } ++ rmp = list_entry(receive_mapper->free_packets.next, ++ struct rx_map_packet, list); ++ ++ if (alloc_rx_frags_for_packet(nr_frags, rmp) < 0) { ++ spin_unlock(&receive_mapper->rm_lock); ++ return NULL; ++ } ++ list_del(&rmp->list); ++ atomic_set(&rmp->refcnt, nr_frags); ++ rmp->nc = nc; ++ receive_mapper->nr_free_packets--; ++ ++ spin_unlock(&receive_mapper->rm_lock); ++ ++ return rmp; ++} ++ ++struct grant_unmapper { ++ unsigned nr_gops; ++ gnttab_unmap_grant_ref_t gop_queue[32]; ++}; ++ ++static void do_unmaps(struct grant_unmapper *unmapper) ++{ ++ int ret; ++ unsigned x; ++ ++ if (unmapper->nr_gops != 0) { ++ ret = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, ++ unmapper->gop_queue, ++ unmapper->nr_gops); ++ BUG_ON(ret); ++ for (x = 0; x < unmapper->nr_gops; x++) { ++ set_phys_to_machine( ++ __pa(unmapper->gop_queue[x].host_addr) >> ++ PAGE_SHIFT, ++ INVALID_P2M_ENTRY); ++ } ++ } ++ unmapper->nr_gops = 0; ++} ++ ++static void grant_unmap(struct grant_unmapper *unmapper, ++ void *va, ++ int handle) ++{ ++ gnttab_unmap_grant_ref_t *gop; ++ if (unmapper->nr_gops == ARRAY_SIZE(unmapper->gop_queue)) ++ do_unmaps(unmapper); ++ gop = &unmapper->gop_queue[unmapper->nr_gops]; ++ gnttab_set_unmap_op(gop, (unsigned long)va, GNTMAP_host_map, handle); ++ unmapper->nr_gops++; ++} ++ ++/* A tasklet which is invoked shortly after a packet is released so ++ that we can send the FINISH_PACKET message. */ ++static void gc_tasklet(unsigned long _rm) ++{ ++ struct list_head packets; ++ struct rx_map_packet *packet; ++ struct rx_map_fragment *rx_frag; ++ struct list_head released_fragments; ++ unsigned nr_released_packets; ++ unsigned idx; ++ struct grant_unmapper unmapper; ++ struct page *page; ++ struct netchannel2 *locked_nc; ++ ++ INIT_LIST_HEAD(&packets); ++ ++ spin_lock(&receive_mapper->finished_packets_lock); ++ list_splice_init(&receive_mapper->finished_packets, &packets); ++ spin_unlock(&receive_mapper->finished_packets_lock); ++ ++ /* Unmap the fragments. */ ++ unmapper.nr_gops = 0; ++ BUG_ON(packets.next == NULL); ++ list_for_each_entry(packet, &packets, list) { ++ BUG_ON(packet->list.next == NULL); ++ BUG_ON(atomic_read(&packet->refcnt) != 0); ++ BUG_ON(packet->frags.next == NULL); ++ list_for_each_entry(rx_frag, &packet->frags, list) { ++ BUG_ON(rx_frag->list.next == NULL); ++ if (rx_frag->handle == -1) ++ continue; ++ idx = fragment_idx(rx_frag); ++ page = receive_mapper->pages[idx]; ++ stop_tracking_page(page); ++ grant_unmap(&unmapper, page_address(page), ++ rx_frag->handle); ++ } ++ } ++ do_unmaps(&unmapper); ++ ++ /* Tell the other end that the packets are finished, and ++ accumulate the fragments into a local free list. */ ++ INIT_LIST_HEAD(&released_fragments); ++ nr_released_packets = 0; ++ ++ locked_nc = NULL; ++ list_for_each_entry(packet, &packets, list) { ++ if (locked_nc != packet->nc) { ++ if (locked_nc) { ++ spin_unlock(&locked_nc->rings.lock); ++ nc2_kick(&locked_nc->rings); ++ } ++ spin_lock(&packet->nc->rings.lock); ++ locked_nc = packet->nc; ++ } ++ BUG_ON(packet->frags.next == NULL); ++ list_for_each_entry(rx_frag, &packet->frags, list) { ++ BUG_ON(rx_frag->list.next == NULL); ++ idx = fragment_idx(rx_frag); ++ gnttab_reset_grant_page(receive_mapper->pages[idx]); ++ } ++ nr_released_packets++; ++ list_splice_init(&packet->frags, &released_fragments); ++ queue_finish_packet_message(&locked_nc->rings, packet->id, ++ packet->flags); ++ } ++ ++ if (locked_nc) { ++ spin_unlock(&locked_nc->rings.lock); ++ nc2_kick(&locked_nc->rings); ++ locked_nc = NULL; ++ ++ spin_lock(&receive_mapper->rm_lock); ++ list_splice(&packets, &receive_mapper->free_packets); ++ list_splice(&released_fragments, &receive_mapper->free_frags); ++ receive_mapper->nr_free_packets += nr_released_packets; ++ ++ /* Reprogram the expire timer. */ ++ if (!list_empty(&receive_mapper->active_packets)) { ++ mod_timer(&receive_mapper->expire_timer, ++ list_entry(receive_mapper->active_packets.next, ++ struct rx_map_packet, ++ list)->expires); ++ } ++ spin_unlock(&receive_mapper->rm_lock); ++ } ++} ++ ++/* Decrement the refcnt on @rmp and, if necessary, move it to the ++ finished packets list and schedule the GC tasklet. */ ++static void put_rx_map_packet(struct rx_map_packet *rmp) ++{ ++ if (atomic_dec_and_test(&rmp->refcnt)) { ++ /* Remove it from the active list. */ ++ spin_lock_bh(&receive_mapper->rm_lock); ++ list_del(&rmp->list); ++ spin_unlock_bh(&receive_mapper->rm_lock); ++ ++ /* Add it to the finished list. */ ++ spin_lock_bh(&receive_mapper->finished_packets_lock); ++ list_add_tail(&rmp->list, &receive_mapper->finished_packets); ++ spin_unlock_bh(&receive_mapper->finished_packets_lock); ++ ++ tasklet_schedule(&receive_mapper->gc_tasklet); ++ } ++} ++ ++ ++/* The page @page, which was previously part of a receiver-mapped SKB, ++ * has been released. If it was the last page involved in its SKB, ++ * the packet is finished and we can tell the other end that it's ++ * finished. ++ */ ++static void netchan2_page_release(struct page *page, unsigned order) ++{ ++ struct rx_map_fragment *frag; ++ struct rx_map_packet *rmp; ++ ++ BUG_ON(order != 0); ++ ++ frag = (struct rx_map_fragment *)page->mapping; ++ rmp = frag->packet; ++ ++ put_rx_map_packet(rmp); ++} ++ ++/* Unmap the packet, removing all other references to it. The caller ++ * should take an additional reference to the packet before calling ++ * this, to stop it disappearing underneath us. The only way of ++ * checking whether this succeeded is to look at the packet's ++ * reference count after it returns. ++ */ ++static void unmap_this_packet(struct rx_map_packet *rmp) ++{ ++ struct rx_map_fragment *rx_frag; ++ unsigned idx; ++ int r; ++ int cnt; ++ ++ /* Unmap every fragment in the packet. We don't fail the whole ++ function just because gnttab_copy_grant_page() failed, ++ because success or failure will be inferable from the ++ reference count on the packet (this makes it easier to ++ handle the case where some pages have already been copied, ++ for instance). */ ++ cnt = 0; ++ list_for_each_entry(rx_frag, &rmp->frags, list) { ++ idx = fragment_idx(rx_frag); ++ if (rx_frag->handle != -1) { ++ r = gnttab_copy_grant_page(rx_frag->handle, ++ &receive_mapper->pages[idx]); ++ if (r == 0) { ++ /* We copied the page, so it's not really ++ mapped any more. */ ++ rx_frag->handle = -1; ++ atomic_dec(&rmp->refcnt); ++ } ++ } ++ cnt++; ++ } ++ ++ /* Caller should hold a reference. */ ++ BUG_ON(atomic_read(&rmp->refcnt) == 0); ++} ++ ++static void unmap_all_packets(void) ++{ ++ struct rx_map_packet *rmp; ++ struct rx_map_packet *next; ++ struct list_head finished_packets; ++ int need_tasklet; ++ ++ INIT_LIST_HEAD(&finished_packets); ++ ++ spin_lock_bh(&receive_mapper->rm_lock); ++ ++ list_for_each_entry_safe(rmp, next, &receive_mapper->active_packets, ++ list) { ++ atomic_inc(&rmp->refcnt); ++ unmap_this_packet(rmp); ++ if (atomic_dec_and_test(&rmp->refcnt)) ++ list_move(&rmp->list, finished_packets.prev); ++ } ++ spin_unlock_bh(&receive_mapper->rm_lock); ++ ++ need_tasklet = !list_empty(&finished_packets); ++ ++ spin_lock_bh(&receive_mapper->finished_packets_lock); ++ list_splice(&finished_packets, receive_mapper->finished_packets.prev); ++ spin_unlock_bh(&receive_mapper->finished_packets_lock); ++ ++ if (need_tasklet) ++ tasklet_schedule(&receive_mapper->gc_tasklet); ++} ++ ++static void free_receive_mapper(struct receive_mapper *rm) ++{ ++ unsigned x; ++ ++ /* Get rid of any packets which are currently mapped. */ ++ suspend_receive_mapper(rm); ++ ++ /* Stop the expiry timer. We know it won't get requeued ++ * because there are no packets outstanding and rm->suspending ++ * is set (because of suspend_receive_mapper()). */ ++ del_timer_sync(&rm->expire_timer); ++ ++ /* Wait for any last instances of the tasklet to finish. */ ++ tasklet_kill(&rm->gc_tasklet); ++ ++ if (rm->pages != NULL) { ++ for (x = 0; x < MAX_MAPPED_FRAGS; x++) { ++ if (PageForeign(rm->pages[x])) ++ ClearPageForeign(rm->pages[x]); ++ rm->pages[x]->mapping = NULL; ++ } ++ free_empty_pages_and_pagevec(rm->pages, MAX_MAPPED_FRAGS); ++ } ++ if (rm->tracker != NULL) ++ free_page_foreign_tracker(rm->tracker); ++ kfree(rm); ++} ++ ++/* Timer invoked shortly after a packet expires, so that we can copy ++ the data and get it back from Linux. This is necessary if a packet ++ gets stuck in a socket RX queue somewhere, or you risk a ++ deadlock. */ ++static void expire_timer(unsigned long data) ++{ ++ struct rx_map_packet *rmp, *next; ++ struct list_head finished_packets; ++ int need_tasklet; ++ ++ INIT_LIST_HEAD(&finished_packets); ++ ++ spin_lock(&receive_mapper->rm_lock); ++ list_for_each_entry_safe(rmp, next, &receive_mapper->active_packets, ++ list) { ++ if (time_after(rmp->expires, jiffies)) { ++ mod_timer(&receive_mapper->expire_timer, rmp->expires); ++ break; ++ } ++ atomic_inc(&rmp->refcnt); ++ unmap_this_packet(rmp); ++ if (atomic_dec_and_test(&rmp->refcnt)) { ++ list_move(&rmp->list, finished_packets.prev); ++ } else { ++ /* Couldn't unmap the packet, either because ++ it's in use by real hardware or we've run ++ out of memory. Send the packet to the end ++ of the queue and update the expiry time so ++ that we try again later. */ ++ /* Note that this can make the active packet ++ list slightly out of order. Oh well; it ++ won't be by more than a few jiffies, and it ++ doesn't really matter that much. */ ++ rmp->expires = jiffies + PACKET_TIMEOUT; ++ list_move(&rmp->list, ++ receive_mapper->active_packets.prev); ++ } ++ } ++ spin_unlock(&receive_mapper->rm_lock); ++ ++ need_tasklet = !list_empty(&finished_packets); ++ ++ spin_lock(&receive_mapper->finished_packets_lock); ++ list_splice(&finished_packets, receive_mapper->finished_packets.prev); ++ spin_unlock(&receive_mapper->finished_packets_lock); ++ ++ if (need_tasklet) ++ tasklet_schedule(&receive_mapper->gc_tasklet); ++} ++ ++static struct receive_mapper *new_receive_mapper(void) ++{ ++ struct receive_mapper *rm; ++ unsigned x; ++ ++ rm = kzalloc(sizeof(*rm), GFP_KERNEL); ++ if (!rm) ++ goto err; ++ INIT_LIST_HEAD(&rm->free_frags); ++ INIT_LIST_HEAD(&rm->free_packets); ++ INIT_LIST_HEAD(&rm->active_packets); ++ INIT_LIST_HEAD(&rm->finished_packets); ++ spin_lock_init(&rm->rm_lock); ++ spin_lock_init(&rm->finished_packets_lock); ++ for (x = 0; x < MAX_MAPPED_FRAGS; x++) ++ list_add_tail(&rm->frags[x].list, &rm->free_frags); ++ for (x = 0; x < MAX_MAPPED_PACKETS; x++) ++ list_add_tail(&rm->packets[x].list, &rm->free_packets); ++ rm->nr_free_packets = MAX_MAPPED_PACKETS; ++ ++ setup_timer(&rm->expire_timer, expire_timer, 0); ++ tasklet_init(&rm->gc_tasklet, gc_tasklet, 0); ++ ++ rm->tracker = alloc_page_foreign_tracker(MAX_MAPPED_FRAGS); ++ if (!rm->tracker) ++ goto err; ++ rm->pages = alloc_empty_pages_and_pagevec(MAX_MAPPED_FRAGS); ++ if (!rm->pages) ++ goto err; ++ for (x = 0; x < MAX_MAPPED_FRAGS; x++) { ++ SetPageForeign(rm->pages[x], netchan2_page_release); ++ rm->pages[x]->mapping = (void *)&rm->frags[x]; ++ } ++ ++ return rm; ++ ++err: ++ if (rm != NULL) ++ free_receive_mapper(rm); ++ return NULL; ++} ++ ++static void attach_frag_to_skb(struct sk_buff *skb, ++ struct rx_map_fragment *frag) ++{ ++ unsigned idx; ++ struct skb_shared_info *shinfo; ++ skb_frag_t *sk_frag; ++ ++ shinfo = skb_shinfo(skb); ++ sk_frag = &shinfo->frags[shinfo->nr_frags]; ++ idx = fragment_idx(frag); ++ sk_frag->page = receive_mapper->pages[idx]; ++ sk_frag->page_offset = frag->nc_frag.off; ++ sk_frag->size = frag->nc_frag.size; ++ shinfo->nr_frags++; ++} ++ ++struct rx_plan { ++ int is_failed; ++ unsigned nr_mops; ++ gnttab_map_grant_ref_t mops[8]; ++ struct rx_map_fragment *frags[8]; ++}; ++ ++static void flush_grant_operations(struct rx_plan *rp) ++{ ++ unsigned x; ++ int ret; ++ gnttab_map_grant_ref_t *mop; ++ ++ if (rp->nr_mops == 0) ++ return; ++ if (!rp->is_failed) { ++ ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, ++ rp->mops, ++ rp->nr_mops); ++ BUG_ON(ret); ++ for (x = 0; x < rp->nr_mops; x++) { ++ mop = &rp->mops[x]; ++ if (mop->status != 0) { ++ rp->is_failed = 1; ++ } else { ++ rp->frags[x]->handle = mop->handle; ++ set_phys_to_machine( ++ __pa(mop->host_addr) >> PAGE_SHIFT, ++ FOREIGN_FRAME(mop->dev_bus_addr >> ++ PAGE_SHIFT)); ++ } ++ } ++ } ++ rp->nr_mops = 0; ++} ++ ++static void map_fragment(struct rx_plan *rp, ++ struct rx_map_fragment *rx_frag, ++ struct netchannel2 *nc) ++{ ++ unsigned idx = fragment_idx(rx_frag); ++ gnttab_map_grant_ref_t *mop; ++ ++ if (rp->nr_mops == ARRAY_SIZE(rp->mops)) ++ flush_grant_operations(rp); ++ mop = &rp->mops[rp->nr_mops]; ++ gnttab_set_map_op(mop, ++ (unsigned long)page_address(receive_mapper->pages[idx]), ++ GNTMAP_host_map | GNTMAP_readonly, ++ rx_frag->nc_frag.receiver_map.gref, ++ nc->rings.otherend_id); ++ rp->frags[rp->nr_mops] = rx_frag; ++ rp->nr_mops++; ++} ++ ++/* Unmap a packet which has been half-mapped. */ ++static void unmap_partial_packet(struct rx_map_packet *rmp) ++{ ++ unsigned idx; ++ struct rx_map_fragment *rx_frag; ++ struct grant_unmapper unmapper; ++ ++ unmapper.nr_gops = 0; ++ list_for_each_entry(rx_frag, &rmp->frags, list) { ++ if (rx_frag->handle == -1) ++ continue; ++ idx = fragment_idx(rx_frag); ++ grant_unmap(&unmapper, ++ page_address(receive_mapper->pages[idx]), ++ rx_frag->handle); ++ } ++ do_unmaps(&unmapper); ++} ++ ++struct sk_buff *handle_receiver_map_packet(struct netchannel2 *nc, ++ struct netchannel2_msg_packet *msg, ++ struct netchannel2_msg_hdr *hdr, ++ unsigned nr_frags, ++ unsigned frags_off) ++{ ++ struct sk_buff *skb; ++ struct rx_map_fragment *rx_frag; ++ unsigned x; ++ unsigned len; ++ struct rx_map_packet *rmp; ++ unsigned idx; ++ struct rx_plan plan; ++ unsigned prefix_size; ++ ++ memset(&plan, 0, sizeof(plan)); ++ ++ rmp = alloc_rx_packet(nc, nr_frags); ++ if (rmp == NULL) ++ return NULL; ++ ++ if (msg->prefix_size < SKB_MIN_PAYLOAD_SIZE) ++ prefix_size = SKB_MIN_PAYLOAD_SIZE; ++ else ++ prefix_size = msg->prefix_size; ++ /* As in posted_buffers.c, we don't limit the total size of ++ the packet, because we don't need to allocate more memory ++ for very large packets. The prefix is safe because it's ++ only a 16 bit number. A 64k allocation won't always ++ succeed, but it's unlikely to trigger the OOM killer or ++ otherwise interfere with the normal operation of the local ++ domain. */ ++ skb = dev_alloc_skb(prefix_size + NET_IP_ALIGN); ++ if (skb == NULL) { ++ spin_lock(&receive_mapper->rm_lock); ++ list_splice(&rmp->frags, &receive_mapper->free_frags); ++ list_add(&rmp->list, &receive_mapper->free_packets); ++ receive_mapper->nr_free_packets++; ++ spin_unlock(&receive_mapper->rm_lock); ++ return NULL; ++ } ++ skb_reserve(skb, NET_IP_ALIGN); ++ ++ rmp->id = msg->id; ++ rmp->flags = msg->flags; ++ ++ rx_frag = list_entry(rmp->frags.next, struct rx_map_fragment, list); ++ for (x = 0; x < nr_frags; x++) { ++ fetch_fragment(&nc->rings, x, &rx_frag->nc_frag, frags_off); ++ if (rx_frag->nc_frag.size > PAGE_SIZE || ++ rx_frag->nc_frag.off >= PAGE_SIZE || ++ rx_frag->nc_frag.size + rx_frag->nc_frag.off > PAGE_SIZE) { ++ plan.is_failed = 1; ++ break; ++ } ++ map_fragment(&plan, rx_frag, nc); ++ rx_frag = list_entry(rx_frag->list.next, ++ struct rx_map_fragment, ++ list); ++ } ++ ++ flush_grant_operations(&plan); ++ if (plan.is_failed) ++ goto fail_and_unmap; ++ ++ /* Grab the prefix off of the ring. */ ++ nc2_copy_from_ring_off(&nc->rings.cons_ring, ++ skb_put(skb, msg->prefix_size), ++ msg->prefix_size, ++ frags_off + ++ nr_frags * sizeof(struct netchannel2_fragment)); ++ ++ /* All fragments mapped, so we know that this is going to ++ work. Transfer the receive slots into the SKB. */ ++ len = 0; ++ list_for_each_entry(rx_frag, &rmp->frags, list) { ++ attach_frag_to_skb(skb, rx_frag); ++ idx = fragment_idx(rx_frag); ++ start_tracking_page(receive_mapper->tracker, ++ receive_mapper->pages[idx], ++ nc->rings.otherend_id, ++ rx_frag->nc_frag.receiver_map.gref, ++ idx, ++ nc); ++ len += rx_frag->nc_frag.size; ++ } ++ ++ skb->len += len; ++ skb->data_len += len; ++ skb->truesize += len; ++ ++ spin_lock(&receive_mapper->rm_lock); ++ list_add_tail(&rmp->list, &receive_mapper->active_packets); ++ rmp->expires = jiffies + PACKET_TIMEOUT; ++ if (rmp == list_entry(receive_mapper->active_packets.next, ++ struct rx_map_packet, ++ list)) ++ mod_timer(&receive_mapper->expire_timer, rmp->expires); ++ spin_unlock(&receive_mapper->rm_lock); ++ ++ if (skb_headlen(skb) < SKB_MIN_PAYLOAD_SIZE) ++ pull_through(skb, ++ SKB_MIN_PAYLOAD_SIZE - skb_headlen(skb)); ++ ++ return skb; ++ ++fail_and_unmap: ++ pr_debug("Failed to map received packet!\n"); ++ unmap_partial_packet(rmp); ++ ++ spin_lock(&receive_mapper->rm_lock); ++ list_splice(&rmp->frags, &receive_mapper->free_frags); ++ list_add_tail(&rmp->list, &receive_mapper->free_packets); ++ receive_mapper->nr_free_packets++; ++ spin_unlock(&receive_mapper->rm_lock); ++ ++ kfree_skb(skb); ++ return NULL; ++} ++ ++static void suspend_receive_mapper(struct receive_mapper *rm) ++{ ++ spin_lock_bh(&rm->rm_lock); ++ /* Stop any more packets coming in. */ ++ rm->suspending = 1; ++ ++ /* Wait for Linux to give back all of the SKBs which we've ++ given it. */ ++ while (rm->nr_free_packets != MAX_MAPPED_PACKETS) { ++ spin_unlock_bh(&rm->rm_lock); ++ unmap_all_packets(); ++ msleep(100); ++ spin_lock_bh(&rm->rm_lock); ++ } ++ spin_unlock_bh(&rm->rm_lock); ++} ++ ++static void resume_receive_mapper(void) ++{ ++ spin_lock_bh(&receive_mapper->rm_lock); ++ receive_mapper->suspending = 0; ++ spin_unlock_bh(&receive_mapper->rm_lock); ++} ++ ++ ++int init_receive_map_mode(void) ++{ ++ struct receive_mapper *new_rm; ++ spin_lock(&global_map_lock); ++ while (receive_mapper == NULL) { ++ spin_unlock(&global_map_lock); ++ new_rm = new_receive_mapper(); ++ if (new_rm == NULL) ++ return -ENOMEM; ++ spin_lock(&global_map_lock); ++ if (receive_mapper == NULL) { ++ receive_mapper = new_rm; ++ } else { ++ spin_unlock(&global_map_lock); ++ free_receive_mapper(new_rm); ++ spin_lock(&global_map_lock); ++ } ++ } ++ spin_unlock(&global_map_lock); ++ return 0; ++} ++ ++void deinit_receive_map_mode(void) ++{ ++ if (!receive_mapper) ++ return; ++ BUG_ON(spin_is_locked(&global_map_lock)); ++ free_receive_mapper(receive_mapper); ++ receive_mapper = NULL; ++} ++ ++void suspend_receive_map_mode(void) ++{ ++ if (!receive_mapper) ++ return; ++ suspend_receive_mapper(receive_mapper); ++} ++ ++void resume_receive_map_mode(void) ++{ ++ if (!receive_mapper) ++ return; ++ resume_receive_mapper(); ++} ++ ++struct netchannel2 *nc2_get_interface_for_page(struct page *p) ++{ ++ BUG_ON(!page_is_tracked(p)); ++ if (!receive_mapper || ++ tracker_for_page(p) != receive_mapper->tracker) ++ return NULL; ++ return get_page_tracker_ctxt(p); ++} +diff --git a/drivers/xen/netchannel2/recv_packet.c b/drivers/xen/netchannel2/recv_packet.c +new file mode 100644 +index 0000000..2de8afa +--- /dev/null ++++ b/drivers/xen/netchannel2/recv_packet.c +@@ -0,0 +1,333 @@ ++/* Support for receiving individual packets, and all the stuff which ++ * goes with that. */ ++#include ++#include ++#include ++#include "netchannel2_core.h" ++ ++/* Send as many finish packet messages as will fit on the ring. */ ++void send_finish_packet_messages(struct netchannel2_ring_pair *ncrp) ++{ ++ struct pending_finish_packets *pfp = &ncrp->pending_finish; ++ struct netchannel2_msg_finish_packet msg; ++ RING_IDX cons; ++ ++ while (pfp->prod != pfp->cons && ++ nc2_can_send_payload_bytes(&ncrp->prod_ring, sizeof(msg))) { ++ cons = pfp->cons; ++ msg.id = pfp->ids[pfp->cons % MAX_PENDING_FINISH_PACKETS]; ++ pfp->cons++; ++ nc2_send_message(&ncrp->prod_ring, ++ NETCHANNEL2_MSG_FINISH_PACKET, ++ 0, ++ &msg, ++ sizeof(msg)); ++ } ++} ++ ++/* Add a packet ID to the finish packet queue. The caller should ++ arrange that send_finish_packet_messages is sent soon to flush the ++ requests out. */ ++void queue_finish_packet_message(struct netchannel2_ring_pair *ncrp, ++ uint32_t id, uint8_t flags) ++{ ++ struct pending_finish_packets *pfp = &ncrp->pending_finish; ++ RING_IDX prod; ++ ++ prod = pfp->prod; ++ pfp->ids[prod % MAX_PENDING_FINISH_PACKETS] = id; ++ pfp->prod++; ++ ++ if (flags & NC2_PACKET_FLAG_need_event) ++ ncrp->pending_time_sensitive_messages = 1; ++} ++ ++/* Handle a packet message from the other end. On success, queues the ++ new skb to the pending skb list. If the packet is invalid, it is ++ discarded without generating a FINISH message. */ ++/* Caution: this drops and re-acquires the ring lock. */ ++void nc2_handle_packet_msg(struct netchannel2 *nc, ++ struct netchannel2_ring_pair *ncrp, ++ struct netchannel2_msg_hdr *hdr, ++ struct sk_buff_head *pending_rx_queue) ++{ ++ unsigned nr_frags; ++ struct netchannel2_msg_packet msg; ++ struct sk_buff *skb; ++ const unsigned frags_off = sizeof(msg); ++ unsigned frags_bytes; ++ ++ if (ncrp->pending_finish.prod - ncrp->pending_finish.cons == ++ MAX_PENDING_FINISH_PACKETS) { ++ pr_debug("Remote endpoint sent too many packets!\n"); ++ nc->stats.rx_errors++; ++ return; ++ } ++ ++ if (hdr->size < sizeof(msg)) { ++ pr_debug("Packet message too small (%d < %zd)\n", hdr->size, ++ sizeof(msg)); ++ nc->stats.rx_errors++; ++ return; ++ } ++ ++ if (hdr->size & 7) { ++ pr_debug("Packet size in ring not multiple of 8: %d\n", ++ hdr->size); ++ nc->stats.rx_errors++; ++ return; ++ } ++ ++ nc2_copy_from_ring(&ncrp->cons_ring, &msg, sizeof(msg)); ++ ++ if (msg.type != NC2_PACKET_TYPE_receiver_copy && ++ msg.type != NC2_PACKET_TYPE_small && ++ ncrp != &nc->rings) { ++ pr_debug("Received strange packet type %d on bypass ring.\n", ++ msg.type); ++ nc->stats.tx_errors++; ++ return; ++ } ++ ++ frags_bytes = hdr->size - sizeof(msg) - msg.prefix_size; ++ nr_frags = frags_bytes / sizeof(struct netchannel2_fragment); ++ ++ if (nr_frags > MAX_SKB_FRAGS) { ++ pr_debug("otherend misbehaving: %d frags > %ld\n", ++ nr_frags, MAX_SKB_FRAGS); ++ nc->stats.tx_errors++; ++ return; ++ } ++ ++ switch (msg.type) { ++ case NC2_PACKET_TYPE_small: ++ if (nr_frags != 0) { ++ /* Small packets, by definition, have no ++ * fragments */ ++ pr_debug("Received small packet with %d frags?\n", ++ nr_frags); ++ nc->stats.rx_errors++; ++ return; ++ } ++ /* Any of the receiver functions can handle small ++ packets as a trivial special case. Use receiver ++ copy, since that's the simplest. */ ++ skb = handle_receiver_copy_packet(nc, ncrp, &msg, hdr, ++ nr_frags, frags_off); ++ /* No finish message */ ++ break; ++ case NC2_PACKET_TYPE_receiver_copy: ++ skb = handle_receiver_copy_packet(nc, ncrp, &msg, hdr, ++ nr_frags, frags_off); ++ queue_finish_packet_message(ncrp, msg.id, msg.flags); ++ break; ++ case NC2_PACKET_TYPE_receiver_map: ++ if (!nc->local_trusted) { ++ /* The remote doesn't trust us, so they ++ shouldn't be sending us receiver-map ++ packets. Just treat it as an RSCB ++ packet. */ ++ skb = NULL; ++ } else { ++ skb = handle_receiver_map_packet(nc, &msg, hdr, ++ nr_frags, ++ frags_off); ++ /* Finish message will be sent when we unmap ++ * the packet. */ ++ } ++ if (skb == NULL) { ++ /* We can't currently map this skb. Use a ++ receiver copy instead. */ ++ skb = handle_receiver_copy_packet(nc, ncrp, &msg, hdr, ++ nr_frags, frags_off); ++ queue_finish_packet_message(ncrp, msg.id, msg.flags); ++ } ++ break; ++ default: ++ pr_debug("Unknown packet type %d\n", msg.type); ++ nc->stats.rx_errors++; ++ skb = NULL; ++ break; ++ } ++ if (skb != NULL) { ++ nc->stats.rx_bytes += skb->len; ++ nc->stats.rx_packets++; ++ skb->dev = nc->net_device; ++ ++ if (ncrp->filter_mac && ++ skb_headlen(skb) >= sizeof(struct ethhdr) && ++ memcmp(((struct ethhdr *)skb->data)->h_source, ++ ncrp->remote_mac, ++ ETH_ALEN)) { ++ /* We're in filter MACs mode and the source ++ MAC on this packet is wrong. Drop it. */ ++ /* (We know that any packet big enough to ++ contain an ethernet header at all will ++ contain it in the head space because we do ++ a pull_through at the end of the type ++ handler.) */ ++ nc->stats.rx_missed_errors++; ++ goto err; ++ } ++ ++ switch (msg.flags & (NC2_PACKET_FLAG_data_validated | ++ NC2_PACKET_FLAG_csum_blank)) { ++ case 0: ++ skb->ip_summed = CHECKSUM_NONE; ++ break; ++ case NC2_PACKET_FLAG_data_validated: ++ skb->ip_summed = CHECKSUM_UNNECESSARY; ++ break; ++ default: ++ /* csum_blank implies data_validated, so ++ csum_blank and csum_blank|data_validated ++ are equivalent. */ ++ skb->ip_summed = CHECKSUM_PARTIAL; ++ if (msg.csum_offset + 2 > skb->len) { ++ /* Whoops. Assuming no bugs in our ++ receive methods, the other end just ++ requested checksum calculation ++ beyond the end of the packet. */ ++ if (net_ratelimit()) ++ dev_warn(&nc->net_device->dev, ++ "csum field too far through packet (%d, skb len %d, headlen %d)\n", ++ msg.csum_offset, skb->len, ++ skb_headlen(skb)); ++ goto err; ++ } ++ skb->csum_start = msg.csum_start + skb_headroom(skb); ++ skb->csum_offset = msg.csum_offset - msg.csum_start; ++ break; ++ } ++ ++#ifdef CONFIG_XEN_NETDEV2_AUTOMATIC_BYPASS ++ if (ncrp == &nc->rings) { ++ if (msg.flags & NC2_PACKET_FLAG_bypass_candidate) ++ nc2_received_bypass_candidate_packet(nc, skb); ++ else ++ nc->auto_bypass.nr_non_bypass_packets++; ++ } else { ++ container_of(ncrp, struct nc2_alternate_ring, rings)-> ++ autoteardown.nr_packets++; ++ } ++#endif ++ ++ switch (msg.segmentation_type) { ++ case NC2_PACKET_SEGMENTATION_TYPE_none: ++ break; ++ case NC2_PACKET_SEGMENTATION_TYPE_tcpv4: ++ if (msg.mss == 0) { ++ pr_debug("TSO request with mss == 0?\n"); ++ goto err; ++ } ++ skb_shinfo(skb)->gso_type = ++ SKB_GSO_TCPV4 | SKB_GSO_DODGY; ++ skb_shinfo(skb)->gso_size = msg.mss; ++ skb_shinfo(skb)->gso_segs = 0; ++ break; ++ default: ++ pr_debug("Unknown segmentation offload type %d!\n", ++ msg.segmentation_type); ++ goto err; ++ } ++ ++ __skb_queue_tail(pending_rx_queue, skb); ++ ++ if (ncrp->pending_rx_hypercalls.nr_pending_gops >= ++ RX_GRANT_COPY_BATCH) { ++ flush_prepared_grant_copies(&ncrp->pending_rx_hypercalls, ++ nc2_rscb_on_gntcopy_fail); ++ /* since receive could generate ACKs to the ++ start_xmit() function we need to release ++ the ring lock */ ++ spin_unlock(&ncrp->lock); ++ /* we should receive the packet as soon as the ++ copy is complete to benefit from cache ++ locality */ ++ receive_pending_skbs(pending_rx_queue); ++ spin_lock(&ncrp->lock); ++ ++ } ++ ++ } ++ return; ++ ++err: ++ /* If the receive succeeded part-way, there may be references ++ to the skb in the hypercall batcher. Flush them out before ++ we release it. This is a slow path, so we don't care that ++ much about performance. */ ++ flush_prepared_grant_copies(&ncrp->pending_rx_hypercalls, ++ nc2_rscb_on_gntcopy_fail); ++ ++ /* We may need to send a FINISH message here if this was a ++ receiver-map packet. That should be handled automatically ++ by the kfree_skb(). */ ++ kfree_skb(skb); ++ nc->stats.rx_errors++; ++ return; ++} ++ ++/* If there is space on the ring, tell the other end how many packets ++ its allowed to send at one time and clear the ++ need_advertise_max_packets flag. */ ++void advertise_max_packets(struct netchannel2_ring_pair *ncrp) ++{ ++ struct netchannel2_msg_set_max_packets msg; ++ ++ if (!nc2_can_send_payload_bytes(&ncrp->prod_ring, sizeof(msg))) ++ return; ++ msg.max_outstanding_packets = MAX_PENDING_FINISH_PACKETS; ++ nc2_send_message(&ncrp->prod_ring, ++ NETCHANNEL2_MSG_SET_MAX_PACKETS, ++ 0, ++ &msg, ++ sizeof(msg)); ++ ncrp->need_advertise_max_packets = 0; ++ ncrp->pending_time_sensitive_messages = 1; ++} ++ ++void advertise_max_fragments_per_packet(struct netchannel2_ring_pair *ncrp) ++{ ++ struct netchannel2_msg_set_max_fragments_per_packet msg; ++ ++ if (!nc2_can_send_payload_bytes(&ncrp->prod_ring, sizeof(msg))) ++ return; ++ msg.max_frags_per_packet = MAX_SKB_FRAGS; ++ nc2_send_message(&ncrp->prod_ring, ++ NETCHANNEL2_MSG_SET_MAX_FRAGMENTS_PER_PACKET, ++ 0, ++ &msg, ++ sizeof(msg)); ++ ncrp->need_advertise_max_fragments_per_packet = 0; ++ ncrp->pending_time_sensitive_messages = 1; ++} ++ ++void receive_pending_skbs(struct sk_buff_head *pending_rx_queue) ++{ ++ struct sk_buff *skb; ++ struct skb_cb_overlay *sco; ++ while (!skb_queue_empty(pending_rx_queue)) { ++ skb = __skb_dequeue(pending_rx_queue); ++ sco = get_skb_overlay(skb); ++ if (unlikely(sco->failed)) ++ kfree_skb(skb); ++ else { ++ skb->protocol = eth_type_trans(skb, skb->dev); ++ netif_receive_skb(skb); ++ } ++ } ++} ++ ++ ++/* These don't really belong here, but it's as good a place as any. */ ++int __init nc2_init(void) ++{ ++ return 0; ++} ++ ++void __exit nc2_exit(void) ++{ ++ nc2_shutdown_autoteardown(); ++ deinit_receive_map_mode(); ++} +diff --git a/drivers/xen/netchannel2/rscb.c b/drivers/xen/netchannel2/rscb.c +new file mode 100644 +index 0000000..217cfc4 +--- /dev/null ++++ b/drivers/xen/netchannel2/rscb.c +@@ -0,0 +1,434 @@ ++/* Receiver-side copy buffer support */ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "netchannel2_core.h" ++ ++/* -------------------------- Receive -------------------------------- */ ++ ++/* This is called whenever an RSCB grant copy fails. */ ++void nc2_rscb_on_gntcopy_fail(void *ctxt, gnttab_copy_t *gop) ++{ ++ struct sk_buff *skb = ctxt; ++ struct skb_cb_overlay *sco = get_skb_overlay(skb); ++ if (!sco->failed && net_ratelimit()) ++ printk(KERN_WARNING "Dropping RX packet because of copy error\n"); ++ sco->failed = 1; ++} ++ ++ ++/* Copy @size bytes from @offset in grant ref @gref against domain ++ @domid and shove them on the end of @skb. Fails if it the head ++ does not have enough space or if the copy would span multiple ++ pages. */ ++static int nc2_grant_copy(struct netchannel2_ring_pair *ncrp, ++ struct sk_buff *skb, ++ unsigned offset, ++ unsigned size, ++ grant_ref_t gref, ++ domid_t domid) ++{ ++ gnttab_copy_t *gop; ++ void *tail; ++ void *end; ++ ++ if (size > PAGE_SIZE) ++ return 0; ++ ++ tail = skb_tail_pointer(skb); ++ end = skb_end_pointer(skb); ++ ++ if (unlikely(size > (end-tail))) ++ return 0; ++ ++ if (unlikely(offset_in_page(tail) + size > PAGE_SIZE)) { ++ unsigned f1 = PAGE_SIZE - offset_in_page(tail); ++ /* Recursive, but only ever to depth 1, so okay */ ++ if (!nc2_grant_copy(ncrp, skb, offset, f1, gref, domid)) ++ return 0; ++ offset += f1; ++ size -= f1; ++ tail += f1; ++ } ++ ++ /* Copy this fragment into the header. */ ++ gop = hypercall_batcher_grant_copy(&ncrp->pending_rx_hypercalls, ++ skb, ++ nc2_rscb_on_gntcopy_fail); ++ gop->flags = GNTCOPY_source_gref; ++ gop->source.domid = domid; ++ gop->source.offset = offset; ++ gop->source.u.ref = gref; ++ gop->dest.domid = DOMID_SELF; ++ gop->dest.offset = offset_in_page(tail); ++ gop->dest.u.gmfn = virt_to_mfn(tail); ++ gop->len = size; ++ ++ skb_put(skb, size); ++ ++ return 1; ++} ++ ++/* We've received a receiver-copy packet message from the remote. ++ Parse it up, build an sk_buff, and return it. Returns NULL on ++ error. */ ++struct sk_buff *handle_receiver_copy_packet(struct netchannel2 *nc, ++ struct netchannel2_ring_pair *ncrp, ++ struct netchannel2_msg_packet *msg, ++ struct netchannel2_msg_hdr *hdr, ++ unsigned nr_frags, ++ unsigned frags_off) ++{ ++ struct netchannel2_fragment frag; ++ unsigned nr_bytes; ++ unsigned x; ++ struct sk_buff *skb; ++ unsigned skb_headsize; ++ int first_frag, first_frag_size; ++ gnttab_copy_t *gop; ++ struct skb_shared_info *shinfo; ++ struct page *new_page; ++ ++ if (msg->prefix_size > NETCHANNEL2_MAX_INLINE_BYTES) { ++ pr_debug("Inline prefix too big! (%d > %d)\n", ++ msg->prefix_size, NETCHANNEL2_MAX_INLINE_BYTES); ++ return NULL; ++ } ++ ++ /* Count the number of bytes in the packet. Be careful: the ++ other end can still access the packet on the ring, so the ++ size could change later. */ ++ nr_bytes = msg->prefix_size; ++ for (x = 0; x < nr_frags; x++) { ++ fetch_fragment(ncrp, x, &frag, frags_off); ++ nr_bytes += frag.size; ++ } ++ if (nr_bytes > NETCHANNEL2_MAX_PACKET_BYTES) { ++ pr_debug("Packet too big! (%d > %d)\n", nr_bytes, ++ NETCHANNEL2_MAX_PACKET_BYTES); ++ return NULL; ++ } ++ if (nr_bytes < 64) { ++ /* Linux sometimes has problems with very small SKBs. ++ Impose a minimum size of 64 bytes. */ ++ nr_bytes = 64; ++ } ++ ++ first_frag = 0; ++ if (nr_frags > 0) { ++ fetch_fragment(ncrp, 0, &frag, frags_off); ++ first_frag_size = frag.size; ++ first_frag = 1; ++ } else { ++ first_frag_size = 0; ++ first_frag = 0; ++ } ++ ++ /* We try to have both prefix and the first frag in the skb head ++ if they do not exceed the page size */ ++ skb_headsize = msg->prefix_size + first_frag_size + NET_IP_ALIGN; ++ if (skb_headsize > ++ ((PAGE_SIZE - sizeof(struct skb_shared_info) - NET_SKB_PAD) & ++ ~(SMP_CACHE_BYTES - 1))) { ++ skb_headsize = msg->prefix_size + NET_IP_ALIGN; ++ first_frag = 0; ++ } ++ ++ skb = dev_alloc_skb(skb_headsize); ++ if (!skb) { ++ /* Drop the packet. */ ++ pr_debug("Couldn't allocate a %d byte skb.\n", nr_bytes); ++ nc->stats.rx_dropped++; ++ return NULL; ++ } ++ ++ /* Arrange that the IP header is nicely aligned in memory. */ ++ skb_reserve(skb, NET_IP_ALIGN); ++ ++ /* The inline prefix should always fit in the SKB head. */ ++ nc2_copy_from_ring_off(&ncrp->cons_ring, ++ skb_put(skb, msg->prefix_size), ++ msg->prefix_size, ++ frags_off + nr_frags * sizeof(frag)); ++ ++ /* copy first frag into skb head if it does not cross a ++ page boundary */ ++ if (first_frag == 1) { ++ fetch_fragment(ncrp, 0, &frag, frags_off); ++ if (!nc2_grant_copy(ncrp, skb, frag.off, frag.size, ++ frag.receiver_copy.gref, ++ ncrp->otherend_id)) { ++ get_skb_overlay(skb)->failed = 1; ++ return skb; ++ } ++ } ++ ++ shinfo = skb_shinfo(skb); ++ for (x = first_frag; x < nr_frags; x++) { ++ fetch_fragment(ncrp, x, &frag, frags_off); ++ ++ /* Allocate a new page for the fragment */ ++ new_page = alloc_page(GFP_ATOMIC); ++ if (!new_page) { ++ get_skb_overlay(skb)->failed = 1; ++ return skb; ++ } ++ ++ gop = hypercall_batcher_grant_copy(&ncrp->pending_rx_hypercalls, ++ skb, ++ nc2_rscb_on_gntcopy_fail); ++ gop->flags = GNTCOPY_source_gref; ++ gop->source.domid = ncrp->otherend_id; ++ gop->source.offset = frag.off; ++ gop->source.u.ref = frag.receiver_copy.gref; ++ gop->dest.domid = DOMID_SELF; ++ gop->dest.offset = 0; ++ gop->dest.u.gmfn = pfn_to_mfn(page_to_pfn(new_page)); ++ gop->len = frag.size; ++ ++ shinfo->frags[x-first_frag].page = new_page; ++ shinfo->frags[x-first_frag].page_offset = 0; ++ shinfo->frags[x-first_frag].size = frag.size; ++ shinfo->nr_frags++; ++ ++ skb->truesize += frag.size; ++ skb->data_len += frag.size; ++ skb->len += frag.size; ++ } ++ return skb; ++} ++ ++ ++ ++/* ------------------------------- Transmit ---------------------------- */ ++ ++struct grant_packet_plan { ++ volatile struct netchannel2_fragment *out_fragment; ++ grant_ref_t gref_pool; ++ int use_subpage_grants; ++ unsigned prefix_avail; ++#ifdef CONFIG_XEN_NETDEV2_BYPASSABLE ++ int could_have_used_bypass; ++#endif ++}; ++ ++static inline int nfrags_skb(struct sk_buff *skb, int prefix_size) ++{ ++ unsigned long start_grant; ++ unsigned long end_grant; ++ ++ if (skb_headlen(skb) <= prefix_size) ++ return skb_shinfo(skb)->nr_frags; ++ ++ start_grant = ((unsigned long)skb->data + prefix_size) & ++ ~(PAGE_SIZE-1); ++ end_grant = ((unsigned long)skb->data + ++ skb_headlen(skb) + PAGE_SIZE - 1) & ++ ~(PAGE_SIZE-1); ++ return ((end_grant - start_grant) >> PAGE_SHIFT) ++ + skb_shinfo(skb)->nr_frags; ++} ++ ++enum prepare_xmit_result prepare_xmit_allocate_grant(struct netchannel2_ring_pair *ncrp, ++ struct sk_buff *skb, ++ int use_subpage_grants) ++{ ++ struct skb_cb_overlay *skb_co = get_skb_overlay(skb); ++ unsigned nr_fragments; ++ grant_ref_t gref_pool; ++ int err; ++ unsigned inline_prefix_size; ++ ++ if (allocate_txp_slot(ncrp, skb) < 0) ++ return PREP_XMIT_BUSY; ++ ++ if (use_subpage_grants) { ++ /* We're going to have to get the remote to issue a ++ grant copy hypercall anyway, so there's no real ++ benefit to shoving the headers inline. */ ++ /* (very small packets won't go through here, so ++ there's no chance that we could completely ++ eliminate the grant copy.) */ ++ inline_prefix_size = sizeof(struct ethhdr); ++ } else { ++ /* If we're going off-box (and we probably are, if the ++ remote is trusted), putting the header in the ring ++ potentially saves a TLB miss in the bridge, which ++ is worth doing. */ ++ inline_prefix_size = PACKET_PREFIX_SIZE; ++ if (skb_headlen(skb) < inline_prefix_size) ++ inline_prefix_size = skb_headlen(skb); ++ } ++ ++ if (skb_co->nr_fragments == 0) { ++ nr_fragments = nfrags_skb(skb, inline_prefix_size); ++ ++ /* No-fragments packets should be policy small, not ++ * policy grant. */ ++ BUG_ON(nr_fragments == 0); ++ ++ if (nr_fragments > ncrp->max_fragments_per_tx_packet) { ++ if (skb_linearize(skb) < 0) ++ return PREP_XMIT_DROP; ++ nr_fragments = nfrags_skb(skb, inline_prefix_size); ++ if (nr_fragments > ncrp->max_fragments_per_tx_packet) ++ return PREP_XMIT_DROP; ++ } ++ ++ skb_co->nr_fragments = nr_fragments; ++ } ++ ++ /* Grab the grant references. */ ++ err = gnttab_suballoc_grant_references(skb_co->nr_fragments, ++ &ncrp->gref_pool, ++ &gref_pool); ++ if (err < 0) { ++ release_txp_slot(ncrp, skb); ++ /* Leave skb_co->nr_fragments set, so that we don't ++ have to recompute it next time around. */ ++ return PREP_XMIT_BUSY; ++ } ++ ++ skb_co->gref_pool = gref_pool; ++ skb_co->inline_prefix_size = inline_prefix_size; ++ ++ if (use_subpage_grants) ++ skb_co->type = NC2_PACKET_TYPE_receiver_copy; ++ else ++ skb_co->type = NC2_PACKET_TYPE_receiver_map; ++ ++ return PREP_XMIT_OKAY; ++} ++ ++static void prepare_subpage_grant(struct netchannel2_ring_pair *ncrp, ++ struct page *page, ++ unsigned off_in_page, ++ unsigned size, ++ struct grant_packet_plan *plan) ++{ ++ volatile struct netchannel2_fragment *frag; ++ domid_t trans_domid; ++ grant_ref_t trans_gref; ++ grant_ref_t gref; ++#ifdef CONFIG_XEN_NETDEV2_BYPASSABLE ++ struct netchannel2 *orig_iface; ++#endif ++ ++ if (size <= plan->prefix_avail) { ++ /* This fragment is going to be inline -> nothing to ++ * do. */ ++ plan->prefix_avail -= size; ++ return; ++ } ++ if (plan->prefix_avail > 0) { ++ /* Part inline, part in payload. */ ++ size -= plan->prefix_avail; ++ off_in_page += plan->prefix_avail; ++ plan->prefix_avail = 0; ++ } ++ frag = plan->out_fragment; ++ gref = gnttab_claim_grant_reference(&plan->gref_pool); ++ frag->receiver_copy.gref = gref; ++ if (page_is_tracked(page)) { ++#ifdef CONFIG_XEN_NETDEV2_BYPASSABLE ++ orig_iface = nc2_get_interface_for_page(page); ++ if (orig_iface && ++ orig_iface->extant_bypasses < orig_iface->max_bypasses) ++ plan->could_have_used_bypass = 1; ++#endif ++ lookup_tracker_page(page, &trans_domid, &trans_gref); ++ gnttab_grant_foreign_access_ref_trans(gref, ++ ncrp->otherend_id, ++ GTF_readonly, ++ trans_domid, ++ trans_gref); ++ } else if (plan->use_subpage_grants) { ++ gnttab_grant_foreign_access_ref_subpage(gref, ++ ncrp->otherend_id, ++ virt_to_mfn(page_address(page)), ++ GTF_readonly, ++ off_in_page, ++ size); ++ } else { ++ gnttab_grant_foreign_access_ref(gref, ++ ncrp->otherend_id, ++ virt_to_mfn(page_address(page)), ++ GTF_readonly); ++ } ++ frag->off = off_in_page; ++ frag->size = size; ++ plan->out_fragment++; ++} ++ ++static int grant_data_area(struct netchannel2_ring_pair *ncrp, ++ struct sk_buff *skb, ++ struct grant_packet_plan *plan) ++{ ++ void *ptr = skb->data; ++ unsigned len = skb_headlen(skb); ++ unsigned off; ++ unsigned this_time; ++ ++ for (off = 0; off < len; off += this_time) { ++ this_time = len - off; ++ if (this_time + offset_in_page(ptr + off) > PAGE_SIZE) ++ this_time = PAGE_SIZE - offset_in_page(ptr + off); ++ prepare_subpage_grant(ncrp, ++ virt_to_page(ptr + off), ++ offset_in_page(ptr + off), ++ this_time, ++ plan); ++ } ++ return 0; ++} ++ ++void xmit_grant(struct netchannel2_ring_pair *ncrp, ++ struct sk_buff *skb, ++ int use_subpage_grants, ++ volatile void *msg_buf) ++{ ++ volatile struct netchannel2_msg_packet *msg = msg_buf; ++ struct skb_cb_overlay *skb_co = get_skb_overlay(skb); ++ struct grant_packet_plan plan; ++ unsigned x; ++ struct skb_shared_info *shinfo; ++ skb_frag_t *frag; ++ ++ memset(&plan, 0, sizeof(plan)); ++ plan.use_subpage_grants = use_subpage_grants; ++ plan.prefix_avail = skb_co->inline_prefix_size; ++ plan.out_fragment = msg->frags; ++ plan.gref_pool = skb_co->gref_pool; ++ ++ ncrp->count_frags_no_event += skb_co->nr_fragments; ++ if (ncrp->count_frags_no_event >= ncrp->max_count_frags_no_event) { ++ msg->flags |= NC2_PACKET_FLAG_need_event; ++ ncrp->count_frags_no_event = 0; ++ } ++ ++ grant_data_area(ncrp, skb, &plan); ++ ++ shinfo = skb_shinfo(skb); ++ for (x = 0; x < shinfo->nr_frags; x++) { ++ frag = &shinfo->frags[x]; ++ prepare_subpage_grant(ncrp, ++ frag->page, ++ frag->page_offset, ++ frag->size, ++ &plan); ++ } ++ ++ skb_co->nr_fragments = plan.out_fragment - msg->frags; ++ ++#ifdef CONFIG_XEN_NETDEV2_BYPASSABLE ++ if (plan.could_have_used_bypass && ++ ncrp == &ncrp->interface->rings && ++ ncrp->interface->extant_bypasses < ncrp->interface->max_bypasses) ++ msg->flags |= NC2_PACKET_FLAG_bypass_candidate; ++#endif ++} ++ +diff --git a/drivers/xen/netchannel2/tools/destroy_bypass.c b/drivers/xen/netchannel2/tools/destroy_bypass.c +new file mode 100644 +index 0000000..93b82e0 +--- /dev/null ++++ b/drivers/xen/netchannel2/tools/destroy_bypass.c +@@ -0,0 +1,25 @@ ++#include ++#include ++#include ++#include ++#include ++#include ++#include "../netchannel2_uspace.h" ++ ++int ++main(int argc, char *argv[]) ++{ ++ int fd; ++ struct netchannel2_ioctl_destroy_bypass ioc; ++ int r; ++ ++ fd = open("/dev/netback2", O_RDWR); ++ if (fd < 0) ++ err(1, "openning /dev/netback2"); ++ ioc.handle = atoi(argv[1]); ++ ++ r = ioctl(fd, NETCHANNEL2_IOCTL_DESTROY_BYPASS, &ioc); ++ if (r < 0) ++ err(1, "destroying bypass"); ++ return 0; ++} +diff --git a/drivers/xen/netchannel2/tools/establish_bypass.c b/drivers/xen/netchannel2/tools/establish_bypass.c +new file mode 100644 +index 0000000..bdd326c +--- /dev/null ++++ b/drivers/xen/netchannel2/tools/establish_bypass.c +@@ -0,0 +1,31 @@ ++#include ++#include ++#include ++#include ++#include ++#include ++#include "../netchannel2_uspace.h" ++ ++int ++main(int argc, char *argv[]) ++{ ++ int fd; ++ unsigned a; ++ unsigned b; ++ struct netchannel2_ioctl_establish_bypass ioc; ++ int r; ++ ++ fd = open("/dev/netback2", O_RDWR); ++ if (fd < 0) ++ err(1, "openning /dev/netback2"); ++ a = atoi(argv[1]); ++ b = atoi(argv[2]); ++ ioc.handle_a = a; ++ ioc.handle_b = b; ++ ++ r = ioctl(fd, NETCHANNEL2_IOCTL_ESTABLISH_BYPASS, &ioc); ++ if (r < 0) ++ err(1, "establishing bypass"); ++ printf("%d\n", r); ++ return 0; ++} +diff --git a/drivers/xen/netchannel2/util.c b/drivers/xen/netchannel2/util.c +new file mode 100644 +index 0000000..1b2a909 +--- /dev/null ++++ b/drivers/xen/netchannel2/util.c +@@ -0,0 +1,244 @@ ++#include ++#include ++#include ++#include ++#ifdef CONFIG_XEN_NETDEV2_BACKEND ++#include ++#endif ++#include ++#include "netchannel2_core.h" ++ ++int allocate_txp_slot(struct netchannel2_ring_pair *ncrp, ++ struct sk_buff *skb) ++{ ++ struct skb_cb_overlay *skb_co = get_skb_overlay(skb); ++ struct txp_slot *tp; ++ ++ BUG_ON(skb_co->tp); ++ ++ if (ncrp->head_free_tx_packet == INVALID_TXP_INDEX || ++ ncrp->nr_tx_packets_outstanding == ++ ncrp->max_tx_packets_outstanding) { ++ return -1; ++ } ++ ++ tp = &ncrp->tx_packets[ncrp->head_free_tx_packet]; ++ ncrp->head_free_tx_packet = txp_get_next_free(tp); ++ ++ txp_set_skb(tp, skb); ++ skb_co->tp = tp; ++ ncrp->nr_tx_packets_outstanding++; ++ return 0; ++} ++ ++static void nc2_free_skb(struct netchannel2 *nc, ++ struct sk_buff *skb) ++{ ++ dev_kfree_skb(skb); ++} ++ ++void release_txp_slot(struct netchannel2_ring_pair *ncrp, ++ struct sk_buff *skb) ++{ ++ struct skb_cb_overlay *skb_co = get_skb_overlay(skb); ++ struct txp_slot *tp = skb_co->tp; ++ ++ BUG_ON(txp_get_skb(tp) != skb); ++ ++ /* Try to keep the free TX packet list in order as far as ++ * possible, since that gives slightly better cache behaviour. ++ * It's not worth spending a lot of effort getting this right, ++ * though, so just use a simple heuristic: if we're freeing a ++ * packet, and the previous packet is already free, chain this ++ * packet directly after it, rather than putting it at the ++ * head of the list. This isn't perfect by any means, but ++ * it's enough that you get nice long runs of contiguous ++ * packets in the free list, and that's all we really need. ++ * Runs much bigger than a cache line aren't really very ++ * useful, anyway. */ ++ if (tp != ncrp->tx_packets && !txp_slot_in_use(tp - 1)) { ++ txp_set_next_free(tp, txp_get_next_free(tp - 1)); ++ txp_set_next_free(tp - 1, tp - ncrp->tx_packets); ++ } else { ++ txp_set_next_free(tp, ncrp->head_free_tx_packet); ++ ncrp->head_free_tx_packet = tp - ncrp->tx_packets; ++ } ++ skb_co->tp = NULL; ++ ncrp->nr_tx_packets_outstanding--; ++} ++ ++void release_tx_packet(struct netchannel2_ring_pair *ncrp, ++ struct sk_buff *skb) ++{ ++ struct skb_cb_overlay *skb_co = get_skb_overlay(skb); ++ struct txp_slot *tp = skb_co->tp; ++ grant_ref_t gref; ++ int r; ++ unsigned cntr; ++ ++ if (skb_co->type == NC2_PACKET_TYPE_receiver_copy) { ++ while (1) { ++ r = gnttab_claim_grant_reference(&skb_co->gref_pool); ++ if (r == -ENOSPC) ++ break; ++ gref = (grant_ref_t)r; ++ /* It's a subpage grant reference, so Xen ++ guarantees to release it quickly. Sit and ++ wait for it to do so. */ ++ cntr = 0; ++ while (!gnttab_end_foreign_access_ref(gref)) { ++ cpu_relax(); ++ if (++cntr % 65536 == 0) ++ printk(KERN_WARNING "Having trouble ending gref %d for receiver copy.\n", ++ gref); ++ } ++ gnttab_release_grant_reference(&ncrp->gref_pool, gref); ++ } ++ } else if (skb_co->type == NC2_PACKET_TYPE_receiver_map) { ++ while (1) { ++ r = gnttab_claim_grant_reference(&skb_co->gref_pool); ++ if (r == -ENOSPC) ++ break; ++ gref = (grant_ref_t)r; ++ r = gnttab_end_foreign_access_ref(gref); ++ if (r == 0) { ++ printk(KERN_WARNING "Failed to end remote access to packet memory.\n"); ++ } else { ++ gnttab_release_grant_reference(&ncrp->gref_pool, ++ gref); ++ } ++ } ++ } else if (skb_co->gref_pool != 0) { ++ gnttab_subfree_grant_references(skb_co->gref_pool, ++ &ncrp->gref_pool); ++ } ++ ++ if (tp != NULL) ++ release_txp_slot(ncrp, skb); ++ ++ nc2_free_skb(ncrp->interface, skb); ++} ++ ++void fetch_fragment(struct netchannel2_ring_pair *ncrp, ++ unsigned idx, ++ struct netchannel2_fragment *frag, ++ unsigned off) ++{ ++ nc2_copy_from_ring_off(&ncrp->cons_ring, ++ frag, ++ sizeof(*frag), ++ off + idx * sizeof(*frag)); ++} ++ ++/* Copy @count bytes from the skb's data area into its head, updating ++ * the pointers as appropriate. The caller should ensure that there ++ * is actually enough space in the head. */ ++void pull_through(struct sk_buff *skb, unsigned count) ++{ ++ unsigned frag = 0; ++ unsigned this_frag; ++ void *buf; ++ void *va; ++ ++ while (count != 0 && frag < skb_shinfo(skb)->nr_frags) { ++ this_frag = skb_shinfo(skb)->frags[frag].size; ++ if (this_frag > count) ++ this_frag = count; ++ va = page_address(skb_shinfo(skb)->frags[frag].page); ++ buf = skb->tail; ++ memcpy(buf, va + skb_shinfo(skb)->frags[frag].page_offset, ++ this_frag); ++ skb->tail += this_frag; ++ BUG_ON(skb->tail > skb->end); ++ skb_shinfo(skb)->frags[frag].size -= this_frag; ++ skb_shinfo(skb)->frags[frag].page_offset += this_frag; ++ skb->data_len -= this_frag; ++ count -= this_frag; ++ frag++; ++ } ++ for (frag = 0; ++ frag < skb_shinfo(skb)->nr_frags && ++ skb_shinfo(skb)->frags[frag].size == 0; ++ frag++) { ++ put_page(skb_shinfo(skb)->frags[frag].page); ++ } ++ skb_shinfo(skb)->nr_frags -= frag; ++ memmove(skb_shinfo(skb)->frags, ++ skb_shinfo(skb)->frags+frag, ++ sizeof(skb_shinfo(skb)->frags[0]) * ++ skb_shinfo(skb)->nr_frags); ++} ++ ++#ifdef CONFIG_XEN_NETDEV2_BACKEND ++ ++/* Zap a grant_mapping structure, releasing all mappings and the ++ reserved virtual address space. Prepare the grant_mapping for ++ re-use. */ ++void nc2_unmap_grants(struct grant_mapping *gm) ++{ ++ struct gnttab_unmap_grant_ref op[MAX_GRANT_MAP_PAGES]; ++ int i; ++ ++ if (gm->mapping == NULL) ++ return; ++ for (i = 0; i < gm->nr_pages; i++) { ++ gnttab_set_unmap_op(&op[i], ++ (unsigned long)gm->mapping->addr + ++ i * PAGE_SIZE, ++ GNTMAP_host_map, ++ gm->handles[i]); ++ } ++ if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, op, i)) ++ BUG(); ++ free_vm_area(gm->mapping); ++ memset(gm, 0, sizeof(*gm)); ++} ++ ++int nc2_map_grants(struct grant_mapping *gm, ++ const grant_ref_t *grefs, ++ unsigned nr_grefs, ++ domid_t remote_domain) ++{ ++ struct grant_mapping work; ++ struct gnttab_map_grant_ref op[MAX_GRANT_MAP_PAGES]; ++ int i; ++ ++ memset(&work, 0, sizeof(work)); ++ ++ if (nr_grefs > MAX_GRANT_MAP_PAGES || nr_grefs == 0) ++ return -EINVAL; ++ ++ if (nr_grefs & (nr_grefs-1)) { ++ /* Must map a power-of-two number of pages. */ ++ return -EINVAL; ++ } ++ ++ work.nr_pages = nr_grefs; ++ work.mapping = alloc_vm_area(PAGE_SIZE * work.nr_pages); ++ if (!work.mapping) ++ return -ENOMEM; ++ for (i = 0; i < nr_grefs; i++) ++ gnttab_set_map_op(&op[i], ++ (unsigned long)work.mapping->addr + ++ i * PAGE_SIZE, ++ GNTMAP_host_map, ++ grefs[i], ++ remote_domain); ++ ++ if (HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, op, nr_grefs)) ++ BUG(); ++ ++ for (i = 0; i < nr_grefs; i++) { ++ if (op[i].status) { ++ work.nr_pages = i; ++ nc2_unmap_grants(&work); ++ return -EFAULT; ++ } ++ work.handles[i] = op[i].handle; ++ } ++ ++ nc2_unmap_grants(gm); ++ *gm = work; ++ return 0; ++} ++#endif +diff --git a/drivers/xen/netchannel2/xmit_packet.c b/drivers/xen/netchannel2/xmit_packet.c +new file mode 100644 +index 0000000..6421d94 +--- /dev/null ++++ b/drivers/xen/netchannel2/xmit_packet.c +@@ -0,0 +1,373 @@ ++/* Things related to actually sending packet messages, and which is ++ shared across all transmit modes. */ ++#include ++#include ++#include "netchannel2_core.h" ++ ++/* We limit the number of transmitted packets which can be in flight ++ at any one time, as a somewhat paranoid safety catch. */ ++#define MAX_TX_PACKETS MAX_PENDING_FINISH_PACKETS ++ ++static enum transmit_policy transmit_policy(struct netchannel2 *nc, ++ struct sk_buff *skb) ++{ ++ if (skb->len <= PACKET_PREFIX_SIZE && !skb_is_nonlinear(skb)) ++ return transmit_policy_small; ++ else if (nc->remote_trusted) ++ return transmit_policy_map; ++ else ++ return transmit_policy_grant; ++} ++ ++/* Allocate resources for a small packet. The entire thing will be ++ transmitted in the ring. This is only called for small, linear ++ SKBs. It always succeeds, but has an int return type for symmetry ++ with the other prepare_xmit_*() functions. */ ++enum prepare_xmit_result prepare_xmit_allocate_small( ++ struct netchannel2_ring_pair *ncrp, ++ struct sk_buff *skb) ++{ ++ struct skb_cb_overlay *skb_co = get_skb_overlay(skb); ++ ++ BUG_ON(skb_is_nonlinear(skb)); ++ BUG_ON(skb->len > NETCHANNEL2_MAX_INLINE_BYTES); ++ ++ skb_co->type = NC2_PACKET_TYPE_small; ++ skb_co->gref_pool = 0; ++ skb_co->inline_prefix_size = skb->len; ++ ++ return PREP_XMIT_OKAY; ++} ++ ++/* Figure out how much space @tp will take up on the ring. */ ++unsigned get_transmitted_packet_msg_size(struct sk_buff *skb) ++{ ++ struct skb_cb_overlay *skb_co = get_skb_overlay(skb); ++ return (sizeof(struct netchannel2_msg_packet) + ++ sizeof(struct netchannel2_fragment) * skb_co->nr_fragments + ++ skb_co->inline_prefix_size + 7) & ~7; ++} ++ ++/* Do the minimum amount of work to be certain that when we come to ++ transmit this packet we won't run out of resources. This includes ++ figuring out how we're going to fragment the packet for ++ transmission, which buffers we're going to use, etc. Return <0 if ++ insufficient resources are available right now, or 0 if we ++ succeed. */ ++/* Careful: this may allocate e.g. a TXP slot and then discover that ++ it can't reserve ring space. In that case, the TXP remains ++ allocated. The expected case is that the caller will arrange for ++ us to retry the allocation later, in which case we'll pick up the ++ already-allocated buffers. */ ++enum prepare_xmit_result prepare_xmit_allocate_resources(struct netchannel2 *nc, ++ struct sk_buff *skb) ++{ ++ struct skb_cb_overlay *skb_co = get_skb_overlay(skb); ++ enum transmit_policy policy; ++ unsigned msg_size; ++ enum prepare_xmit_result r; ++ ++ if (skb_co->policy == transmit_policy_unknown) { ++ policy = transmit_policy(nc, skb); ++ switch (policy) { ++ case transmit_policy_small: ++ r = prepare_xmit_allocate_small(&nc->rings, skb); ++ break; ++ case transmit_policy_grant: ++ r = prepare_xmit_allocate_grant(&nc->rings, skb, 1); ++ break; ++ case transmit_policy_map: ++ r = prepare_xmit_allocate_grant(&nc->rings, skb, 0); ++ break; ++ default: ++ BUG(); ++ /* Shut the compiler up. */ ++ r = PREP_XMIT_BUSY; ++ } ++ if (r != PREP_XMIT_OKAY) ++ return r; ++ skb_co->policy = policy; ++ } ++ ++ msg_size = get_transmitted_packet_msg_size(skb); ++ if (nc2_reserve_payload_bytes(&nc->rings.prod_ring, msg_size)) ++ return PREP_XMIT_OKAY; ++ ++ return PREP_XMIT_BUSY; ++} ++ ++static void set_offload_flags(struct sk_buff *skb, ++ volatile struct netchannel2_msg_packet *msg) ++{ ++ if (skb->ip_summed == CHECKSUM_PARTIAL) { ++ msg->flags |= ++ NC2_PACKET_FLAG_csum_blank | ++ NC2_PACKET_FLAG_data_validated; ++ msg->csum_start = skb->csum_start - (skb->data - skb->head); ++ msg->csum_offset = msg->csum_start + skb->csum_offset; ++ } ++ ++ if (skb->proto_data_valid) ++ msg->flags |= NC2_PACKET_FLAG_data_validated; ++ ++ if (skb_shinfo(skb)->gso_size != 0) { ++ msg->mss = skb_shinfo(skb)->gso_size; ++ msg->segmentation_type = NC2_PACKET_SEGMENTATION_TYPE_tcpv4; ++ } else { ++ msg->mss = 0; ++ msg->segmentation_type = NC2_PACKET_SEGMENTATION_TYPE_none; ++ } ++} ++ ++/* Transmit a packet which has previously been prepared with ++ prepare_xmit_allocate_resources(). */ ++/* Once this has been called, the ring must not be flushed until the ++ TX hypercall batcher is (assuming this ring has a hypercall ++ batcher). */ ++int nc2_really_start_xmit(struct netchannel2_ring_pair *ncrp, ++ struct sk_buff *skb) ++{ ++ struct skb_cb_overlay *skb_co = get_skb_overlay(skb); ++ struct netchannel2 *nc = ncrp->interface; ++ unsigned msg_size; ++ volatile struct netchannel2_msg_packet *msg; ++ ++ msg_size = get_transmitted_packet_msg_size(skb); ++ /* Un-reserve the space we reserved for the packet. */ ++ BUG_ON(ncrp->prod_ring.reserve < msg_size); ++ ncrp->prod_ring.reserve -= msg_size; ++ if (!nc2_can_send_payload_bytes(&ncrp->prod_ring, msg_size)) { ++ /* Aw, crud. We had to transmit a PAD message at just ++ the wrong time, and our attempt to reserve ring ++ space failed. Delay transmiting this packet ++ Make sure we redo the space reserve */ ++ ncrp->prod_ring.reserve += msg_size; ++ return 0; ++ } ++ __nc2_avoid_ring_wrap(&ncrp->prod_ring, msg_size); ++ ++ /* Set up part of the message. We do the message header ++ itself and the inline prefix. The individual xmit_* ++ methods are responsible for the fragments. They may also ++ set some more msg flags. */ ++ msg = __nc2_get_message_ptr(&ncrp->prod_ring); ++ msg->hdr.type = NETCHANNEL2_MSG_PACKET; ++ msg->hdr.flags = 0; ++ msg->id = skb_co->tp - ncrp->tx_packets; ++ msg->type = skb_co->type; ++ msg->flags = 0; ++ msg->prefix_size = skb_co->inline_prefix_size; ++ ++ /* We cast away the volatile to avoid compiler warnings, and ++ then use barrier()s to discourage gcc from using msg->frags ++ in CSE or somesuch. It's kind of unlikely that it would, ++ but better to make sure. */ ++ barrier(); ++ memcpy((void *)(msg->frags + skb_co->nr_fragments), ++ skb->data, ++ skb_co->inline_prefix_size); ++ barrier(); ++ ++ set_offload_flags(skb, msg); ++ ++ switch (skb_co->policy) { ++ case transmit_policy_small: ++ /* Nothing to do */ ++ break; ++ case transmit_policy_grant: ++ xmit_grant(ncrp, skb, 1, msg); ++ break; ++ case transmit_policy_map: ++ xmit_grant(ncrp, skb, 0, msg); ++ break; ++ default: ++ BUG(); ++ } ++ ++ /* The transmission method may have decided not to use all the ++ fragments it reserved, which changes the message size. */ ++ msg_size = get_transmitted_packet_msg_size(skb); ++ msg->hdr.size = msg_size; ++ ++ ncrp->prod_ring.prod_pvt += msg_size; ++ ++ BUG_ON(ncrp->prod_ring.bytes_available < msg_size); ++ ++ ncrp->prod_ring.bytes_available -= msg_size; ++ ++ ncrp->pending_time_sensitive_messages = 1; ++ ++ if (skb_co->tp) { ++ ncrp->expected_finish_messages++; ++ /* We're now ready to accept a FINISH message for this ++ packet. */ ++ skb_co->expecting_finish = 1; ++ } else { ++ /* This packet doesn't need a FINISH message. Queue ++ it up to be released as soon as we flush the ++ hypercall batcher and the ring. */ ++ nc->stats.tx_bytes += skb->len; ++ nc->stats.tx_packets++; ++ __skb_queue_tail(&ncrp->release_on_flush_batcher, skb); ++ } ++ ++ return 1; ++} ++ ++/* Arrange that @skb will be sent on ring @ncrp soon. Assumes that ++ prepare_xmit_allocate_resources() has been successfully called on ++ @skb already. */ ++void queue_packet_to_interface(struct sk_buff *skb, ++ struct netchannel2_ring_pair *ncrp) ++{ ++ __skb_queue_tail(&ncrp->pending_tx_queue, skb); ++ if (ncrp->pending_tx_queue.qlen == 1) ++ nc2_kick(ncrp); ++} ++ ++int nc2_start_xmit(struct sk_buff *skb, struct net_device *dev) ++{ ++ struct netchannel2 *nc = netdev_priv(dev); ++ struct skb_cb_overlay *sco = get_skb_overlay(skb); ++ int r; ++ ++ memset(sco, 0, sizeof(*sco)); ++ ++ spin_lock_bh(&nc->rings.lock); ++ ++ /* If we have a bypass suitable for this packet then we prefer ++ * that to the main ring pair. */ ++#ifdef CONFIG_XEN_NETDEV2_BYPASS_ENDPOINT ++ { ++ struct nc2_alternate_ring *ncr; ++ list_for_each_entry(ncr, &nc->alternate_rings, ++ rings_by_interface) { ++ if (bypass_xmit_packet(nc, ncr, skb)) { ++ spin_unlock_bh(&nc->rings.lock); ++ return NETDEV_TX_OK; ++ } ++ } ++ } ++#endif ++ ++ if (!nc->rings.is_attached) ++ goto out_drop; ++ ++ r = prepare_xmit_allocate_resources(nc, skb); ++ if (r != PREP_XMIT_OKAY) { ++ if (r == PREP_XMIT_BUSY) ++ goto out_busy; ++ else ++ goto out_drop; ++ } ++ queue_packet_to_interface(skb, &nc->rings); ++ spin_unlock_bh(&nc->rings.lock); ++ ++ return NETDEV_TX_OK; ++ ++out_drop: ++ spin_unlock_bh(&nc->rings.lock); ++ dev_kfree_skb(skb); ++ nc->stats.tx_dropped++; ++ return NETDEV_TX_OK; ++ ++out_busy: ++ /* Some more buffers may have arrived, so kick the worker ++ * thread to go and have a look. */ ++ nc2_kick(&nc->rings); ++ ++ __skb_queue_tail(&nc->pending_skbs, skb); ++ nc->is_stopped = 1; ++ netif_stop_queue(dev); ++ spin_unlock_bh(&nc->rings.lock); ++ return NETDEV_TX_OK; ++} ++ ++ ++void nc2_handle_finish_packet_msg(struct netchannel2 *nc, ++ struct netchannel2_ring_pair *ncrp, ++ struct netchannel2_msg_hdr *hdr) ++{ ++ struct skb_cb_overlay *sco; ++ struct netchannel2_msg_finish_packet msg; ++ struct txp_slot *tp; ++ struct sk_buff *skb; ++ ++ if (hdr->size < sizeof(msg)) { ++ pr_debug("Packet finish message had strange size %d\n", ++ hdr->size); ++ return; ++ } ++ nc2_copy_from_ring(&ncrp->cons_ring, &msg, sizeof(msg)); ++ if (msg.id > NR_TX_PACKETS) { ++ pr_debug("Other end tried to end bad packet id %d\n", ++ msg.id); ++ return; ++ } ++ tp = &ncrp->tx_packets[msg.id]; ++ skb = txp_get_skb(tp); ++ if (!skb) { ++ pr_debug("Other end tried to end packet id %d which wasn't in use\n", ++ msg.id); ++ return; ++ } ++ sco = get_skb_overlay(skb); ++ /* Careful: if the remote is malicious, they may try to end a ++ packet after we allocate it but before we send it (e.g. if ++ we've had to back out because we didn't have enough ring ++ space). */ ++ if (!sco->expecting_finish) { ++ pr_debug("Other end finished packet before we sent it?\n"); ++ return; ++ } ++ nc->stats.tx_bytes += skb->len; ++ nc->stats.tx_packets++; ++ release_tx_packet(ncrp, skb); ++ ncrp->expected_finish_messages--; ++} ++ ++ ++/* ------------------------ Control-path operations ---------------------- */ ++void nc2_handle_set_max_packets_msg(struct netchannel2_ring_pair *ncrp, ++ struct netchannel2_msg_hdr *hdr) ++{ ++ struct netchannel2_msg_set_max_packets msg; ++ ++ if (hdr->size != sizeof(msg)) { ++ pr_debug("Set max packets message had strange size %d\n", ++ hdr->size); ++ return; ++ } ++ if (ncrp->max_tx_packets_outstanding != 0) { ++ pr_debug("Other end tried to change number of outstanding packets from %d.\n", ++ ncrp->max_tx_packets_outstanding); ++ return; ++ } ++ nc2_copy_from_ring(&ncrp->cons_ring, &msg, sizeof(msg)); ++ /* Limit the number of outstanding packets to something sane. ++ This is a little bit paranoid (it should be safe to set ++ this arbitrarily high), but limiting it avoids nasty ++ surprises in untested configurations. */ ++ if (msg.max_outstanding_packets > MAX_TX_PACKETS) { ++ pr_debug("Other end tried to set max outstanding to %d, limiting to %d.\n", ++ msg.max_outstanding_packets, MAX_TX_PACKETS); ++ ncrp->max_tx_packets_outstanding = MAX_TX_PACKETS; ++ } else { ++ ncrp->max_tx_packets_outstanding = msg.max_outstanding_packets; ++ } ++} ++ ++/* Release all packets on the transmitted and pending_tx lists. */ ++void drop_pending_tx_packets(struct netchannel2_ring_pair *ncrp) ++{ ++ struct sk_buff *skb; ++ unsigned x; ++ ++ nc2_queue_purge(ncrp, &ncrp->pending_tx_queue); ++ for (x = 0; x < NR_TX_PACKETS; x++) { ++ skb = txp_get_skb(&ncrp->tx_packets[x]); ++ if (skb) ++ release_tx_packet(ncrp, skb); ++ } ++} ++ +diff --git a/include/xen/interface/io/netchannel2.h b/include/xen/interface/io/netchannel2.h +new file mode 100644 +index 0000000..528417c +--- /dev/null ++++ b/include/xen/interface/io/netchannel2.h +@@ -0,0 +1,342 @@ ++#ifndef __NETCHANNEL2_H__ ++#define __NETCHANNEL2_H__ ++ ++#include ++ ++/* Tell the other end how many packets its allowed to have ++ * simultaneously outstanding for transmission. An endpoint must not ++ * send PACKET messages which would take it over this limit. ++ * ++ * The SET_MAX_PACKETS message must be sent before any PACKET ++ * messages. It should only be sent once, unless the ring is ++ * disconnected and reconnected. ++ */ ++#define NETCHANNEL2_MSG_SET_MAX_PACKETS 1 ++struct netchannel2_msg_set_max_packets { ++ struct netchannel2_msg_hdr hdr; ++ uint32_t max_outstanding_packets; ++}; ++ ++/* Pass a packet to the other end. The packet consists of a header, ++ * followed by a bunch of fragment descriptors, followed by an inline ++ * packet prefix. Every fragment descriptor in a packet must be the ++ * same type, and the type is determined by the header. The receiving ++ * endpoint should respond with a finished_packet message as soon as ++ * possible. The prefix may be no more than ++ * NETCHANNEL2_MAX_INLINE_BYTES. Packets may contain no more than ++ * NETCHANNEL2_MAX_PACKET_BYTES bytes of data, including all fragments ++ * and the prefix. ++ * ++ * If a SET_MAX_FRAGMENTS_PER_PACKET message has been received, the ++ * number of fragments in the packet should respect that limit. ++ * Otherwise, there should be at most one fragment in the packet ++ * (there may be zero if the entire packet fits in the inline prefix). ++ */ ++#define NETCHANNEL2_MSG_PACKET 2 ++#define NETCHANNEL2_MAX_PACKET_BYTES 65536 ++#define NETCHANNEL2_MAX_INLINE_BYTES 256 ++struct netchannel2_fragment { ++ uint16_t size; ++ /* The offset is always relative to the start of the page. ++ For pre_posted packet types, it is not relative to the ++ start of the buffer (although the fragment range will ++ obviously be within the buffer range). */ ++ uint16_t off; ++ union { ++ struct { ++ grant_ref_t gref; ++ } receiver_copy; ++ struct { ++ grant_ref_t gref; ++ } receiver_map; ++ }; ++}; ++struct netchannel2_msg_packet { ++ struct netchannel2_msg_hdr hdr; ++ uint32_t id; /* Opaque ID which is echoed into the finished ++ packet message. */ ++ uint8_t type; ++ uint8_t flags; ++ uint8_t segmentation_type; ++ uint8_t pad; ++ uint16_t prefix_size; ++ uint16_t mss; ++ uint16_t csum_start; ++ uint16_t csum_offset; ++ /* Variable-size array. The number of elements is determined ++ by the size of the message. */ ++ struct netchannel2_fragment frags[0]; ++}; ++ ++/* TX csum offload. The transmitting domain has skipped a checksum ++ * calculation. Before forwarding the packet on, the receiving domain ++ * must first perform a 16 bit IP checksum on everything from ++ * csum_start to the end of the packet, and then write the result to ++ * an offset csum_offset in the packet. This should only be set if ++ * the transmitting domain has previously received a SET_OFFLOAD ++ * message with csum = 1. ++ */ ++#define NC2_PACKET_FLAG_csum_blank 1 ++/* RX csum offload. The transmitting domain has already validated the ++ * protocol-level checksum on this packet (i.e. TCP or UDP), so the ++ * receiving domain shouldn't bother. This does not tell you anything ++ * about the IP-level checksum. This can be set on any packet, ++ * regardless of any SET_OFFLOAD messages which may or may not have ++ * been sent. */ ++#define NC2_PACKET_FLAG_data_validated 2 ++/* If set, this flag indicates that this packet could have used a ++ * bypass if one had been available, and so it should be sent to the ++ * autobypass state machine. ++ */ ++#define NC2_PACKET_FLAG_bypass_candidate 4 ++/* If set, the transmitting domain requires an event urgently when ++ * this packet's finish message is sent. Otherwise, the event can be ++ * delayed. */ ++#define NC2_PACKET_FLAG_need_event 8 ++ ++/* The mechanism which should be used to receive the data part of ++ * a packet: ++ * ++ * receiver_copy -- The transmitting domain has granted the receiving ++ * domain access to the original RX buffers using ++ * copy-only grant references. The receiving domain ++ * should copy the data out of the buffers and issue ++ * a FINISH message. ++ * ++ * Due to backend bugs, it is in not safe to use this ++ * packet type except on bypass rings. ++ * ++ * receiver_map -- The transmitting domain has granted the receiving ++ * domain access to the original RX buffers using ++ * full (mappable) grant references. This can be ++ * treated the same way as receiver_copy, but the ++ * receiving domain also has the option of mapping ++ * the fragments, rather than copying them. If it ++ * decides to do so, it should ensure that the fragments ++ * will be unmapped in a reasonably timely fashion, ++ * and don't e.g. become stuck in a receive buffer ++ * somewhere. In general, anything longer than about ++ * a second is likely to cause problems. Once all ++ * grant references have been unmapper, the receiving ++ * domain should send a FINISH message. ++ * ++ * This packet type may not be used on bypass rings. ++ * ++ * small -- The packet does not have any fragment descriptors ++ * (i.e. the entire thing is inline in the ring). The receiving ++ * domain should simply the copy the packet out of the ring ++ * into a locally allocated buffer. No FINISH message is required ++ * or allowed. ++ * ++ * This packet type may be used on any ring. ++ * ++ * All endpoints must be able to receive all packet types, but note ++ * that it is correct to treat receiver_map and small packets as ++ * receiver_copy ones. */ ++#define NC2_PACKET_TYPE_receiver_copy 1 ++#define NC2_PACKET_TYPE_receiver_map 3 ++#define NC2_PACKET_TYPE_small 4 ++ ++#define NC2_PACKET_SEGMENTATION_TYPE_none 0 ++#define NC2_PACKET_SEGMENTATION_TYPE_tcpv4 1 ++ ++/* Tell the other end that we're finished with a message it sent us, ++ and it can release the transmit buffers etc. This must be sent in ++ response to receiver_copy and receiver_map packets. It must not be ++ sent in response to pre_posted or small packets. */ ++#define NETCHANNEL2_MSG_FINISH_PACKET 3 ++struct netchannel2_msg_finish_packet { ++ struct netchannel2_msg_hdr hdr; ++ uint32_t id; ++}; ++ ++/* Tell the other end what sort of offloads we're going to let it use. ++ * An endpoint must not use any offload unless it has been enabled ++ * by a previous SET_OFFLOAD message. */ ++/* Note that there is no acknowledgement for this message. This means ++ * that an endpoint can continue to receive PACKET messages which ++ * require offload support for some time after it disables task ++ * offloading. The endpoint is expected to handle this case correctly ++ * (which may just mean dropping the packet and returning a FINISH ++ * message, if appropriate). ++ */ ++#define NETCHANNEL2_MSG_SET_OFFLOAD 4 ++struct netchannel2_msg_set_offload { ++ struct netchannel2_msg_hdr hdr; ++ /* Checksum offload. If this is 0, the other end must ++ * calculate checksums before sending the packet. If it is 1, ++ * the other end does not have to perform the calculation. ++ */ ++ uint8_t csum; ++ /* Segmentation offload. If this is 0, the other end must not ++ * generate any packet messages with a segmentation type other ++ * than NC2_PACKET_SEGMENTATION_TYPE_none. If it is 1, the ++ * other end may also generate packets with a type of ++ * NC2_PACKET_SEGMENTATION_TYPE_tcpv4. ++ */ ++ uint8_t tcpv4_segmentation_offload; ++ uint16_t reserved; ++}; ++ ++/* Set the maximum number of fragments which can be used in any packet ++ * (not including the inline prefix). Until this is sent, there can ++ * be at most one such fragment per packet. The maximum must not be ++ * set to zero. */ ++/* Note that there is no acknowledgement for this message, and so if ++ * an endpoint tries to reduce the number of fragments then it may ++ * continue to recieve over-fragmented packets for some time. The ++ * receiving endpoint is expected to deal with this. ++ */ ++#define NETCHANNEL2_MSG_SET_MAX_FRAGMENTS_PER_PACKET 5 ++struct netchannel2_msg_set_max_fragments_per_packet { ++ struct netchannel2_msg_hdr hdr; ++ uint32_t max_frags_per_packet; ++}; ++ ++/* Attach to a bypass ring as a frontend. The receiving domain should ++ * map the bypass ring (which will be in the sending domain's memory) ++ * and attach to it in the same as it attached to the original ring. ++ * This bypass ring will, once it's been successfully set up, be used ++ * for all packets destined for @remote_mac (excluding broadcasts). ++ * ++ * @ring_domid indicates which domain allocated the ring pages, and ++ * hence which domain should be specified when grant mapping ++ * @control_gref, @prod_gref, and @cons_gref. It can be set to ++ * DOMID_SELF, in which case the domain ID of the domain sending the ++ * message should be used. ++ * ++ * @peer_domid indicates the domain ID of the domain on the other end ++ * of the ring. ++ * ++ * @handle gives a unique handle for the bypass which will be used in ++ * future messages. ++ * ++ * @peer_trusted is true if the peer should be trusted by the domain ++ * which sent the bypass message. ++ * ++ * @ring_pages gives the number of valid grefs in the @prod_grefs and ++ * @cons_grefs arrays. ++ * ++ * @is_backend_like indicates which ring attach the receiving domain ++ * should use. If @is_backend_like is set, the receiving domain ++ * should interpret the control area as a netchannel2_backend_shared. ++ * Otherwise, it's a netchannel2_frontend_shared. Also, a ++ * backend-like endpoint should receive an event channel from the peer ++ * domain, while a frontend-like one should send one. Once ++ * established, the ring is symmetrical. ++ * ++ * ++ * BYPASS messages can only be sent by a trusted endpoint. They may ++ * not be sent over bypass rings. ++ * ++ * No packets may be sent over the ring until a READY message is ++ * received. Until that point, all packets must be sent over the ++ * parent ring. ++ */ ++struct netchannel2_msg_bypass_common { ++ uint16_t ring_domid; ++ uint16_t peer_domid; ++ uint32_t handle; ++ ++ uint8_t remote_mac[6]; ++ uint8_t peer_trusted; ++ uint8_t ring_pages; ++ ++ uint32_t control_gref; ++ uint32_t pad; ++ ++ /* Followed by a run of @ring_pages uint32_t producer ring ++ grant references, then a run of @ring_pages uint32_t ++ consumer ring grant references */ ++}; ++ ++#define NETCHANNEL2_MSG_BYPASS_FRONTEND 9 ++struct netchannel2_msg_bypass_frontend { ++ struct netchannel2_msg_hdr hdr; ++ uint32_t pad; ++ struct netchannel2_msg_bypass_common common; ++}; ++ ++#define NETCHANNEL2_MSG_BYPASS_BACKEND 10 ++struct netchannel2_msg_bypass_backend { ++ struct netchannel2_msg_hdr hdr; ++ uint32_t port; ++ struct netchannel2_msg_bypass_common common; ++}; ++ ++#define NETCHANNEL2_MSG_BYPASS_FRONTEND_READY 11 ++struct netchannel2_msg_bypass_frontend_ready { ++ struct netchannel2_msg_hdr hdr; ++ int32_t port; ++}; ++ ++/* This message is sent on a bypass ring once the sending domain is ++ * ready to receive packets. Until it has been received, the bypass ++ * ring cannot be used to transmit packets. It may only be sent once. ++ * ++ * Note that it is valid to send packet messages before *sending* a ++ * BYPASS_READY message, provided a BYPASS_READY message has been ++ * *received*. ++ * ++ * This message can only be sent on a bypass ring. ++ */ ++#define NETCHANNEL2_MSG_BYPASS_READY 12 ++struct netchannel2_msg_bypass_ready { ++ struct netchannel2_msg_hdr hdr; ++ uint32_t pad; ++}; ++ ++/* Disable an existing bypass. This is sent over the *parent* ring, ++ * in the same direction as the original BYPASS message, when the ++ * bypassed domain wishes to disable the ring. The receiving domain ++ * should stop sending PACKET messages over the ring, wait for FINISH ++ * messages for any outstanding PACKETs, and then acknowledge this ++ * message with a DISABLED message. ++ * ++ * This message may not be sent on bypass rings. ++ */ ++#define NETCHANNEL2_MSG_BYPASS_DISABLE 13 ++struct netchannel2_msg_bypass_disable { ++ struct netchannel2_msg_hdr hdr; ++ uint32_t handle; ++}; ++#define NETCHANNEL2_MSG_BYPASS_DISABLED 14 ++struct netchannel2_msg_bypass_disabled { ++ struct netchannel2_msg_hdr hdr; ++ uint32_t handle; ++}; ++ ++/* Detach from an existing bypass. This is sent over the *parent* in ++ * the same direction as the original BYPASS message, when the ++ * bypassed domain wishes to destroy the ring. The receiving domain ++ * should immediately unmap the ring and respond with a DETACHED ++ * message. Any PACKET messages which haven't already received a ++ * FINISH message are dropped. ++ * ++ * During a normal shutdown, this message will be sent after DISABLED ++ * messages have been received from both endpoints. However, it can ++ * also be sent without a preceding DISABLE message if the other ++ * endpoint appears to be misbehaving or has crashed. ++ * ++ * This message may not be sent on bypass rings. ++ */ ++#define NETCHANNEL2_MSG_BYPASS_DETACH 15 ++struct netchannel2_msg_bypass_detach { ++ struct netchannel2_msg_hdr hdr; ++ uint32_t handle; ++}; ++#define NETCHANNEL2_MSG_BYPASS_DETACHED 16 ++struct netchannel2_msg_bypass_detached { ++ struct netchannel2_msg_hdr hdr; ++ uint32_t handle; ++}; ++ ++#define NETCHANNEL2_MSG_SUGGEST_BYPASS 17 ++struct netchannel2_msg_suggest_bypass { ++ struct netchannel2_msg_hdr hdr; ++ unsigned char mac[6]; ++ uint16_t pad1; ++ uint32_t pad2; ++}; ++ ++#endif /* !__NETCHANNEL2_H__ */ +diff --git a/include/xen/interface/io/uring.h b/include/xen/interface/io/uring.h +new file mode 100644 +index 0000000..f5c4738 +--- /dev/null ++++ b/include/xen/interface/io/uring.h +@@ -0,0 +1,426 @@ ++#ifndef __XEN_PUBLIC_IO_URING_H__ ++#define __XEN_PUBLIC_IO_URING_H__ ++ ++#include ++#include ++#include ++ ++typedef unsigned RING_IDX; ++ ++#define NETCHANNEL2_MSG_PAD 255 ++ ++/* The sring structures themselves. The _cons and _prod variants are ++ different views of the same bit of shared memory, and are supposed ++ to provide better checking of the expected use patterns. Fields in ++ the shared ring are owned by either the producer end or the ++ consumer end. If a field is owned by your end, the other end will ++ never modify it. If it's owned by the other end, the other end is ++ allowed to modify it whenever it likes, and you can never do so. ++ ++ Fields owned by the other end are always const (because you can't ++ change them). They're also volatile, because there are a bunch ++ of places where we go: ++ ++ local_x = sring->x; ++ validate(local_x); ++ use(local_x); ++ ++ and it would be very bad if the compiler turned that into: ++ ++ local_x = sring->x; ++ validate(sring->x); ++ use(local_x); ++ ++ because that contains a potential TOCTOU race (hard to exploit, but ++ still present). The compiler is only allowed to do that ++ optimisation because it knows that local_x == sring->x at the start ++ of the call to validate(), and it only knows that if it can reorder ++ the read of sring->x over the sequence point at the end of the ++ first statement. In other words, it can only do the bad ++ optimisation if it knows that reads of sring->x are side-effect ++ free. volatile stops it from making that assumption. ++ ++ We don't need a full memory barrier here, because it's sufficient ++ to copy the volatile data into stable guest-local storage, and ++ volatile achieves that. i.e. we don't need local_x to be precisely ++ sring->x, but we do need it to be a stable snapshot of some ++ previous valud of sring->x. ++ ++ Note that there are still plenty of other places where we *do* need ++ full barriers. volatile just deals with this one, specific, case. ++ ++ We could also deal with it by putting compiler barriers in all over ++ the place. The downside of that approach is that you need to put ++ the barrier()s in lots of different places (basically, everywhere ++ which needs to access these fields), and it's easy to forget one. ++ barrier()s also have somewhat heavier semantics than volatile ++ (because they prevent all reordering, rather than just reordering ++ on this one field), although that's pretty much irrelevant because ++ gcc usually treats pretty much any volatile access as a call to ++ barrier(). ++*/ ++ ++/* Messages are sent over sring pairs. Each sring in a pair provides ++ * a unidirectional byte stream which can generate events when either ++ * the producer or consumer pointers cross a particular threshold. ++ * ++ * We define both sring_prod and sring_cons structures. The two ++ * structures will always map onto the same physical bytes in memory, ++ * but they provide different views of that memory which are ++ * appropriate to either producers or consumers. ++ * ++ * Obviously, the endpoints need to agree on which end produces ++ * messages on which ring. The endpoint which provided the memory ++ * backing the ring always produces on the first sring, and the one ++ * which just mapped the ring produces on the second. By convention, ++ * these are known as the frontend and backend, respectively. ++ */ ++ ++/* For both rings, the producer (consumer) pointers point at the ++ * *next* byte which is going to be produced (consumed). An endpoint ++ * must generate an event on the event channel port if it moves the ++ * producer pointer (consumer pointer) across prod_event (cons_event). ++ * ++ * i.e if an endpoint ever updates a pointer so that the old pointer ++ * is strictly less than the event, and the new pointer is greater ++ * than or equal to the event then the remote must be notified. If ++ * the pointer overflows the ring, treat the new value as if it were ++ * (actual new value) + (1 << 32). ++ */ ++struct netchannel2_sring_prod { ++ RING_IDX prod; ++ volatile const RING_IDX cons; ++ volatile const RING_IDX prod_event; ++ RING_IDX cons_event; ++ unsigned char pad[48]; ++}; ++ ++struct netchannel2_sring_cons { ++ volatile const RING_IDX prod; ++ RING_IDX cons; ++ RING_IDX prod_event; ++ volatile const RING_IDX cons_event; ++ unsigned char pad[48]; ++}; ++ ++struct netchannel2_frontend_shared { ++ struct netchannel2_sring_prod prod; ++ struct netchannel2_sring_cons cons; ++}; ++ ++struct netchannel2_backend_shared { ++ struct netchannel2_sring_cons cons; ++ struct netchannel2_sring_prod prod; ++}; ++ ++struct netchannel2_prod_ring { ++ struct netchannel2_sring_prod *sring; ++ void *payload; ++ RING_IDX prod_pvt; ++ /* This is the number of bytes available after prod_pvt last ++ time we checked, minus the number of bytes which we've ++ consumed since then. It's used to a avoid a bunch of ++ memory barriers when checking for ring space. */ ++ unsigned bytes_available; ++ /* Number of bytes reserved by nc2_reserve_payload_bytes() */ ++ unsigned reserve; ++ size_t payload_bytes; ++}; ++ ++struct netchannel2_cons_ring { ++ struct netchannel2_sring_cons *sring; ++ const volatile void *payload; ++ RING_IDX cons_pvt; ++ size_t payload_bytes; ++}; ++ ++/* A message header. There is one of these at the start of every ++ * message. @type is one of the #define's below, and @size is the ++ * size of the message, including the header and any padding. ++ * size should be a multiple of 8 so we avoid unaligned memory copies. ++ * structs defining message formats should have sizes multiple of 8 ++ * bytes and should use paddding fields if needed. ++ */ ++struct netchannel2_msg_hdr { ++ uint8_t type; ++ uint8_t flags; ++ uint16_t size; ++}; ++ ++/* Copy some bytes from the shared ring to a stable local buffer, ++ * starting at the private consumer pointer. Does not update the ++ * private consumer pointer. ++ */ ++static inline void nc2_copy_from_ring_off(struct netchannel2_cons_ring *ring, ++ void *buf, ++ size_t nbytes, ++ unsigned off) ++{ ++ unsigned start, end; ++ ++ start = (ring->cons_pvt + off) & (ring->payload_bytes-1); ++ end = (ring->cons_pvt + nbytes + off) & (ring->payload_bytes-1); ++ /* We cast away the volatile modifier to get rid of an ++ irritating compiler warning, and compensate with a ++ barrier() at the end. */ ++ memcpy(buf, (const void *)ring->payload + start, nbytes); ++ barrier(); ++} ++ ++static inline void nc2_copy_from_ring(struct netchannel2_cons_ring *ring, ++ void *buf, ++ size_t nbytes) ++{ ++ nc2_copy_from_ring_off(ring, buf, nbytes, 0); ++} ++ ++ ++/* Copy some bytes to the shared ring, starting at the private ++ * producer pointer. Does not update the private pointer. ++ */ ++static inline void nc2_copy_to_ring_off(struct netchannel2_prod_ring *ring, ++ const void *src, ++ unsigned nr_bytes, ++ unsigned off) ++{ ++ unsigned start, end; ++ ++ start = (ring->prod_pvt + off) & (ring->payload_bytes-1); ++ end = (ring->prod_pvt + nr_bytes + off) & (ring->payload_bytes-1); ++ memcpy(ring->payload + start, src, nr_bytes); ++} ++ ++static inline void nc2_copy_to_ring(struct netchannel2_prod_ring *ring, ++ const void *src, ++ unsigned nr_bytes) ++{ ++ nc2_copy_to_ring_off(ring, src, nr_bytes, 0); ++} ++ ++static inline void __nc2_send_pad(struct netchannel2_prod_ring *ring, ++ unsigned nr_bytes) ++{ ++ struct netchannel2_msg_hdr msg; ++ msg.type = NETCHANNEL2_MSG_PAD; ++ msg.flags = 0; ++ msg.size = nr_bytes; ++ nc2_copy_to_ring(ring, &msg, sizeof(msg)); ++ ring->prod_pvt += nr_bytes; ++ ring->bytes_available -= nr_bytes; ++} ++ ++static inline int __nc2_ring_would_wrap(struct netchannel2_prod_ring *ring, ++ unsigned nr_bytes) ++{ ++ RING_IDX mask; ++ mask = ~(ring->payload_bytes - 1); ++ return (ring->prod_pvt & mask) != ((ring->prod_pvt + nr_bytes) & mask); ++} ++ ++static inline unsigned __nc2_pad_needed(struct netchannel2_prod_ring *ring) ++{ ++ return ring->payload_bytes - ++ (ring->prod_pvt & (ring->payload_bytes - 1)); ++} ++ ++static inline void __nc2_avoid_ring_wrap(struct netchannel2_prod_ring *ring, ++ unsigned nr_bytes) ++{ ++ if (!__nc2_ring_would_wrap(ring, nr_bytes)) ++ return; ++ __nc2_send_pad(ring, __nc2_pad_needed(ring)); ++ ++} ++ ++/* Prepare a message for the other end and place it on the shared ++ * ring, updating the private producer pointer. You need to call ++ * nc2_flush_messages() before the message is actually made visible to ++ * the other end. It is permissible to send several messages in a ++ * batch and only flush them once. ++ */ ++static inline void nc2_send_message(struct netchannel2_prod_ring *ring, ++ unsigned type, ++ unsigned flags, ++ const void *msg, ++ size_t size) ++{ ++ struct netchannel2_msg_hdr *hdr = (struct netchannel2_msg_hdr *)msg; ++ ++ __nc2_avoid_ring_wrap(ring, size); ++ ++ hdr->type = type; ++ hdr->flags = flags; ++ hdr->size = size; ++ ++ nc2_copy_to_ring(ring, msg, size); ++ ring->prod_pvt += size; ++ BUG_ON(ring->bytes_available < size); ++ ring->bytes_available -= size; ++} ++ ++static inline volatile void *__nc2_get_message_ptr(struct netchannel2_prod_ring *ncrp) ++{ ++ return (volatile void *)ncrp->payload + ++ (ncrp->prod_pvt & (ncrp->payload_bytes-1)); ++} ++ ++/* Copy the private producer pointer to the shared producer pointer, ++ * with a suitable memory barrier such that all messages placed on the ++ * ring are stable before we do the copy. This effectively pushes any ++ * messages which we've just sent out to the other end. Returns 1 if ++ * we need to notify the other end and 0 otherwise. ++ */ ++static inline int nc2_flush_ring(struct netchannel2_prod_ring *ring) ++{ ++ RING_IDX old_prod, new_prod; ++ ++ old_prod = ring->sring->prod; ++ new_prod = ring->prod_pvt; ++ ++ wmb(); ++ ++ ring->sring->prod = new_prod; ++ ++ /* We need the update to prod to happen before we read ++ * event. */ ++ mb(); ++ ++ /* We notify if the producer pointer moves across the event ++ * pointer. */ ++ if ((RING_IDX)(new_prod - ring->sring->prod_event) < ++ (RING_IDX)(new_prod - old_prod)) ++ return 1; ++ else ++ return 0; ++} ++ ++/* Copy the private consumer pointer to the shared consumer pointer, ++ * with a memory barrier so that any previous reads from the ring ++ * complete before the pointer is updated. This tells the other end ++ * that we're finished with the messages, and that it can re-use the ++ * ring space for more messages. Returns 1 if we need to notify the ++ * other end and 0 otherwise. ++ */ ++static inline int nc2_finish_messages(struct netchannel2_cons_ring *ring) ++{ ++ RING_IDX old_cons, new_cons; ++ ++ old_cons = ring->sring->cons; ++ new_cons = ring->cons_pvt; ++ ++ /* Need to finish reading from the ring before updating ++ cons */ ++ mb(); ++ ring->sring->cons = ring->cons_pvt; ++ ++ /* Need to publish our new consumer pointer before checking ++ event. */ ++ mb(); ++ if ((RING_IDX)(new_cons - ring->sring->cons_event) < ++ (RING_IDX)(new_cons - old_cons)) ++ return 1; ++ else ++ return 0; ++} ++ ++/* Check whether there are any unconsumed messages left on the shared ++ * ring. Returns 1 if there are, and 0 if there aren't. If there are ++ * no more messages, set the producer event so that we'll get a ++ * notification as soon as another one gets sent. It is assumed that ++ * all messages up to @prod have been processed, and none of the ones ++ * after it have been. */ ++static inline int nc2_final_check_for_messages(struct netchannel2_cons_ring *ring, ++ RING_IDX prod) ++{ ++ if (prod != ring->sring->prod) ++ return 1; ++ /* Request an event when more stuff gets poked on the ring. */ ++ ring->sring->prod_event = prod + 1; ++ ++ /* Publish event before final check for responses. */ ++ mb(); ++ if (prod != ring->sring->prod) ++ return 1; ++ else ++ return 0; ++} ++ ++/* Can we send a message with @nr_bytes payload bytes? Returns 1 if ++ * we can or 0 if we can't. If there isn't space right now, set the ++ * consumer event so that we'll get notified when space is ++ * available. */ ++static inline int nc2_can_send_payload_bytes(struct netchannel2_prod_ring *ring, ++ unsigned nr_bytes) ++{ ++ unsigned space; ++ RING_IDX cons; ++ BUG_ON(ring->bytes_available > ring->payload_bytes); ++ /* Times 2 because we might need to send a pad message */ ++ if (likely(ring->bytes_available > nr_bytes * 2 + ring->reserve)) ++ return 1; ++ if (__nc2_ring_would_wrap(ring, nr_bytes)) ++ nr_bytes += __nc2_pad_needed(ring); ++retry: ++ cons = ring->sring->cons; ++ space = ring->payload_bytes - (ring->prod_pvt - cons); ++ if (likely(space >= nr_bytes + ring->reserve)) { ++ /* We have enough space to send the message. */ ++ ++ /* Need to make sure that the read of cons happens ++ before any following memory writes. */ ++ mb(); ++ ++ ring->bytes_available = space; ++ ++ return 1; ++ } else { ++ /* Not enough space available. Set an event pointer ++ when cons changes. We need to be sure that the ++ @cons used here is the same as the cons used to ++ calculate @space above, and the volatile modifier ++ on sring->cons achieves that. */ ++ ring->sring->cons_event = cons + 1; ++ ++ /* Check whether more space became available while we ++ were messing about. */ ++ ++ /* Need the event pointer to be stable before we do ++ the check. */ ++ mb(); ++ if (unlikely(cons != ring->sring->cons)) { ++ /* Cons pointer changed. Try again. */ ++ goto retry; ++ } ++ ++ /* There definitely isn't space on the ring now, and ++ an event has been set such that we'll be notified ++ if more space becomes available. */ ++ /* XXX we get a notification as soon as any more space ++ becomes available. We could maybe optimise by ++ setting the event such that we only get notified ++ when we know that enough space is available. The ++ main complication is handling the case where you ++ try to send a message of size A, fail due to lack ++ of space, and then try to send one of size B, where ++ B < A. It's not clear whether you want to set the ++ event for A bytes or B bytes. The obvious answer ++ is B, but that means moving the event pointer ++ backwards, and it's not clear that that's always ++ safe. Always setting for a single byte is safe, so ++ stick with that for now. */ ++ return 0; ++ } ++} ++ ++static inline int nc2_reserve_payload_bytes(struct netchannel2_prod_ring *ring, ++ unsigned nr_bytes) ++{ ++ if (nc2_can_send_payload_bytes(ring, nr_bytes)) { ++ ring->reserve += nr_bytes; ++ return 1; ++ } else { ++ return 0; ++ } ++} ++ ++#endif /* __XEN_PUBLIC_IO_URING_H__ */ diff --git a/master/netchannel2_vmq b/master/netchannel2_vmq new file mode 100644 index 0000000..4decf32 --- /dev/null +++ b/master/netchannel2_vmq @@ -0,0 +1,2828 @@ + + VMQ support for netchannel2 + + Signed-off-by: Steven Smith + +diff --git a/drivers/xen/Kconfig b/drivers/xen/Kconfig +index 7d3033b..1d8ef04 100644 +--- a/drivers/xen/Kconfig ++++ b/drivers/xen/Kconfig +@@ -234,6 +234,11 @@ config XEN_NETDEV2_FRONTEND + depends on XEN_NETCHANNEL2 + default y + ++config XEN_NETDEV2_VMQ ++ bool "Net channel 2 support for multi-queue devices" ++ depends on XEN_NETDEV2_BACKEND && NET_VMQ ++ default y ++ + config XEN_NETDEV2_BYPASSABLE + bool "Net channel 2 bypassee support" + depends on XEN_NETDEV2_BACKEND +diff --git a/drivers/xen/netchannel2/Makefile b/drivers/xen/netchannel2/Makefile +index d779de5..16e566d 100644 +--- a/drivers/xen/netchannel2/Makefile ++++ b/drivers/xen/netchannel2/Makefile +@@ -1,7 +1,7 @@ + obj-$(CONFIG_XEN_NETCHANNEL2) += netchannel2.o + + netchannel2-objs := chan.o netchan2.o rscb.o util.o \ +- xmit_packet.o offload.o recv_packet.o poll.o \ ++ posted_buffers.o xmit_packet.o offload.o recv_packet.o poll.o \ + receiver_map.o + + ifeq ($(CONFIG_XEN_NETDEV2_BACKEND),y) +@@ -12,6 +12,10 @@ ifeq ($(CONFIG_XEN_NETDEV2_FRONTEND),y) + netchannel2-objs += netfront2.o + endif + ++ifeq ($(CONFIG_XEN_NETDEV2_VMQ),y) ++netchannel2-objs += vmq.o ++endif ++ + ifeq ($(CONFIG_XEN_NETDEV2_BYPASSABLE),y) + netchannel2-objs += bypassee.o + endif +diff --git a/drivers/xen/netchannel2/chan.c b/drivers/xen/netchannel2/chan.c +index 03913a3..89465e1 100644 +--- a/drivers/xen/netchannel2/chan.c ++++ b/drivers/xen/netchannel2/chan.c +@@ -13,6 +13,7 @@ + + #include "netchannel2_endpoint.h" + #include "netchannel2_core.h" ++#include "vmq.h" + + static int process_ring(struct napi_struct *napi, + int work_avail); +@@ -89,6 +90,15 @@ retry: + nc2_handle_set_max_fragments_per_packet(nc, ncrp, + &hdr); + break; ++ case NETCHANNEL2_MSG_POST_BUFFER: ++ nc2_handle_post_buffer(nc, ncrp, &hdr); ++ break; ++ case NETCHANNEL2_MSG_RETURN_POSTED_BUFFER: ++ nc2_handle_return_posted_buffer(nc, ncrp, &hdr); ++ break; ++ case NETCHANNEL2_MSG_SET_NR_POSTED_BUFFERS: ++ nc2_handle_set_nr_posted_buffers(nc, ncrp, &hdr); ++ break; + case NETCHANNEL2_MSG_BYPASS_FRONTEND: + nc2_handle_bypass_frontend(nc, ncrp, &hdr); + break; +@@ -172,8 +182,12 @@ static void flush_rings(struct netchannel2_ring_pair *ncrp) + advertise_max_fragments_per_packet(ncrp); + + if (ncrp == &nc->rings) { ++ nc2_replenish_rx_buffers(nc); ++ nc2_return_pending_posted_buffers(nc); + if (nc->need_advertise_offloads) + advertise_offloads(nc); ++ if (nc->need_advertise_tx_buffers) ++ nc2_advertise_tx_buffers(nc); + nc2_advertise_bypasses(nc); + nc2_crank_aux_ring_state_machine(nc); + nc2_autobypass_make_suggestions(nc); +@@ -454,6 +468,13 @@ struct netchannel2 *nc2_new(struct xenbus_device *xd) + nc2_release(nc); + return NULL; + } ++ INIT_LIST_HEAD(&nc->rx_buffers); ++ INIT_LIST_HEAD(&nc->unused_rx_buffers); ++ INIT_LIST_HEAD(&nc->unposted_rx_buffers); ++ INIT_LIST_HEAD(&nc->avail_tx_buffers); ++ nc->nr_avail_tx_buffers = 0; ++ INIT_LIST_HEAD(&nc->unused_tx_buffer_slots); ++ INIT_LIST_HEAD(&nc->pending_tx_buffer_return); + + if (local_trusted) { + if (init_receive_map_mode() < 0) { +@@ -513,8 +534,13 @@ void nc2_release(struct netchannel2 *nc) + + nc2_queue_purge(&nc->rings, &nc->pending_skbs); + ++ /* Should have been released when we detached. */ ++ BUG_ON(nc->rx_buffer_structs); ++ + release_bypasses(nc); + ++ unprepare_tx_buffers(nc); ++ + free_netdev(nc->net_device); + } + +@@ -604,6 +630,9 @@ int nc2_attach_rings(struct netchannel2 *nc, + + static void _detach_rings(struct netchannel2_ring_pair *ncrp) + { ++ if (ncrp == &ncrp->interface->rings) ++ nc2_posted_buffer_rx_forget(ncrp->interface); ++ + spin_lock_bh(&ncrp->lock); + /* We need to release all of the pending transmission packets, + because they're never going to complete now that we've lost +@@ -782,6 +811,8 @@ static int process_ring(struct napi_struct *napi, + /* Pick up incoming messages. */ + work_done = nc2_poll(ncrp, work_avail, &rx_queue); + ++ do_vmq_work(nc); ++ + /* Transmit pending packets. */ + if (!skb_queue_empty(&ncrp->pending_tx_queue)) { + skb = __skb_dequeue(&ncrp->pending_tx_queue); +@@ -795,6 +826,17 @@ static int process_ring(struct napi_struct *napi, + skb = __skb_dequeue(&ncrp->pending_tx_queue); + } while (skb != NULL); + ++ /* If we've transmitted on the main ring then we may ++ have made use of the hypercall batcher. Flush it. ++ This must happen before we flush the rings, since ++ that's when the PACKET messages will be made ++ visible to the other end. */ ++ if (ncrp == &nc->rings) { ++ flush_hypercall_batcher(&nc->batcher, ++ nc2_posted_on_gntcopy_fail); ++ vmq_flush_unmap_hypercall(); ++ } ++ + flush_rings(ncrp); + + while ((skb = __skb_dequeue(&ncrp->release_on_flush_batcher))) +diff --git a/drivers/xen/netchannel2/netback2.c b/drivers/xen/netchannel2/netback2.c +index 844f452..4790834 100644 +--- a/drivers/xen/netchannel2/netback2.c ++++ b/drivers/xen/netchannel2/netback2.c +@@ -10,6 +10,13 @@ + #include "netchannel2_core.h" + #include "netchannel2_endpoint.h" + #include "netchannel2_uspace.h" ++#include "vmq.h" ++ ++#ifdef CONFIG_XEN_NETDEV2_VMQ ++#define NR_TX_BUFS (VMQ_MAX_BUFFERS+256) ++#else ++#define NR_TX_BUFS 256 ++#endif + + static atomic_t next_handle; + /* A list of all currently-live netback2 interfaces. */ +@@ -166,12 +173,21 @@ static int attach_to_frontend(struct netback2 *nd) + return err; + } + ++ nc2_vmq_connect(nc); ++ + /* All done */ + nd->attached = 1; + + return 0; + } + ++static void nb2_shutdown(struct netchannel2 *nc) ++{ ++ nc2_vmq_disconnect(nc); ++ ++ nc2_set_nr_tx_buffers(nc, 0); ++} ++ + static void frontend_changed(struct xenbus_device *xd, + enum xenbus_state frontend_state) + { +@@ -189,6 +205,8 @@ static void frontend_changed(struct xenbus_device *xd, + * detached, and this is pointless but harmless.) */ + detach_from_frontend(nb); + ++ nc2_set_nr_tx_buffers(nb->chan, NR_TX_BUFS); ++ + /* Tell the frontend what sort of rings we're willing + to accept. */ + xenbus_printf(XBT_NIL, nb->xenbus_device->nodename, +@@ -222,6 +240,7 @@ static void frontend_changed(struct xenbus_device *xd, + break; + + case XenbusStateClosing: ++ nb2_shutdown(nb->chan); + detach_from_frontend(nb); + xenbus_switch_state(xd, XenbusStateClosed); + break; +@@ -257,6 +276,8 @@ static int netback2_uevent(struct xenbus_device *xd, + + static void netback2_shutdown(struct xenbus_device *xd) + { ++ struct netback2 *nb = xenbus_device_to_nb2(xd); ++ nb2_shutdown(nb->chan); + xenbus_switch_state(xd, XenbusStateClosing); + } + +diff --git a/drivers/xen/netchannel2/netchannel2_core.h b/drivers/xen/netchannel2/netchannel2_core.h +index 7a62af4..49dbca7 100644 +--- a/drivers/xen/netchannel2/netchannel2_core.h ++++ b/drivers/xen/netchannel2/netchannel2_core.h +@@ -7,6 +7,8 @@ + #include + #include + ++#include "vmq_def.h" ++ + /* After we send this number of frags, we request the other end to + * notify us when sending the corresponding finish packet message */ + #define MAX_MAX_COUNT_FRAGS_NO_EVENT 192 +@@ -23,6 +25,10 @@ + * pointer; see the txp_slot stuff later. */ + #define NR_TX_PACKETS 256 + ++/* No matter what the other end wants, we never post more than this ++ number of RX buffers to it. */ ++#define MAX_POSTED_BUFFERS (2048+256) ++ + /* A way of keeping track of a mapping of a bunch of grant references + into a contigous chunk of virtual address space. This is used for + things like multi-page rings. */ +@@ -37,7 +43,11 @@ enum transmit_policy { + transmit_policy_unknown = 0, + transmit_policy_first = 0xf001, + transmit_policy_grant = transmit_policy_first, ++ transmit_policy_post, + transmit_policy_map, ++#ifdef CONFIG_XEN_NETDEV2_VMQ ++ transmit_policy_vmq, ++#endif + transmit_policy_small, + transmit_policy_last = transmit_policy_small + }; +@@ -89,6 +99,8 @@ static inline nc2_txp_index_t txp_get_next_free(struct txp_slot *slot) + + /* This goes in struct sk_buff::cb */ + struct skb_cb_overlay { ++ struct list_head buffers; /* Only if we're using the posted ++ buffer strategy. */ + struct txp_slot *tp; + unsigned nr_fragments; + grant_ref_t gref_pool; +@@ -370,11 +382,72 @@ struct netchannel2 { + struct nc2_incoming_bypass_suggestions incoming_bypass_suggestions; + #endif + ++ /* Infrastructure for managing buffers which we've posted to ++ the other end. These are all protected by the lock. */ ++ /* A list of nx2_rx_buffer structures, threaded on list, which ++ we've posted to the other end. */ ++ struct list_head rx_buffers; ++ /* Buffers which we've allocated but not yet sent to the other ++ end. */ ++ struct list_head unposted_rx_buffers; ++ /* Buffers which are available but not yet allocated. */ ++ struct list_head unused_rx_buffers; ++ /* The number of buffers in the rx_buffers list. */ ++ unsigned nr_rx_buffers; ++ /* The maximum number of buffers which we can ever have ++ outstanding, and the size of the rx_buffer_structs ++ array. */ ++ unsigned max_nr_rx_buffers; ++ /* A bunch of nc2_rx_buffer structures which can be used for ++ RX buffers. */ ++ struct nc2_rx_buffer *rx_buffer_structs; ++ /* Set if we're sufficiently far through device shutdown that ++ posting more RX buffers would be a bad idea. */ ++ uint8_t dont_post_buffers; ++ ++ /* Infrastructure for managing buffers which the other end has ++ posted to us. Protected by the lock. */ ++ /* A list of nc2_tx_buffer structures, threaded on list, which ++ contains all tx buffers which have been posted by the ++ remote. */ ++ struct list_head avail_tx_buffers; ++ /* A list of nc2_tx_buffer structures which the other end ++ hasn't populated yet. */ ++ struct list_head unused_tx_buffer_slots; ++ /* A list of nc2_tx_buffer structures which we need to return ++ to the other end. */ ++ struct list_head pending_tx_buffer_return; ++ /* Some pre-allocated nc2_tx_buffer structures. We have to ++ pre-allocate, because we always need to be able to respond ++ to a POST_BUFFER message (up to some limit). */ ++ struct nc2_tx_buffer *tx_buffers; ++ /* Non-zero if we need to send the other end a ++ SET_NR_POSTED_BUFFERS message. */ ++ uint8_t need_advertise_tx_buffers; ++ /* Number of tx buffers. This is the actual number of slots ++ in the @tx_buffers array. */ ++ uint32_t nr_tx_buffers; ++ /* Number of available tx buffers. The length of the ++ * avail_tx_buffers list. */ ++ uint32_t nr_avail_tx_buffers; ++ /* ``Configured'' number of tx buffers. We only actually ++ allocate any TX buffers when the local interface is up, but ++ this is set to the desired number of buffers all the ++ time. */ ++ uint32_t configured_nr_tx_buffers; ++ + /* Updates are protected by the lock. This can be read at any + * time without holding any locks, and the rest of Linux is + * expected to cope. */ + struct net_device_stats stats; + ++ struct hypercall_batcher batcher; ++ ++#ifdef CONFIG_XEN_NETDEV2_VMQ ++ /* vmq data for supporting multi-queue devices */ ++ nc2_vmq_t vmq; ++#endif ++ + #ifdef CONFIG_XEN_NETDEV2_AUTOMATIC_BYPASS + struct nc2_auto_bypass auto_bypass; + #endif +@@ -681,11 +754,26 @@ struct sk_buff *handle_receiver_copy_packet(struct netchannel2 *nc, + struct netchannel2_msg_hdr *hdr, + unsigned nr_frags, + unsigned frags_off); ++struct sk_buff *handle_pre_posted_packet(struct netchannel2 *nc, ++ struct netchannel2_msg_packet *msg, ++ struct netchannel2_msg_hdr *hdr, ++ unsigned nr_frags, ++ unsigned frags_off); + struct sk_buff *handle_receiver_map_packet(struct netchannel2 *nc, + struct netchannel2_msg_packet *msg, + struct netchannel2_msg_hdr *hdr, + unsigned nr_frags, + unsigned frags_off); ++void nc2_handle_return_posted_buffer(struct netchannel2 *nc, ++ struct netchannel2_ring_pair *ncrp, ++ struct netchannel2_msg_hdr *hdr); ++void nc2_handle_post_buffer(struct netchannel2 *nc, ++ struct netchannel2_ring_pair *ncrp, ++ struct netchannel2_msg_hdr *hdr); ++void nc2_handle_set_nr_posted_buffers(struct netchannel2 *nc, ++ struct netchannel2_ring_pair *ncrp, ++ struct netchannel2_msg_hdr *hdr); ++void nc2_advertise_tx_buffers(struct netchannel2 *nc); + + enum prepare_xmit_result { + PREP_XMIT_OKAY = 0, +@@ -705,9 +793,20 @@ void xmit_grant(struct netchannel2_ring_pair *ncrp, + int use_subpage_grants, + volatile void *msg); + ++int prepare_xmit_allocate_post(struct netchannel2 *nc, ++ struct sk_buff *skb); ++void xmit_post(struct netchannel2 *nc, ++ struct sk_buff *skb, ++ volatile void *msg); ++ ++void nc2_replenish_rx_buffers(struct netchannel2 *nc); ++ + void queue_finish_packet_message(struct netchannel2_ring_pair *ncrp, + uint32_t id, uint8_t flags); + ++void nc2_return_pending_posted_buffers(struct netchannel2 *nc); ++void nc2_posted_buffer_rx_forget(struct netchannel2 *nc); ++ + int allocate_txp_slot(struct netchannel2_ring_pair *ncrp, + struct sk_buff *skb); + void release_txp_slot(struct netchannel2_ring_pair *ncrp, +@@ -716,6 +815,8 @@ void release_txp_slot(struct netchannel2_ring_pair *ncrp, + void release_tx_packet(struct netchannel2_ring_pair *ncrp, + struct sk_buff *skb); + ++void unprepare_tx_buffers(struct netchannel2 *nc); ++ + void fetch_fragment(struct netchannel2_ring_pair *ncrp, + unsigned idx, + struct netchannel2_fragment *frag, +@@ -750,6 +851,7 @@ irqreturn_t nc2_int(int irq, void *dev_id); + + void cleanup_ring_pair(struct netchannel2_ring_pair *ncrp); + void nc2_rscb_on_gntcopy_fail(void *ctxt, gnttab_copy_t *gop); ++void nc2_posted_on_gntcopy_fail(void *ctxt, gnttab_copy_t *gop); + + int init_receive_map_mode(void); + void deinit_receive_map_mode(void); +diff --git a/drivers/xen/netchannel2/netfront2.c b/drivers/xen/netchannel2/netfront2.c +index 13d94e4..c618531 100644 +--- a/drivers/xen/netchannel2/netfront2.c ++++ b/drivers/xen/netchannel2/netfront2.c +@@ -356,6 +356,8 @@ static void backend_changed(struct xenbus_device *xd, + /* Backend has advertised the ring protocol. Allocate + the rings, and tell the backend about them. */ + ++ nc2_set_nr_tx_buffers(nf->chan, 0); ++ + err = 0; + if (!nf->attached) + err = allocate_rings(nf, xd->otherend_id); +diff --git a/drivers/xen/netchannel2/posted_buffer.h b/drivers/xen/netchannel2/posted_buffer.h +new file mode 100644 +index 0000000..38e733c +--- /dev/null ++++ b/drivers/xen/netchannel2/posted_buffer.h +@@ -0,0 +1,50 @@ ++/* Buffer management related bits, shared between vmq.c and ++ * posted_buffer.c */ ++#ifndef NC2_POSTED_BUFFER_H__ ++#define NC2_POSTED_BUFFER_H__ ++ ++/* A buffer which the other end has provided us which we can use to ++ transmit packets to it. */ ++struct nc2_tx_buffer { ++ struct list_head list; ++ uint32_t id; /* ID assigned by the remote endpoint. */ ++ grant_ref_t gref; ++ uint16_t off_in_page; ++ uint16_t size; ++ grant_handle_t grant_handle; ++}; ++ ++/* add a buffer to the pending list to be returned to the other end buffer */ ++static inline void return_tx_buffer(struct netchannel2 *nc, ++ struct nc2_tx_buffer *buffer) ++{ ++ list_add(&buffer->list, &nc->pending_tx_buffer_return); ++} ++ ++static inline struct nc2_tx_buffer *_get_tx_buffer(struct netchannel2 *nc) ++{ ++ struct nc2_tx_buffer *buffer; ++ struct list_head *entry = nc->avail_tx_buffers.next; ++ list_del(entry); ++ buffer = list_entry(entry, struct nc2_tx_buffer, list); ++ nc->nr_avail_tx_buffers--; ++ return buffer; ++} ++ ++/* recycle a posted buffer: return it to the list of available buffers */ ++static inline void recycle_tx_buffer(struct netchannel2 *nc, ++ struct nc2_tx_buffer *buffer) ++{ ++ list_add(&buffer->list, &nc->avail_tx_buffers); ++ nc->nr_avail_tx_buffers++; ++} ++ ++/* add a buffer slot to list of unused buffer slots after it has been ++ * returned to other end */ ++static inline void free_tx_buffer(struct netchannel2 *nc, ++ struct nc2_tx_buffer *buffer) ++{ ++ list_add(&buffer->list, &nc->unused_tx_buffer_slots); ++} ++ ++#endif /* !NC2_POSTED_BUFFER_H__ */ +diff --git a/drivers/xen/netchannel2/posted_buffers.c b/drivers/xen/netchannel2/posted_buffers.c +new file mode 100644 +index 0000000..74cba90 +--- /dev/null ++++ b/drivers/xen/netchannel2/posted_buffers.c +@@ -0,0 +1,763 @@ ++/* Support for receiver-posted buffers */ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include "netchannel2_endpoint.h" ++#include "netchannel2_core.h" ++#include "posted_buffer.h" ++ ++#define POSTED_BUFFER_SIZE PAGE_SIZE ++ ++/* A poison value to make certain buffer management errors more ++ * obvious. */ ++#define RX_BUFFER_BIAS 0xbeef0000 ++ ++static void prepare_tx_buffers(struct netchannel2 *nc); ++ ++/* --------------------------- Receive -------------------------------- */ ++ ++/* A buffer which we have allocated for the other end to send us ++ packets in. */ ++struct nc2_rx_buffer { ++ struct list_head list; ++ void *buffer; ++ grant_ref_t gref; ++ uint8_t is_posted; /* Set if this buffer is available to the ++ other end. */ ++}; ++ ++/* The other end just sent us a buffer id. Convert it back to an ++ nc2_rx_buffer structure. Returns NULL if the id is invalid, or if ++ it isn't currently owned by the other end. */ ++static struct nc2_rx_buffer *find_rx_buffer(struct netchannel2 *nc, ++ uint32_t id) ++{ ++ struct nc2_rx_buffer *rxb; ++ id -= RX_BUFFER_BIAS; ++ if (id >= nc->max_nr_rx_buffers) ++ return NULL; ++ rxb = &nc->rx_buffer_structs[id]; ++ if (rxb->is_posted) ++ return rxb; ++ else ++ return NULL; ++} ++ ++/* Post a buffer to the other endpoint immediately. Assumes that the ++ caller has already checked that there is enough space available on ++ the ring. */ ++static void _nc2_post_buffer(struct netchannel2 *nc, ++ struct nc2_rx_buffer *rxb) ++{ ++ struct netchannel2_msg_post_buffer msg; ++ ++ BUG_ON(!nc->remote_trusted); ++ ++ msg.id = rxb - nc->rx_buffer_structs + RX_BUFFER_BIAS; ++ msg.gref = rxb->gref; ++ msg.off_in_page = offset_in_page(rxb->buffer); ++ msg.size = POSTED_BUFFER_SIZE; ++ ++ nc2_send_message(&nc->rings.prod_ring, NETCHANNEL2_MSG_POST_BUFFER, ++ 0, &msg, sizeof(msg)); ++} ++ ++/* Push out all pending buffer posts, until the ring becomes full or ++ we run out of buffers to post. Called under the lock. */ ++static void push_rx_buffer_posts(struct netchannel2 *nc) ++{ ++ struct nc2_rx_buffer *buf; ++ ++ while (!list_empty(&nc->unposted_rx_buffers) && ++ nc2_can_send_payload_bytes(&nc->rings.prod_ring, ++ sizeof(struct netchannel2_msg_post_buffer))) { ++ buf = list_entry(nc->unposted_rx_buffers.next, ++ struct nc2_rx_buffer, ++ list); ++ _nc2_post_buffer(nc, buf); ++ buf->is_posted = 1; ++ list_move(&buf->list, &nc->rx_buffers); ++ nc->nr_rx_buffers++; ++ ++ nc->rings.pending_time_sensitive_messages = 1; ++ } ++} ++ ++/* Allocate more RX buffers until we reach our target number of RX ++ buffers and post them to the other endpoint. Call under the ++ lock. */ ++void nc2_replenish_rx_buffers(struct netchannel2 *nc) ++{ ++ struct nc2_rx_buffer *rb; ++ ++ if (nc->dont_post_buffers || !nc->remote_trusted) ++ return; ++ ++ while (!list_empty(&nc->unused_rx_buffers)) { ++ rb = list_entry(nc->unused_rx_buffers.next, ++ struct nc2_rx_buffer, ++ list); ++ rb->buffer = (void *)__get_free_pages(GFP_ATOMIC|__GFP_NOWARN, ++ 0); ++ if (!rb->buffer) ++ break; ++ rb->gref = ++ gnttab_grant_foreign_access(nc->rings.otherend_id, ++ virt_to_mfn(rb->buffer), ++ 0); ++ if ((int)rb->gref < 0) { ++ free_page((unsigned long)rb->buffer); ++ break; ++ } ++ ++ list_move(&rb->list, &nc->unposted_rx_buffers); ++ } ++ ++ push_rx_buffer_posts(nc); ++} ++ ++/* The other endpoint has used @rxb to transmit part of the packet ++ which we're goign to represent by @skb. Attach it to the packet's ++ fragment list. The caller should make sure that @skb currently has ++ less than MAX_SKB_FRAGS in its shinfo area, and that @size and ++ @offset are appropriate for the buffer. @size gives the size of ++ the fragment, and @offset gives its offset relative to the start of ++ the receive buffer. */ ++/* This effectively transfers ownership of the buffer's page from @rxb ++ to @skb. */ ++static void attach_buffer_to_skb(struct sk_buff *skb, ++ struct nc2_rx_buffer *rxb, ++ unsigned size, ++ unsigned offset) ++{ ++ struct skb_shared_info *shinfo = skb_shinfo(skb); ++ skb_frag_t *frag = &shinfo->frags[shinfo->nr_frags]; ++ ++ BUG_ON(shinfo->nr_frags >= MAX_SKB_FRAGS); ++ ++ frag->page = virt_to_page(rxb->buffer); ++ frag->page_offset = offset_in_page(rxb->buffer) + offset; ++ frag->size = size; ++ skb->truesize += size; ++ skb->data_len += size; ++ skb->len += size; ++ ++ shinfo->nr_frags++; ++} ++ ++/* The other end has sent us a packet using pre-posted buffers. Parse ++ it up and return an skb representing the packet, or NULL on ++ error. */ ++struct sk_buff *handle_pre_posted_packet(struct netchannel2 *nc, ++ struct netchannel2_msg_packet *msg, ++ struct netchannel2_msg_hdr *hdr, ++ unsigned nr_frags, ++ unsigned frags_off) ++{ ++ struct netchannel2_fragment frag; ++ struct sk_buff *skb; ++ unsigned x; ++ struct nc2_rx_buffer *rxb; ++ int is_bad; ++ int dropped; ++ unsigned prefix_len; ++ ++#define SKB_MIN_PAYLOAD_SIZE 128 ++ ++ dropped = 0; ++ is_bad = 0; ++ if (msg->prefix_size < SKB_MIN_PAYLOAD_SIZE) ++ prefix_len = SKB_MIN_PAYLOAD_SIZE; ++ else ++ prefix_len = msg->prefix_size; ++ /* We don't enforce the MAX_PACKET_BYTES limit here. That's ++ okay, because the amount of memory which the other end can ++ cause us to allocate is still limited, which is all that's ++ really needed. */ ++ skb = dev_alloc_skb(prefix_len + NET_IP_ALIGN); ++ if (skb == NULL) { ++ is_bad = 1; ++ dropped = 1; ++ } else { ++ skb_reserve(skb, NET_IP_ALIGN); ++ nc2_copy_from_ring_off(&nc->rings.cons_ring, ++ skb_put(skb, msg->prefix_size), ++ msg->prefix_size, ++ frags_off + nr_frags * sizeof(frag)); ++ } ++ ++ for (x = 0; x < nr_frags; x++) { ++ fetch_fragment(&nc->rings, x, &frag, frags_off); ++ rxb = find_rx_buffer(nc, frag.pre_post.id); ++ if (rxb == NULL) { ++ pr_debug("RX in bad frag %d.\n", frag.pre_post.id); ++ is_bad = 1; ++ continue; ++ } ++ ++ if (!is_bad && ++ frag.size <= PAGE_SIZE && ++ frag.off < PAGE_SIZE && ++ frag.size + frag.off <= POSTED_BUFFER_SIZE && ++ gnttab_end_foreign_access_ref(rxb->gref)) { ++ gnttab_free_grant_reference(rxb->gref); ++ attach_buffer_to_skb(skb, rxb, frag.size, ++ frag.off); ++ ++ } else { ++ is_bad = 1; ++ gnttab_end_foreign_access(rxb->gref, ++ (unsigned long)rxb->buffer); ++ } ++ rxb->gref = 0; ++ rxb->buffer = NULL; ++ rxb->is_posted = 0; ++ nc->nr_rx_buffers--; ++ list_move(&rxb->list, &nc->unused_rx_buffers); ++ } ++ ++ if (is_bad) { ++ pr_debug("Received skb is bad!\n"); ++ if (skb) ++ kfree_skb(skb); ++ skb = NULL; ++ if (dropped) ++ nc->stats.rx_dropped++; ++ else ++ nc->stats.rx_errors++; ++ } else { ++ if (skb_headlen(skb) < SKB_MIN_PAYLOAD_SIZE) ++ pull_through(skb, ++ SKB_MIN_PAYLOAD_SIZE - skb_headlen(skb)); ++ } ++ ++ return skb; ++} ++ ++/* Release a single RX buffer and return it to the unused list. */ ++static void release_rx_buffer(struct netchannel2 *nc, ++ struct nc2_rx_buffer *rxb) ++{ ++ rxb->is_posted = 0; ++ gnttab_end_foreign_access(rxb->gref, ++ (unsigned long)rxb->buffer); ++ nc->nr_rx_buffers--; ++ list_move(&rxb->list, &nc->unused_rx_buffers); ++} ++ ++/* The other endpoint has finished with one of our RX buffers. Do ++ something suitable with it. */ ++void nc2_handle_return_posted_buffer(struct netchannel2 *nc, ++ struct netchannel2_ring_pair *ncrp, ++ struct netchannel2_msg_hdr *hdr) ++{ ++ struct netchannel2_msg_return_posted_buffer msg; ++ struct nc2_rx_buffer *rxb; ++ ++ if (hdr->size != sizeof(msg)) { ++ pr_debug("return rx buffer message wrong size %d != %zd\n", ++ hdr->size, sizeof(msg)); ++ return; ++ } ++ if (ncrp != &nc->rings) { ++ pr_debug("Return a posted buffer on an ancillary ring!\n"); ++ return; ++ } ++ nc2_copy_from_ring(&nc->rings.cons_ring, &msg, hdr->size); ++ rxb = find_rx_buffer(nc, msg.id); ++ if (!rxb) { ++ pr_debug("Other end returned buffer id %d which we didn't know about.\n", ++ msg.id); ++ return; ++ } ++ release_rx_buffer(nc, rxb); ++} ++ ++/* Tear down any remaining RX buffers. The caller should have done ++ something to make sure that the other end isn't going to try and ++ use them any more. */ ++void nc2_posted_buffer_rx_forget(struct netchannel2 *nc) ++{ ++ struct nc2_rx_buffer *rxb, *next; ++ ++ spin_lock_bh(&nc->rings.lock); ++ list_for_each_entry_safe(rxb, next, &nc->rx_buffers, list) ++ release_rx_buffer(nc, rxb); ++ list_for_each_entry_safe(rxb, next, &nc->unposted_rx_buffers, list) ++ release_rx_buffer(nc, rxb); ++ ++ BUG_ON(!list_empty(&nc->rx_buffers)); ++ BUG_ON(!list_empty(&nc->unposted_rx_buffers)); ++ ++ INIT_LIST_HEAD(&nc->unused_rx_buffers); ++ kfree(nc->rx_buffer_structs); ++ nc->rx_buffer_structs = NULL; ++ nc->max_nr_rx_buffers = 0; ++ spin_unlock_bh(&nc->rings.lock); ++} ++ ++void nc2_handle_set_nr_posted_buffers(struct netchannel2 *nc, ++ struct netchannel2_ring_pair *ncrp, ++ struct netchannel2_msg_hdr *hdr) ++{ ++ struct netchannel2_msg_set_nr_posted_buffers msg; ++ struct nc2_rx_buffer *buffer_structs; ++ unsigned x; ++ unsigned nr_buffers; ++ ++ if (ncrp != &nc->rings) { ++ pr_debug("set_nr_posted_buffers on an ancillary ring!\n"); ++ return; ++ } ++ if (hdr->size != sizeof(msg)) { ++ pr_debug("set nr posted buffers message wrong size %d != %zd\n", ++ hdr->size, sizeof(msg)); ++ return; ++ } ++ if (nc->rx_buffer_structs != NULL) { ++ pr_debug("Other end tried to change posted buffer settings when they were already set.\n"); ++ return; ++ } ++ nc2_copy_from_ring(&nc->rings.cons_ring, &msg, hdr->size); ++ if (msg.nr_buffers <= MAX_POSTED_BUFFERS) { ++ nr_buffers = msg.nr_buffers; ++ } else { ++ pr_debug("remote recommended %d buffers, using %d\n", ++ msg.nr_buffers, MAX_POSTED_BUFFERS); ++ nr_buffers = MAX_POSTED_BUFFERS; ++ } ++ ++ buffer_structs = kzalloc(sizeof(struct nc2_rx_buffer) * nr_buffers, ++ GFP_ATOMIC); ++ if (buffer_structs == NULL) { ++ printk(KERN_WARNING "failed to allocate %d rx buffers", ++ nr_buffers); ++ return; ++ } ++ ++ for (x = 0; x < nr_buffers; x++) ++ list_add_tail(&buffer_structs[x].list, ++ &nc->unused_rx_buffers); ++ nc->max_nr_rx_buffers = nr_buffers; ++ nc->rx_buffer_structs = buffer_structs; ++ nc->dont_post_buffers = 0; ++} ++ ++ ++/* -------------------------- Transmit ------------------------------- */ ++ ++/* A representation of a packet which is halfway through being ++ prepared for transmission. */ ++struct post_packet_plan { ++ unsigned off_in_cur_buffer; ++ struct nc2_tx_buffer *cur_buffer; ++ ++ /* We assemble the next fragment in work_frag, and then copy ++ to output_frag once it's done. */ ++ struct netchannel2_fragment work_frag; ++ volatile struct netchannel2_fragment *output_frag; ++}; ++ ++/* A grant copy failed while we were transmitting a packet. That ++ indicates that the *receiving* domain gave us a bad RX buffer. ++ We're too late to send them an error, so there isn't really ++ anything we can do to help them. Oh well, nevermind. */ ++void nc2_posted_on_gntcopy_fail(void *ctxt, ++ gnttab_copy_t *gop) ++{ ++ printk(KERN_WARNING "Grant copy failed for transmit; domain provided bad RX buffer (source %x, %x, %x, dest %x, %x, %x, len %x, flags %x, status %d).\n", ++ gop->source.u.ref, gop->source.domid, gop->source.offset, ++ gop->dest.u.ref, gop->dest.domid, gop->dest.offset, ++ gop->len, gop->flags, gop->status); ++} ++ ++/* Advance to the next transmit buffer/fragment in the packet. */ ++static void advance_to_next_buffer(struct post_packet_plan *plan) ++{ ++ BUG_ON(plan->off_in_cur_buffer < plan->cur_buffer->size); ++ plan->cur_buffer = list_entry(plan->cur_buffer->list.next, ++ struct nc2_tx_buffer, ++ list); ++ plan->off_in_cur_buffer = 0; ++ ++ *plan->output_frag = plan->work_frag; ++ plan->output_frag++; ++ memset(&plan->work_frag, 0, sizeof(plan->work_frag)); ++ plan->work_frag.pre_post.id = plan->cur_buffer->id; ++} ++ ++/* Schedule a copy from a range of bytes in a local page into the ++ packet we're building in @plan. This cannot cross page or TX ++ buffer boundaries. */ ++static void prepare_grant_copy(struct netchannel2 *nc, ++ struct post_packet_plan *plan, ++ struct page *page, ++ unsigned page_off, ++ unsigned count, ++ domid_t domid) ++{ ++ gnttab_copy_t *gop; ++ ++ /* XXX: We don't do any error checking on this grant copy. ++ That's okay. There are only two ways a grant copy can ++ fail: ++ ++ -- The source is bad. But the source is either in our ++ local memory (so must be good), or something we've ++ already mapped (so the grant reference must be good, and ++ must already be pinned so it can't go bad). Therefore, ++ the source must always be good, and we can't fail ++ because of a bad source. ++ ++ -- The destination is bad. This could happen if the ++ receiving domain sent us a bad page to use as an RX ++ buffer. In that case, we'll tell the receiving domain ++ that it received some data in a page when the page is ++ actually uninitialised. The worst case is that the ++ receiving domain ends up copying its own uninitialised ++ memory to its own userspace. That's not a problem for ++ us (because it can't see *our* uninitialised memory), ++ and if it's a problem for the receiving domain then it ++ should have been more careful about what memory it gave ++ us to use as RX buffers. ++ ++ Therefore, the lack of error checking is actually perfectly ++ safe. ++ ++ (Even if it isn't exactly great software engineering ++ practice.) ++ */ ++ gop = hypercall_batcher_grant_copy(&nc->batcher, ++ NULL, ++ nc2_posted_on_gntcopy_fail); ++ gop->flags = GNTCOPY_dest_gref; ++ if (page_is_tracked(page)) { ++ lookup_tracker_page(page, ++ &gop->source.domid, ++ &gop->source.u.ref); ++ gop->flags |= GNTCOPY_source_gref; ++ } else { ++ gop->source.domid = DOMID_SELF; ++ gop->source.u.gmfn = virt_to_mfn(page_address(page)); ++ } ++ gop->source.offset = page_off; ++ gop->dest.domid = domid; ++ gop->dest.offset = ++ plan->cur_buffer->off_in_page + plan->off_in_cur_buffer; ++ gop->dest.u.ref = plan->cur_buffer->gref; ++ gop->len = count; ++} ++ ++/* Add the bytes from @ptr to @ptr + @size to the packet we're ++ preparing in @plan. This cannot handle page-crossing local ++ buffers, but will correctly handle buffer-crossing operations. */ ++static void prepare_subpage_post(struct netchannel2 *nc, ++ struct page *page, ++ unsigned off_in_page, ++ unsigned size, ++ struct post_packet_plan *plan) ++{ ++ unsigned remaining_in_buffer; ++ unsigned this_time; ++ ++ BUG_ON(off_in_page + size > PAGE_SIZE); ++ while (size != 0) { ++ remaining_in_buffer = ++ plan->cur_buffer->size - ++ plan->off_in_cur_buffer; ++ if (remaining_in_buffer == 0) { ++ advance_to_next_buffer(plan); ++ remaining_in_buffer = plan->cur_buffer->size; ++ } ++ ++ this_time = size; ++ if (this_time > remaining_in_buffer) ++ this_time = remaining_in_buffer; ++ prepare_grant_copy(nc, ++ plan, ++ page, ++ off_in_page, ++ this_time, ++ nc->rings.otherend_id); ++ plan->work_frag.size += this_time; ++ plan->off_in_cur_buffer += this_time; ++ ++ size -= this_time; ++ off_in_page += this_time; ++ } ++} ++ ++/* Add @skb->data to @skb->tail to the packet which is being prepared ++ in @plan. */ ++static void prepare_data_area_post(struct netchannel2 *nc, struct sk_buff *skb, ++ struct post_packet_plan *plan) ++{ ++ void *ptr = skb->data; ++ unsigned len = skb_headlen(skb); ++ unsigned off; ++ unsigned this_time; ++ ++ for (off = 0; off < len; off += this_time) { ++ this_time = len; ++ if (this_time + offset_in_page(ptr + off) > PAGE_SIZE) ++ this_time = PAGE_SIZE - offset_in_page(ptr + off); ++ prepare_subpage_post(nc, ++ virt_to_page(ptr + off), ++ offset_in_page(ptr + off), ++ this_time, ++ plan); ++ } ++} ++ ++/* Allocate some TX buffers suitable for transmitting @skb out of ++ @nc's pool. The buffers are chained on @fragments. On success, ++ returns the number of buffers allocated. Returns -1 if ++ insufficient buffers are available, in which case no buffers are ++ allocated. We assume that the packet will be offset by ++ NET_IP_ALIGN bytes in the first fragment so that everything after ++ the ethernet header is properly aligned. */ ++static int grab_tx_buffers(struct netchannel2 *nc, ++ struct sk_buff *skb, ++ struct list_head *fragments) ++{ ++ unsigned bytes_to_transmit; ++ unsigned bytes_planned; ++ struct nc2_tx_buffer *current_buffer, *next; ++ int count; ++ ++ INIT_LIST_HEAD(fragments); ++ bytes_planned = 0; ++ bytes_to_transmit = skb->len + NET_IP_ALIGN; ++ count = 0; ++ list_for_each_entry_safe(current_buffer, next, &nc->avail_tx_buffers, ++ list) { ++ count++; ++ bytes_planned += current_buffer->size; ++ list_move(¤t_buffer->list, fragments); ++ if (bytes_planned >= bytes_to_transmit) { ++ BUG_ON(nc->nr_avail_tx_buffers < count); ++ nc->nr_avail_tx_buffers -= count; ++ return count; ++ } ++ } ++ BUG_ON(nc->nr_avail_tx_buffers != count); ++ list_splice_init(fragments, &nc->avail_tx_buffers); ++ return -1; ++} ++ ++int prepare_xmit_allocate_post(struct netchannel2 *nc, struct sk_buff *skb) ++{ ++ struct skb_cb_overlay *scb; ++ int nr_fragments; ++ ++ scb = get_skb_overlay(skb); ++ nr_fragments = grab_tx_buffers(nc, skb, &scb->buffers); ++ if (nr_fragments < 0) ++ return -1; ++ scb->nr_fragments = nr_fragments; ++ scb->type = NC2_PACKET_TYPE_pre_posted; ++ ++ return 0; ++} ++ ++void xmit_post(struct netchannel2 *nc, struct sk_buff *skb, ++ volatile void *msg_buf) ++{ ++ volatile struct netchannel2_msg_packet *msg = msg_buf; ++ struct skb_cb_overlay *scb; ++ struct skb_shared_info *shinfo; ++ skb_frag_t *frag; ++ unsigned x; ++ struct post_packet_plan plan; ++ ++ scb = get_skb_overlay(skb); ++ memset(&plan, 0, sizeof(plan)); ++ ++ plan.cur_buffer = list_entry(scb->buffers.next, ++ struct nc2_tx_buffer, ++ list); ++ plan.output_frag = msg->frags; ++ memset(&plan.work_frag, 0, sizeof(plan.work_frag)); ++ plan.work_frag.pre_post.id = plan.cur_buffer->id; ++ ++ /* Burn a couple of bytes at the start of the packet so as we ++ get better alignment in the body. */ ++ plan.work_frag.off = NET_IP_ALIGN; ++ plan.off_in_cur_buffer = NET_IP_ALIGN; ++ ++ prepare_data_area_post(nc, skb, &plan); ++ shinfo = skb_shinfo(skb); ++ for (x = 0; x < shinfo->nr_frags; x++) { ++ frag = &shinfo->frags[x]; ++ prepare_subpage_post(nc, ++ frag->page, ++ frag->page_offset, ++ frag->size, ++ &plan); ++ } ++ ++ *plan.output_frag = plan.work_frag; ++ ++ /* All of the buffer slots which have been used in ++ this packet are now available for the other end to ++ fill with new buffers. */ ++ list_splice(&scb->buffers, &nc->unused_tx_buffer_slots); ++} ++ ++/* The other endpoint has sent us a transmit buffer. Add it to the ++ list. Called under the lock. */ ++void nc2_handle_post_buffer(struct netchannel2 *nc, ++ struct netchannel2_ring_pair *ncrp, ++ struct netchannel2_msg_hdr *hdr) ++{ ++ struct netchannel2_msg_post_buffer msg; ++ struct nc2_tx_buffer *txb; ++ ++ if (hdr->size != sizeof(msg)) { ++ pr_debug("Strange sized rx buffer post %d\n", hdr->size); ++ return; ++ } ++ if (ncrp != &nc->rings) { ++ pr_debug("Posted buffer on an ancillary ring!\n"); ++ return; ++ } ++ nc2_copy_from_ring(&nc->rings.cons_ring, &msg, sizeof(msg)); ++ if (list_empty(&nc->unused_tx_buffer_slots) || ++ msg.size > PAGE_SIZE || ++ msg.off_in_page > PAGE_SIZE || ++ msg.size + msg.off_in_page > PAGE_SIZE || ++ msg.size < 64) { ++ pr_debug("Other end posted too many buffers, or this buffer was strange (%d,%d)\n", ++ msg.off_in_page, msg.size); ++ return; ++ } ++ ++ txb = list_entry(nc->unused_tx_buffer_slots.next, ++ struct nc2_tx_buffer, ++ list); ++ txb->id = msg.id; ++ txb->gref = msg.gref; ++ txb->off_in_page = msg.off_in_page; ++ txb->size = msg.size; ++ ++ nc->nr_avail_tx_buffers++; ++ ++ list_move(&txb->list, &nc->avail_tx_buffers); ++} ++ ++/* Process the pending TX buffer return list and push as many as ++ possible onto the ring. Called under the lock. Does not ++ automatically flush the ring; that's the caller's ++ responsibility. */ ++void nc2_return_pending_posted_buffers(struct netchannel2 *nc) ++{ ++ struct netchannel2_msg_return_posted_buffer msg; ++ struct nc2_tx_buffer *txb; ++ ++ memset(&msg, 0, sizeof(msg)); ++ while (!list_empty(&nc->pending_tx_buffer_return) && ++ nc2_can_send_payload_bytes(&nc->rings.prod_ring, sizeof(msg))) { ++ txb = list_entry(nc->pending_tx_buffer_return.next, ++ struct nc2_tx_buffer, ++ list); ++ list_del(&txb->list); ++ free_tx_buffer(nc, txb); ++ msg.id = txb->id; ++ nc2_send_message(&nc->rings.prod_ring, ++ NETCHANNEL2_MSG_RETURN_POSTED_BUFFER, ++ 0, ++ &msg, ++ sizeof(&msg)); ++ } ++} ++ ++/* If there is space on the ring, tell the other end how many RX ++ buffers we want it to post (i.e. how many TX buffers we're allowed ++ to accept). Called under the lock. */ ++void nc2_advertise_tx_buffers(struct netchannel2 *nc) ++{ ++ struct netchannel2_msg_set_nr_posted_buffers msg; ++ ++ if (!nc2_can_send_payload_bytes(&nc->rings.prod_ring, sizeof(msg))) ++ return; ++ msg.nr_buffers = nc->nr_tx_buffers; ++ nc2_send_message(&nc->rings.prod_ring, ++ NETCHANNEL2_MSG_SET_NR_POSTED_BUFFERS, ++ 0, &msg, sizeof(msg)); ++ nc->need_advertise_tx_buffers = 0; ++ nc->rings.pending_time_sensitive_messages = 1; ++} ++ ++/* Set the target number of TX buffers. */ ++void nc2_set_nr_tx_buffers(struct netchannel2 *nc, unsigned nr_buffers) ++{ ++ int changed; ++ ++ spin_lock_bh(&nc->rings.lock); ++ changed = (nc->configured_nr_tx_buffers != nr_buffers); ++ nc->configured_nr_tx_buffers = nr_buffers; ++ spin_unlock_bh(&nc->rings.lock); ++ if (changed) ++ prepare_tx_buffers(nc); ++} ++ ++/* The local ethX interface just came up. Set up the TX buffers. */ ++static void prepare_tx_buffers(struct netchannel2 *nc) ++{ ++ struct nc2_tx_buffer *buffers; ++ unsigned x; ++ unsigned nr_buffers; ++ ++ nr_buffers = nc->configured_nr_tx_buffers; ++ if (nr_buffers == 0) { ++ /* Trying to shut down TX in posted buffers. */ ++ unprepare_tx_buffers(nc); ++ return; ++ } ++ ++ buffers = kzalloc(sizeof(struct nc2_tx_buffer) * nr_buffers, ++ GFP_KERNEL); ++ if (buffers == NULL) { ++ printk(KERN_ERR "Cannot allocate %d tx buffer slots, posted tx disabled.\n", ++ nr_buffers); ++ return; ++ } ++ ++ spin_lock_bh(&nc->rings.lock); ++ ++ /* nc->tx_buffers should be NULL, because starting and ++ stopping the TX buffer management should alternate. */ ++ BUG_ON(nc->tx_buffers); ++ ++ INIT_LIST_HEAD(&nc->avail_tx_buffers); ++ nc->nr_avail_tx_buffers = 0; ++ for (x = 0; x < nr_buffers; x++) ++ list_add_tail(&buffers[x].list, &nc->unused_tx_buffer_slots); ++ nc->tx_buffers = buffers; ++ nc->nr_tx_buffers = nr_buffers; ++ nc->need_advertise_tx_buffers = 1; ++ spin_unlock_bh(&nc->rings.lock); ++} ++ ++/* The local ethX interface is goign down. Release the TX buffers ++ allocated by prepare_tx_buffers(). Note that the poll() method has ++ already been stopped, so messages posted by the other end will not ++ be processed. */ ++void unprepare_tx_buffers(struct netchannel2 *nc) ++{ ++ spin_lock_bh(&nc->rings.lock); ++ INIT_LIST_HEAD(&nc->pending_tx_buffer_return); ++ INIT_LIST_HEAD(&nc->unused_tx_buffer_slots); ++ INIT_LIST_HEAD(&nc->avail_tx_buffers); ++ nc->nr_tx_buffers = 0; ++ nc->nr_avail_tx_buffers = 0; ++ nc->need_advertise_tx_buffers = 1; ++ kfree(nc->tx_buffers); ++ nc->tx_buffers = NULL; ++ spin_unlock_bh(&nc->rings.lock); ++} +diff --git a/drivers/xen/netchannel2/recv_packet.c b/drivers/xen/netchannel2/recv_packet.c +index 2de8afa..27cd7fe 100644 +--- a/drivers/xen/netchannel2/recv_packet.c ++++ b/drivers/xen/netchannel2/recv_packet.c +@@ -121,6 +121,11 @@ void nc2_handle_packet_msg(struct netchannel2 *nc, + nr_frags, frags_off); + queue_finish_packet_message(ncrp, msg.id, msg.flags); + break; ++ case NC2_PACKET_TYPE_pre_posted: ++ skb = handle_pre_posted_packet(nc, &msg, hdr, nr_frags, ++ frags_off); ++ /* No finish message */ ++ break; + case NC2_PACKET_TYPE_receiver_map: + if (!nc->local_trusted) { + /* The remote doesn't trust us, so they +diff --git a/drivers/xen/netchannel2/util.c b/drivers/xen/netchannel2/util.c +index 1b2a909..bd281a8 100644 +--- a/drivers/xen/netchannel2/util.c ++++ b/drivers/xen/netchannel2/util.c +@@ -34,7 +34,13 @@ int allocate_txp_slot(struct netchannel2_ring_pair *ncrp, + static void nc2_free_skb(struct netchannel2 *nc, + struct sk_buff *skb) + { +- dev_kfree_skb(skb); ++#ifdef CONFIG_XEN_NETDEV2_VMQ ++ nc2_vmq_t *vmq = &nc->vmq; ++ if (get_skb_overlay(skb)->policy == transmit_policy_vmq) ++ skb_queue_tail(&vmq->dealloc_queue, skb); ++ else ++#endif ++ dev_kfree_skb(skb); + } + + void release_txp_slot(struct netchannel2_ring_pair *ncrp, +diff --git a/drivers/xen/netchannel2/vmq.c b/drivers/xen/netchannel2/vmq.c +new file mode 100644 +index 0000000..8fa6308 +--- /dev/null ++++ b/drivers/xen/netchannel2/vmq.c +@@ -0,0 +1,805 @@ ++/***************************************************************************** ++ * vmq.c ++ * ++ * Support multi-queue network devices. ++ * ++ * Copyright (c) 2008, Kaushik Kumar Ram, Rice University. ++ * Copyright (c) 2008, Jose Renato Santos, Hewlett-Packard Co. ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU General Public License version 2 ++ * as published by the Free Software Foundation; or, when distributed ++ * separately from the Linux kernel or incorporated into other ++ * software packages, subject to the following license: ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a copy ++ * of this source file (the "Software"), to deal in the Software without ++ * restriction, including without limitation the rights to use, copy, modify, ++ * merge, publish, distribute, sublicense, and/or sell copies of the Software, ++ * and to permit persons to whom the Software is furnished to do so, subject to ++ * the following conditions: ++ * ++ * The above copyright notice and this permission notice shall be included in ++ * all copies or substantial portions of the Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE ++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS ++ * IN THE SOFTWARE. ++ * ++ */ ++/* This only implements the transmit half of the method; receive is ++ * handled by posted_buffers.c */ ++#include ++#include ++#include ++#include ++#include ++#include "netchannel2_core.h" ++ ++#include "posted_buffer.h" ++#include "vmq.h" ++ ++/* state of device queue when operating in vmq mode */ ++#define VMQ_QUEUE_DISABLED 0 ++#define VMQ_QUEUE_STARTING 1 ++#define VMQ_QUEUE_ENABLED 2 ++#define VMQ_QUEUE_CLOSING 3 ++ ++#define VMQ_MAX_UNMAP_OPS 256 ++struct vmq_unmap_grants { ++ unsigned n; ++ gnttab_unmap_grant_ref_t gop[VMQ_MAX_UNMAP_OPS]; ++}; ++typedef struct vmq_unmap_grants vmq_unmap_grants_t; ++ ++vmq_unmap_grants_t vmq_unmap_grants; ++ ++static inline void vmq_flush_unmap_grants(void) ++{ ++ if (vmq_unmap_grants.n == 0) ++ return; ++ ++ if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, ++ vmq_unmap_grants.gop, ++ vmq_unmap_grants.n)) ++ BUG(); ++ vmq_unmap_grants.n = 0; ++} ++ ++static inline gnttab_unmap_grant_ref_t *vmq_next_unmap_gop(void) ++{ ++ if (vmq_unmap_grants.n == VMQ_MAX_UNMAP_OPS) ++ vmq_flush_unmap_grants(); ++ return &vmq_unmap_grants.gop[vmq_unmap_grants.n++]; ++} ++ ++void vmq_flush_unmap_hypercall(void) ++{ ++ vmq_flush_unmap_grants(); ++} ++ ++static inline unsigned long vmq_idx_to_pfn(nc2_vmq_t *vmq, unsigned int idx) ++{ ++ return page_to_pfn(vmq->pages[idx]); ++} ++ ++static inline unsigned long vmq_idx_to_kaddr(nc2_vmq_t *vmq, unsigned int idx) ++{ ++ return (unsigned long)pfn_to_kaddr(vmq_idx_to_pfn(vmq, idx)); ++} ++ ++/* get vmq idx from page struct */ ++static long nc2_vmq_page_index(struct page *page) ++{ ++ nc2_vmq_buf_t *vmq_buf; ++ vmq_buf = (nc2_vmq_buf_t *)page->mapping; ++ return vmq_buf - vmq_buf->nc->vmq.buffer; ++} ++ ++/* Read a physical device name from xenstore and ++ * returns a pointer to the associated net_device structure. ++ * Returns NULL on error. */ ++static struct net_device *read_pdev(struct xenbus_device *dev) ++{ ++ char *pdevstr; ++ struct net_device *pdev = NULL; ++ ++ pdevstr = xenbus_read(XBT_NIL, dev->nodename, "pdev", NULL); ++ if (IS_ERR(pdevstr)) ++ return NULL; ++ ++ if (pdevstr) ++ pdev = dev_get_by_name(&init_net, pdevstr); ++ ++ kfree(pdevstr); ++ ++ return pdev; ++} ++ ++static void nc2_vmq_page_release(struct page *page, unsigned int order) ++{ ++ printk(KERN_CRIT "%s: ERROR: Unexpected release of netchannel2 vmq page", ++ __func__); ++ BUG_ON(1); ++} ++ ++static inline int nc2_vmq_is_disabled(struct netchannel2 *nc) ++{ ++ return nc->vmq.vmq_state == VMQ_QUEUE_DISABLED; ++} ++ ++static inline int nc2_vmq_is_starting(struct netchannel2 *nc) ++{ ++ return nc->vmq.vmq_state == VMQ_QUEUE_STARTING; ++} ++ ++static inline int nc2_vmq_is_enabled(struct netchannel2 *nc) ++{ ++ return nc->vmq.vmq_state == VMQ_QUEUE_ENABLED; ++} ++ ++static inline int nc2_vmq_is_closing(struct netchannel2 *nc) ++{ ++ return nc->vmq.vmq_state == VMQ_QUEUE_CLOSING; ++} ++ ++static inline void nc2_vmq_enable(struct netchannel2 *nc) ++{ ++ nc2_vmq_t *vmq = &nc->vmq; ++ vmq_get(vmq); ++ vmq_enable_queue(vmq->pdev, vmq->vmq_id); ++ vmq->vmq_state = VMQ_QUEUE_ENABLED; ++} ++ ++void nc2_vmq_disconnect(struct netchannel2 *nc) ++{ ++ nc2_vmq_t *vmq = &nc->vmq; ++ ++ if (nc2_vmq_is_enabled(nc)) { ++ vmq_disable_queue(vmq->pdev, vmq->vmq_id); ++ vmq_free_queue(vmq->pdev, vmq->vmq_id); ++ vmq->vmq_state = VMQ_QUEUE_CLOSING; ++ /* wait until all buffers have been returned by dev driver */ ++ wait_event(vmq->waiting_to_free, ++ atomic_read(&vmq->refcnt) == 0); ++ return; ++ } ++ ++ if (nc2_vmq_is_starting(nc)) { ++ vmq_free_queue(vmq->pdev, vmq->vmq_id); ++ vmq->vmq_state = VMQ_QUEUE_CLOSING; ++ return; ++ } ++ ++} ++ ++ ++static void nc2_vmq_end_map_buffers(gnttab_map_grant_ref_t *mop, int count, ++ struct netchannel2 *nc, u16 *alloc_idx) ++{ ++ int i, err; ++ u16 idx; ++ unsigned int prod; ++ nc2_vmq_t *vmq = &nc->vmq; ++ ++ prod = vmq->mapped_pages_prod; ++ ++ for (i = 0; i < count; i++) { ++ idx = alloc_idx[i]; ++ ++ /* Check error status */ ++ err = mop->status; ++ if (likely(!err)) { ++ set_phys_to_machine( ++ __pa(vmq_idx_to_kaddr(vmq, idx)) ++ >> PAGE_SHIFT, ++ FOREIGN_FRAME(mop->dev_bus_addr ++ >> PAGE_SHIFT)); ++ /* Store the handle */ ++ vmq->buffer[idx].buf->grant_handle = mop->handle; ++ ++ /* Add it to the mapped pages list */ ++ vmq->mapped_pages[VMQ_IDX_MASK(prod++)] = idx; ++ mop++; ++ continue; ++ } ++ ++ /* Error mapping page: return posted buffer to other end. ++ * TODO: We might need an error field on the return buffer ++ * message */ ++ return_tx_buffer(nc, vmq->buffer[idx].buf); ++ ++ /* Add the page back to the free list */ ++ vmq->unmapped_pages[VMQ_IDX_MASK(vmq->unmapped_pages_prod++)] ++ = idx; ++ ++ mop++; ++ } ++ ++ smp_wmb(); ++ vmq->mapped_pages_prod = prod; ++ ++ return; ++} ++ ++/* Map guest buffers and place them in the mapped buffers list. The mapped ++ * pages in this list are used when allocating a skb (vmq_alloc_skb()). ++ */ ++static void nc2_vmq_map_buffers(struct netchannel2 *nc) ++{ ++ u16 idx; ++ int count = 0; ++ unsigned int cons; ++ int nbufs; ++ int buf_avail; ++ struct nc2_tx_buffer *buf; ++ struct nc2_vmq *vmq = &nc->vmq; ++ int n_mapped = nr_vmq_bufs(nc); ++ ++ ++ /* ++ * Putting hundreds of bytes on the stack is considered rude. ++ * Static works because a tasklet can only be on one CPU at any time. ++ */ ++ static gnttab_map_grant_ref_t rx_map_ops[VMQ_MAX_BUFFERS]; ++ static u16 alloc_idx[VMQ_MAX_BUFFERS]; ++ ++ /* If there is at least VMQ_MIN_BUFFERS buffers, no work to do */ ++ if (n_mapped >= VMQ_MIN_BUFFERS) ++ return; ++ ++ /* Try to get VMQ_MAX_BUFFERS mapped buffers, if there are ++ sufficient buffers posted by the other end */ ++ nbufs = VMQ_MAX_BUFFERS - n_mapped; ++ buf_avail = nc->nr_avail_tx_buffers; ++ if (nbufs > buf_avail) ++ nbufs = buf_avail; ++ ++ /* Xen cannot handle more than 512 grant ops in a single hypercall */ ++ if (nbufs > 512) ++ nbufs = 512; ++ ++ /* give up if there are no buffers available */ ++ if (nbufs <= 0) ++ return; ++ ++ /* Note that we *should* have free pages to consume here ++ * and no checks are needed. ++ */ ++ cons = vmq->unmapped_pages_cons; ++ ++ while (count < nbufs) { ++ idx = vmq->unmapped_pages[VMQ_IDX_MASK(cons++)]; ++ buf = vmq->buffer[idx].buf = _get_tx_buffer(nc); ++ /* Setup grant map operation */ ++ gnttab_set_map_op(&rx_map_ops[count], ++ vmq_idx_to_kaddr(vmq, idx), ++ GNTMAP_host_map, ++ buf->gref, ++ nc->rings.otherend_id); ++ alloc_idx[count] = idx; ++ count++; ++ } ++ ++ vmq->unmapped_pages_cons = cons; ++ ++ /* Map all the pages */ ++ BUG_ON(HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, ++ rx_map_ops, nbufs)); ++ ++ /* Finalize buffer mapping after checking if the grant operations ++ succeeded */ ++ nc2_vmq_end_map_buffers(rx_map_ops, nbufs, nc, alloc_idx); ++ ++ vmq->nbufs += nbufs; ++} ++ ++static void nc2_vmq_unmap_buf(struct netchannel2 *nc, ++ unsigned int idx, int recycle) ++{ ++ nc2_vmq_t *vmq = &nc->vmq; ++ unsigned long pfn; ++ gnttab_unmap_grant_ref_t *gop; ++ unsigned prod; ++ ++ pfn = vmq_idx_to_pfn(vmq, idx); ++ /* Already unmapped? */ ++ if (!phys_to_machine_mapping_valid(pfn)) ++ return; ++ ++ gop = vmq_next_unmap_gop(); ++ gnttab_set_unmap_op(gop, vmq_idx_to_kaddr(vmq, idx), ++ GNTMAP_host_map, ++ vmq->buffer[idx].buf->grant_handle); ++ ++ vmq->nbufs--; ++ ++ set_phys_to_machine(__pa(vmq_idx_to_kaddr(vmq, idx)) >> ++ PAGE_SHIFT, ++ INVALID_P2M_ENTRY); ++ /* Ready for next use. */ ++ gnttab_reset_grant_page(vmq->pages[idx]); ++ /* Add the page back to the unmapped list */ ++ prod = vmq->unmapped_pages_prod; ++ vmq->unmapped_pages[VMQ_IDX_MASK(prod++)] = idx; ++ if (recycle) ++ recycle_tx_buffer(nc, vmq->buffer[idx].buf); ++ else ++ free_tx_buffer(nc, vmq->buffer[idx].buf); ++ smp_wmb(); ++ vmq->unmapped_pages_prod = prod; ++} ++ ++static void nc2_vmq_free_mapped_bufs(struct netchannel2 *nc) ++{ ++ nc2_vmq_t *vmq = &nc->vmq; ++ unsigned int idx; ++ unsigned prod, cons; ++ ++ /* The queue should be disabled before this function is called */ ++ BUG_ON(vmq->vmq_state == VMQ_QUEUE_ENABLED); ++ ++ cons = vmq->mapped_pages_cons; ++ prod = vmq->mapped_pages_prod; ++ smp_rmb(); ++ ++ while (cons != prod) { ++ idx = vmq->mapped_pages[VMQ_IDX_MASK(cons++)]; ++ nc2_vmq_unmap_buf(nc, idx, 1); ++ } ++ ++ vmq_flush_unmap_grants(); ++ ++ vmq->mapped_pages_cons = cons; ++ ++} ++ ++static void nc2_vmq_free_skb(struct sk_buff *skb) ++{ ++ struct netchannel2 *nc; ++ nc2_vmq_t *vmq; ++ unsigned int idx; ++ int nr_frags, i; ++ struct skb_shared_info *shinfo = skb_shinfo(skb); ++ skb_frag_t *frags = shinfo->frags; ++ ++ nc = netdev_priv(skb->dev); ++ vmq = &nc->vmq; ++ ++ nr_frags = shinfo->nr_frags; ++ for (i = 0; i < nr_frags; i++) { ++ idx = nc2_vmq_page_index(frags[i].page); ++ nc2_vmq_unmap_buf(nc, idx, 1); ++ } ++ ++ vmq_flush_unmap_grants(); ++ ++ shinfo->frag_list = NULL; ++ shinfo->nr_frags = 0; ++ ++ /* Add the skb back to the free pool */ ++ skb_queue_tail(&vmq->free_skb_list, skb); ++} ++ ++/* Initialize the free socket buffer list */ ++static int vmq_init_free_skb_list(int n, struct sk_buff_head *free_skb_list) ++{ ++ int i; ++ struct sk_buff *skb; ++ ++ skb_queue_head_init(free_skb_list); ++ ++ for (i = 0; i < n; i++) { ++ skb = alloc_skb(VMQ_SKB_SIZE, GFP_ATOMIC); ++ if (!skb) { ++ printk("Netchannel2 vmq: Failed to allocate socket " ++ "buffer %d (max=%d)\n", i, (int)n); ++ goto error; ++ } ++ skb_queue_tail(free_skb_list, skb); ++ } ++ ++ return 0; ++error: ++ /* Free all the allocated buffers and return Error */ ++ while (!skb_queue_empty(free_skb_list)) ++ kfree_skb(skb_dequeue(free_skb_list)); ++ ++ return -1; ++} ++ ++/* Initialize vmq. Return 1 if vmq is used and 0 otherwise */ ++int nc2_vmq_connect(struct netchannel2 *nc) ++{ ++ nc2_vmq_t *vmq = &nc->vmq; ++ struct page *page; ++ int q_id; ++ int size; ++ int i; ++ ++ vmq->vmq_mode = 0; ++ vmq->pdev = read_pdev(nc->xenbus_device); ++ ++ /* cannot use vmq mode if physical device not found */ ++ if (!vmq->pdev) ++ return 0; ++ ++ /* Allocate a RX queue */ ++ q_id = vmq_alloc_queue(vmq->pdev, VMQ_TYPE_RX); ++ if (q_id < 0) ++ /* Allocation failed, cannot use multi-queue */ ++ goto free_pdev; ++ ++ vmq->vmq_id = q_id; ++ ++ /* Set the size of the queue */ ++ size = vmq_get_maxsize(vmq->pdev); ++ if (size > VMQ_QUEUE_SIZE) ++ size = VMQ_QUEUE_SIZE; ++ if (vmq_set_size(vmq->pdev, q_id, size) < 0) { ++ /* Failure, free up the queue and return error */ ++ printk(KERN_ERR "%s: could not set queue size on net device\n", ++ __func__); ++ goto free_queue; ++ } ++ vmq->vmq_size = size; ++ ++ /* Set the mac address of the queue */ ++ if (vmq_set_mac(vmq->pdev, q_id, nc->rings.remote_mac) < 0) { ++ /* Failure, free up the queue and return error */ ++ printk(KERN_ERR "%s: could not set MAC address for net device queue\n", ++ __func__); ++ goto free_queue; ++ } ++ ++ vmq->pages = alloc_empty_pages_and_pagevec(VMQ_MAX_BUFFERS); ++ if (vmq->pages == NULL) { ++ printk(KERN_ERR "%s: out of memory\n", __func__); ++ goto free_queue; ++ } ++ ++ skb_queue_head_init(&vmq->dealloc_queue); ++ skb_queue_head_init(&vmq->rx_queue); ++ ++ if (vmq_init_free_skb_list(VMQ_MAX_BUFFERS, ++ &vmq->free_skb_list)) { ++ printk(KERN_ERR "%s: Could not allocate free socket buffers", ++ __func__); ++ goto free_pagevec; ++ } ++ ++ for (i = 0; i < VMQ_MAX_BUFFERS; i++) { ++ vmq->buffer[i].nc = nc; ++ page = vmq->pages[i]; ++ SetPageForeign(page, nc2_vmq_page_release); ++ page->mapping = (void *)&vmq->buffer[i]; ++ vmq->unmapped_pages[i] = i; ++ } ++ ++ vmq->unmapped_pages_prod = VMQ_MAX_BUFFERS; ++ vmq->unmapped_pages_cons = 0; ++ ++ vmq->mapped_pages_prod = 0; ++ vmq->mapped_pages_cons = 0; ++ ++ vmq->nbufs = 0; ++ vmq->vmq_mode = 1; ++ ++ /* Store the pointer to netchannel2 device in pdev */ ++ BUG_ON((vmq->pdev->vmq == NULL) || (vmq->pdev->vmq->queue == NULL)); ++ vmq->pdev->vmq->queue[q_id].guest = (void *)nc->net_device; ++ ++ atomic_set(&vmq->refcnt, 0); ++ init_waitqueue_head(&vmq->waiting_to_free); ++ ++ printk(KERN_INFO "Netchannel2 using vmq mode for guest %d\n", ++ nc->xenbus_device->otherend_id); ++ ++ vmq->vmq_state = VMQ_QUEUE_STARTING; ++ ++ return 1; /* Success */ ++ ++ ++free_pagevec: ++ free_empty_pages_and_pagevec(vmq->pages, VMQ_MAX_BUFFERS); ++free_queue: ++ vmq_free_queue(vmq->pdev, vmq->vmq_id); ++free_pdev: ++ dev_put(vmq->pdev); ++ vmq->pdev = NULL; ++ return 0; ++} ++ ++void nc2_vmq_shutdown(struct netchannel2 *nc) ++{ ++ nc2_vmq_t *vmq = &nc->vmq; ++ int i; ++ ++ if (!vmq->vmq_mode) ++ return; ++ ++ /* All posted bufs should have been returned */ ++ BUG_ON(nr_vmq_bufs(nc) != nr_vmq_mapped_bufs(nc)); ++ ++ /* free the mapped bufs */ ++ nc2_vmq_free_mapped_bufs(nc); ++ ++ /* Free the vmq pages */ ++ if (vmq->pages) { ++ for (i = 0; i < VMQ_MAX_BUFFERS; i++) { ++ if (PageForeign(vmq->pages[i])) ++ ClearPageForeign(vmq->pages[i]); ++ vmq->pages[i]->mapping = NULL; ++ } ++ free_empty_pages_and_pagevec(vmq->pages, VMQ_MAX_BUFFERS); ++ vmq->pages = NULL; ++ } ++ ++ while (!skb_queue_empty(&vmq->free_skb_list)) { ++ /* Free the socket buffer pool */ ++ kfree_skb(skb_dequeue(&vmq->free_skb_list)); ++ } ++ vmq->vmq_state = VMQ_QUEUE_DISABLED; ++ vmq->vmq_mode = 0; ++ ++ if (vmq->pdev) { ++ dev_put(vmq->pdev); ++ vmq->pdev = NULL; ++ } ++ ++ vmq_put(vmq); ++} ++ ++static int prepare_xmit_allocate_vmq(struct netchannel2 *nc, ++ struct sk_buff *skb) ++{ ++ unsigned msg_size; ++ ++ msg_size = get_transmitted_packet_msg_size(skb); ++ if (!nc2_reserve_payload_bytes(&nc->rings.prod_ring, msg_size)) ++ return -1; ++ return 0; ++} ++ ++void do_vmq_work(struct netchannel2 *nc) ++{ ++ nc2_vmq_t *vmq = &nc->vmq; ++ struct sk_buff *skb; ++ unsigned long flags; ++ ++ /* if not in vmq mode do nothing */ ++ if (!nc2_in_vmq_mode(nc)) ++ return; ++ ++ /* Map guest buffers for dedicated NIC RX queue if needed */ ++ if (nr_vmq_bufs(nc) < VMQ_MIN_BUFFERS) { ++ nc2_vmq_map_buffers(nc); ++ /* We delay enabling the queue until we have enough ++ posted buffers. Check if it is time to enable it */ ++ if (nc2_vmq_is_starting(nc) && ++ (nr_vmq_bufs(nc) >= VMQ_MIN_BUFFERS)) { ++ nc2_vmq_enable(nc); ++ } ++ } ++ ++ /* free vmq skb's returned by the physical device driver */ ++ while (!skb_queue_empty(&nc->vmq.dealloc_queue)) ++ nc2_vmq_free_skb(skb_dequeue(&nc->vmq.dealloc_queue)); ++ ++ /* complete vmq closing after all packets returned by physical ++ * device driver */ ++ ++ if (nc2_vmq_is_closing(nc) && ++ (nr_vmq_bufs(nc) == nr_vmq_mapped_bufs(nc))) { ++ nc->vmq.vmq_state = VMQ_QUEUE_DISABLED; ++ nc2_vmq_shutdown(nc); ++ } ++ ++ spin_lock_irqsave(&vmq->rx_queue.lock, flags); ++ while (!skb_queue_empty(&vmq->rx_queue)) { ++ skb = __skb_dequeue(&nc->vmq.rx_queue); ++ if (prepare_xmit_allocate_vmq(nc, skb) < 0) { ++ __skb_queue_head(&vmq->rx_queue, skb); ++ spin_unlock_irqrestore(&vmq->rx_queue.lock, flags); ++ return; ++ } ++ __skb_queue_tail(&nc->rings.pending_tx_queue, skb); ++ } ++ spin_unlock_irqrestore(&vmq->rx_queue.lock, flags); ++} ++ ++/* Return the netchannel2 device corresponding to the given queue in pdev */ ++static inline struct net_device *nc2_vmq_queue_to_vif(struct net_device *pdev, ++ int queue_id) ++{ ++ net_vmq_t *n_vmq; ++ vmq_queue_t *vmq_q; ++ ++ n_vmq = pdev->vmq; ++ BUG_ON(n_vmq == NULL); ++ vmq_q = &n_vmq->queue[queue_id]; ++ BUG_ON(vmq_q == NULL); ++ ++ return (struct net_device *)vmq_q->guest; ++} ++ ++/* Handle incoming vmq packet */ ++int vmq_netif_rx(struct sk_buff *skb, int queue_id) ++{ ++ struct skb_cb_overlay *skb_co = get_skb_overlay(skb); ++ struct net_device *dev; ++ struct netchannel2 *nc; ++ nc2_vmq_t *vmq; ++ ++ memset(skb_co, 0, sizeof(*skb_co)); ++ ++ skb_co->nr_fragments = skb_shinfo(skb)->nr_frags; ++ skb_co->type = NC2_PACKET_TYPE_pre_posted; ++ skb_co->policy = transmit_policy_vmq; ++ ++ /* get the netchannel2 interface corresponding to this queue */ ++ dev = nc2_vmq_queue_to_vif(skb->dev, queue_id); ++ nc = netdev_priv(dev); ++ vmq = &nc->vmq; ++ ++ /* replace source dev with destination dev */ ++ skb->dev = dev; ++ /* add skb to rx_queue */ ++ skb_queue_tail(&vmq->rx_queue, skb); ++ ++ /* Trigger thread excution to procees new packets */ ++ nc2_kick(&nc->rings); ++ ++ return 0; ++} ++EXPORT_SYMBOL(vmq_netif_rx); ++ ++ ++/* Allocate a socket buffer from the free list, get a guest posted ++ * buffer, attach it to the skb, and return it. ++ */ ++struct sk_buff *vmq_alloc_skb(struct net_device *netdevice, int queue_id, ++ unsigned int length) ++{ ++ struct sk_buff *skb; ++ struct netchannel2 *nc; ++ nc2_vmq_t *vmq; ++ unsigned int idx; ++ int nr_bufs, i; ++ unsigned int cons; ++ unsigned int prod; ++ ++ /* get the netchannel2 interface corresponding to this queue */ ++ nc = netdev_priv(nc2_vmq_queue_to_vif(netdevice, queue_id)); ++ ++ vmq = &nc->vmq; ++ ++ /* Get a free buffer from the pool */ ++ if (skb_queue_empty(&vmq->free_skb_list)) { ++ /* No buffers to allocate */ ++ return NULL; ++ } ++ ++ ++ skb = skb_dequeue(&vmq->free_skb_list); ++ BUG_ON(skb == NULL); ++ ++ nr_bufs = VMQ_NUM_BUFFERS(length); ++ ++ cons = vmq->mapped_pages_cons; ++ prod = vmq->mapped_pages_prod; ++ smp_rmb(); ++ ++ if (nr_bufs > (prod - cons)) ++ /* Not enough mapped buffers in the pool */ ++ goto kick_nc2; ++ ++ if (nr_bufs > MAX_SKB_FRAGS) ++ goto error; ++ ++ for (i = 0; i < nr_bufs; i++) { ++ idx = vmq->mapped_pages[VMQ_IDX_MASK(cons)]; ++ /* FIX ME: This can be simplified */ ++ skb_shinfo(skb)->frags[i].page = ++ virt_to_page(vmq_idx_to_kaddr(vmq, idx)); ++ skb_shinfo(skb)->frags[i].page_offset = 0; ++ skb_shinfo(skb)->frags[i].size = PAGE_SIZE; ++ skb_shinfo(skb)->nr_frags++; ++ skb->dev = netdevice; ++ cons++; ++ } ++ ++ vmq->mapped_pages_cons = cons; ++ ++ /* if number of buffers get low run tasklet to map more buffers */ ++ if (nr_vmq_bufs(nc) < VMQ_MIN_BUFFERS) ++ nc2_kick(&nc->rings); ++ ++ return skb; ++ ++kick_nc2: ++ /* kick netchannel2 interface to get any recently posted buffers */ ++ nc2_kick(&nc->rings); ++error: ++ /* Add the skb back to the free pool */ ++ skb_queue_tail(&vmq->free_skb_list, skb); ++ return NULL; ++} ++EXPORT_SYMBOL(vmq_alloc_skb); ++ ++/* Detach the guest pages and free the socket buffer */ ++void vmq_free_skb(struct sk_buff *skb, int queue_id) ++{ ++ struct net_device *dev; ++ struct netchannel2 *nc; ++ nc2_vmq_t *vmq; ++ ++ /* get the netchannel2 interface corresponding to this queue */ ++ dev = nc2_vmq_queue_to_vif(skb->dev, queue_id); ++ ++ nc = netdev_priv(dev); ++ vmq = &nc->vmq; ++ ++ /* Add skb to the dealloc queue */ ++ skb->dev = dev; ++ skb_queue_tail(&vmq->dealloc_queue, skb); ++ ++ /* kick netchannel2 interface */ ++ nc2_kick(&nc->rings); ++ ++} ++EXPORT_SYMBOL(vmq_free_skb); ++ ++int nc2_is_vmq_packet(struct netchannel2 *nc, struct sk_buff *skb) ++{ ++ int nr_frags; ++ long idx; ++ nc2_vmq_t *vmq = &nc->vmq; ++ ++ nr_frags = skb_shinfo(skb)->nr_frags; ++ if (vmq->vmq_mode && nr_frags && ++ PageForeign(skb_shinfo(skb)->frags[0].page)) { ++ idx = nc2_vmq_page_index(skb_shinfo(skb)->frags[0].page); ++ if ((idx >= 0) && (idx < VMQ_MAX_BUFFERS)) ++ return 1; ++ } ++ ++ return 0; ++} ++ ++/* Prepare to transmit a vmq packet */ ++void xmit_vmq(struct netchannel2 *nc, struct sk_buff *skb, ++ volatile void *msg_buf) ++{ ++ volatile struct netchannel2_msg_packet *msg = msg_buf; ++ volatile struct netchannel2_fragment *out_frag; ++ nc2_vmq_t *vmq = &nc->vmq; ++ skb_frag_t *frag; ++ struct nc2_tx_buffer *txbuf; ++ int nr_frags; ++ unsigned int idx; ++ unsigned x; ++ ++ nr_frags = skb_shinfo(skb)->nr_frags; ++ for (x = 0; x < nr_frags; x++) { ++ frag = &skb_shinfo(skb)->frags[x]; ++ out_frag = &msg->frags[x]; ++ ++ idx = nc2_vmq_page_index(frag->page); ++ txbuf = vmq->buffer[idx].buf; ++ out_frag->pre_post.id = txbuf->id; ++ out_frag->off = frag->page_offset; ++ out_frag->size = frag->size; ++ /* TODO: need to batch unmap grants */ ++ nc2_vmq_unmap_buf(nc, idx, 0); ++ } ++ ++ /* Avoid unmapping frags grants when skb is freed later */ ++ /* by nc2_vmq_free_skb() */ ++ skb_shinfo(skb)->nr_frags = 0; ++} ++ +diff --git a/drivers/xen/netchannel2/vmq.h b/drivers/xen/netchannel2/vmq.h +new file mode 100644 +index 0000000..18051d4 +--- /dev/null ++++ b/drivers/xen/netchannel2/vmq.h +@@ -0,0 +1,58 @@ ++#ifndef VMQ_H__ ++#define VMQ_H__ ++ ++#include "netchannel2_core.h" ++ ++#ifdef CONFIG_XEN_NETDEV2_VMQ ++ ++int nc2_vmq_connect(struct netchannel2 *nc); ++void nc2_vmq_disconnect(struct netchannel2 *nc); ++void do_vmq_work(struct netchannel2 *nc); ++int nc2_is_vmq_packet(struct netchannel2 *nc, struct sk_buff *skb); ++void xmit_vmq(struct netchannel2 *nc, struct sk_buff *skb, ++ volatile void *msg); ++void vmq_flush_unmap_hypercall(void); ++ ++#define vmq_get(_b) \ ++ atomic_inc(&(_b)->refcnt); ++ ++#define vmq_put(_b) \ ++ do { \ ++ if (atomic_dec_and_test(&(_b)->refcnt)) { \ ++ wake_up(&(_b)->waiting_to_free); \ ++ } \ ++ } while (0) ++ ++static inline int nr_vmq_mapped_bufs(struct netchannel2 *nc) ++{ ++ return nc->vmq.mapped_pages_prod - ++ nc->vmq.mapped_pages_cons; ++} ++ ++static inline int nr_vmq_bufs(struct netchannel2 *nc) ++{ ++ return nc->vmq.nbufs; ++} ++ ++static inline int nc2_in_vmq_mode(struct netchannel2 *nc) ++{ ++ return nc->vmq.vmq_mode; ++} ++ ++#else ++static inline int nc2_vmq_connect(struct netchannel2 *nc) ++{ ++ return 0; ++} ++static inline void nc2_vmq_disconnect(struct netchannel2 *nc) ++{ ++} ++static inline void do_vmq_work(struct netchannel2 *nc) ++{ ++} ++static inline void vmq_flush_unmap_hypercall(void) ++{ ++} ++#endif /* CONFIG_XEN_NETDEV2_VMQ */ ++ ++#endif /* !VMQ_H__ */ +diff --git a/drivers/xen/netchannel2/vmq_def.h b/drivers/xen/netchannel2/vmq_def.h +new file mode 100644 +index 0000000..5c34ac3 +--- /dev/null ++++ b/drivers/xen/netchannel2/vmq_def.h +@@ -0,0 +1,68 @@ ++#ifndef VMQ_DEF_H__ ++#define VMQ_DEF_H__ ++ ++ ++/* size of HW queue in VMQ device */ ++#define VMQ_QUEUE_SIZE 1024 ++ ++/* Mimimum amount of buffers needed for VMQ ++ * This is the lower water mark that triggers mapping more guest buffers ++ * Should be larger than the queue size to allow for in flight packets ++ */ ++#define VMQ_MIN_BUFFERS 1920 ++ ++/* Maximum amount of posted buffers which are reserved for VMQ ++ * Should be less than MAX_POSTED_BUFFERS. For now, the difference can be used ++ * for intra-node guest to guest traffic. When we map guest buffers we try to ++ * have VMQ_MAX_BUFFERS mapped. The difference (VMQ_MAX_BUFFERS-VMQ_MIN_BUFFERS) ++ * helps batch multiple grant map operattions ++ * VMQ_QUEUE_SIZE < VMQ_MIN_BUFFER < VMQ_MAX_BUFFER < MAX_POSTED_BUFFERS ++ * VMQ_MAX_BUFFERS must be a power of 2 ++ */ ++#define VMQ_MAX_BUFFERS 2048 ++ ++/* skb size is zero since packet data uses fragments */ ++#define VMQ_SKB_SIZE 0 ++ ++#define VMQ_NUM_BUFFERS(len) ((len + PAGE_SIZE - 1) / PAGE_SIZE) ++ ++#define VMQ_IDX_MASK(_i) ((_i)&(VMQ_MAX_BUFFERS-1)) ++ ++typedef struct nc2_vmq_buf { ++ struct nc2_tx_buffer *buf; ++ struct netchannel2 *nc; ++} nc2_vmq_buf_t; ++ ++typedef struct nc2_vmq { ++ struct net_device *pdev; /* Pointer to physical device */ ++ int vmq_mode; /* indicate if vif is in vmq mode */ ++ struct page **pages; /* pages for mapping guest RX bufs */ ++ struct sk_buff_head free_skb_list; /* Free socket buffer pool */ ++ struct sk_buff_head dealloc_queue; /* list of skb's to be free */ ++ struct sk_buff_head rx_queue; /* list of received packets */ ++ ++ /* guest mapped buffers */ ++ nc2_vmq_buf_t buffer[VMQ_MAX_BUFFERS]; ++ ++ /* Ring with free pages available for mapping guest RX buffers */ ++ u16 unmapped_pages[VMQ_MAX_BUFFERS]; ++ unsigned int unmapped_pages_prod; ++ unsigned int unmapped_pages_cons; ++ ++ /* Ring of mapped RX pages avaialable for vmq device */ ++ u16 mapped_pages[VMQ_MAX_BUFFERS]; ++ unsigned int mapped_pages_prod; ++ unsigned int mapped_pages_cons; ++ ++ unsigned int nbufs; /* number of vmq buffers: posted to */ ++ /* HW queue or available to be posted */ ++ int vmq_id; /* Queue id */ ++ int vmq_size; /* Queue size */ ++ int vmq_state; /* queue stste */ ++ ++ atomic_t refcnt; ++ wait_queue_head_t waiting_to_free; ++ ++} nc2_vmq_t; ++ ++#endif /* !VMQ_DEF_H__ */ +diff --git a/drivers/xen/netchannel2/xmit_packet.c b/drivers/xen/netchannel2/xmit_packet.c +index 6421d94..6a60cd5 100644 +--- a/drivers/xen/netchannel2/xmit_packet.c ++++ b/drivers/xen/netchannel2/xmit_packet.c +@@ -3,6 +3,12 @@ + #include + #include + #include "netchannel2_core.h" ++#include "vmq.h" ++ ++/* You don't normally want to transmit in posted buffers mode, because ++ grant mode is usually faster, but it's sometimes useful for testing ++ the VMQ receiver when you don't have VMQ-capable hardware. */ ++#define PREFER_POSTED_BUFFERS 0 + + /* We limit the number of transmitted packets which can be in flight + at any one time, as a somewhat paranoid safety catch. */ +@@ -15,6 +21,16 @@ static enum transmit_policy transmit_policy(struct netchannel2 *nc, + return transmit_policy_small; + else if (nc->remote_trusted) + return transmit_policy_map; ++ else if (PREFER_POSTED_BUFFERS && ++ /* We approximate the number of buffers needed by ++ skb_shinfo(skb)->nr_frags, which isn't entirely ++ correct, but isn't that far off, either. Getting ++ it wrong just means we'll delay transmission ++ waiting for more buffers when we should have gone ++ ahead with polict grant; not ideal, but hardly a ++ disaster. */ ++ nc->nr_avail_tx_buffers > skb_shinfo(skb)->nr_frags) ++ return transmit_policy_post; + else + return transmit_policy_grant; + } +@@ -76,6 +92,9 @@ enum prepare_xmit_result prepare_xmit_allocate_resources(struct netchannel2 *nc, + case transmit_policy_grant: + r = prepare_xmit_allocate_grant(&nc->rings, skb, 1); + break; ++ case transmit_policy_post: ++ r = prepare_xmit_allocate_post(nc, skb); ++ break; + case transmit_policy_map: + r = prepare_xmit_allocate_grant(&nc->rings, skb, 0); + break; +@@ -171,12 +190,20 @@ int nc2_really_start_xmit(struct netchannel2_ring_pair *ncrp, + set_offload_flags(skb, msg); + + switch (skb_co->policy) { ++#ifdef CONFIG_XEN_NETDEV2_VMQ ++ case transmit_policy_vmq: ++ xmit_vmq(nc, skb, msg); ++ break; ++#endif + case transmit_policy_small: + /* Nothing to do */ + break; + case transmit_policy_grant: + xmit_grant(ncrp, skb, 1, msg); + break; ++ case transmit_policy_post: ++ xmit_post(nc, skb, msg); ++ break; + case transmit_policy_map: + xmit_grant(ncrp, skb, 0, msg); + break; +diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h +index 2b7b804..014dbac 100644 +--- a/include/linux/netdevice.h ++++ b/include/linux/netdevice.h +@@ -753,6 +753,11 @@ struct net_device + #define GSO_MAX_SIZE 65536 + unsigned int gso_max_size; + ++#ifdef CONFIG_NET_VMQ ++ /* multi-queue for virtualization */ ++ struct net_vmq *vmq; ++#endif ++ + #ifdef CONFIG_DCBNL + /* Data Center Bridging netlink ops */ + struct dcbnl_rtnl_ops *dcbnl_ops; +diff --git a/include/linux/netvmq.h b/include/linux/netvmq.h +new file mode 100644 +index 0000000..cd69c5a +--- /dev/null ++++ b/include/linux/netvmq.h +@@ -0,0 +1,399 @@ ++/****************************************************************************** ++ * netvmq.h ++ * ++ * Interface between the I/O virtualization layer and multi-queue devices to ++ * enable direct data placement in guest memory ++ * ++ * Copyright (c) 2008, Jose Renato Santos, Hewlett-Packard Co. ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU General Public License version 2 ++ * as published by the Free Software Foundation; or, when distributed ++ * separately from the Linux kernel or incorporated into other ++ * software packages, subject to the following license: ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a copy ++ * of this source file (the "Software"), to deal in the Software without ++ * restriction, including without limitation the rights to use, copy, modify, ++ * merge, publish, distribute, sublicense, and/or sell copies of the Software, ++ * and to permit persons to whom the Software is furnished to do so, subject to ++ * the following conditions: ++ * ++ * The above copyright notice and this permission notice shall be included in ++ * all copies or substantial portions of the Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE ++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS ++ * IN THE SOFTWARE. ++ * ++ */ ++ ++/* ++ * This file defines the vmq API for Linux network device drivers ++ * to enable the use of multi-queue NICs for virtualization. ++ * The goal is to enable network device drivers to dedicate ++ * each RX queue to a specific guest. This means network ++ * drivers should be able to allocate physical memory from ++ * the set of memory pages assigned to a specific guest. ++ * ++ * The interface between network device drivers and the virtualization ++ * layer has two components: ++ * 1) A set of functions implemented by the virtualization layer that ++ * can be called from new multi-queue network device drivers ++ * 2) A set of new functions implemented by the device drivers to support ++ * multi-queue ++ */ ++ ++#ifndef _NETVMQ_H ++#define _NETVMQ_H ++ ++#ifdef CONFIG_NET_VMQ ++ ++#include ++ ++/* status flags for vmq_queue struct */ ++/* allocated/free queue*/ ++#define _VMQ_queue_allocated (0) ++#define VMQ_queue_allocated (1U<<_VMQ_queue_allocated) ++ ++/* queue type. RX/TX */ ++#define _VMQ_queue_rx (1) ++#define VMQ_queue_rx (1U<<_VMQ_queue_rx) ++ ++/* enabled/disabled queue */ ++#define _VMQ_queue_enabled (2) ++#define VMQ_queue_enabled (1U<<_VMQ_queue_enabled) ++ ++/* queue type used to allocate or check number of available queues */ ++#define VMQ_TYPE_RX (1) ++#define VMQ_TYPE_TX (2) ++#define VMQ_TYPE_TX_RX (VMQ_TYPE_RX | VMQ_TYPE_TX) ++ ++ ++struct vmq_queue { ++ /* queue flags - VMQ_queue_* */ ++ unsigned int flags; ++ /* pointer to opaque struct with guest information */ ++ /* format is specific to the virtualization layer used */ ++ void *guest; ++ /* pointer to opaque struct in device driver */ ++ void *devqueue; ++}; ++typedef struct vmq_queue vmq_queue_t; ++ ++struct net_vmq { ++ /* pointer to device driver specific functions for multi-queue */ ++ ++ int (*avail_queues)(struct net_device *netdev, ++ unsigned int queue_type); ++ int (*alloc_queue)(struct net_device *netdev, ++ unsigned int queue_type); ++ int (*free_queue)(struct net_device *netdev, int queue); ++ int (*get_maxsize)(struct net_device *netdev); ++ int (*get_size)(struct net_device *netdev, int queue); ++ int (*set_size)(struct net_device *netdev, int queue, int size); ++ int (*set_mac)(struct net_device *netdev, int queue, u8 *mac_addr); ++ int (*set_vlan)(struct net_device *netdev, int queue, int vlan_id); ++ int (*enable)(struct net_device *netdev, int queue); ++ int (*disable)(struct net_device *netdev, int queue); ++ ++ /* maximum number of vm queues that device can allocate */ ++ int nvmq; ++ ++ /* Variable size Vector with queues info */ ++ /* nvmq defines the vector size */ ++ vmq_queue_t *queue; ++}; ++typedef struct net_vmq net_vmq_t; ++ ++/** ++ * alloc_vmq - Allocate net_vmq struct used for multi-queue devices ++ * @max_queue: Maximum number of queues that can be allocated ++ * for virtualization ++ */ ++static inline net_vmq_t *alloc_vmq(int max_queues) ++{ ++ net_vmq_t *vmq; ++ vmq = kzalloc(sizeof(net_vmq_t), GFP_KERNEL); ++ if (!vmq) ++ return NULL; ++ vmq->queue = kzalloc(max_queues * sizeof(vmq_queue_t), GFP_KERNEL); ++ if (!vmq->queue) { ++ kfree(vmq); ++ return NULL; ++ } ++ return vmq; ++} ++ ++/** ++ * free_vmq - Free net_vmq struct ++ * @vmq: pointer to net_vmq struct ++ */ ++static inline void free_vmq(net_vmq_t *vmq) ++{ ++ kfree(vmq->queue); ++ kfree(vmq); ++} ++ ++/*================================================================* ++ * 1) Functions provided by the virtualization layer to support * ++ * multi-queue devices. * ++ * Device drivers that support multi-queue should use these new * ++ * functions instead of the ones they replace * ++ *================================================================*/ ++ ++ ++/* vmq_alloc_skb : This function should be used instead of the usual ++ * netdev_alloc_skb() in order to post RX buffers to a RX queue ++ * dedicated to a guest. Queues not dedicated to a guest should ++ * use the reguler netdev_alloc_skb() function ++ * ++ * It will return buffers from memory belonging to a given guest ++ * The device driver should not try to change the data alignment ++ * or change the skb data pointer in any way. ++ * The function should already return an skb with the right alignment ++ * ++ * The device driver should be prepared to handle a NULL return value ++ * indicating no memory for that guest is currently available. In this case ++ * the device driver should only postpone the buffer allocation ++ * (probably until the next buffer is used by the device) and continue ++ * operating with the previously posted buffers ++ * ++ * netdev: network device allocating the skb ++ * queue: Queue id of a queue dedicated to a guest ++ * individual queues are identified by a integer in the ++ * the range [0, MAX-1]. Negative values are use to indicate error ++ * The maximum number of queues (MAX) is determined by the device ++ * ++ * length: size to allocate ++ */ ++struct sk_buff *vmq_alloc_skb(struct net_device *netdev, int queue, ++ unsigned int length); ++ ++ ++/* vmq_free_skb : Free an skb allocated with vmq_alloc_skb() ++ * ++ * skb: socket buffer to be freed ++ * qid: Queue id of a queue dedicated to a guest ++ * We could add a qid field in sk_buff struct and avoid passing it ++ * as a parameter in vm_free_skb() and vmq_netif_rx() ++ */ ++void vmq_free_skb(struct sk_buff *skb, int queue); ++ ++/* vmq_alloc_page : Allocate full pages from guest memory. ++ * This can only be used when the device MTU is larger than a page ++ * and multiple pages are neeeded to receive a packet. ++ * ++ * Similarly to vmq_alloc_skb(), ++ * the device driver should be prepared to handle a NULL return value ++ * indicating no memory for that guest is currently available. In this case ++ * the device driver should only postpone the buffer allocation ++ * (probably until the next buffer is used by the device) and continue ++ * operating with the previously posted buffers ++ * ++ * netdev: network device allocating the skb ++ * queue: Queue id of a queue dedicated to a guest ++ * individual queues are identified by a integer in the ++ * the range [0, MAX-1]. Negative values are use to indicate error ++ * The maximum number of queues (MAX) is determined by the device ++ */ ++struct page *vmq_alloc_page(struct net_device *netdev, int queue); ++ ++/* vmq_free_page : Free a guest page allocated with vmq_alloc_page() ++ * ++ * page: page to be freed ++ * queue: Queue id of a queue dedicated to a guest ++ */ ++void vmq_free_page(struct page *page, int queue); ++ ++/* ++ * vmq_netif_rx: This function is a replacement for the generic netif_rx() ++ * and allows packets received on a particular queue to be forwarded directly ++ * to a particular guest bypassing the regular network stack (bridge in xen). ++ * In Xen this function will be implemented by the Xen netback driver. ++ * The use of this function by the driver is optional and may be configured ++ * using a kernel CONFIG option (CONFIG option to be defined) ++ * ++ * skb: Received socket buffer ++ * queue: Queue id of a queue dedicated to a guest ++ */ ++int vmq_netif_rx(struct sk_buff *skb, int queue); ++ ++/*==============================================================* ++ * 2) New device driver functions for multi-queue devices * ++ *==============================================================*/ ++ ++/* vmq_avail_queues: Returns number of available queues that can be allocated ++ * It does not include already allocated queues or queues used for receive ++ * side scaling. It should return 0 when vmq_alloc_queue() would fail ++ * ++ * netdev: network device ++ * queue_type: Queue type, (VMQ_TYPE_*) ++ * RETURN VALUE: ++ * number of available queues ++ * returns 0 on success ++ */ ++static inline int vmq_avail_queues(struct net_device *netdev, ++ unsigned int queue_type) ++{ ++ if (!netdev->vmq) ++ return -EINVAL; ++ return netdev->vmq->avail_queues(netdev, queue_type); ++} ++ ++/* vmq_alloc_queue: allocate a queue ++ * ++ * netdev: network device ++ * queue_type: Queue type, (VMQ_TYPE_*) ++ * RETURN VALUE: ++ * queue id of the allocated queue (the qid should be an integer which ++ * cannot exceed or be equal to the maximum number of queues); ++ * a negative value indicates error ++ */ ++static inline int vmq_alloc_queue(struct net_device *netdev, ++ unsigned int queue_type) ++{ ++ if (!netdev->vmq) ++ return -EINVAL; ++ return netdev->vmq->alloc_queue(netdev, queue_type); ++} ++ ++/* vmq_free_queue: free a queue previously allocated with vmq_alloc_queue() ++ * ++ * netdev: network device ++ * queue: id of queue to be freed ++ * RETURN VALUE: ++ * a negative value indicates error; ++ * returns 0 on success ++ */ ++static inline int vmq_free_queue(struct net_device *netdev, int queue) ++{ ++ if (!netdev->vmq) ++ return -EINVAL; ++ return netdev->vmq->free_queue(netdev, queue); ++} ++ ++/* vmq_get_maxsize: Get maximum size that can be set for a queue ++ * (max number of HW descriptors) ++ * ++ * netdev: network device ++ * RETURN VALUE: ++ * max size of a queue ++ * a negative value indicates error, ++ */ ++static inline int vmq_get_maxsize(struct net_device *netdev) ++{ ++ if (!netdev->vmq) ++ return -EINVAL; ++ return netdev->vmq->get_maxsize(netdev); ++} ++ ++/* vmq_get_size: Get size of queue (number of HW descriptors) ++ * ++ * netdev: network device ++ * queue: queue id ++ * RETURN VALUE: ++ * size of queue ++ * a negative value indicates error, ++ */ ++static inline int vmq_get_size(struct net_device *netdev, int queue) ++{ ++ if (!netdev->vmq) ++ return -EINVAL; ++ return netdev->vmq->get_size(netdev, queue); ++} ++ ++/* vmq_set_size: Set size of queue (number of HW descriptors) ++ * It can return error if size exceeds maximum hw capablity ++ * We will probably need function to return the maximum ++ * HW queue size, but we can live without it for now ++ * netdev: network device ++ * queue: queue id ++ * size: Queue size (number of HW descriptors) ++ * RETURN VALUE: ++ * a negative value indicates error, ++ * returns 0 on success ++ */ ++static inline int vmq_set_size(struct net_device *netdev, int queue, int size) ++{ ++ if (!netdev->vmq) ++ return -EINVAL; ++ return netdev->vmq->set_size(netdev, queue, size); ++} ++ ++/* vmq_set_mac: Set MAC address filter for a queue ++ * ++ * netdev: network device ++ * queue: queue id ++ * mac_addr: pointer to a 6 byte array with the MAC address ++ * MAC address FF:FF:FF:FF:FF:FF is used to reset the filter ++ * RETURN VALUE: ++ * a negative value indicates error, ++ * returns 0 on success ++ */ ++static inline int vmq_set_mac(struct net_device *netdev, int queue, ++ u8 *mac_addr) ++{ ++ if (!netdev->vmq) ++ return -EINVAL; ++ return netdev->vmq->set_mac(netdev, queue, mac_addr); ++} ++ ++/* vmq_set_vlan: Set VLAN filter for a queue ++ * ++ * netdev: network device ++ * queue: queue id ++ * vlan_id: VLAN id ++ * The invalid VLAN id -1 is used to reset the VLAN filter ++ * RETURN VALUE: ++ * a negative value indicates error, ++ * returns 0 on success ++ */ ++static inline int vmq_set_vlan(struct net_device *netdev, int queue, ++ int vlan_id) ++{ ++ if (!netdev->vmq) ++ return -EINVAL; ++ return netdev->vmq->set_vlan(netdev, queue, vlan_id); ++} ++ ++/* vmq_enable_queue: Enable queue ++ * For receive queues this will trigger allocating and posting buffers ++ * ++ * netdev: network device ++ * queue: queue id ++ * RETURN VALUE: ++ * a negative value indicates error, ++ * returns 0 on success ++ */ ++static inline int vmq_enable_queue(struct net_device *netdev, int queue) ++{ ++ if (!netdev->vmq) ++ return -EINVAL; ++ return netdev->vmq->enable(netdev, queue); ++} ++ ++/* vmq_disable_queue: Disable queue ++ * This will flush all buffers in the queue and will free the respective ++ * skb's or fragment pages ++ * ++ * netdev: network device ++ * queue_id: queue id ++ * RETURN VALUE: ++ * a negative value indicates error, ++ * returns 0 on success ++ */ ++static inline int vmq_disable_queue(struct net_device *netdev, int queue) ++{ ++ if (!netdev->vmq) ++ return -EINVAL; ++ return netdev->vmq->disable(netdev, queue); ++} ++ ++#endif /* CONFIG_NET_VMQ */ ++ ++#endif /* _NETVMQ_H */ +diff --git a/include/xen/interface/io/netchannel2.h b/include/xen/interface/io/netchannel2.h +index 528417c..3f0aebc 100644 +--- a/include/xen/interface/io/netchannel2.h ++++ b/include/xen/interface/io/netchannel2.h +@@ -47,6 +47,11 @@ struct netchannel2_fragment { + grant_ref_t gref; + } receiver_copy; + struct { ++ /* The id of a buffer which previously posted ++ in a POST_BUFFER message. */ ++ uint32_t id; ++ } pre_post; ++ struct { + grant_ref_t gref; + } receiver_map; + }; +@@ -106,6 +111,13 @@ struct netchannel2_msg_packet { + * Due to backend bugs, it is in not safe to use this + * packet type except on bypass rings. + * ++ * pre_posted -- The transmitting domain has copied the packet to ++ * buffers which were previously provided in POST_BUFFER ++ * messages. No FINISH message is required, and it is ++ * an error to send one. ++ * ++ * This packet type may not be used on bypass rings. ++ * + * receiver_map -- The transmitting domain has granted the receiving + * domain access to the original RX buffers using + * full (mappable) grant references. This can be +@@ -134,6 +146,7 @@ struct netchannel2_msg_packet { + * that it is correct to treat receiver_map and small packets as + * receiver_copy ones. */ + #define NC2_PACKET_TYPE_receiver_copy 1 ++#define NC2_PACKET_TYPE_pre_posted 2 + #define NC2_PACKET_TYPE_receiver_map 3 + #define NC2_PACKET_TYPE_small 4 + +@@ -193,6 +206,64 @@ struct netchannel2_msg_set_max_fragments_per_packet { + uint32_t max_frags_per_packet; + }; + ++/* Provide a buffer to the other end. The buffer is initially empty. ++ * The other end is expected to either: ++ * ++ * -- Put some packet data in it, and return it as part of a ++ * pre_posted PACKET message, or ++ * -- Not do anything with it, and return it in a RETURN_BUFFER ++ * message. ++ * ++ * The other end is allowed to hold on to the buffer for as long as it ++ * wants before returning the buffer. Buffers may be used out of ++ * order. ++ * ++ * This message cannot be sent unless the VM has received a ++ * SET_NR_POSTED_BUFFERS message. The total number of outstanding ++ * buffers must not exceed the limit specified in the ++ * SET_NR_POSTED_BUFFERS message. ++ * ++ * The grant reference should be a whole-page reference, and not a ++ * subpage reference, because the reeciving domain may need to map it ++ * in order to make the buffer available to hardware. The current ++ * Linux implementation doesn't do this, but a future version will. ++ */ ++#define NETCHANNEL2_MSG_POST_BUFFER 6 ++struct netchannel2_msg_post_buffer { ++ struct netchannel2_msg_hdr hdr; ++ uint32_t id; ++ grant_ref_t gref; ++ uint16_t off_in_page; ++ uint16_t size; ++}; ++ ++/* The other end has decided not to use the buffer for some reason ++ * (usually because it's shutting down). The buffer is returned ++ * containing no data. ++ */ ++#define NETCHANNEL2_MSG_RETURN_POSTED_BUFFER 7 ++struct netchannel2_msg_return_posted_buffer { ++ struct netchannel2_msg_hdr hdr; ++ uint32_t id; ++}; ++ ++/* The other end is allowing us to post up to @nr_buffers messages to ++ * us. If @nr_buffers is 0, the use of posted buffers is disabled. ++ * ++ * If there are buffers outstanding, a SET_NR_POSTED_BUFFERS message ++ * implicitly returns all of them, as if they had been returned with a ++ * run of RETURN_POSTED_BUFFER messages. This is true even if ++ * @nr_buffers is unchanged. ++ * ++ * @nr_buffers only ever provides an upper bound on the number of ++ * buffers posted; an endpoint may elect to post less than that. ++ */ ++#define NETCHANNEL2_MSG_SET_NR_POSTED_BUFFERS 8 ++struct netchannel2_msg_set_nr_posted_buffers { ++ struct netchannel2_msg_hdr hdr; ++ uint32_t nr_buffers; ++}; ++ + /* Attach to a bypass ring as a frontend. The receiving domain should + * map the bypass ring (which will be in the sending domain's memory) + * and attach to it in the same as it attached to the original ring. +diff --git a/net/Kconfig b/net/Kconfig +index 0732cb3..27f7d53 100644 +--- a/net/Kconfig ++++ b/net/Kconfig +@@ -37,6 +37,12 @@ source "net/unix/Kconfig" + source "net/xfrm/Kconfig" + source "net/iucv/Kconfig" + ++config NET_VMQ ++ bool "Virtual-machine multi-queue support" ++ default n ++ help ++ Add support for the VMQ features of certain modern network cards. ++ + config INET + bool "TCP/IP networking" + ---help--- diff --git a/master/series b/master/series index dec84c8..27ea9db 100644 --- a/master/series +++ b/master/series @@ -302,6 +302,14 @@ oom-debugging oom-debug-me-harder sysrq-loglevel +# +# Netchannel 2 +# +netback_map_foreign +v2_grant_tables +netchannel2 +netchannel2_vmq + # # XenClient # @@ -313,6 +321,5 @@ bridge-carrier blktap2-pause-unpause blktap2-smp-map-unmap bridge-carrier-follows-prio0.patch -v2v-core bsg-add-global-sgio-mutex.patch itpm diff --git a/master/v2_grant_tables b/master/v2_grant_tables new file mode 100644 index 0000000..2d482ce --- /dev/null +++ b/master/v2_grant_tables @@ -0,0 +1,1021 @@ + + V2 grant tables + + Signed-off-by: Steven Smith + +diff --git a/drivers/xen/core/gnttab.c b/drivers/xen/core/gnttab.c +index aaf526d..1ef44d4 100644 +--- a/drivers/xen/core/gnttab.c ++++ b/drivers/xen/core/gnttab.c +@@ -53,19 +53,35 @@ + /* External tools reserve first few grant table entries. */ + #define NR_RESERVED_ENTRIES 8 + #define GNTTAB_LIST_END 0xffffffff +-#define ENTRIES_PER_GRANT_FRAME (PAGE_SIZE / sizeof(grant_entry_t)) ++#define ENTRIES_PER_GRANT_FRAME (grant_table_version == 1 ? \ ++ (PAGE_SIZE / sizeof(grant_entry_v1_t)) : \ ++ (PAGE_SIZE / sizeof(grant_entry_v2_t))) ++ ++static void pending_free_timer(unsigned long ignore); + + static grant_ref_t **gnttab_list; + static unsigned int nr_grant_frames; + static unsigned int boot_max_nr_grant_frames; + static int gnttab_free_count; + static grant_ref_t gnttab_free_head; ++static grant_ref_t gnttab_pending_free_gref_head = GNTTAB_LIST_END; ++static LIST_HEAD(gnttab_pending_free_pages); ++static DEFINE_TIMER(gnttab_delayed_free_timer, pending_free_timer, 0, 0); ++static DEFINE_SPINLOCK(gnttab_pending_free_lock); + static DEFINE_SPINLOCK(gnttab_list_lock); + +-static struct grant_entry *shared; ++static union { ++ grant_entry_v1_t *v1; ++ grant_entry_v2_t *v2; ++ void *raw; ++} shared; ++ ++static grant_status_t *grstatus; + + static struct gnttab_free_callback *gnttab_free_callback_list; + ++static int grant_table_version; ++ + static int gnttab_expand(unsigned int req_entries); + + #define RPP (PAGE_SIZE / sizeof(grant_ref_t)) +@@ -74,6 +90,11 @@ static int gnttab_expand(unsigned int req_entries); + #define nr_freelist_frames(grant_frames) \ + (((grant_frames) * ENTRIES_PER_GRANT_FRAME + RPP - 1) / RPP) + ++#define SPP (PAGE_SIZE / sizeof(grant_status_t)) ++#define nr_status_frames(grant_frames) \ ++ (((grant_frames) * ENTRIES_PER_GRANT_FRAME + SPP - 1) / SPP) ++ ++ + static int get_free_entries(int count) + { + unsigned long flags; +@@ -152,11 +173,7 @@ int gnttab_grant_foreign_access(domid_t domid, unsigned long frame, + if (unlikely((ref = get_free_entry()) < 0)) + return -ENOSPC; + +- shared[ref].frame = frame; +- shared[ref].domid = domid; +- wmb(); +- BUG_ON(flags & (GTF_accept_transfer | GTF_reading | GTF_writing)); +- shared[ref].flags = GTF_permit_access | flags; ++ gnttab_grant_foreign_access_ref(ref, domid, frame, flags); + + return ref; + } +@@ -165,53 +182,235 @@ EXPORT_SYMBOL_GPL(gnttab_grant_foreign_access); + void gnttab_grant_foreign_access_ref(grant_ref_t ref, domid_t domid, + unsigned long frame, int flags) + { +- shared[ref].frame = frame; +- shared[ref].domid = domid; +- wmb(); +- BUG_ON(flags & (GTF_accept_transfer | GTF_reading | GTF_writing)); +- shared[ref].flags = GTF_permit_access | flags; ++ BUG_ON(flags & (GTF_accept_transfer | GTF_reading | ++ GTF_writing | GTF_sub_page)); ++ if (grant_table_version == 1) { ++ shared.v1[ref].frame = frame; ++ shared.v1[ref].domid = domid; ++ wmb(); ++ shared.v1[ref].flags = GTF_permit_access | flags; ++ } else { ++ shared.v2[ref].frame = frame; ++ shared.v2[ref].hdr.domid = domid; ++ wmb(); ++ shared.v2[ref].hdr.flags = GTF_permit_access | flags; ++ } + } + EXPORT_SYMBOL_GPL(gnttab_grant_foreign_access_ref); + ++int gnttab_grant_foreign_access_subpage(domid_t domid, unsigned long frame, ++ int flags, unsigned page_off, ++ unsigned length) ++{ ++ int ref; ++ ++ if (unlikely((ref = get_free_entry()) < 0)) ++ return -ENOSPC; ++ ++ gnttab_grant_foreign_access_ref_subpage(ref, domid, frame, flags, ++ page_off, length); ++ ++ return ref; ++} ++EXPORT_SYMBOL_GPL(gnttab_grant_foreign_access_subpage); ++ ++void gnttab_grant_foreign_access_ref_subpage(grant_ref_t ref, domid_t domid, ++ unsigned long frame, int flags, ++ unsigned page_off, ++ unsigned length) ++{ ++ BUG_ON(flags & (GTF_accept_transfer | GTF_reading | ++ GTF_writing | GTF_sub_page | GTF_permit_access)); ++ BUG_ON(grant_table_version == 1); ++ shared.v2[ref].sub_page.frame = frame; ++ shared.v2[ref].sub_page.page_off = page_off; ++ shared.v2[ref].sub_page.length = length; ++ shared.v2[ref].hdr.domid = domid; ++ wmb(); ++ shared.v2[ref].hdr.flags = GTF_permit_access | GTF_sub_page | flags; ++} ++EXPORT_SYMBOL_GPL(gnttab_grant_foreign_access_ref_subpage); ++ ++int gnttab_subpage_grants_available(void) ++{ ++ return grant_table_version == 2; ++} ++EXPORT_SYMBOL_GPL(gnttab_subpage_grants_available); ++ ++int gnttab_grant_foreign_access_trans(domid_t domid, int flags, ++ domid_t trans_domid, ++ grant_ref_t trans_gref) ++{ ++ int ref; ++ ++ if (unlikely((ref = get_free_entry()) < 0)) ++ return -ENOSPC; ++ ++ gnttab_grant_foreign_access_ref_trans(ref, domid, flags, ++ trans_domid, trans_gref); ++ ++ return ref; ++} ++EXPORT_SYMBOL_GPL(gnttab_grant_foreign_access_trans); ++ ++void gnttab_grant_foreign_access_ref_trans(grant_ref_t ref, domid_t domid, ++ int flags, ++ domid_t trans_domid, ++ grant_ref_t trans_gref) ++{ ++ BUG_ON(flags & (GTF_accept_transfer | GTF_reading | ++ GTF_writing | GTF_sub_page | GTF_permit_access)); ++ BUG_ON(grant_table_version == 1); ++ shared.v2[ref].transitive.trans_domid = trans_domid; ++ shared.v2[ref].transitive.gref = trans_gref; ++ shared.v2[ref].hdr.domid = domid; ++ wmb(); ++ shared.v2[ref].hdr.flags = GTF_permit_access | GTF_transitive | flags; ++} ++EXPORT_SYMBOL_GPL(gnttab_grant_foreign_access_ref_trans); + + int gnttab_query_foreign_access(grant_ref_t ref) + { + u16 nflags; + +- nflags = shared[ref].flags; ++ if (grant_table_version == 1) ++ nflags = shared.v1[ref].flags; ++ else ++ nflags = grstatus[ref]; + +- return (nflags & (GTF_reading|GTF_writing)); ++ return (nflags & (GTF_reading|GTF_writing)); + } + EXPORT_SYMBOL_GPL(gnttab_query_foreign_access); + +-int gnttab_end_foreign_access_ref(grant_ref_t ref) ++static int _gnttab_end_foreign_access_ref(grant_ref_t ref) + { +- u16 flags, nflags; ++ u16 flags, nflags; ++ u16 *pflags; ++ ++ if (grant_table_version == 1) { ++ pflags = &shared.v1[ref].flags; ++ nflags = *pflags; ++ do { ++ if ((flags = nflags) & (GTF_reading|GTF_writing)) { ++ return 0; ++ } ++ } while ((nflags = synch_cmpxchg_subword(pflags, flags, 0)) != ++ flags); ++ return 1; ++ } else { ++ shared.v2[ref].hdr.flags = 0; ++ mb(); ++ if (grstatus[ref] & (GTF_reading|GTF_writing)) { ++ return 0; ++ } else { ++ /* The read of grstatus needs to have acquire ++ semantics. On x86, reads already have ++ that, and we just need to protect against ++ compiler reorderings. On other ++ architectures we may need a full ++ barrier. */ ++#ifdef CONFIG_X86 ++ barrier(); ++#else ++ mb(); ++#endif ++ return 1; ++ } ++ } ++} + +- nflags = shared[ref].flags; +- do { +- if ((flags = nflags) & (GTF_reading|GTF_writing)) { +- printk(KERN_DEBUG "WARNING: g.e. still in use!\n"); +- return 0; ++int gnttab_end_foreign_access_ref(grant_ref_t gref) ++{ ++ int res; ++ res = _gnttab_end_foreign_access_ref(gref); ++ if (res == 0) ++ printk(KERN_DEBUG "WARNING: g.e. still in use!\n"); ++ return res; ++} ++EXPORT_SYMBOL_GPL(gnttab_end_foreign_access_ref); ++ ++static void pending_free_timer(unsigned long ignore) ++{ ++ grant_ref_t gref, next_gref; ++ grant_ref_t prev; /* The last gref which we failed to release, ++ or GNTTAB_LIST_END if there is no such ++ gref. */ ++ int need_mod_timer; ++ struct page *page, *next_page; ++ ++ spin_lock(&gnttab_pending_free_lock); ++ prev = GNTTAB_LIST_END; ++ for (gref = gnttab_pending_free_gref_head; ++ gref != GNTTAB_LIST_END; ++ gref = next_gref) { ++ next_gref = gnttab_entry(gref); ++ if (_gnttab_end_foreign_access_ref(gref)) { ++ put_free_entry(gref); ++ if (prev != GNTTAB_LIST_END) ++ gnttab_entry(prev) = next_gref; ++ else ++ gnttab_pending_free_gref_head = next_gref; ++ } else { ++ prev = gref; + } +- } while ((nflags = synch_cmpxchg_subword(&shared[ref].flags, flags, 0)) != +- flags); ++ } ++ list_for_each_entry_safe(page, next_page, ++ &gnttab_pending_free_pages, lru) { ++ gref = page->index; ++ if (_gnttab_end_foreign_access_ref(gref)) { ++ list_del(&page->lru); ++ put_free_entry(gref); ++ /* The page hasn't been used in this domain ++ for more than a second, so it's probably ++ cold. */ ++ if (put_page_testzero(page)) { ++#ifdef MODULE ++ __free_page(page); ++#else ++ free_cold_page(page); ++#endif ++ } ++ } ++ } ++ ++ need_mod_timer = ++ (gnttab_pending_free_gref_head != GNTTAB_LIST_END) || ++ !list_empty(&gnttab_pending_free_pages); ++ spin_unlock(&gnttab_pending_free_lock); + +- return 1; ++ if (need_mod_timer) ++ mod_timer(&gnttab_delayed_free_timer, jiffies + HZ); + } +-EXPORT_SYMBOL_GPL(gnttab_end_foreign_access_ref); + + void gnttab_end_foreign_access(grant_ref_t ref, unsigned long page) + { +- if (gnttab_end_foreign_access_ref(ref)) { ++ int need_mod_timer; ++ struct page *page_struct; ++ ++ if (_gnttab_end_foreign_access_ref(ref)) { + put_free_entry(ref); + if (page != 0) + free_page(page); + } else { +- /* XXX This needs to be fixed so that the ref and page are +- placed on a list to be freed up later. */ +- printk(KERN_DEBUG +- "WARNING: leaking g.e. and page still in use!\n"); ++ spin_lock_bh(&gnttab_pending_free_lock); ++ if (page == 0) { ++ if (gnttab_pending_free_gref_head == GNTTAB_LIST_END) ++ need_mod_timer = 1; ++ else ++ need_mod_timer = 0; ++ gnttab_entry(ref) = gnttab_pending_free_gref_head; ++ gnttab_pending_free_gref_head = ref; ++ } else { ++ need_mod_timer = ++ list_empty(&gnttab_pending_free_pages); ++ page_struct = virt_to_page((void *)page); ++ page_struct->index = ref; ++ list_add_tail(&page_struct->lru, ++ &gnttab_pending_free_pages); ++ } ++ spin_unlock_bh(&gnttab_pending_free_lock); ++ if (need_mod_timer) ++ mod_timer(&gnttab_delayed_free_timer, jiffies + HZ); + } + } + EXPORT_SYMBOL_GPL(gnttab_end_foreign_access); +@@ -229,42 +428,58 @@ int gnttab_grant_foreign_transfer(domid_t domid, unsigned long pfn) + EXPORT_SYMBOL_GPL(gnttab_grant_foreign_transfer); + + void gnttab_grant_foreign_transfer_ref(grant_ref_t ref, domid_t domid, +- unsigned long pfn) ++ unsigned long pfn) + { +- shared[ref].frame = pfn; +- shared[ref].domid = domid; +- wmb(); +- shared[ref].flags = GTF_accept_transfer; ++ if (grant_table_version == 1) { ++ shared.v1[ref].frame = pfn; ++ shared.v1[ref].domid = domid; ++ wmb(); ++ shared.v1[ref].flags = GTF_accept_transfer; ++ } else { ++ shared.v2[ref].frame = pfn; ++ shared.v2[ref].hdr.domid = domid; ++ wmb(); ++ shared.v2[ref].hdr.flags = GTF_accept_transfer; ++ } + } + EXPORT_SYMBOL_GPL(gnttab_grant_foreign_transfer_ref); + + unsigned long gnttab_end_foreign_transfer_ref(grant_ref_t ref) + { +- unsigned long frame; +- u16 flags; +- +- /* +- * If a transfer is not even yet started, try to reclaim the grant +- * reference and return failure (== 0). +- */ +- while (!((flags = shared[ref].flags) & GTF_transfer_committed)) { +- if (synch_cmpxchg_subword(&shared[ref].flags, flags, 0) == flags) +- return 0; +- cpu_relax(); +- } +- +- /* If a transfer is in progress then wait until it is completed. */ +- while (!(flags & GTF_transfer_completed)) { +- flags = shared[ref].flags; +- cpu_relax(); +- } +- +- /* Read the frame number /after/ reading completion status. */ +- rmb(); +- frame = shared[ref].frame; +- BUG_ON(frame == 0); +- +- return frame; ++ unsigned long frame; ++ u16 flags; ++ u16 *pflags; ++ ++ if (grant_table_version == 1) ++ pflags = &shared.v1[ref].flags; ++ else ++ pflags = &shared.v2[ref].hdr.flags; ++ ++ /* ++ * If a transfer is not even yet started, try to reclaim the grant ++ * reference and return failure (== 0). ++ */ ++ while (!((flags = *pflags) & GTF_transfer_committed)) { ++ if (synch_cmpxchg_subword(pflags, flags, 0) == flags) ++ return 0; ++ cpu_relax(); ++ } ++ ++ /* If a transfer is in progress then wait until it is completed. */ ++ while (!(flags & GTF_transfer_completed)) { ++ flags = *pflags; ++ cpu_relax(); ++ } ++ ++ /* Read the frame number /after/ reading completion status. */ ++ rmb(); ++ if (grant_table_version == 1) ++ frame = shared.v1[ref].frame; ++ else ++ frame = shared.v2[ref].frame; ++ BUG_ON(frame == 0); ++ ++ return frame; + } + EXPORT_SYMBOL_GPL(gnttab_end_foreign_transfer_ref); + +@@ -303,6 +518,41 @@ void gnttab_free_grant_references(grant_ref_t head) + } + EXPORT_SYMBOL_GPL(gnttab_free_grant_references); + ++int gnttab_suballoc_grant_references(u16 count, grant_ref_t *old_head, ++ grant_ref_t *new_head) ++{ ++ grant_ref_t cursor; ++ unsigned nr_allocated; ++ ++ *new_head = cursor = *old_head; ++ if (cursor == GNTTAB_LIST_END) ++ return -ENOSPC; ++ nr_allocated = 1; ++ while (nr_allocated < count) { ++ cursor = gnttab_entry(cursor); ++ if (cursor == GNTTAB_LIST_END) ++ return -ENOSPC; ++ nr_allocated++; ++ } ++ *old_head = gnttab_entry(cursor); ++ gnttab_entry(cursor) = GNTTAB_LIST_END; ++ return 0; ++} ++EXPORT_SYMBOL_GPL(gnttab_suballoc_grant_references); ++ ++void gnttab_subfree_grant_references(grant_ref_t head, grant_ref_t *pool) ++{ ++ grant_ref_t cursor; ++ ++ for (cursor = head; ++ gnttab_entry(cursor) != GNTTAB_LIST_END; ++ cursor = gnttab_entry(cursor)) ++ ; ++ gnttab_entry(cursor) = *pool; ++ *pool = head; ++} ++EXPORT_SYMBOL_GPL(gnttab_subfree_grant_references); ++ + int gnttab_alloc_grant_references(u16 count, grant_ref_t *head) + { + int h = get_free_entries(count); +@@ -435,6 +685,30 @@ static inline unsigned int max_nr_grant_frames(void) + return xen_max; + } + ++static void gnttab_request_version(void) ++{ ++ int rc; ++ struct gnttab_set_version gsv; ++ ++ gsv.version = 2; ++ rc = HYPERVISOR_grant_table_op(GNTTABOP_set_version, &gsv, 1); ++ if (rc == 0) { ++ grant_table_version = 2; ++ printk(KERN_NOTICE "Using V2 grant tables.\n"); ++ } else { ++ if (grant_table_version == 2) { ++ /* If we've already used version 2 features, ++ but then suddenly discover that they're not ++ available (e.g. migrating to an older ++ version of Xen), almost unbounded badness ++ can happen. */ ++ panic("we need grant tables version 2, but only version 1 is available"); ++ } ++ grant_table_version = 1; ++ printk(KERN_WARNING "Using legacy V1 grant tables; upgrade to a newer version of Xen.\n"); ++ } ++} ++ + #ifdef CONFIG_XEN + + static DEFINE_SEQLOCK(gnttab_dma_lock); +@@ -450,6 +724,16 @@ static int map_pte_fn(pte_t *pte, struct page *pmd_page, + return 0; + } + ++static int map_pte_fn_status(pte_t *pte, struct page *pmd_page, ++ unsigned long addr, void *data) ++{ ++ uint64_t **frames = (uint64_t **)data; ++ ++ set_pte_at(&init_mm, addr, pte, pfn_pte_ma((*frames)[0], PAGE_KERNEL)); ++ (*frames)++; ++ return 0; ++} ++ + #ifdef CONFIG_PM_SLEEP + static int unmap_pte_fn(pte_t *pte, struct page *pmd_page, + unsigned long addr, void *data) +@@ -467,45 +751,97 @@ void *arch_gnttab_alloc_shared(unsigned long *frames) + BUG_ON(area == NULL); + return area->addr; + } ++ ++void *arch_gnttab_alloc_status(uint64_t *frames) ++{ ++ struct vm_struct *area; ++ area = alloc_vm_area(PAGE_SIZE * ++ nr_status_frames(boot_max_nr_grant_frames)); ++ BUG_ON(area == NULL); ++ return area->addr; ++} + #endif /* CONFIG_X86 */ + + static int gnttab_map(unsigned int start_idx, unsigned int end_idx) + { +- struct gnttab_setup_table setup; +- unsigned long *frames; +- unsigned int nr_gframes = end_idx + 1; +- int rc; +- +- frames = kmalloc(nr_gframes * sizeof(unsigned long), GFP_ATOMIC); +- if (!frames) +- return -ENOMEM; +- +- setup.dom = DOMID_SELF; +- setup.nr_frames = nr_gframes; +- set_xen_guest_handle(setup.frame_list, frames); +- +- rc = HYPERVISOR_grant_table_op(GNTTABOP_setup_table, &setup, 1); +- if (rc == -ENOSYS) { +- kfree(frames); +- return -ENOSYS; +- } +- +- BUG_ON(rc || setup.status); +- +- if (shared == NULL) +- shared = arch_gnttab_alloc_shared(frames); ++ struct gnttab_setup_table setup; ++ unsigned long *gframes; ++ uint64_t *sframes; ++ unsigned int nr_gframes = end_idx + 1; ++ unsigned int nr_sframes; ++ int rc; ++ ++ BUG_ON(grant_table_version == 0); ++ ++ gframes = kmalloc(nr_gframes * sizeof(unsigned long), GFP_ATOMIC); ++ if (!gframes) ++ return -ENOMEM; ++ ++ setup.dom = DOMID_SELF; ++ setup.nr_frames = nr_gframes; ++ set_xen_guest_handle(setup.frame_list, gframes); ++ ++ rc = HYPERVISOR_grant_table_op(GNTTABOP_setup_table, &setup, 1); ++ if (rc == -ENOSYS) { ++ kfree(gframes); ++ return -ENOSYS; ++ } ++ ++ BUG_ON(rc || setup.status); ++ ++ if (shared.raw == NULL) ++ shared.raw = arch_gnttab_alloc_shared(gframes); ++ ++ if (grant_table_version > 1) { ++ struct gnttab_get_status_frames getframes; ++ ++ nr_sframes= nr_status_frames(nr_gframes); ++ ++ sframes = kmalloc(nr_sframes * sizeof(uint64_t), ++ GFP_ATOMIC); ++ if (!sframes) { ++ kfree(gframes); ++ return -ENOMEM; ++ } ++ getframes.dom = DOMID_SELF; ++ getframes.nr_frames = nr_sframes; ++ getframes.frame_list = (unsigned long)sframes; ++ ++ rc = HYPERVISOR_grant_table_op(GNTTABOP_get_status_frames, ++ &getframes, 1); ++ if (rc == -ENOSYS) { ++ kfree(gframes); ++ kfree(sframes); ++ return -ENOSYS; ++ } ++ ++ BUG_ON(rc || getframes.status); ++ ++ if (grstatus == NULL) ++ grstatus = arch_gnttab_alloc_status(sframes); ++ } + + #ifdef CONFIG_X86 +- rc = apply_to_page_range(&init_mm, (unsigned long)shared, +- PAGE_SIZE * nr_gframes, +- map_pte_fn, &frames); +- BUG_ON(rc); +- frames -= nr_gframes; /* adjust after map_pte_fn() */ ++ rc = apply_to_page_range(&init_mm, (unsigned long)shared.raw, ++ PAGE_SIZE * nr_gframes, ++ map_pte_fn, &gframes); ++ BUG_ON(rc); ++ gframes -= nr_gframes; /* adjust after map_pte_fn() */ ++ ++ if (grant_table_version > 1) { ++ rc = apply_to_page_range(&init_mm, (unsigned long)grstatus, ++ PAGE_SIZE * nr_sframes, ++ map_pte_fn_status, &sframes); ++ BUG_ON(rc); ++ sframes -= nr_sframes; /* adjust after map_pte_fn() */ ++ } + #endif /* CONFIG_X86 */ + +- kfree(frames); ++ kfree(gframes); ++ if (grant_table_version > 1) ++ kfree(sframes); + +- return 0; ++ return 0; + } + + static void gnttab_page_free(struct page *page, unsigned int order) +@@ -710,6 +1046,7 @@ EXPORT_SYMBOL(gnttab_post_map_adjust); + + static int gnttab_resume(struct sys_device *dev) + { ++ gnttab_request_version(); + if (max_nr_grant_frames() < nr_grant_frames) + return -ENOSYS; + return gnttab_map(0, nr_grant_frames - 1); +@@ -720,10 +1057,13 @@ static int gnttab_resume(struct sys_device *dev) + #ifdef CONFIG_X86 + static int gnttab_suspend(struct sys_device *dev, pm_message_t state) + { +- apply_to_page_range(&init_mm, (unsigned long)shared, +- PAGE_SIZE * nr_grant_frames, +- unmap_pte_fn, NULL); +- return 0; ++ apply_to_page_range(&init_mm, (unsigned long)shared.raw, ++ PAGE_SIZE * nr_grant_frames, ++ unmap_pte_fn, NULL); ++ apply_to_page_range(&init_mm, (unsigned long)grstatus, ++ PAGE_SIZE * nr_status_frames(nr_grant_frames), ++ unmap_pte_fn, NULL); ++ return 0; + } + #else + #define gnttab_suspend NULL +@@ -745,7 +1085,8 @@ static struct sys_device device_gnttab = { + + #include + +-static unsigned long resume_frames; ++static unsigned long resume_frames_gnttab; ++static unsigned long resume_frames_status; + + static int gnttab_map(unsigned int start_idx, unsigned int end_idx) + { +@@ -759,9 +1100,26 @@ static int gnttab_map(unsigned int start_idx, unsigned int end_idx) + xatp.domid = DOMID_SELF; + xatp.idx = i; + xatp.space = XENMAPSPACE_grant_table; +- xatp.gpfn = (resume_frames >> PAGE_SHIFT) + i; +- if (HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp)) +- BUG(); ++ xatp.gpfn = (resume_frames_gnttab >> PAGE_SHIFT) + i; ++ if (HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp)) ++ BUG(); ++ } while (i-- > start_idx); ++ ++ return 0; ++} ++ ++static int gnttab_map_status(unsigned int start_idx, unsigned int end_idx) ++{ ++ struct xen_add_to_physmap xatp; ++ unsigned int i = end_idx; ++ ++ do { ++ xatp.domid = DOMID_SELF; ++ xatp.idx = i | XENMAPIDX_grant_table_status; ++ xatp.space = XENMAPSPACE_grant_table; ++ xatp.gpfn = (resume_frames_status >> PAGE_SHIFT) + i; ++ if (HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp)) ++ BUG(); + } while (i-- > start_idx); + + return 0; +@@ -769,24 +1127,46 @@ static int gnttab_map(unsigned int start_idx, unsigned int end_idx) + + int gnttab_resume(void) + { +- unsigned int max_nr_gframes, nr_gframes; +- +- nr_gframes = nr_grant_frames; +- max_nr_gframes = max_nr_grant_frames(); +- if (max_nr_gframes < nr_gframes) +- return -ENOSYS; +- +- if (!resume_frames) { +- resume_frames = alloc_xen_mmio(PAGE_SIZE * max_nr_gframes); +- shared = ioremap(resume_frames, PAGE_SIZE * max_nr_gframes); +- if (shared == NULL) { +- printk("error to ioremap gnttab share frames\n"); +- return -1; +- } ++ unsigned int max_nr_gframes, nr_gframes; ++ unsigned int max_nr_sframes, nr_sframes; ++ ++ gnttab_request_version(); ++ ++ nr_gframes = nr_grant_frames; ++ max_nr_gframes = max_nr_grant_frames(); ++ if (max_nr_gframes < nr_gframes) ++ return -ENOSYS; ++ ++ if (!resume_frames_gnttab) { ++ resume_frames_gnttab = ++ alloc_xen_mmio(PAGE_SIZE * max_nr_gframes); ++ shared.raw = ioremap(resume_frames_gnttab, ++ PAGE_SIZE * max_nr_gframes); ++ if (shared.raw == NULL) { ++ printk("error to ioremap gnttab share frames\n"); ++ return -1; ++ } + } + + gnttab_map(0, nr_gframes - 1); + ++ if (grant_table_version > 1) { ++ nr_sframes = nr_status_frames(nr_gframes); ++ max_nr_sframes = nr_status_frames(max_nr_gframes); ++ if (!resume_frames_status) { ++ resume_frames_status = ++ alloc_xen_mmio(PAGE_SIZE * max_nr_sframes); ++ grstatus = ioremap(resume_frames_status, ++ PAGE_SIZE * max_nr_sframes); ++ if (grstatus == NULL) { ++ printk("error ioremap()ing gnttab status frames\n"); ++ return -1; ++ } ++ } ++ ++ gnttab_map_status(0, nr_sframes - 1); ++ } ++ + return 0; + } + +diff --git a/include/xen/gnttab.h b/include/xen/gnttab.h +index bde65fd..b4610d9 100644 +--- a/include/xen/gnttab.h ++++ b/include/xen/gnttab.h +@@ -54,6 +54,22 @@ struct gnttab_free_callback { + int gnttab_grant_foreign_access(domid_t domid, unsigned long frame, + int flags); + ++int gnttab_grant_foreign_access_subpage(domid_t domid, unsigned long frame, ++ int flags, unsigned page_off, ++ unsigned length); ++ ++void gnttab_grant_foreign_access_ref_trans(grant_ref_t ref, domid_t domid, ++ int flags, ++ domid_t trans_domid, ++ grant_ref_t trans_gref); ++ ++/* ++ * Are sub-page grants available on this version of Xen? Returns 1 if ++ * they are, and 0 if they're not. ++ */ ++int gnttab_subpage_grants_available(void); ++ ++ + /* + * End access through the given grant reference, iff the grant entry is no + * longer in use. Return 1 if the grant entry was freed, 0 if it is still in +@@ -81,10 +97,15 @@ int gnttab_query_foreign_access(grant_ref_t ref); + */ + int gnttab_alloc_grant_references(u16 count, grant_ref_t *pprivate_head); + ++int gnttab_suballoc_grant_references(u16 count, grant_ref_t *old_head, ++ grant_ref_t *new_head); ++ + void gnttab_free_grant_reference(grant_ref_t ref); + + void gnttab_free_grant_references(grant_ref_t head); + ++void gnttab_subfree_grant_references(grant_ref_t head, grant_ref_t *pool); ++ + int gnttab_empty_grant_references(const grant_ref_t *pprivate_head); + + int gnttab_claim_grant_reference(grant_ref_t *pprivate_head); +@@ -98,6 +119,14 @@ void gnttab_cancel_free_callback(struct gnttab_free_callback *callback); + + void gnttab_grant_foreign_access_ref(grant_ref_t ref, domid_t domid, + unsigned long frame, int flags); ++void gnttab_grant_foreign_access_ref_subpage(grant_ref_t ref, domid_t domid, ++ unsigned long frame, int flags, ++ unsigned page_off, ++ unsigned length); ++void gnttab_grant_foreign_access_ref_trans(grant_ref_t ref, domid_t domid, ++ int flags, ++ domid_t trans_domid, ++ grant_ref_t trans_gref); + + void gnttab_grant_foreign_transfer_ref(grant_ref_t, domid_t domid, + unsigned long pfn); +diff --git a/include/xen/interface/grant_table.h b/include/xen/interface/grant_table.h +index c5c2044..20c4915 100644 +--- a/include/xen/interface/grant_table.h ++++ b/include/xen/interface/grant_table.h +@@ -84,12 +84,22 @@ + */ + + /* ++ * Reference to a grant entry in a specified domain's grant table. ++ */ ++typedef uint32_t grant_ref_t; ++ ++/* + * A grant table comprises a packed array of grant entries in one or more + * page frames shared between Xen and a guest. + * [XEN]: This field is written by Xen and read by the sharing guest. + * [GST]: This field is written by the guest and read by Xen. + */ +-struct grant_entry { ++ ++/* ++ * Version 1 of the grant table entry structure is maintained purely ++ * for backwards compatibility. New guests should use version 2. ++ */ ++struct grant_entry_v1 { + /* GTF_xxx: various type and flag information. [XEN,GST] */ + uint16_t flags; + /* The domain being granted foreign privileges. [GST] */ +@@ -100,7 +110,7 @@ struct grant_entry { + */ + uint32_t frame; + }; +-typedef struct grant_entry grant_entry_t; ++typedef struct grant_entry_v1 grant_entry_v1_t; + + /* + * Type of grant entry. +@@ -108,10 +118,13 @@ typedef struct grant_entry grant_entry_t; + * GTF_permit_access: Allow @domid to map/access @frame. + * GTF_accept_transfer: Allow @domid to transfer ownership of one page frame + * to this guest. Xen writes the page number to @frame. ++ * GTF_transitive: Allow @domid to transitively access a subrange of ++ * @trans_grant in @trans_domid. No mappings are allowed. + */ + #define GTF_invalid (0U<<0) + #define GTF_permit_access (1U<<0) + #define GTF_accept_transfer (2U<<0) ++#define GTF_transitive (3U<<0) + #define GTF_type_mask (3U<<0) + + /* +@@ -120,6 +133,9 @@ typedef struct grant_entry grant_entry_t; + * GTF_reading: Grant entry is currently mapped for reading by @domid. [XEN] + * GTF_writing: Grant entry is currently mapped for writing by @domid. [XEN] + * GTF_PAT, GTF_PWT, GTF_PCD: (x86) cache attribute flags for the grant [GST] ++ * GTF_sub_page: Grant access to only a subrange of the page. @domid ++ * will only be allowed to copy from the grant, and not ++ * map it. [GST] + */ + #define _GTF_readonly (2) + #define GTF_readonly (1U<<_GTF_readonly) +@@ -133,6 +149,8 @@ typedef struct grant_entry grant_entry_t; + #define GTF_PCD (1U<<_GTF_PCD) + #define _GTF_PAT (7) + #define GTF_PAT (1U<<_GTF_PAT) ++#define _GTF_sub_page (8) ++#define GTF_sub_page (1U<<_GTF_sub_page) + + /* + * Subflags for GTF_accept_transfer: +@@ -149,15 +167,76 @@ typedef struct grant_entry grant_entry_t; + #define _GTF_transfer_completed (3) + #define GTF_transfer_completed (1U<<_GTF_transfer_completed) + ++/* ++ * Version 2 grant table entries. These fulfil the same role as ++ * version 1 entries, but can represent more complicated operations. ++ * Any given domain will have either a version 1 or a version 2 table, ++ * and every entry in the table will be the same version. ++ * ++ * The interface by which domains use grant references does not depend ++ * on the grant table version in use by the other domain. ++ */ + +-/*********************************** +- * GRANT TABLE QUERIES AND USES ++/* ++ * Version 1 and version 2 grant entries share a common prefix. The ++ * fields of the prefix are documented as part of struct ++ * grant_entry_v1. + */ ++struct grant_entry_header { ++ uint16_t flags; ++ domid_t domid; ++}; ++typedef struct grant_entry_header grant_entry_header_t; + + /* +- * Reference to a grant entry in a specified domain's grant table. ++ * Version 2 of the grant entry structure. ++ */ ++struct grant_entry_v2 { ++ grant_entry_header_t hdr; ++ union { ++ /* ++ * The frame to which we are granting access. This field has ++ * the same meaning as the grant_entry_v1 field of the same ++ * name. ++ */ ++ uint32_t frame; ++ ++ /* ++ * If the grant type is GTF_grant_access and GTF_sub_page is ++ * set, @domid is allowed to access bytes ++ * [@page_off,@page_off+@length) in frame @frame. ++ */ ++ struct { ++ uint32_t frame; ++ uint16_t page_off; ++ uint16_t length; ++ } sub_page; ++ ++ /* ++ * If the grant is GTF_transitive, @domid is allowed to use ++ * the grant @gref in domain @trans_domid, as if it was the ++ * local domain. Obviously, the transitive access must be ++ * compatible with the original grant. ++ * ++ * The current version of Xen does not allow transitive grants ++ * to be mapped. ++ */ ++ struct { ++ domid_t trans_domid; ++ uint16_t pad0; ++ grant_ref_t gref; ++ } transitive; ++ ++ uint32_t __spacer[3]; /* Pad to a power of two */ ++ }; ++}; ++typedef struct grant_entry_v2 grant_entry_v2_t; ++ ++typedef uint16_t grant_status_t; ++ ++/*********************************** ++ * GRANT TABLE QUERIES AND USES + */ +-typedef uint32_t grant_ref_t; + + /* + * Handle to track a mapping created via a grant reference. +@@ -365,6 +444,46 @@ struct gnttab_unmap_and_replace { + typedef struct gnttab_unmap_and_replace gnttab_unmap_and_replace_t; + DEFINE_XEN_GUEST_HANDLE(gnttab_unmap_and_replace_t); + ++/* ++ * GNTTABOP_set_version: Request a particular version of the grant ++ * table shared table structure. This operation can only be performed ++ * once in any given domain. It must be performed before any grants ++ * are activated; otherwise, the domain will be stuck with version 1. ++ * The only defined versions are 1 and 2. ++ */ ++#define GNTTABOP_set_version 8 ++struct gnttab_set_version { ++ /* IN parameters */ ++ uint32_t version; ++}; ++DEFINE_XEN_GUEST_HANDLE_STRUCT(gnttab_set_version); ++typedef struct gnttab_set_version gnttab_set_version_t; ++DEFINE_XEN_GUEST_HANDLE(gnttab_set_version_t); ++ ++/* ++ * GNTTABOP_get_status_frames: Get the list of frames used to store grant ++ * status for . In grant format version 2, the status is separated ++ * from the other shared grant fields to allow more efficient synchronization ++ * using barriers instead of atomic cmpexch operations. ++ * specify the size of vector . ++ * The frame addresses are returned in the . ++ * Only addresses are returned, even if the table is larger. ++ * NOTES: ++ * 1. may be specified as DOMID_SELF. ++ * 2. Only a sufficiently-privileged domain may specify != DOMID_SELF. ++ */ ++#define GNTTABOP_get_status_frames 9 ++struct gnttab_get_status_frames { ++ /* IN parameters. */ ++ uint32_t nr_frames; ++ domid_t dom; ++ /* OUT parameters. */ ++ int16_t status; /* GNTST_* */ ++ uint64_t frame_list; ++}; ++DEFINE_XEN_GUEST_HANDLE_STRUCT(gnttab_get_status_frames); ++typedef struct gnttab_get_status_frames gnttab_get_status_frames_t; ++DEFINE_XEN_GUEST_HANDLE(gnttab_get_status_frames_t); + + /* + * Bitfield values for gnttab_map_grant_ref.flags.