direct-io.hg
changeset 379:f5c2415f65e3
bitkeeper revision 1.179.1.1 (3e9ee075wJmtFBkJEk-QAC5VB7htXg)
network.c, dev.c, vif.h, sched.h, network.h, TODO:
Fixed network rings so we can have out-of-order responses. This made it possible to fix local packet delivery. However, the virtual firewall/router stuff needs urgent redesigning.
network.c, dev.c, vif.h, sched.h, network.h, TODO:
Fixed network rings so we can have out-of-order responses. This made it possible to fix local packet delivery. However, the virtual firewall/router stuff needs urgent redesigning.
author | kaf24@scramble.cl.cam.ac.uk |
---|---|
date | Thu Apr 17 17:12:21 2003 +0000 (2003-04-17) |
parents | 1585992989d0 |
children | 5e482605e7d8 |
files | xen/TODO xen/common/network.c xen/include/hypervisor-ifs/network.h xen/include/xeno/sched.h xen/include/xeno/vif.h xen/net/dev.c xenolinux-2.4.21-pre4-sparse/arch/xeno/drivers/network/network.c |
line diff
1.1 --- a/xen/TODO Thu Apr 17 12:26:14 2003 +0000 1.2 +++ b/xen/TODO Thu Apr 17 17:12:21 2003 +0000 1.3 @@ -7,20 +7,7 @@ longer-term goals. 1.4 -- Keir (16/3/03) 1.5 1.6 1.7 -1. FIX HANDLING OF NETWORK RINGS 1.8 --------------------------------- 1.9 -Handling of the transmit rings is currently very broken (for example, 1.10 -sending an inter-domain packet will wedge the hypervisor). This is 1.11 -because we may handle packets out of order (eg. inter-domain packets 1.12 -are handled eagerly, while packets for real interfaces are queued), 1.13 -but our current ring design really assumes in-order handling. 1.14 - 1.15 -A neat fix will be to allow responses to be queued in a different 1.16 -order to requests, just as we already do with block-device 1.17 -rings. We'll need to add an opaque identifier to ring entries, 1.18 -allowing matching of requests and responses, but that's about it. 1.19 - 1.20 -2. ACCURATE TIMERS AND WALL-CLOCK TIME 1.21 +1. ACCURATE TIMERS AND WALL-CLOCK TIME 1.22 -------------------------------------- 1.23 Currently our long-term timebase free runs on CPU0, with no external 1.24 calibration. We should run ntpd on domain 0 and allow this to warp 1.25 @@ -28,7 +15,7 @@ Xen's timebase. Once this is done, we ca 1.26 not worry about relative drift (since they'll all get sync'ed 1.27 periodically by ntp). 1.28 1.29 -3. ASSIGNING DOMAINS TO PROCESSORS 1.30 +2. ASSIGNING DOMAINS TO PROCESSORS 1.31 ---------------------------------- 1.32 More intelligent assignment of domains to processors. In 1.33 particular, we don't play well with hyperthreading: we will assign 1.34 @@ -40,17 +27,17 @@ relationships between processors in the 1.35 siblings in the same package). We then use this to balance domains 1.36 across packages, and across virtual processors within a package. 1.37 1.38 -4. PROPER DESTRUCTION OF DOMAINS 1.39 --------------------------------- 1.40 -Currently we do not free resources when destroying a domain. This is 1.41 -because they may be tied up in subsystems, and there is no way of 1.42 -pulling them back in a safe manner. 1.43 +3. DOMAIN 0 MANAGEMENT DAEMON 1.44 +----------------------------- 1.45 +A better control daemon is required for domain 0, which keeps proper 1.46 +track of machine resources and can make sensible policy choices. This 1.47 +may require support in Xen; for example, notifications (eg. DOMn is 1.48 +killed), and requests (eg. can DOMn allocate x frames of memory?). 1.49 1.50 -The fix is probably to reference count resources and automatically 1.51 -free them when the count reaches zero. We may get away with one count 1.52 -per domain (for all its resources). When this reaches zero we know it 1.53 -is safe to free everything: block-device rings, network rings, and all 1.54 -the rest. 1.55 +4. SANE NETWORK ROUTING 1.56 +----------------------- 1.57 +The current virtual firewall/router is completely broken. Needs a new 1.58 +design and implementation! 1.59 1.60 5. NETWORK CHECKSUM OFFLOAD 1.61 --------------------------- 1.62 @@ -60,14 +47,7 @@ indicate, on transmit, which packets nee 1.63 receive, which packets have been checked out as okay. We can steal 1.64 Linux's interface, which is entirely sane given NIC limitations. 1.65 1.66 -6. DOMAIN 0 MANAGEMENT DAEMON 1.67 ------------------------------ 1.68 -A better control daemon is required for domain 0, which keeps proper 1.69 -track of machine resources and can make sensible policy choices. This 1.70 -may require support in Xen; for example, notifications (eg. DOMn is 1.71 -killed), and requests (eg. can DOMn allocate x frames of memory?). 1.72 - 1.73 -7. MODULE SUPPORT FOR XEN 1.74 +6. MODULE SUPPORT FOR XEN 1.75 ------------------------- 1.76 Network and blkdev drivers are bloating Xen. At some point we want to 1.77 build drivers as modules, stick them in a cheesy ramfs, then relocate 1.78 @@ -79,7 +59,7 @@ which drivers to load. 1.79 Most of the hard stuff (relocating and the like) is done for us by 1.80 Linux's module system. 1.81 1.82 -8. NEW DESIGN FEATURES 1.83 +7. NEW DESIGN FEATURES 1.84 ---------------------- 1.85 This includes the last-chance page cache, and the unified buffer cache. 1.86
2.1 --- a/xen/common/network.c Thu Apr 17 12:26:14 2003 +0000 2.2 +++ b/xen/common/network.c Thu Apr 17 17:12:21 2003 +0000 2.3 @@ -5,7 +5,7 @@ 2.4 * with the virtual interfaces (vifs) and the virtual firewall/router through 2.5 * the use of rules. 2.6 * 2.7 - * Copyright (c) 2002, A K Warfield and K A Fraser 2.8 + * Copyright (c) 2002-2003, A K Warfield and K A Fraser 2.9 */ 2.10 2.11 #include <hypervisor-ifs/network.h> 2.12 @@ -67,7 +67,8 @@ net_vif_t *create_net_vif(int domain) 2.13 2.14 shadow_ring = kmalloc(sizeof(net_shadow_ring_t), GFP_KERNEL); 2.15 if ( shadow_ring == NULL ) goto fail; 2.16 - 2.17 + memset(shadow_ring, 0, sizeof(*shadow_ring)); 2.18 + 2.19 shadow_ring->rx_ring = kmalloc(RX_RING_SIZE 2.20 * sizeof(rx_shadow_entry_t), GFP_KERNEL); 2.21 shadow_ring->tx_ring = kmalloc(TX_RING_SIZE 2.22 @@ -75,9 +76,6 @@ net_vif_t *create_net_vif(int domain) 2.23 if ( (shadow_ring->rx_ring == NULL) || (shadow_ring->tx_ring == NULL) ) 2.24 goto fail; 2.25 2.26 - shadow_ring->rx_prod = shadow_ring->rx_cons = shadow_ring->rx_idx = 0; 2.27 - shadow_ring->tx_prod = shadow_ring->tx_cons = shadow_ring->tx_idx = 0; 2.28 - 2.29 /* 2.30 * Fill in the new vif struct. Note that, while the vif's refcnt is 2.31 * non-zero, we hold a reference to the task structure. 2.32 @@ -121,7 +119,7 @@ void destroy_net_vif(net_vif_t *vif) 2.33 /* Return any outstanding receive buffers to the guest OS. */ 2.34 spin_lock_irqsave(&p->page_lock, flags); 2.35 for ( i = vif->shadow_ring->rx_idx; 2.36 - i != vif->shadow_ring->rx_prod; 2.37 + i != vif->shadow_ring->rx_req_cons; 2.38 i = ((i+1) & (RX_RING_SIZE-1)) ) 2.39 { 2.40 rx_shadow_entry_t *rx = vif->shadow_ring->rx_ring + i; 2.41 @@ -263,7 +261,7 @@ void add_default_net_rule(int vif_id, u3 2.42 memset(&new_rule, 0, sizeof(net_rule_t)); 2.43 new_rule.dst_addr = ipaddr; 2.44 new_rule.dst_addr_mask = 0xffffffff; 2.45 - new_rule.src_interface = VIF_PHYSICAL_INTERFACE; 2.46 + new_rule.src_interface = VIF_ANY_INTERFACE; 2.47 new_rule.dst_interface = vif_id; 2.48 new_rule.action = NETWORK_ACTION_ACCEPT; 2.49 new_rule.proto = NETWORK_PROTO_ANY; 2.50 @@ -319,9 +317,8 @@ void print_net_rule_list() 2.51 * Apply the rules to this skbuff and return the vif id that it is bound for. 2.52 * If there is no match, VIF_DROP is returned. 2.53 */ 2.54 - 2.55 -int net_find_rule(u8 nproto, u8 tproto, u32 src_addr, u32 dst_addr, u16 src_port, u16 dst_port, 2.56 - int src_vif) 2.57 +int net_find_rule(u8 nproto, u8 tproto, u32 src_addr, u32 dst_addr, 2.58 + u16 src_port, u16 dst_port, int src_vif) 2.59 { 2.60 net_rule_ent_t *ent; 2.61 int dest = VIF_DROP; 2.62 @@ -330,7 +327,7 @@ int net_find_rule(u8 nproto, u8 tproto, 2.63 2.64 ent = net_rule_list; 2.65 2.66 - while (ent) 2.67 + while ( ent != NULL ) 2.68 { 2.69 if ( ((ent->r.src_interface == src_vif) 2.70 || (ent->r.src_interface == VIF_ANY_INTERFACE)) && 2.71 @@ -351,12 +348,19 @@ int net_find_rule(u8 nproto, u8 tproto, 2.72 (tproto == IPPROTO_UDP))) 2.73 ) 2.74 { 2.75 - break; 2.76 + /* 2.77 + * XXX FFS! We keep going to find the "best" rule. Where best 2.78 + * corresponds to vaguely sane routing of a packet. We need a less 2.79 + * shafted model for aour "virtual firewall/router" methinks! 2.80 + */ 2.81 + if ( dest < 0 ) 2.82 + dest = ent->r.dst_interface; 2.83 + if ( dest >= 0 ) 2.84 + break; 2.85 } 2.86 ent = ent->next; 2.87 } 2.88 2.89 - if (ent) (dest = ent->r.dst_interface); 2.90 read_unlock(&net_rule_lock); 2.91 return dest; 2.92 } 2.93 @@ -423,6 +427,7 @@ int __net_get_target_vif(u8 *data, unsig 2.94 return target; 2.95 2.96 drop: 2.97 + printk("VIF%d: pkt to drop!\n", src_vif); 2.98 return VIF_DROP; 2.99 } 2.100
3.1 --- a/xen/include/hypervisor-ifs/network.h Thu Apr 17 12:26:14 2003 +0000 3.2 +++ b/xen/include/hypervisor-ifs/network.h Thu Apr 17 17:12:21 2003 +0000 3.3 @@ -14,50 +14,70 @@ 3.4 3.5 #include <linux/types.h> 3.6 3.7 -typedef struct tx_entry_st { 3.8 - unsigned long addr; /* machine address of packet (IN VAR) */ 3.9 - unsigned short size; /* in bytes (IN VAR) */ 3.10 - unsigned char status; /* per descriptor status (OUT VAR) */ 3.11 - unsigned char _unused; 3.12 + 3.13 +typedef struct tx_req_entry_st 3.14 +{ 3.15 + unsigned long id; 3.16 + unsigned long addr; /* machine address of packet */ 3.17 + unsigned short size; /* packet size in bytes */ 3.18 +} tx_req_entry_t; 3.19 + 3.20 +typedef struct tx_resp_entry_st 3.21 +{ 3.22 + unsigned long id; 3.23 + unsigned char status; 3.24 +} tx_resp_entry_t; 3.25 + 3.26 +typedef union tx_entry_st 3.27 +{ 3.28 + tx_req_entry_t req; 3.29 + tx_resp_entry_t resp; 3.30 } tx_entry_t; 3.31 3.32 -typedef struct rx_entry_st { 3.33 - unsigned long addr; /* machine address of PTE to swizzle (IN VAR) */ 3.34 - unsigned short size; /* in bytes (OUT VAR) */ 3.35 - unsigned char status; /* per descriptor status (OUT VAR) */ 3.36 - unsigned char offset; /* offset in page of received pkt (OUT VAR) */ 3.37 + 3.38 +typedef struct rx_req_entry_st 3.39 +{ 3.40 + unsigned long id; 3.41 + unsigned long addr; /* machine address of PTE to swizzle */ 3.42 +} rx_req_entry_t; 3.43 + 3.44 +typedef struct rx_resp_entry_st 3.45 +{ 3.46 + unsigned long id; 3.47 + unsigned short size; /* received packet size in bytes */ 3.48 + unsigned char status; /* per descriptor status */ 3.49 + unsigned char offset; /* offset in page of received pkt */ 3.50 +} rx_resp_entry_t; 3.51 + 3.52 +typedef union rx_entry_st 3.53 +{ 3.54 + rx_req_entry_t req; 3.55 + rx_resp_entry_t resp; 3.56 } rx_entry_t; 3.57 3.58 + 3.59 #define TX_RING_SIZE 256 3.60 #define RX_RING_SIZE 256 3.61 -typedef struct net_ring_st { 3.62 + 3.63 +typedef struct net_ring_st 3.64 +{ 3.65 /* 3.66 - * Guest OS places packets into ring at tx_prod. 3.67 - * Hypervisor removes at tx_cons. 3.68 - * Ring is empty when tx_prod == tx_cons. 3.69 - * Guest OS receives a DOMAIN_EVENT_NET_TX when tx_cons passes tx_event. 3.70 - * Hypervisor may be prodded whenever tx_prod is updated, but this is 3.71 - * only necessary when tx_cons == old_tx_prod (ie. transmitter stalled). 3.72 + * Guest OS places packets into ring at tx_req_prod. 3.73 + * Guest OS receives DOMAIN_EVENT_NET_TX when tx_resp_prod passes tx_event. 3.74 */ 3.75 tx_entry_t *tx_ring; 3.76 - unsigned int tx_prod, tx_cons, tx_event; 3.77 + unsigned int tx_req_prod, tx_resp_prod, tx_event; 3.78 3.79 /* 3.80 - * Guest OS places empty buffers into ring at rx_prod. 3.81 - * Hypervisor fills buffers as rx_cons. 3.82 - * Ring is empty when rx_prod == rx_cons. 3.83 - * Guest OS receives a DOMAIN_EVENT_NET_RX when rx_cons passes rx_event. 3.84 - * Hypervisor may be prodded whenever rx_prod is updated, but this is 3.85 - * only necessary when rx_cons == old_rx_prod (ie. receiver stalled). 3.86 + * Guest OS places empty buffers into ring at rx_req_prod. 3.87 + * Guest OS receives DOMAIN_EVENT_NET_RX when rx_rssp_prod passes rx_event. 3.88 */ 3.89 rx_entry_t *rx_ring; 3.90 - unsigned int rx_prod, rx_cons, rx_event; 3.91 + unsigned int rx_req_prod, rx_resp_prod, rx_event; 3.92 } net_ring_t; 3.93 3.94 -/* Specify base of per-domain array. Get returned free slot in the array. */ 3.95 -/*net_ring_t *create_net_vif(int domain);*/ 3.96 - 3.97 -/* Packet routing/filtering code follows: 3.98 +/* 3.99 + * Packet routing/filtering code follows: 3.100 */ 3.101 3.102 #define NETWORK_ACTION_ACCEPT 0 3.103 @@ -89,7 +109,7 @@ typedef struct net_rule_st 3.104 typedef struct vif_query_st 3.105 { 3.106 unsigned int domain; 3.107 - char *buf; // where to put the reply -- guest virtual address 3.108 + char *buf; /* reply buffer -- guest virtual address */ 3.109 } vif_query_t; 3.110 3.111 /* Network trap operations and associated structure.
4.1 --- a/xen/include/xeno/sched.h Thu Apr 17 12:26:14 2003 +0000 4.2 +++ b/xen/include/xeno/sched.h Thu Apr 17 17:12:21 2003 +0000 4.3 @@ -50,7 +50,7 @@ extern struct mm_struct init_mm; 4.4 } 4.5 4.6 #define _HYP_EVENT_NEED_RESCHED 0 4.7 -#define _HYP_EVENT_NET_RX 1 4.8 +#define _HYP_EVENT_NET 1 4.9 #define _HYP_EVENT_DIE 2 4.10 4.11 #define PF_DONEFPUINIT 0x1 /* Has the FPU been initialised for this task? */
5.1 --- a/xen/include/xeno/vif.h Thu Apr 17 12:26:14 2003 +0000 5.2 +++ b/xen/include/xeno/vif.h Thu Apr 17 17:12:21 2003 +0000 5.3 @@ -3,7 +3,7 @@ 5.4 * This is the hypervisor end of the network code. The net_ring structure 5.5 * stored in each vif is placed on a shared page to interact with the guest VM. 5.6 * 5.7 - * Copyright (c) 2002, A K Warfield and K A Fraser 5.8 + * Copyright (c) 2002-2003, A K Warfield and K A Fraser 5.9 */ 5.10 5.11 /* virtual network interface struct and associated defines. */ 5.12 @@ -25,45 +25,51 @@ 5.13 * TX_RING_SIZE and RX_RING_SIZE are defined in the shared network.h. 5.14 */ 5.15 5.16 -typedef struct rx_shadow_entry_st { 5.17 +typedef struct rx_shadow_entry_st 5.18 +{ 5.19 + unsigned long id; 5.20 + /* IN vars */ 5.21 unsigned long addr; 5.22 + /* OUT vars */ 5.23 unsigned short size; 5.24 unsigned char status; 5.25 unsigned char offset; 5.26 + /* PRIVATE vars */ 5.27 unsigned long flush_count; 5.28 } rx_shadow_entry_t; 5.29 5.30 -typedef struct tx_shadow_entry_st { 5.31 +typedef struct tx_shadow_entry_st 5.32 +{ 5.33 + unsigned long id; 5.34 + /* IN vars */ 5.35 void *header; 5.36 unsigned long payload; 5.37 unsigned short size; 5.38 + /* OUT vars */ 5.39 unsigned char status; 5.40 - unsigned char _unused; 5.41 } tx_shadow_entry_t; 5.42 5.43 typedef struct net_shadow_ring_st { 5.44 rx_shadow_entry_t *rx_ring; 5.45 + unsigned int rx_prod; /* More buffers for filling go here. */ 5.46 + unsigned int rx_idx; /* Next buffer to fill is here. */ 5.47 + unsigned int rx_cons; /* Next buffer to create response for is here. */ 5.48 + 5.49 tx_shadow_entry_t *tx_ring; 5.50 - 5.51 /* 5.52 - * Private copy of producer. Follows guest OS version, but never 5.53 - * catches up with our consumer index. 5.54 + * These cannot be derived from shared variables, as not all packets 5.55 + * will end up on the shadow ring (eg. locally delivered packets). 5.56 */ 5.57 - unsigned int rx_prod; 5.58 - /* Points at next buffer to be filled by NIC. Chases rx_prod. */ 5.59 - unsigned int rx_idx; 5.60 - /* Points at next buffer to be returned to the guest OS. Chases rx_idx. */ 5.61 - unsigned int rx_cons; 5.62 + unsigned int tx_prod; /* More packets for sending go here. */ 5.63 + unsigned int tx_idx; /* Next packet to send is here. */ 5.64 + unsigned int tx_transmitted_prod; /* Next packet to finish transmission. */ 5.65 + unsigned int tx_cons; /* Next packet to create response for is here. */ 5.66 5.67 - /* 5.68 - * Private copy of producer. Follows guest OS version, but never 5.69 - * catches up with our consumer index. 5.70 - */ 5.71 - unsigned int tx_prod; 5.72 - /* Points at next buffer to be scheduled. Chases tx_prod. */ 5.73 - unsigned int tx_idx; 5.74 - /* Points at next buffer to be returned to the guest OS. Chases tx_idx. */ 5.75 - unsigned int tx_cons; 5.76 + /* Indexes into shared ring. */ 5.77 + unsigned int rx_req_cons; 5.78 + unsigned int rx_resp_prod; /* private version of shared variable */ 5.79 + unsigned int tx_req_cons; 5.80 + unsigned int tx_resp_prod; /* private version of shared variable */ 5.81 } net_shadow_ring_t; 5.82 5.83 typedef struct net_vif_st {
6.1 --- a/xen/net/dev.c Thu Apr 17 12:26:14 2003 +0000 6.2 +++ b/xen/net/dev.c Thu Apr 17 17:12:21 2003 +0000 6.3 @@ -49,6 +49,15 @@ 6.4 #define TX_RING_ADD(_i,_j) (((_i)+(_j)) & (TX_RING_SIZE-1)) 6.5 #define RX_RING_ADD(_i,_j) (((_i)+(_j)) & (RX_RING_SIZE-1)) 6.6 6.7 +static void make_tx_response(net_vif_t *vif, 6.8 + unsigned long id, 6.9 + unsigned char st); 6.10 +static void make_rx_response(net_vif_t *vif, 6.11 + unsigned long id, 6.12 + unsigned short size, 6.13 + unsigned char st, 6.14 + unsigned char off); 6.15 + 6.16 struct net_device *the_dev = NULL; 6.17 6.18 /* 6.19 @@ -482,6 +491,49 @@ illegal_highdma(struct net_device *dev, 6.20 6.21 struct netif_rx_stats netdev_rx_stat[NR_CPUS]; 6.22 6.23 +/* 6.24 + * update_shared_ring(void) 6.25 + * 6.26 + * This replaces flush_rx_queue as the guest event handler to move packets 6.27 + * queued in the guest ring up to the guest. Really, the packet is already 6.28 + * there, it was page flipped in deliver_packet, but this moves the ring 6.29 + * descriptor across from the shadow ring and increments the pointers. 6.30 + */ 6.31 +void update_shared_ring(void) 6.32 +{ 6.33 + rx_shadow_entry_t *rx; 6.34 + tx_shadow_entry_t *tx; 6.35 + net_ring_t *net_ring; 6.36 + net_shadow_ring_t *shadow_ring; 6.37 + net_vif_t *vif; 6.38 + struct list_head *ent; 6.39 + 6.40 + clear_bit(_HYP_EVENT_NET, ¤t->hyp_events); 6.41 + 6.42 + list_for_each(ent, ¤t->net_vifs) 6.43 + { 6.44 + vif = list_entry(ent, net_vif_t, dom_list); 6.45 + net_ring = vif->net_ring; 6.46 + shadow_ring = vif->shadow_ring; 6.47 + 6.48 + while ( shadow_ring->rx_cons != shadow_ring->rx_idx ) 6.49 + { 6.50 + rx = shadow_ring->rx_ring + shadow_ring->rx_cons; 6.51 + if ( rx->flush_count == tlb_flush_count[smp_processor_id()] ) 6.52 + __flush_tlb(); 6.53 + shadow_ring->rx_cons = RX_RING_INC(shadow_ring->rx_cons); 6.54 + make_rx_response(vif, rx->id, rx->size, rx->status, rx->offset); 6.55 + } 6.56 + 6.57 + while ( shadow_ring->tx_cons != shadow_ring->tx_transmitted_prod ) 6.58 + { 6.59 + tx = shadow_ring->tx_ring + shadow_ring->tx_cons; 6.60 + shadow_ring->tx_cons = RX_RING_INC(shadow_ring->tx_cons); 6.61 + make_tx_response(vif, tx->id, tx->status); 6.62 + } 6.63 + } 6.64 +} 6.65 + 6.66 void deliver_packet(struct sk_buff *skb, net_vif_t *vif) 6.67 { 6.68 net_shadow_ring_t *shadow_ring; 6.69 @@ -489,7 +541,6 @@ void deliver_packet(struct sk_buff *skb, 6.70 unsigned long *g_pte; 6.71 struct pfn_info *g_pfn, *h_pfn; 6.72 unsigned int i; 6.73 - unsigned long flags; 6.74 6.75 memset(skb->mac.ethernet->h_dest, 0, ETH_ALEN); 6.76 if ( ntohs(skb->mac.ethernet->h_proto) == ETH_P_ARP ) 6.77 @@ -501,17 +552,13 @@ void deliver_packet(struct sk_buff *skb, 6.78 6.79 rx = shadow_ring->rx_ring + i; 6.80 6.81 - if ( rx->status != RING_STATUS_OK ) 6.82 - { 6.83 - DPRINTK("Bad buffer in deliver_packet()\n"); 6.84 - goto inc_and_out; 6.85 - } 6.86 + ASSERT(rx->status == RING_STATUS_OK); 6.87 + ASSERT(skb->len <= PAGE_SIZE); 6.88 6.89 - ASSERT(skb->len <= PAGE_SIZE); 6.90 rx->size = skb->len; 6.91 rx->offset = (unsigned char)((unsigned long)skb->data & ~PAGE_MASK); 6.92 6.93 - spin_lock_irqsave(&vif->domain->page_lock, flags); 6.94 + spin_lock(&vif->domain->page_lock); 6.95 6.96 g_pte = map_domain_mem(rx->addr); 6.97 6.98 @@ -541,12 +588,11 @@ void deliver_packet(struct sk_buff *skb, 6.99 list_del(&g_pfn->list); 6.100 list_add(&h_pfn->list, &vif->domain->pg_head); 6.101 6.102 - spin_unlock_irqrestore(&vif->domain->page_lock, flags); 6.103 + spin_unlock(&vif->domain->page_lock); 6.104 6.105 /* Our skbuff now points at the guest's old frame. */ 6.106 skb->pf = g_pfn; 6.107 6.108 - inc_and_out: 6.109 smp_wmb(); /* updates must happen before releasing the descriptor. */ 6.110 shadow_ring->rx_idx = RX_RING_INC(i); 6.111 } 6.112 @@ -595,11 +641,11 @@ int netif_rx(struct sk_buff *skb) 6.113 if ( skb->dst_vif == VIF_UNKNOWN_INTERFACE ) 6.114 skb->dst_vif = __net_get_target_vif(skb->data, skb->len, skb->src_vif); 6.115 6.116 - read_lock_irqsave(&sys_vif_lock, flags); 6.117 + read_lock(&sys_vif_lock); 6.118 if ( (skb->dst_vif <= VIF_PHYSICAL_INTERFACE) || 6.119 ((vif = sys_vif_list[skb->dst_vif]) == NULL) ) 6.120 { 6.121 - read_unlock_irqrestore(&sys_vif_lock, flags); 6.122 + read_unlock(&sys_vif_lock); 6.123 netdev_rx_stat[this_cpu].dropped++; 6.124 unmap_domain_mem(skb->head); 6.125 kfree_skb(skb); 6.126 @@ -608,10 +654,10 @@ int netif_rx(struct sk_buff *skb) 6.127 } 6.128 6.129 get_vif(vif); 6.130 - read_unlock_irqrestore(&sys_vif_lock, flags); 6.131 + read_unlock(&sys_vif_lock); 6.132 6.133 deliver_packet(skb, vif); 6.134 - cpu_mask = mark_hyp_event(vif->domain, _HYP_EVENT_NET_RX); 6.135 + cpu_mask = mark_hyp_event(vif->domain, _HYP_EVENT_NET); 6.136 put_vif(vif); 6.137 6.138 unmap_domain_mem(skb->head); 6.139 @@ -676,10 +722,8 @@ static void add_to_net_schedule_list_tai 6.140 /* Destructor function for tx skbs. */ 6.141 static void tx_skb_release(struct sk_buff *skb) 6.142 { 6.143 - int i, send = 0; 6.144 + int i; 6.145 net_vif_t *vif = sys_vif_list[skb->src_vif]; 6.146 - unsigned int idx; 6.147 - tx_shadow_entry_t *tx; 6.148 unsigned long cpu_mask, flags; 6.149 6.150 spin_lock_irqsave(&vif->domain->page_lock, flags); 6.151 @@ -692,51 +736,10 @@ static void tx_skb_release(struct sk_buf 6.152 6.153 skb_shinfo(skb)->nr_frags = 0; 6.154 6.155 - /* This would mean that the guest OS has fiddled with our index. */ 6.156 - if ( vif->shadow_ring->tx_cons != vif->net_ring->tx_cons ) 6.157 - DPRINTK("Shadow and shared rings out of sync (%d/%d)\n", 6.158 - vif->shadow_ring->tx_cons, vif->net_ring->tx_cons); 6.159 - 6.160 - /* 6.161 - * XXX This assumes that, per vif, SKBs are processed in-order! 6.162 - * Also assumes no concurrency. This is safe because each vif 6.163 - * maps to one NIC. This is executed in NIC interrupt code, so we have 6.164 - * mutual exclusion from do_IRQ(). 6.165 - */ 6.166 - 6.167 - smp_wmb(); /* make sure any status updates occur before inc'ing tx_cons. */ 6.168 - 6.169 - /* Skip over a sequence of bad descriptors, plus the first good one. */ 6.170 - do { 6.171 - idx = vif->shadow_ring->tx_cons; 6.172 - /* There must be at least one good descriptor outstanding. */ 6.173 - if ( idx == vif->shadow_ring->tx_idx ) BUG(); 6.174 - tx = &vif->shadow_ring->tx_ring[idx]; 6.175 - vif->shadow_ring->tx_cons = TX_RING_INC(idx); 6.176 - if ( vif->shadow_ring->tx_cons == vif->net_ring->tx_event ) send = 1; 6.177 - } while ( tx->status != RING_STATUS_OK ); 6.178 - 6.179 - /* Now skip over any more bad descriptors, up to the next good one. */ 6.180 - do { 6.181 - idx = vif->shadow_ring->tx_cons; 6.182 - tx = &vif->shadow_ring->tx_ring[idx]; 6.183 - /* Carry on until we find a good descriptor, or reach scheduler idx. */ 6.184 - if ( (idx == vif->shadow_ring->tx_idx) || 6.185 - (tx->status == RING_STATUS_OK) ) 6.186 - break; 6.187 - vif->shadow_ring->tx_cons = TX_RING_INC(idx); 6.188 - if ( vif->shadow_ring->tx_cons == vif->net_ring->tx_event ) send = 1; 6.189 - } while ( 1 ); 6.190 - 6.191 - /* Update shared consumer index to the new private value. */ 6.192 - vif->net_ring->tx_cons = vif->shadow_ring->tx_cons; 6.193 - 6.194 - /* Send a transmit event if requested. */ 6.195 - if ( send ) 6.196 - { 6.197 - cpu_mask = mark_guest_event(vif->domain, _EVENT_NET_TX); 6.198 - guest_event_notify(cpu_mask); 6.199 - } 6.200 + vif->shadow_ring->tx_transmitted_prod = 6.201 + TX_RING_INC(vif->shadow_ring->tx_transmitted_prod); 6.202 + cpu_mask = mark_hyp_event(vif->domain, _HYP_EVENT_NET); 6.203 + hyp_event_notify(cpu_mask); 6.204 6.205 put_vif(vif); 6.206 } 6.207 @@ -765,27 +768,22 @@ static void net_tx_action(unsigned long 6.208 continue; 6.209 } 6.210 6.211 + if ( (skb = alloc_skb_nodata(GFP_ATOMIC)) == NULL ) 6.212 + { 6.213 + printk("Out of memory in net_tx_action()!\n"); 6.214 + add_to_net_schedule_list_tail(vif); 6.215 + put_vif(vif); 6.216 + break; 6.217 + } 6.218 + 6.219 /* Pick an entry from the transmit queue. */ 6.220 tx = &vif->shadow_ring->tx_ring[vif->shadow_ring->tx_idx]; 6.221 vif->shadow_ring->tx_idx = TX_RING_INC(vif->shadow_ring->tx_idx); 6.222 if ( vif->shadow_ring->tx_idx != vif->shadow_ring->tx_prod ) 6.223 add_to_net_schedule_list_tail(vif); 6.224 6.225 - /* Check the chosen entry is good. */ 6.226 - if ( tx->status != RING_STATUS_OK ) 6.227 - { 6.228 - put_vif(vif); 6.229 - continue; 6.230 - } 6.231 + ASSERT(tx->status == RING_STATUS_OK); 6.232 6.233 - if ( (skb = alloc_skb_nodata(GFP_ATOMIC)) == NULL ) 6.234 - { 6.235 - printk("Out of memory in net_tx_action()!\n"); 6.236 - tx->status = RING_STATUS_BAD_PAGE; 6.237 - put_vif(vif); 6.238 - break; 6.239 - } 6.240 - 6.241 skb->destructor = tx_skb_release; 6.242 6.243 skb->head = skb->data = tx->header; 6.244 @@ -828,57 +826,6 @@ static inline void maybe_schedule_tx_act 6.245 6.246 6.247 /* 6.248 - * update_shared_ring(void) 6.249 - * 6.250 - * This replaces flush_rx_queue as the guest event handler to move packets 6.251 - * queued in the guest ring up to the guest. Really, the packet is already 6.252 - * there, it was page flipped in deliver_packet, but this moves the ring 6.253 - * descriptor across from the shadow ring and increments the pointers. 6.254 - */ 6.255 - 6.256 -void update_shared_ring(void) 6.257 -{ 6.258 - rx_shadow_entry_t *rx; 6.259 - shared_info_t *s = current->shared_info; 6.260 - net_ring_t *net_ring; 6.261 - net_shadow_ring_t *shadow_ring; 6.262 - net_vif_t *vif; 6.263 - struct list_head *ent; 6.264 - 6.265 - clear_bit(_HYP_EVENT_NET_RX, ¤t->hyp_events); 6.266 - 6.267 - list_for_each(ent, ¤t->net_vifs) 6.268 - { 6.269 - vif = list_entry(ent, net_vif_t, dom_list); 6.270 - net_ring = vif->net_ring; 6.271 - shadow_ring = vif->shadow_ring; 6.272 - 6.273 - /* This would mean that the guest OS has fiddled with our index. */ 6.274 - if ( shadow_ring->rx_cons != net_ring->rx_cons ) 6.275 - DPRINTK("Shadow and shared rings out of sync (%d/%d)\n", 6.276 - shadow_ring->rx_cons, net_ring->rx_cons); 6.277 - 6.278 - while ( shadow_ring->rx_cons != shadow_ring->rx_idx ) 6.279 - { 6.280 - rx = shadow_ring->rx_ring + shadow_ring->rx_cons; 6.281 - copy_to_user(net_ring->rx_ring + shadow_ring->rx_cons, rx, 6.282 - sizeof(rx_entry_t)); 6.283 - 6.284 - if ( rx->flush_count == tlb_flush_count[smp_processor_id()] ) 6.285 - __flush_tlb(); 6.286 - 6.287 - smp_wmb(); /* copy descriptor before inc'ing rx_cons */ 6.288 - shadow_ring->rx_cons = RX_RING_INC(shadow_ring->rx_cons); 6.289 - 6.290 - if ( shadow_ring->rx_cons == net_ring->rx_event ) 6.291 - set_bit(_EVENT_NET_RX, &s->events); 6.292 - } 6.293 - net_ring->rx_cons = shadow_ring->rx_cons; 6.294 - } 6.295 -} 6.296 - 6.297 - 6.298 -/* 6.299 * We need this ioctl for efficient implementation of the 6.300 * if_indextoname() function required by the IPv6 API. Without 6.301 * it, we would have to search all the interfaces to find a 6.302 @@ -1847,10 +1794,10 @@ long do_net_update(void) 6.303 net_ring_t *net_ring; 6.304 net_shadow_ring_t *shadow_ring; 6.305 net_vif_t *current_vif; 6.306 - unsigned int i; 6.307 + unsigned int i, j; 6.308 struct sk_buff *skb; 6.309 - tx_entry_t tx; 6.310 - rx_shadow_entry_t *rx; 6.311 + tx_req_entry_t tx; 6.312 + rx_req_entry_t rx; 6.313 unsigned long pfn; 6.314 struct pfn_info *page; 6.315 unsigned long *g_pte; 6.316 @@ -1873,31 +1820,32 @@ long do_net_update(void) 6.317 * new producer index, but take care not to catch up with our own 6.318 * consumer index. 6.319 */ 6.320 - for ( i = shadow_ring->tx_prod; 6.321 - (i != net_ring->tx_prod) && 6.322 - (((shadow_ring->tx_cons-i) & (TX_RING_SIZE-1)) != 1); 6.323 + j = shadow_ring->tx_prod; 6.324 + for ( i = shadow_ring->tx_req_cons; 6.325 + (i != net_ring->tx_req_prod) && 6.326 + (((shadow_ring->tx_resp_prod-i) & (TX_RING_SIZE-1)) != 1); 6.327 i = TX_RING_INC(i) ) 6.328 { 6.329 - if ( copy_from_user(&tx, net_ring->tx_ring+i, sizeof(tx)) ) 6.330 + if ( copy_from_user(&tx, &net_ring->tx_ring[i].req, sizeof(tx)) ) 6.331 { 6.332 DPRINTK("Bad copy_from_user for tx net descriptor\n"); 6.333 - shadow_ring->tx_ring[i].status = RING_STATUS_ERR_CFU; 6.334 + make_tx_response(current_vif, tx.id, RING_STATUS_ERR_CFU); 6.335 continue; 6.336 } 6.337 6.338 - shadow_ring->tx_ring[i].size = tx.size; 6.339 - shadow_ring->tx_ring[i].status = RING_STATUS_BAD_PAGE; 6.340 - 6.341 - if ( tx.size < PKT_PROT_LEN ) 6.342 + if ( (tx.size < PKT_PROT_LEN) || (tx.size > ETH_FRAME_LEN) ) 6.343 { 6.344 - DPRINTK("Runt packet %d\n", tx.size); 6.345 + DPRINTK("Bad packet size: %d\n", tx.size); 6.346 + make_tx_response(current_vif, tx.id, RING_STATUS_BAD_PAGE); 6.347 continue; 6.348 } 6.349 6.350 + /* No crossing a page boundary as the payload mustn't fragment. */ 6.351 if ( ((tx.addr & ~PAGE_MASK) + tx.size) >= PAGE_SIZE ) 6.352 { 6.353 DPRINTK("tx.addr: %lx, size: %u, end: %lu\n", 6.354 tx.addr, tx.size, (tx.addr &~PAGE_MASK) + tx.size); 6.355 + make_tx_response(current_vif, tx.id, RING_STATUS_BAD_PAGE); 6.356 continue; 6.357 } 6.358 6.359 @@ -1909,6 +1857,7 @@ long do_net_update(void) 6.360 { 6.361 DPRINTK("Bad page frame\n"); 6.362 spin_unlock_irq(¤t->page_lock); 6.363 + make_tx_response(current_vif, tx.id, RING_STATUS_BAD_PAGE); 6.364 continue; 6.365 } 6.366 6.367 @@ -1917,45 +1866,61 @@ long do_net_update(void) 6.368 protocol = __constant_htons( 6.369 init_tx_header(g_data, tx.size, the_dev)); 6.370 if ( protocol == 0 ) 6.371 + { 6.372 + make_tx_response(current_vif, tx.id, RING_STATUS_BAD_PAGE); 6.373 goto tx_unmap_and_continue; 6.374 + } 6.375 6.376 target = __net_get_target_vif(g_data, tx.size, current_vif->id); 6.377 6.378 if ( target > VIF_PHYSICAL_INTERFACE ) 6.379 { 6.380 /* Local delivery */ 6.381 - if ( (skb = dev_alloc_skb(tx.size)) == NULL ) 6.382 + if ( (skb = dev_alloc_skb(ETH_FRAME_LEN + 32)) == NULL ) 6.383 + { 6.384 + make_tx_response(current_vif, tx.id, RING_STATUS_BAD_PAGE); 6.385 goto tx_unmap_and_continue; 6.386 - 6.387 - skb->destructor = tx_skb_release; 6.388 - get_vif(current_vif); 6.389 - 6.390 - shadow_ring->tx_ring[i].status = RING_STATUS_OK; 6.391 + } 6.392 6.393 skb->src_vif = current_vif->id; 6.394 skb->dst_vif = target; 6.395 - skb->protocol = protocol; 6.396 - 6.397 + skb->protocol = protocol; 6.398 + 6.399 + /* 6.400 + * We don't need a well-formed skb as netif_rx will fill these 6.401 + * fields in as necessary. All we actually need is the right 6.402 + * page offset in skb->data, and the right length in skb->len. 6.403 + * Note that the correct address/length *excludes* link header. 6.404 + */ 6.405 skb->head = (u8 *)map_domain_mem( 6.406 ((skb->pf - frame_table) << PAGE_SHIFT)); 6.407 - skb->data = skb->head + 16; 6.408 - skb_reserve(skb,2); 6.409 + skb->data = skb->head + 18; 6.410 memcpy(skb->data, g_data, tx.size); 6.411 - skb->len = tx.size; 6.412 + skb->data += ETH_HLEN; 6.413 + skb->len = tx.size - ETH_HLEN; 6.414 unmap_domain_mem(skb->head); 6.415 - skb->data += ETH_HLEN; 6.416 + 6.417 (void)netif_rx(skb); 6.418 + 6.419 + make_tx_response(current_vif, tx.id, RING_STATUS_OK); 6.420 } 6.421 else if ( target == VIF_PHYSICAL_INTERFACE ) 6.422 { 6.423 - shadow_ring->tx_ring[i].header = 6.424 + shadow_ring->tx_ring[j].id = tx.id; 6.425 + shadow_ring->tx_ring[j].size = tx.size; 6.426 + shadow_ring->tx_ring[j].status = RING_STATUS_OK; 6.427 + shadow_ring->tx_ring[j].header = 6.428 kmem_cache_alloc(net_header_cachep, GFP_KERNEL); 6.429 - if ( shadow_ring->tx_ring[i].header == NULL ) 6.430 + if ( shadow_ring->tx_ring[j].header == NULL ) 6.431 + { 6.432 + make_tx_response(current_vif, tx.id, RING_STATUS_OK); 6.433 goto tx_unmap_and_continue; 6.434 - memcpy(shadow_ring->tx_ring[i].header, g_data, PKT_PROT_LEN); 6.435 - shadow_ring->tx_ring[i].payload = tx.addr + PKT_PROT_LEN; 6.436 - shadow_ring->tx_ring[i].status = RING_STATUS_OK; 6.437 + } 6.438 + 6.439 + memcpy(shadow_ring->tx_ring[j].header, g_data, PKT_PROT_LEN); 6.440 + shadow_ring->tx_ring[j].payload = tx.addr + PKT_PROT_LEN; 6.441 get_page_tot(page); 6.442 + j = TX_RING_INC(j); 6.443 } 6.444 6.445 tx_unmap_and_continue: 6.446 @@ -1963,10 +1928,12 @@ long do_net_update(void) 6.447 spin_unlock_irq(¤t->page_lock); 6.448 } 6.449 6.450 - if ( shadow_ring->tx_prod != i ) 6.451 + shadow_ring->tx_req_cons = i; 6.452 + 6.453 + if ( shadow_ring->tx_prod != j ) 6.454 { 6.455 smp_mb(); /* Let other CPUs see new descriptors first. */ 6.456 - shadow_ring->tx_prod = i; 6.457 + shadow_ring->tx_prod = j; 6.458 add_to_net_schedule_list_tail(current_vif); 6.459 maybe_schedule_tx_action(); 6.460 } 6.461 @@ -1980,29 +1947,23 @@ long do_net_update(void) 6.462 * new producer index, but take care not to catch up with our own 6.463 * consumer index. 6.464 */ 6.465 - for ( i = shadow_ring->rx_prod; 6.466 - (i != net_ring->rx_prod) && 6.467 - (((shadow_ring->rx_cons-i) & (RX_RING_SIZE-1)) != 1); 6.468 + j = shadow_ring->rx_prod; 6.469 + for ( i = shadow_ring->rx_req_cons; 6.470 + (i != net_ring->rx_req_prod) && 6.471 + (((shadow_ring->rx_resp_prod-i) & (RX_RING_SIZE-1)) != 1); 6.472 i = RX_RING_INC(i) ) 6.473 { 6.474 - /* 6.475 - * This copy assumes that rx_shadow_entry_t is an extension of 6.476 - * rx_net_entry_t extra fields must be tacked on to the end. 6.477 - */ 6.478 - if ( copy_from_user(shadow_ring->rx_ring+i, net_ring->rx_ring+i, 6.479 - sizeof (rx_entry_t) ) ) 6.480 + if ( copy_from_user(&rx, &net_ring->rx_ring[i].req, sizeof(rx)) ) 6.481 { 6.482 - DPRINTK("Bad copy_from_user for rx ring\n"); 6.483 - shadow_ring->rx_ring[i].status = RING_STATUS_ERR_CFU; 6.484 + DPRINTK("Bad copy_from_user for rx net descriptor\n"); 6.485 + make_rx_response(current_vif, 6.486 + rx.id, 0, RING_STATUS_ERR_CFU, 0); 6.487 continue; 6.488 - } 6.489 + } 6.490 6.491 - rx = shadow_ring->rx_ring + i; 6.492 - pfn = rx->addr >> PAGE_SHIFT; 6.493 + pfn = rx.addr >> PAGE_SHIFT; 6.494 page = frame_table + pfn; 6.495 6.496 - shadow_ring->rx_ring[i].status = RING_STATUS_BAD_PAGE; 6.497 - 6.498 spin_lock_irq(¤t->page_lock); 6.499 if ( (pfn >= max_page) || 6.500 (page->flags != (PGT_l1_page_table | current->domain)) ) 6.501 @@ -2010,14 +1971,18 @@ long do_net_update(void) 6.502 DPRINTK("Bad page frame for ppte %d,%08lx,%08lx,%08lx\n", 6.503 current->domain, pfn, max_page, page->flags); 6.504 spin_unlock_irq(¤t->page_lock); 6.505 + make_rx_response(current_vif, 6.506 + rx.id, 0, RING_STATUS_BAD_PAGE, 0); 6.507 continue; 6.508 } 6.509 6.510 - g_pte = map_domain_mem(rx->addr); 6.511 + g_pte = map_domain_mem(rx.addr); 6.512 6.513 if ( !(*g_pte & _PAGE_PRESENT) ) 6.514 { 6.515 - DPRINTK("Inavlid PTE passed down (not present)\n"); 6.516 + DPRINTK("Invalid PTE passed down (not present)\n"); 6.517 + make_rx_response(current_vif, 6.518 + rx.id, 0, RING_STATUS_BAD_PAGE, 0); 6.519 goto rx_unmap_and_continue; 6.520 } 6.521 6.522 @@ -2027,25 +1992,32 @@ long do_net_update(void) 6.523 { 6.524 DPRINTK("RX page mapped multple times (%d/%d/%08x)\n", 6.525 page->type_count, page->tot_count, page->flags); 6.526 - 6.527 + make_rx_response(current_vif, 6.528 + rx.id, 0, RING_STATUS_BAD_PAGE, 0); 6.529 goto rx_unmap_and_continue; 6.530 } 6.531 6.532 /* The pte they passed was good, so take it away from them. */ 6.533 - shadow_ring->rx_ring[i].status = RING_STATUS_OK; 6.534 *g_pte &= ~_PAGE_PRESENT; 6.535 page->flags = (page->flags & ~PG_type_mask) | PGT_net_rx_buf; 6.536 - rx->flush_count = tlb_flush_count[smp_processor_id()]; 6.537 + shadow_ring->rx_ring[j].id = rx.id; 6.538 + shadow_ring->rx_ring[j].addr = rx.addr; 6.539 + shadow_ring->rx_ring[j].status = RING_STATUS_OK; 6.540 + shadow_ring->rx_ring[j].flush_count = 6.541 + tlb_flush_count[smp_processor_id()]; 6.542 + j = RX_RING_INC(j); 6.543 6.544 rx_unmap_and_continue: 6.545 unmap_domain_mem(g_pte); 6.546 spin_unlock_irq(¤t->page_lock); 6.547 } 6.548 6.549 - if ( shadow_ring->rx_prod != i ) 6.550 + shadow_ring->rx_req_cons = i; 6.551 + 6.552 + if ( shadow_ring->rx_prod != j ) 6.553 { 6.554 smp_mb(); /* Let other CPUs see new descriptors first. */ 6.555 - shadow_ring->rx_prod = i; 6.556 + shadow_ring->rx_prod = j; 6.557 } 6.558 } 6.559 6.560 @@ -2053,6 +2025,58 @@ long do_net_update(void) 6.561 } 6.562 6.563 6.564 +static void make_tx_response(net_vif_t *vif, 6.565 + unsigned long id, 6.566 + unsigned char st) 6.567 +{ 6.568 + unsigned long flags; 6.569 + net_shadow_ring_t *shadow = vif->shadow_ring; 6.570 + unsigned int pos; 6.571 + tx_resp_entry_t *resp, privresp; 6.572 + 6.573 + /* Place on the response ring for the relevant domain. */ 6.574 + local_irq_save(flags); 6.575 + pos = shadow->tx_resp_prod; 6.576 + resp = &vif->net_ring->tx_ring[pos].resp; 6.577 + privresp.id = id; 6.578 + privresp.status = st; 6.579 + copy_to_user(resp, &privresp, sizeof(privresp)); 6.580 + pos = TX_RING_INC(pos); 6.581 + shadow->tx_resp_prod = vif->net_ring->tx_resp_prod = pos; 6.582 + if ( pos == vif->net_ring->rx_event ) 6.583 + set_bit(_EVENT_NET_TX, ¤t->shared_info->events); 6.584 + local_irq_restore(flags); 6.585 +} 6.586 + 6.587 + 6.588 +static void make_rx_response(net_vif_t *vif, 6.589 + unsigned long id, 6.590 + unsigned short size, 6.591 + unsigned char st, 6.592 + unsigned char off) 6.593 +{ 6.594 + unsigned long flags; 6.595 + net_shadow_ring_t *shadow = vif->shadow_ring; 6.596 + unsigned int pos; 6.597 + rx_resp_entry_t *resp, privresp; 6.598 + 6.599 + /* Place on the response ring for the relevant domain. */ 6.600 + local_irq_save(flags); 6.601 + pos = shadow->rx_resp_prod; 6.602 + resp = &vif->net_ring->rx_ring[pos].resp; 6.603 + privresp.id = id; 6.604 + privresp.size = size; 6.605 + privresp.status = st; 6.606 + privresp.offset = off; 6.607 + copy_to_user(resp, &privresp, sizeof(privresp)); 6.608 + pos = RX_RING_INC(pos); 6.609 + shadow->rx_resp_prod = vif->net_ring->rx_resp_prod = pos; 6.610 + if ( pos == vif->net_ring->rx_event ) 6.611 + set_bit(_EVENT_NET_RX, ¤t->shared_info->events); 6.612 + local_irq_restore(flags); 6.613 +} 6.614 + 6.615 + 6.616 int setup_network_devices(void) 6.617 { 6.618 int ret;
7.1 --- a/xenolinux-2.4.21-pre4-sparse/arch/xeno/drivers/network/network.c Thu Apr 17 12:26:14 2003 +0000 7.2 +++ b/xenolinux-2.4.21-pre4-sparse/arch/xeno/drivers/network/network.c Thu Apr 17 17:12:21 2003 +0000 7.3 @@ -3,7 +3,7 @@ 7.4 * 7.5 * Virtual network driver for XenoLinux. 7.6 * 7.7 - * Copyright (c) 2002, K A Fraser 7.8 + * Copyright (c) 2002-2003, K A Fraser 7.9 */ 7.10 7.11 #include <linux/config.h> 7.12 @@ -47,21 +47,14 @@ static void cleanup_module(void); 7.13 7.14 static struct list_head dev_list; 7.15 7.16 -/* 7.17 - * RX RING: RX_IDX <= rx_cons <= rx_prod 7.18 - * TX RING: TX_IDX <= tx_cons <= tx_prod 7.19 - * (*_IDX allocated privately here, *_cons & *_prod shared with hypervisor) 7.20 - */ 7.21 struct net_private 7.22 { 7.23 struct list_head list; 7.24 struct net_device *dev; 7.25 7.26 struct net_device_stats stats; 7.27 - struct sk_buff **tx_skb_ring; 7.28 - struct sk_buff **rx_skb_ring; 7.29 atomic_t tx_entries; 7.30 - unsigned int rx_idx, tx_idx, tx_full; 7.31 + unsigned int rx_resp_cons, tx_resp_cons, tx_full; 7.32 net_ring_t *net_ring; 7.33 spinlock_t tx_lock; 7.34 }; 7.35 @@ -71,10 +64,10 @@ static void dbg_network_int(int irq, voi 7.36 { 7.37 struct net_device *dev = (struct net_device *)dev_id; 7.38 struct net_private *np = dev->priv; 7.39 - printk(KERN_ALERT "tx_full = %d, tx_entries = %d, tx_idx = %d," 7.40 - " tx_cons = %d, tx_prod = %d, tx_event = %d, state=%d\n", 7.41 - np->tx_full, atomic_read(&np->tx_entries), np->tx_idx, 7.42 - np->net_ring->tx_cons, np->net_ring->tx_prod, 7.43 + printk(KERN_ALERT "tx_full = %d, tx_entries = %d, tx_resp_cons = %d," 7.44 + " tx_req_prod = %d, tx_resp_prod = %d, tx_event = %d, state=%d\n", 7.45 + np->tx_full, atomic_read(&np->tx_entries), np->tx_resp_cons, 7.46 + np->net_ring->tx_req_prod, np->net_ring->tx_resp_prod, 7.47 np->net_ring->tx_event, 7.48 test_bit(__LINK_STATE_XOFF, &dev->state)); 7.49 } 7.50 @@ -85,29 +78,17 @@ static int network_open(struct net_devic 7.51 struct net_private *np = dev->priv; 7.52 int error = 0; 7.53 7.54 - np->rx_idx = np->tx_idx = np->tx_full = 0; 7.55 - 7.56 + np->rx_resp_cons = np->tx_resp_cons = np->tx_full = 0; 7.57 memset(&np->stats, 0, sizeof(np->stats)); 7.58 - 7.59 spin_lock_init(&np->tx_lock); 7.60 - 7.61 atomic_set(&np->tx_entries, 0); 7.62 + memset(np->net_ring, 0, sizeof(*np->net_ring)); 7.63 7.64 - np->net_ring->tx_prod = np->net_ring->tx_cons = np->net_ring->tx_event = 0; 7.65 - np->net_ring->rx_prod = np->net_ring->rx_cons = np->net_ring->rx_event = 0; 7.66 - np->net_ring->tx_ring = NULL; 7.67 - np->net_ring->rx_ring = NULL; 7.68 - 7.69 - np->tx_skb_ring = kmalloc(TX_RING_SIZE * sizeof(struct sk_buff *), 7.70 - GFP_KERNEL); 7.71 - np->rx_skb_ring = kmalloc(RX_RING_SIZE * sizeof(struct sk_buff *), 7.72 - GFP_KERNEL); 7.73 np->net_ring->tx_ring = kmalloc(TX_RING_SIZE * sizeof(tx_entry_t), 7.74 GFP_KERNEL); 7.75 np->net_ring->rx_ring = kmalloc(RX_RING_SIZE * sizeof(rx_entry_t), 7.76 GFP_KERNEL); 7.77 - if ( (np->tx_skb_ring == NULL) || (np->rx_skb_ring == NULL) || 7.78 - (np->net_ring->tx_ring == NULL) || (np->net_ring->rx_ring == NULL) ) 7.79 + if ( (np->net_ring->tx_ring == NULL) || (np->net_ring->rx_ring == NULL) ) 7.80 { 7.81 printk(KERN_WARNING "%s; Could not allocate ring memory\n", dev->name); 7.82 error = -ENOBUFS; 7.83 @@ -156,8 +137,6 @@ static int network_open(struct net_devic 7.84 fail: 7.85 if ( np->net_ring->rx_ring ) kfree(np->net_ring->rx_ring); 7.86 if ( np->net_ring->tx_ring ) kfree(np->net_ring->tx_ring); 7.87 - if ( np->rx_skb_ring ) kfree(np->rx_skb_ring); 7.88 - if ( np->tx_skb_ring ) kfree(np->tx_skb_ring); 7.89 kfree(np); 7.90 return error; 7.91 } 7.92 @@ -169,28 +148,29 @@ static void network_tx_buf_gc(struct net 7.93 struct net_private *np = dev->priv; 7.94 struct sk_buff *skb; 7.95 unsigned long flags; 7.96 - unsigned int cons; 7.97 + unsigned int prod; 7.98 + tx_entry_t *tx_ring = np->net_ring->tx_ring; 7.99 7.100 spin_lock_irqsave(&np->tx_lock, flags); 7.101 7.102 do { 7.103 - cons = np->net_ring->tx_cons; 7.104 + prod = np->net_ring->tx_resp_prod; 7.105 7.106 - for ( i = np->tx_idx; i != cons; i = TX_RING_INC(i) ) 7.107 + for ( i = np->tx_resp_cons; i != prod; i = TX_RING_INC(i) ) 7.108 { 7.109 - skb = np->tx_skb_ring[i]; 7.110 + skb = (struct sk_buff *)tx_ring[i].resp.id; 7.111 dev_kfree_skb_any(skb); 7.112 atomic_dec(&np->tx_entries); 7.113 } 7.114 7.115 - np->tx_idx = i; 7.116 + np->tx_resp_cons = prod; 7.117 7.118 /* Set a new event, then check for race with update of tx_cons. */ 7.119 np->net_ring->tx_event = 7.120 - TX_RING_ADD(cons, (atomic_read(&np->tx_entries)>>1) + 1); 7.121 + TX_RING_ADD(prod, (atomic_read(&np->tx_entries)>>1) + 1); 7.122 smp_mb(); 7.123 } 7.124 - while ( cons != np->net_ring->tx_cons ); 7.125 + while ( prod != np->net_ring->tx_resp_prod ); 7.126 7.127 if ( np->tx_full && (atomic_read(&np->tx_entries) < TX_MAX_ENTRIES) ) 7.128 { 7.129 @@ -201,21 +181,13 @@ static void network_tx_buf_gc(struct net 7.130 spin_unlock_irqrestore(&np->tx_lock, flags); 7.131 } 7.132 7.133 -inline unsigned long get_ppte(unsigned long addr) 7.134 +inline pte_t *get_ppte(void *addr) 7.135 { 7.136 - unsigned long ppte; 7.137 - pgd_t *pgd; pmd_t *pmd; pte_t *ptep; 7.138 - pgd = pgd_offset_k(addr); 7.139 - 7.140 - if ( pgd_none(*pgd) || pgd_bad(*pgd) ) BUG(); 7.141 - 7.142 - pmd = pmd_offset(pgd, addr); 7.143 - if ( pmd_none(*pmd) || pmd_bad(*pmd) ) BUG(); 7.144 - 7.145 - ptep = pte_offset(pmd, addr); 7.146 - ppte = (unsigned long)phys_to_machine(virt_to_phys(ptep)); 7.147 - 7.148 - return ppte; 7.149 + pgd_t *pgd; pmd_t *pmd; pte_t *pte; 7.150 + pgd = pgd_offset_k( (unsigned long)addr); 7.151 + pmd = pmd_offset(pgd, (unsigned long)addr); 7.152 + pte = pte_offset(pmd, (unsigned long)addr); 7.153 + return pte; 7.154 } 7.155 7.156 static void network_alloc_rx_buffers(struct net_device *dev) 7.157 @@ -223,21 +195,21 @@ static void network_alloc_rx_buffers(str 7.158 unsigned int i; 7.159 struct net_private *np = dev->priv; 7.160 struct sk_buff *skb; 7.161 - unsigned int end = RX_RING_ADD(np->rx_idx, RX_MAX_ENTRIES); 7.162 + unsigned int end = RX_RING_ADD(np->rx_resp_cons, RX_MAX_ENTRIES); 7.163 7.164 - for ( i = np->net_ring->rx_prod; i != end; i = RX_RING_INC(i) ) 7.165 + for ( i = np->net_ring->rx_req_prod; i != end; i = RX_RING_INC(i) ) 7.166 { 7.167 skb = dev_alloc_skb(RX_BUF_SIZE); 7.168 if ( skb == NULL ) break; 7.169 skb->dev = dev; 7.170 - np->rx_skb_ring[i] = skb; 7.171 - np->net_ring->rx_ring[i].addr = get_ppte((unsigned long)skb->head); 7.172 - np->net_ring->rx_ring[i].size = RX_BUF_SIZE - 16; /* arbitrary */ 7.173 + np->net_ring->rx_ring[i].req.id = (unsigned long)skb; 7.174 + np->net_ring->rx_ring[i].req.addr = 7.175 + virt_to_machine(get_ppte(skb->head)); 7.176 } 7.177 7.178 - np->net_ring->rx_prod = i; 7.179 + np->net_ring->rx_req_prod = i; 7.180 7.181 - np->net_ring->rx_event = RX_RING_INC(np->rx_idx); 7.182 + np->net_ring->rx_event = RX_RING_INC(np->rx_resp_cons); 7.183 7.184 /* 7.185 * We may have allocated buffers which have entries outstanding in 7.186 @@ -254,9 +226,11 @@ static void network_free_rx_buffers(stru 7.187 struct net_private *np = dev->priv; 7.188 struct sk_buff *skb; 7.189 7.190 - for ( i = np->rx_idx; i != np->net_ring->rx_prod; i = RX_RING_INC(i) ) 7.191 + for ( i = np->rx_resp_cons; 7.192 + i != np->net_ring->rx_req_prod; 7.193 + i = RX_RING_INC(i) ) 7.194 { 7.195 - skb = np->rx_skb_ring[i]; 7.196 + skb = (struct sk_buff *)np->net_ring->rx_ring[i].req.id; 7.197 dev_kfree_skb_any(skb); 7.198 } 7.199 } 7.200 @@ -272,7 +246,7 @@ static int network_start_xmit(struct sk_ 7.201 netif_stop_queue(dev); 7.202 return -ENOBUFS; 7.203 } 7.204 - i = np->net_ring->tx_prod; 7.205 + i = np->net_ring->tx_req_prod; 7.206 7.207 if ( (((unsigned long)skb->data & ~PAGE_MASK) + skb->len) >= PAGE_SIZE ) 7.208 { 7.209 @@ -284,11 +258,11 @@ static int network_start_xmit(struct sk_ 7.210 skb = new_skb; 7.211 } 7.212 7.213 - np->tx_skb_ring[i] = skb; 7.214 - np->net_ring->tx_ring[i].addr = 7.215 - (unsigned long)phys_to_machine(virt_to_phys(skb->data)); 7.216 - np->net_ring->tx_ring[i].size = skb->len; 7.217 - np->net_ring->tx_prod = TX_RING_INC(i); 7.218 + np->net_ring->tx_ring[i].req.id = (unsigned long)skb; 7.219 + np->net_ring->tx_ring[i].req.addr = 7.220 + phys_to_machine(virt_to_phys(skb->data)); 7.221 + np->net_ring->tx_ring[i].req.size = skb->len; 7.222 + np->net_ring->tx_req_prod = TX_RING_INC(i); 7.223 atomic_inc(&np->tx_entries); 7.224 7.225 np->stats.tx_bytes += skb->len; 7.226 @@ -316,13 +290,15 @@ static void network_rx_int(int irq, void 7.227 struct net_device *dev = (struct net_device *)dev_id; 7.228 struct net_private *np = dev->priv; 7.229 struct sk_buff *skb; 7.230 - rx_entry_t *rx; 7.231 + rx_resp_entry_t *rx; 7.232 7.233 again: 7.234 - for ( i = np->rx_idx; i != np->net_ring->rx_cons; i = RX_RING_INC(i) ) 7.235 + for ( i = np->rx_resp_cons; 7.236 + i != np->net_ring->rx_resp_prod; 7.237 + i = RX_RING_INC(i) ) 7.238 { 7.239 - rx = &np->net_ring->rx_ring[i]; 7.240 - skb = np->rx_skb_ring[i]; 7.241 + rx = &np->net_ring->rx_ring[i].resp; 7.242 + skb = (struct sk_buff *)rx->id; 7.243 7.244 if ( rx->status != RING_STATUS_OK ) 7.245 { 7.246 @@ -341,8 +317,7 @@ static void network_rx_int(int irq, void 7.247 skb_shinfo(skb)->frag_list = NULL; 7.248 7.249 phys_to_machine_mapping[virt_to_phys(skb->head) >> PAGE_SHIFT] = 7.250 - (*(unsigned long *)phys_to_virt(machine_to_phys(rx->addr)) 7.251 - ) >> PAGE_SHIFT; 7.252 + (*(unsigned long *)get_ppte(skb->head)) >> PAGE_SHIFT; 7.253 7.254 if ( rx->offset < 16 ) 7.255 { 7.256 @@ -353,23 +328,23 @@ static void network_rx_int(int irq, void 7.257 7.258 skb_reserve(skb, rx->offset - 16); 7.259 7.260 - skb_put(skb, np->net_ring->rx_ring[i].size); 7.261 + skb_put(skb, rx->size); 7.262 skb->protocol = eth_type_trans(skb, dev); 7.263 7.264 np->stats.rx_packets++; 7.265 7.266 - np->stats.rx_bytes += np->net_ring->rx_ring[i].size; 7.267 + np->stats.rx_bytes += rx->size; 7.268 netif_rx(skb); 7.269 dev->last_rx = jiffies; 7.270 } 7.271 7.272 - np->rx_idx = i; 7.273 + np->rx_resp_cons = i; 7.274 7.275 network_alloc_rx_buffers(dev); 7.276 7.277 /* Deal with hypervisor racing our resetting of rx_event. */ 7.278 smp_mb(); 7.279 - if ( np->net_ring->rx_cons != i ) goto again; 7.280 + if ( np->net_ring->rx_resp_prod != i ) goto again; 7.281 } 7.282 7.283 7.284 @@ -382,8 +357,6 @@ static void network_tx_int(int irq, void 7.285 7.286 int network_close(struct net_device *dev) 7.287 { 7.288 - struct net_private *np = dev->priv; 7.289 - 7.290 netif_stop_queue(dev); 7.291 7.292 free_irq(NET_RX_IRQ, dev); 7.293 @@ -401,9 +374,6 @@ int network_close(struct net_device *dev 7.294 kfree(np->net_ring->tx_ring); 7.295 #endif 7.296 7.297 - kfree(np->rx_skb_ring); 7.298 - kfree(np->tx_skb_ring); 7.299 - 7.300 MOD_DEC_USE_COUNT; 7.301 7.302 return 0;