direct-io.hg

changeset 379:f5c2415f65e3

bitkeeper revision 1.179.1.1 (3e9ee075wJmtFBkJEk-QAC5VB7htXg)

network.c, dev.c, vif.h, sched.h, network.h, TODO:
Fixed network rings so we can have out-of-order responses. This made it possible to fix local packet delivery. However, the virtual firewall/router stuff needs urgent redesigning.
author kaf24@scramble.cl.cam.ac.uk
date Thu Apr 17 17:12:21 2003 +0000 (2003-04-17)
parents 1585992989d0
children 5e482605e7d8
files xen/TODO xen/common/network.c xen/include/hypervisor-ifs/network.h xen/include/xeno/sched.h xen/include/xeno/vif.h xen/net/dev.c xenolinux-2.4.21-pre4-sparse/arch/xeno/drivers/network/network.c
line diff
     1.1 --- a/xen/TODO	Thu Apr 17 12:26:14 2003 +0000
     1.2 +++ b/xen/TODO	Thu Apr 17 17:12:21 2003 +0000
     1.3 @@ -7,20 +7,7 @@ longer-term goals.
     1.4   -- Keir (16/3/03)
     1.5  
     1.6  
     1.7 -1. FIX HANDLING OF NETWORK RINGS
     1.8 ---------------------------------
     1.9 -Handling of the transmit rings is currently very broken (for example,
    1.10 -sending an inter-domain packet will wedge the hypervisor). This is
    1.11 -because we may handle packets out of order (eg. inter-domain packets
    1.12 -are handled eagerly, while packets for real interfaces are queued),
    1.13 -but our current ring design really assumes in-order handling.
    1.14 -
    1.15 -A neat fix will be to allow responses to be queued in a different
    1.16 -order to requests, just as we already do with block-device
    1.17 -rings. We'll need to add an opaque identifier to ring entries,
    1.18 -allowing matching of requests and responses, but that's about it.
    1.19 -
    1.20 -2. ACCURATE TIMERS AND WALL-CLOCK TIME
    1.21 +1. ACCURATE TIMERS AND WALL-CLOCK TIME
    1.22  --------------------------------------
    1.23  Currently our long-term timebase free runs on CPU0, with no external
    1.24  calibration. We should run ntpd on domain 0 and allow this to warp
    1.25 @@ -28,7 +15,7 @@ Xen's timebase. Once this is done, we ca
    1.26  not worry about relative drift (since they'll all get sync'ed
    1.27  periodically by ntp).
    1.28  
    1.29 -3. ASSIGNING DOMAINS TO PROCESSORS
    1.30 +2. ASSIGNING DOMAINS TO PROCESSORS
    1.31  ----------------------------------
    1.32  More intelligent assignment of domains to processors. In
    1.33  particular, we don't play well with hyperthreading: we will assign
    1.34 @@ -40,17 +27,17 @@ relationships between processors in the 
    1.35  siblings in the same package). We then use this to balance domains
    1.36  across packages, and across virtual processors within a package.
    1.37  
    1.38 -4. PROPER DESTRUCTION OF DOMAINS
    1.39 ---------------------------------
    1.40 -Currently we do not free resources when destroying a domain. This is
    1.41 -because they may be tied up in subsystems, and there is no way of
    1.42 -pulling them back in a safe manner.
    1.43 +3. DOMAIN 0 MANAGEMENT DAEMON
    1.44 +-----------------------------
    1.45 +A better control daemon is required for domain 0, which keeps proper
    1.46 +track of machine resources and can make sensible policy choices. This
    1.47 +may require support in Xen; for example, notifications (eg. DOMn is
    1.48 +killed), and requests (eg. can DOMn allocate x frames of memory?).
    1.49  
    1.50 -The fix is probably to reference count resources and automatically
    1.51 -free them when the count reaches zero. We may get away with one count
    1.52 -per domain (for all its resources). When this reaches zero we know it
    1.53 -is safe to free everything: block-device rings, network rings, and all
    1.54 -the rest.
    1.55 +4. SANE NETWORK ROUTING
    1.56 +-----------------------
    1.57 +The current virtual firewall/router is completely broken. Needs a new
    1.58 +design and implementation!
    1.59  
    1.60  5. NETWORK CHECKSUM OFFLOAD 
    1.61  --------------------------- 
    1.62 @@ -60,14 +47,7 @@ indicate, on transmit, which packets nee
    1.63  receive, which packets have been checked out as okay. We can steal
    1.64  Linux's interface, which is entirely sane given NIC limitations.
    1.65  
    1.66 -6. DOMAIN 0 MANAGEMENT DAEMON
    1.67 ------------------------------
    1.68 -A better control daemon is required for domain 0, which keeps proper
    1.69 -track of machine resources and can make sensible policy choices. This
    1.70 -may require support in Xen; for example, notifications (eg. DOMn is
    1.71 -killed), and requests (eg. can DOMn allocate x frames of memory?).
    1.72 -
    1.73 -7. MODULE SUPPORT FOR XEN
    1.74 +6. MODULE SUPPORT FOR XEN
    1.75  -------------------------
    1.76  Network and blkdev drivers are bloating Xen. At some point we want to
    1.77  build drivers as modules, stick them in a cheesy ramfs, then relocate
    1.78 @@ -79,7 +59,7 @@ which drivers to load.
    1.79  Most of the hard stuff (relocating and the like) is done for us by
    1.80  Linux's module system.
    1.81  
    1.82 -8. NEW DESIGN FEATURES
    1.83 +7. NEW DESIGN FEATURES
    1.84  ----------------------
    1.85  This includes the last-chance page cache, and the unified buffer cache.
    1.86  
     2.1 --- a/xen/common/network.c	Thu Apr 17 12:26:14 2003 +0000
     2.2 +++ b/xen/common/network.c	Thu Apr 17 17:12:21 2003 +0000
     2.3 @@ -5,7 +5,7 @@
     2.4   * with the virtual interfaces (vifs) and the virtual firewall/router through
     2.5   * the use of rules.
     2.6   *
     2.7 - * Copyright (c) 2002, A K Warfield and K A Fraser
     2.8 + * Copyright (c) 2002-2003, A K Warfield and K A Fraser
     2.9   */
    2.10  
    2.11  #include <hypervisor-ifs/network.h>
    2.12 @@ -67,7 +67,8 @@ net_vif_t *create_net_vif(int domain)
    2.13  
    2.14      shadow_ring = kmalloc(sizeof(net_shadow_ring_t), GFP_KERNEL);
    2.15      if ( shadow_ring == NULL ) goto fail;
    2.16 -    
    2.17 +    memset(shadow_ring, 0, sizeof(*shadow_ring));
    2.18 +
    2.19      shadow_ring->rx_ring = kmalloc(RX_RING_SIZE
    2.20                      * sizeof(rx_shadow_entry_t), GFP_KERNEL);
    2.21      shadow_ring->tx_ring = kmalloc(TX_RING_SIZE
    2.22 @@ -75,9 +76,6 @@ net_vif_t *create_net_vif(int domain)
    2.23      if ( (shadow_ring->rx_ring == NULL) || (shadow_ring->tx_ring == NULL) )
    2.24              goto fail;
    2.25  
    2.26 -    shadow_ring->rx_prod = shadow_ring->rx_cons = shadow_ring->rx_idx = 0;
    2.27 -    shadow_ring->tx_prod = shadow_ring->tx_cons = shadow_ring->tx_idx = 0;
    2.28 -    
    2.29      /*
    2.30       * Fill in the new vif struct. Note that, while the vif's refcnt is
    2.31       * non-zero, we hold a reference to the task structure.
    2.32 @@ -121,7 +119,7 @@ void destroy_net_vif(net_vif_t *vif)
    2.33      /* Return any outstanding receive buffers to the guest OS. */
    2.34      spin_lock_irqsave(&p->page_lock, flags);
    2.35      for ( i  = vif->shadow_ring->rx_idx; 
    2.36 -          i != vif->shadow_ring->rx_prod; 
    2.37 +          i != vif->shadow_ring->rx_req_cons;
    2.38            i  = ((i+1) & (RX_RING_SIZE-1)) )
    2.39      {
    2.40          rx_shadow_entry_t *rx = vif->shadow_ring->rx_ring + i;
    2.41 @@ -263,7 +261,7 @@ void add_default_net_rule(int vif_id, u3
    2.42      memset(&new_rule, 0, sizeof(net_rule_t));
    2.43      new_rule.dst_addr = ipaddr;
    2.44      new_rule.dst_addr_mask = 0xffffffff;
    2.45 -    new_rule.src_interface = VIF_PHYSICAL_INTERFACE;
    2.46 +    new_rule.src_interface = VIF_ANY_INTERFACE;
    2.47      new_rule.dst_interface = vif_id;
    2.48      new_rule.action = NETWORK_ACTION_ACCEPT;
    2.49      new_rule.proto = NETWORK_PROTO_ANY;
    2.50 @@ -319,9 +317,8 @@ void print_net_rule_list()
    2.51   * Apply the rules to this skbuff and return the vif id that it is bound for.
    2.52   * If there is no match, VIF_DROP is returned.
    2.53   */
    2.54 -
    2.55 -int net_find_rule(u8 nproto, u8 tproto, u32 src_addr, u32 dst_addr, u16 src_port, u16 dst_port, 
    2.56 -                  int src_vif)
    2.57 +int net_find_rule(u8 nproto, u8 tproto, u32 src_addr, u32 dst_addr, 
    2.58 +                  u16 src_port, u16 dst_port, int src_vif)
    2.59  {
    2.60      net_rule_ent_t *ent;
    2.61      int dest = VIF_DROP;
    2.62 @@ -330,7 +327,7 @@ int net_find_rule(u8 nproto, u8 tproto, 
    2.63      
    2.64      ent = net_rule_list;
    2.65      
    2.66 -    while (ent)
    2.67 +    while ( ent != NULL )
    2.68      {
    2.69          if ( ((ent->r.src_interface == src_vif)
    2.70                || (ent->r.src_interface == VIF_ANY_INTERFACE)) &&
    2.71 @@ -351,12 +348,19 @@ int net_find_rule(u8 nproto, u8 tproto, 
    2.72                 (tproto == IPPROTO_UDP)))
    2.73             )
    2.74          {
    2.75 -            break;
    2.76 +            /*
    2.77 +             * XXX FFS! We keep going to find the "best" rule. Where best 
    2.78 +             * corresponds to vaguely sane routing of a packet. We need a less 
    2.79 +             * shafted model for aour "virtual firewall/router" methinks!
    2.80 +             */
    2.81 +            if ( dest < 0 )
    2.82 +                dest = ent->r.dst_interface;
    2.83 +            if ( dest >= 0 )
    2.84 +                break;
    2.85          }
    2.86          ent = ent->next;
    2.87      }
    2.88  
    2.89 -    if (ent) (dest = ent->r.dst_interface);
    2.90      read_unlock(&net_rule_lock);
    2.91      return dest;
    2.92  }
    2.93 @@ -423,6 +427,7 @@ int __net_get_target_vif(u8 *data, unsig
    2.94      return target;
    2.95      
    2.96   drop:
    2.97 +    printk("VIF%d: pkt to drop!\n", src_vif);
    2.98      return VIF_DROP;
    2.99  }
   2.100  
     3.1 --- a/xen/include/hypervisor-ifs/network.h	Thu Apr 17 12:26:14 2003 +0000
     3.2 +++ b/xen/include/hypervisor-ifs/network.h	Thu Apr 17 17:12:21 2003 +0000
     3.3 @@ -14,50 +14,70 @@
     3.4  
     3.5  #include <linux/types.h>
     3.6  
     3.7 -typedef struct tx_entry_st {
     3.8 -    unsigned long  addr;   /* machine address of packet (IN VAR) */
     3.9 -    unsigned short size;   /* in bytes (IN VAR) */
    3.10 -    unsigned char  status; /* per descriptor status (OUT VAR) */
    3.11 -    unsigned char  _unused;
    3.12 +
    3.13 +typedef struct tx_req_entry_st
    3.14 +{
    3.15 +    unsigned long  id;
    3.16 +    unsigned long  addr;   /* machine address of packet */
    3.17 +    unsigned short size;   /* packet size in bytes */
    3.18 +} tx_req_entry_t;
    3.19 +
    3.20 +typedef struct tx_resp_entry_st
    3.21 +{
    3.22 +    unsigned long  id;
    3.23 +    unsigned char  status;
    3.24 +} tx_resp_entry_t;
    3.25 +
    3.26 +typedef union tx_entry_st
    3.27 +{
    3.28 +    tx_req_entry_t  req;
    3.29 +    tx_resp_entry_t resp;
    3.30  } tx_entry_t;
    3.31  
    3.32 -typedef struct rx_entry_st {
    3.33 -    unsigned long  addr;   /* machine address of PTE to swizzle (IN VAR) */
    3.34 -    unsigned short size;   /* in bytes (OUT VAR) */
    3.35 -    unsigned char  status; /* per descriptor status (OUT VAR) */
    3.36 -    unsigned char  offset; /* offset in page of received pkt (OUT VAR) */
    3.37 +
    3.38 +typedef struct rx_req_entry_st
    3.39 +{
    3.40 +    unsigned long  id;
    3.41 +    unsigned long  addr;   /* machine address of PTE to swizzle */
    3.42 +} rx_req_entry_t;
    3.43 +
    3.44 +typedef struct rx_resp_entry_st
    3.45 +{
    3.46 +    unsigned long  id;
    3.47 +    unsigned short size;   /* received packet size in bytes */
    3.48 +    unsigned char  status; /* per descriptor status */
    3.49 +    unsigned char  offset; /* offset in page of received pkt */
    3.50 +} rx_resp_entry_t;
    3.51 +
    3.52 +typedef union rx_entry_st
    3.53 +{
    3.54 +    rx_req_entry_t  req;
    3.55 +    rx_resp_entry_t resp;
    3.56  } rx_entry_t;
    3.57  
    3.58 +
    3.59  #define TX_RING_SIZE 256
    3.60  #define RX_RING_SIZE 256
    3.61 -typedef struct net_ring_st {
    3.62 +
    3.63 +typedef struct net_ring_st
    3.64 +{
    3.65      /*
    3.66 -     * Guest OS places packets into ring at tx_prod.
    3.67 -     * Hypervisor removes at tx_cons.
    3.68 -     * Ring is empty when tx_prod == tx_cons.
    3.69 -     * Guest OS receives a DOMAIN_EVENT_NET_TX when tx_cons passes tx_event.
    3.70 -     * Hypervisor may be prodded whenever tx_prod is updated, but this is
    3.71 -     * only necessary when tx_cons == old_tx_prod (ie. transmitter stalled).
    3.72 +     * Guest OS places packets into ring at tx_req_prod.
    3.73 +     * Guest OS receives DOMAIN_EVENT_NET_TX when tx_resp_prod passes tx_event.
    3.74       */
    3.75      tx_entry_t	*tx_ring;
    3.76 -    unsigned int tx_prod, tx_cons, tx_event;
    3.77 +    unsigned int tx_req_prod, tx_resp_prod, tx_event;
    3.78  
    3.79      /*
    3.80 -     * Guest OS places empty buffers into ring at rx_prod.
    3.81 -     * Hypervisor fills buffers as rx_cons.
    3.82 -     * Ring is empty when rx_prod == rx_cons.
    3.83 -     * Guest OS receives a DOMAIN_EVENT_NET_RX when rx_cons passes rx_event.
    3.84 -     * Hypervisor may be prodded whenever rx_prod is updated, but this is
    3.85 -     * only necessary when rx_cons == old_rx_prod (ie. receiver stalled).
    3.86 +     * Guest OS places empty buffers into ring at rx_req_prod.
    3.87 +     * Guest OS receives DOMAIN_EVENT_NET_RX when rx_rssp_prod passes rx_event.
    3.88       */
    3.89      rx_entry_t	*rx_ring;
    3.90 -    unsigned int rx_prod, rx_cons, rx_event;
    3.91 +    unsigned int rx_req_prod, rx_resp_prod, rx_event;
    3.92  } net_ring_t;
    3.93  
    3.94 -/* Specify base of per-domain array. Get returned free slot in the array. */
    3.95 -/*net_ring_t *create_net_vif(int domain);*/
    3.96 -
    3.97 -/* Packet routing/filtering code follows:
    3.98 +/*
    3.99 + * Packet routing/filtering code follows:
   3.100   */
   3.101  
   3.102  #define NETWORK_ACTION_ACCEPT   0
   3.103 @@ -89,7 +109,7 @@ typedef struct net_rule_st
   3.104  typedef struct vif_query_st
   3.105  {
   3.106      unsigned int    domain;
   3.107 -    char            *buf;   // where to put the reply -- guest virtual address
   3.108 +    char            *buf;   /* reply buffer -- guest virtual address */
   3.109  } vif_query_t;
   3.110  
   3.111  /* Network trap operations and associated structure. 
     4.1 --- a/xen/include/xeno/sched.h	Thu Apr 17 12:26:14 2003 +0000
     4.2 +++ b/xen/include/xeno/sched.h	Thu Apr 17 17:12:21 2003 +0000
     4.3 @@ -50,7 +50,7 @@ extern struct mm_struct init_mm;
     4.4  }
     4.5  
     4.6  #define _HYP_EVENT_NEED_RESCHED 0
     4.7 -#define _HYP_EVENT_NET_RX       1
     4.8 +#define _HYP_EVENT_NET          1
     4.9  #define _HYP_EVENT_DIE          2
    4.10  
    4.11  #define PF_DONEFPUINIT  0x1  /* Has the FPU been initialised for this task? */
     5.1 --- a/xen/include/xeno/vif.h	Thu Apr 17 12:26:14 2003 +0000
     5.2 +++ b/xen/include/xeno/vif.h	Thu Apr 17 17:12:21 2003 +0000
     5.3 @@ -3,7 +3,7 @@
     5.4   * This is the hypervisor end of the network code.  The net_ring structure
     5.5   * stored in each vif is placed on a shared page to interact with the guest VM.
     5.6   *
     5.7 - * Copyright (c) 2002, A K Warfield and K A Fraser
     5.8 + * Copyright (c) 2002-2003, A K Warfield and K A Fraser
     5.9   */
    5.10  
    5.11  /* virtual network interface struct and associated defines. */
    5.12 @@ -25,45 +25,51 @@
    5.13   * TX_RING_SIZE and RX_RING_SIZE are defined in the shared network.h.
    5.14   */
    5.15  
    5.16 -typedef struct rx_shadow_entry_st {
    5.17 +typedef struct rx_shadow_entry_st 
    5.18 +{
    5.19 +    unsigned long  id;
    5.20 +    /* IN vars */
    5.21      unsigned long  addr;
    5.22 +    /* OUT vars */
    5.23      unsigned short size;
    5.24      unsigned char  status;
    5.25      unsigned char  offset;
    5.26 +    /* PRIVATE vars */
    5.27      unsigned long  flush_count;
    5.28  } rx_shadow_entry_t;
    5.29  
    5.30 -typedef struct tx_shadow_entry_st {
    5.31 +typedef struct tx_shadow_entry_st 
    5.32 +{
    5.33 +    unsigned long  id;
    5.34 +    /* IN vars */
    5.35      void          *header;
    5.36      unsigned long  payload;
    5.37      unsigned short size;
    5.38 +    /* OUT vars */
    5.39      unsigned char  status;
    5.40 -    unsigned char  _unused;
    5.41  } tx_shadow_entry_t;
    5.42  
    5.43  typedef struct net_shadow_ring_st {
    5.44      rx_shadow_entry_t *rx_ring;
    5.45 +    unsigned int rx_prod;  /* More buffers for filling go here. */
    5.46 +    unsigned int rx_idx;   /* Next buffer to fill is here. */
    5.47 +    unsigned int rx_cons;  /* Next buffer to create response for is here. */
    5.48 +
    5.49      tx_shadow_entry_t *tx_ring;
    5.50 -
    5.51      /*
    5.52 -     * Private copy of producer. Follows guest OS version, but never
    5.53 -     * catches up with our consumer index.
    5.54 +     * These cannot be derived from shared variables, as not all packets
    5.55 +     * will end up on the shadow ring (eg. locally delivered packets).
    5.56       */
    5.57 -    unsigned int rx_prod;
    5.58 -    /* Points at next buffer to be filled by NIC. Chases rx_prod. */
    5.59 -    unsigned int rx_idx;
    5.60 -    /* Points at next buffer to be returned to the guest OS. Chases rx_idx. */
    5.61 -    unsigned int rx_cons;
    5.62 +    unsigned int tx_prod;  /* More packets for sending go here. */
    5.63 +    unsigned int tx_idx;   /* Next packet to send is here. */
    5.64 +    unsigned int tx_transmitted_prod; /* Next packet to finish transmission. */
    5.65 +    unsigned int tx_cons;  /* Next packet to create response for is here. */
    5.66  
    5.67 -    /*
    5.68 -     * Private copy of producer. Follows guest OS version, but never
    5.69 -     * catches up with our consumer index.
    5.70 -     */
    5.71 -    unsigned int tx_prod;
    5.72 -    /* Points at next buffer to be scheduled. Chases tx_prod. */
    5.73 -    unsigned int tx_idx;
    5.74 -    /* Points at next buffer to be returned to the guest OS. Chases tx_idx. */
    5.75 -    unsigned int tx_cons;
    5.76 +    /* Indexes into shared ring. */
    5.77 +    unsigned int rx_req_cons;
    5.78 +    unsigned int rx_resp_prod; /* private version of shared variable */
    5.79 +    unsigned int tx_req_cons;
    5.80 +    unsigned int tx_resp_prod; /* private version of shared variable */
    5.81  } net_shadow_ring_t;
    5.82  
    5.83  typedef struct net_vif_st {
     6.1 --- a/xen/net/dev.c	Thu Apr 17 12:26:14 2003 +0000
     6.2 +++ b/xen/net/dev.c	Thu Apr 17 17:12:21 2003 +0000
     6.3 @@ -49,6 +49,15 @@
     6.4  #define TX_RING_ADD(_i,_j) (((_i)+(_j)) & (TX_RING_SIZE-1))
     6.5  #define RX_RING_ADD(_i,_j) (((_i)+(_j)) & (RX_RING_SIZE-1))
     6.6  
     6.7 +static void make_tx_response(net_vif_t *vif, 
     6.8 +                             unsigned long id, 
     6.9 +                             unsigned char st);
    6.10 +static void make_rx_response(net_vif_t     *vif, 
    6.11 +                             unsigned long  id, 
    6.12 +                             unsigned short size,
    6.13 +                             unsigned char  st,
    6.14 +                             unsigned char  off);
    6.15 +
    6.16  struct net_device *the_dev = NULL;
    6.17  
    6.18  /*
    6.19 @@ -482,6 +491,49 @@ illegal_highdma(struct net_device *dev, 
    6.20  
    6.21  struct netif_rx_stats netdev_rx_stat[NR_CPUS];
    6.22  
    6.23 +/*
    6.24 + * update_shared_ring(void)
    6.25 + * 
    6.26 + * This replaces flush_rx_queue as the guest event handler to move packets
    6.27 + * queued in the guest ring up to the guest.  Really, the packet is already
    6.28 + * there, it was page flipped in deliver_packet, but this moves the ring
    6.29 + * descriptor across from the shadow ring and increments the pointers.
    6.30 + */
    6.31 +void update_shared_ring(void)
    6.32 +{
    6.33 +    rx_shadow_entry_t *rx;
    6.34 +    tx_shadow_entry_t *tx;
    6.35 +    net_ring_t *net_ring;
    6.36 +    net_shadow_ring_t *shadow_ring;
    6.37 +    net_vif_t *vif;
    6.38 +    struct list_head *ent;
    6.39 +
    6.40 +    clear_bit(_HYP_EVENT_NET, &current->hyp_events);
    6.41 +
    6.42 +    list_for_each(ent, &current->net_vifs)
    6.43 +    {
    6.44 +        vif = list_entry(ent, net_vif_t, dom_list);
    6.45 +        net_ring    = vif->net_ring;
    6.46 +        shadow_ring = vif->shadow_ring;
    6.47 +
    6.48 +        while ( shadow_ring->rx_cons != shadow_ring->rx_idx )
    6.49 +        {
    6.50 +            rx = shadow_ring->rx_ring + shadow_ring->rx_cons;
    6.51 +            if ( rx->flush_count == tlb_flush_count[smp_processor_id()] )
    6.52 +                __flush_tlb();
    6.53 +            shadow_ring->rx_cons = RX_RING_INC(shadow_ring->rx_cons);
    6.54 +            make_rx_response(vif, rx->id, rx->size, rx->status, rx->offset);
    6.55 +        }
    6.56 +
    6.57 +        while ( shadow_ring->tx_cons != shadow_ring->tx_transmitted_prod )
    6.58 +        {
    6.59 +            tx = shadow_ring->tx_ring + shadow_ring->tx_cons;
    6.60 +            shadow_ring->tx_cons = RX_RING_INC(shadow_ring->tx_cons);
    6.61 +            make_tx_response(vif, tx->id, tx->status);
    6.62 +        }
    6.63 +    }
    6.64 +}
    6.65 +
    6.66  void deliver_packet(struct sk_buff *skb, net_vif_t *vif)
    6.67  {
    6.68      net_shadow_ring_t *shadow_ring;
    6.69 @@ -489,7 +541,6 @@ void deliver_packet(struct sk_buff *skb,
    6.70      unsigned long *g_pte; 
    6.71      struct pfn_info *g_pfn, *h_pfn;
    6.72      unsigned int i; 
    6.73 -    unsigned long flags;
    6.74  
    6.75      memset(skb->mac.ethernet->h_dest, 0, ETH_ALEN);
    6.76      if ( ntohs(skb->mac.ethernet->h_proto) == ETH_P_ARP )
    6.77 @@ -501,17 +552,13 @@ void deliver_packet(struct sk_buff *skb,
    6.78  
    6.79      rx = shadow_ring->rx_ring + i;
    6.80  
    6.81 -    if ( rx->status != RING_STATUS_OK )
    6.82 -    {
    6.83 -        DPRINTK("Bad buffer in deliver_packet()\n");
    6.84 -        goto inc_and_out;
    6.85 -    }
    6.86 +    ASSERT(rx->status == RING_STATUS_OK);
    6.87 +    ASSERT(skb->len <= PAGE_SIZE);
    6.88  
    6.89 -    ASSERT(skb->len <= PAGE_SIZE);
    6.90      rx->size   = skb->len;
    6.91      rx->offset = (unsigned char)((unsigned long)skb->data & ~PAGE_MASK);
    6.92  
    6.93 -    spin_lock_irqsave(&vif->domain->page_lock, flags);
    6.94 +    spin_lock(&vif->domain->page_lock);
    6.95  
    6.96      g_pte = map_domain_mem(rx->addr);
    6.97  
    6.98 @@ -541,12 +588,11 @@ void deliver_packet(struct sk_buff *skb,
    6.99      list_del(&g_pfn->list);
   6.100      list_add(&h_pfn->list, &vif->domain->pg_head);
   6.101  
   6.102 -    spin_unlock_irqrestore(&vif->domain->page_lock, flags);
   6.103 +    spin_unlock(&vif->domain->page_lock);
   6.104      
   6.105      /* Our skbuff now points at the guest's old frame. */
   6.106      skb->pf = g_pfn;
   6.107  
   6.108 - inc_and_out:        
   6.109      smp_wmb(); /* updates must happen before releasing the descriptor. */
   6.110      shadow_ring->rx_idx = RX_RING_INC(i);
   6.111  }
   6.112 @@ -595,11 +641,11 @@ int netif_rx(struct sk_buff *skb)
   6.113      if ( skb->dst_vif == VIF_UNKNOWN_INTERFACE )
   6.114          skb->dst_vif = __net_get_target_vif(skb->data, skb->len, skb->src_vif);
   6.115          
   6.116 -    read_lock_irqsave(&sys_vif_lock, flags);
   6.117 +    read_lock(&sys_vif_lock);
   6.118      if ( (skb->dst_vif <= VIF_PHYSICAL_INTERFACE) ||
   6.119           ((vif = sys_vif_list[skb->dst_vif]) == NULL) )
   6.120      {
   6.121 -        read_unlock_irqrestore(&sys_vif_lock, flags);
   6.122 +        read_unlock(&sys_vif_lock);
   6.123          netdev_rx_stat[this_cpu].dropped++;
   6.124          unmap_domain_mem(skb->head);
   6.125          kfree_skb(skb);
   6.126 @@ -608,10 +654,10 @@ int netif_rx(struct sk_buff *skb)
   6.127      }
   6.128  
   6.129      get_vif(vif);
   6.130 -    read_unlock_irqrestore(&sys_vif_lock, flags);
   6.131 +    read_unlock(&sys_vif_lock);
   6.132  
   6.133      deliver_packet(skb, vif);
   6.134 -    cpu_mask = mark_hyp_event(vif->domain, _HYP_EVENT_NET_RX);
   6.135 +    cpu_mask = mark_hyp_event(vif->domain, _HYP_EVENT_NET);
   6.136      put_vif(vif);
   6.137  
   6.138      unmap_domain_mem(skb->head);
   6.139 @@ -676,10 +722,8 @@ static void add_to_net_schedule_list_tai
   6.140  /* Destructor function for tx skbs. */
   6.141  static void tx_skb_release(struct sk_buff *skb)
   6.142  {
   6.143 -    int i, send = 0;
   6.144 +    int i;
   6.145      net_vif_t *vif = sys_vif_list[skb->src_vif];
   6.146 -    unsigned int idx;
   6.147 -    tx_shadow_entry_t *tx;
   6.148      unsigned long cpu_mask, flags;
   6.149      
   6.150      spin_lock_irqsave(&vif->domain->page_lock, flags);
   6.151 @@ -692,51 +736,10 @@ static void tx_skb_release(struct sk_buf
   6.152  
   6.153      skb_shinfo(skb)->nr_frags = 0; 
   6.154  
   6.155 -    /* This would mean that the guest OS has fiddled with our index. */
   6.156 -    if ( vif->shadow_ring->tx_cons != vif->net_ring->tx_cons )
   6.157 -        DPRINTK("Shadow and shared rings out of sync (%d/%d)\n",
   6.158 -                vif->shadow_ring->tx_cons, vif->net_ring->tx_cons);
   6.159 -
   6.160 -    /*
   6.161 -     * XXX This assumes that, per vif, SKBs are processed in-order!
   6.162 -     * Also assumes no concurrency. This is safe because each vif
   6.163 -     * maps to one NIC. This is executed in NIC interrupt code, so we have
   6.164 -     * mutual exclusion from do_IRQ().
   6.165 -     */
   6.166 -
   6.167 -    smp_wmb(); /* make sure any status updates occur before inc'ing tx_cons. */
   6.168 -
   6.169 -    /* Skip over a sequence of bad descriptors, plus the first good one. */
   6.170 -    do {
   6.171 -        idx = vif->shadow_ring->tx_cons;
   6.172 -        /* There must be at least one good descriptor outstanding. */
   6.173 -        if ( idx == vif->shadow_ring->tx_idx ) BUG();
   6.174 -        tx  = &vif->shadow_ring->tx_ring[idx];
   6.175 -        vif->shadow_ring->tx_cons = TX_RING_INC(idx);
   6.176 -        if ( vif->shadow_ring->tx_cons == vif->net_ring->tx_event ) send = 1;
   6.177 -    } while ( tx->status != RING_STATUS_OK );
   6.178 -
   6.179 -    /* Now skip over any more bad descriptors, up to the next good one. */
   6.180 -    do {
   6.181 -        idx = vif->shadow_ring->tx_cons;
   6.182 -        tx  = &vif->shadow_ring->tx_ring[idx];
   6.183 -        /* Carry on until we find a good descriptor, or reach scheduler idx. */
   6.184 -        if ( (idx == vif->shadow_ring->tx_idx) || 
   6.185 -             (tx->status == RING_STATUS_OK) )
   6.186 -            break;
   6.187 -        vif->shadow_ring->tx_cons = TX_RING_INC(idx);
   6.188 -        if ( vif->shadow_ring->tx_cons == vif->net_ring->tx_event ) send = 1;
   6.189 -    } while ( 1 );
   6.190 -
   6.191 -    /* Update shared consumer index to the new private value. */
   6.192 -    vif->net_ring->tx_cons = vif->shadow_ring->tx_cons;
   6.193 -
   6.194 -    /* Send a transmit event if requested. */
   6.195 -    if ( send )
   6.196 -    {
   6.197 -        cpu_mask = mark_guest_event(vif->domain, _EVENT_NET_TX);
   6.198 -        guest_event_notify(cpu_mask);
   6.199 -    }
   6.200 +    vif->shadow_ring->tx_transmitted_prod =
   6.201 +        TX_RING_INC(vif->shadow_ring->tx_transmitted_prod);
   6.202 +    cpu_mask = mark_hyp_event(vif->domain, _HYP_EVENT_NET);
   6.203 +    hyp_event_notify(cpu_mask);    
   6.204  
   6.205      put_vif(vif);
   6.206  }
   6.207 @@ -765,27 +768,22 @@ static void net_tx_action(unsigned long 
   6.208              continue;
   6.209          }
   6.210  
   6.211 +        if ( (skb = alloc_skb_nodata(GFP_ATOMIC)) == NULL )
   6.212 +        {
   6.213 +            printk("Out of memory in net_tx_action()!\n");
   6.214 +            add_to_net_schedule_list_tail(vif);
   6.215 +            put_vif(vif);
   6.216 +            break;
   6.217 +        }
   6.218 +        
   6.219          /* Pick an entry from the transmit queue. */
   6.220          tx = &vif->shadow_ring->tx_ring[vif->shadow_ring->tx_idx];
   6.221          vif->shadow_ring->tx_idx = TX_RING_INC(vif->shadow_ring->tx_idx);
   6.222          if ( vif->shadow_ring->tx_idx != vif->shadow_ring->tx_prod )
   6.223              add_to_net_schedule_list_tail(vif);
   6.224  
   6.225 -        /* Check the chosen entry is good. */
   6.226 -        if ( tx->status != RING_STATUS_OK ) 
   6.227 -        {
   6.228 -            put_vif(vif);
   6.229 -            continue;
   6.230 -        }
   6.231 +        ASSERT(tx->status == RING_STATUS_OK);
   6.232  
   6.233 -        if ( (skb = alloc_skb_nodata(GFP_ATOMIC)) == NULL )
   6.234 -        {
   6.235 -            printk("Out of memory in net_tx_action()!\n");
   6.236 -            tx->status = RING_STATUS_BAD_PAGE;
   6.237 -            put_vif(vif);
   6.238 -            break;
   6.239 -        }
   6.240 -        
   6.241          skb->destructor = tx_skb_release;
   6.242          
   6.243          skb->head = skb->data = tx->header;
   6.244 @@ -828,57 +826,6 @@ static inline void maybe_schedule_tx_act
   6.245  
   6.246  
   6.247  /*
   6.248 - * update_shared_ring(void)
   6.249 - * 
   6.250 - * This replaces flush_rx_queue as the guest event handler to move packets
   6.251 - * queued in the guest ring up to the guest.  Really, the packet is already
   6.252 - * there, it was page flipped in deliver_packet, but this moves the ring
   6.253 - * descriptor across from the shadow ring and increments the pointers.
   6.254 - */
   6.255 -
   6.256 -void update_shared_ring(void)
   6.257 -{
   6.258 -    rx_shadow_entry_t *rx;
   6.259 -    shared_info_t *s = current->shared_info;
   6.260 -    net_ring_t *net_ring;
   6.261 -    net_shadow_ring_t *shadow_ring;
   6.262 -    net_vif_t *vif;
   6.263 -    struct list_head *ent;
   6.264 -
   6.265 -    clear_bit(_HYP_EVENT_NET_RX, &current->hyp_events);
   6.266 -
   6.267 -    list_for_each(ent, &current->net_vifs)
   6.268 -    {
   6.269 -        vif = list_entry(ent, net_vif_t, dom_list);
   6.270 -        net_ring    = vif->net_ring;
   6.271 -        shadow_ring = vif->shadow_ring;
   6.272 -
   6.273 -        /* This would mean that the guest OS has fiddled with our index. */
   6.274 -        if ( shadow_ring->rx_cons != net_ring->rx_cons )
   6.275 -            DPRINTK("Shadow and shared rings out of sync (%d/%d)\n",
   6.276 -                    shadow_ring->rx_cons, net_ring->rx_cons);
   6.277 -
   6.278 -        while ( shadow_ring->rx_cons != shadow_ring->rx_idx )
   6.279 -        {
   6.280 -            rx = shadow_ring->rx_ring + shadow_ring->rx_cons;
   6.281 -            copy_to_user(net_ring->rx_ring + shadow_ring->rx_cons, rx, 
   6.282 -                         sizeof(rx_entry_t));
   6.283 -
   6.284 -            if ( rx->flush_count == tlb_flush_count[smp_processor_id()] )
   6.285 -                __flush_tlb();
   6.286 -
   6.287 -            smp_wmb(); /* copy descriptor before inc'ing rx_cons */
   6.288 -            shadow_ring->rx_cons = RX_RING_INC(shadow_ring->rx_cons);
   6.289 -
   6.290 -            if ( shadow_ring->rx_cons == net_ring->rx_event )
   6.291 -                set_bit(_EVENT_NET_RX, &s->events);
   6.292 -        }
   6.293 -        net_ring->rx_cons = shadow_ring->rx_cons;
   6.294 -    }
   6.295 -}
   6.296 -
   6.297 -
   6.298 -/*
   6.299   *	We need this ioctl for efficient implementation of the
   6.300   *	if_indextoname() function required by the IPv6 API.  Without
   6.301   *	it, we would have to search all the interfaces to find a
   6.302 @@ -1847,10 +1794,10 @@ long do_net_update(void)
   6.303      net_ring_t *net_ring;
   6.304      net_shadow_ring_t *shadow_ring;
   6.305      net_vif_t *current_vif;
   6.306 -    unsigned int i;
   6.307 +    unsigned int i, j;
   6.308      struct sk_buff *skb;
   6.309 -    tx_entry_t tx;
   6.310 -    rx_shadow_entry_t *rx;
   6.311 +    tx_req_entry_t tx;
   6.312 +    rx_req_entry_t rx;
   6.313      unsigned long pfn;
   6.314      struct pfn_info *page;
   6.315      unsigned long *g_pte;    
   6.316 @@ -1873,31 +1820,32 @@ long do_net_update(void)
   6.317           * new producer index, but take care not to catch up with our own
   6.318           * consumer index.
   6.319           */
   6.320 -        for ( i = shadow_ring->tx_prod; 
   6.321 -              (i != net_ring->tx_prod) && 
   6.322 -                  (((shadow_ring->tx_cons-i) & (TX_RING_SIZE-1)) != 1); 
   6.323 +        j = shadow_ring->tx_prod;
   6.324 +        for ( i = shadow_ring->tx_req_cons; 
   6.325 +              (i != net_ring->tx_req_prod) && 
   6.326 +                  (((shadow_ring->tx_resp_prod-i) & (TX_RING_SIZE-1)) != 1); 
   6.327                i = TX_RING_INC(i) )
   6.328          {
   6.329 -            if ( copy_from_user(&tx, net_ring->tx_ring+i, sizeof(tx)) )
   6.330 +            if ( copy_from_user(&tx, &net_ring->tx_ring[i].req, sizeof(tx)) )
   6.331              {
   6.332                  DPRINTK("Bad copy_from_user for tx net descriptor\n");
   6.333 -                shadow_ring->tx_ring[i].status = RING_STATUS_ERR_CFU;
   6.334 +                make_tx_response(current_vif, tx.id, RING_STATUS_ERR_CFU);
   6.335                  continue;
   6.336              }
   6.337  
   6.338 -            shadow_ring->tx_ring[i].size   = tx.size;
   6.339 -            shadow_ring->tx_ring[i].status = RING_STATUS_BAD_PAGE;
   6.340 -
   6.341 -            if ( tx.size < PKT_PROT_LEN )
   6.342 +            if ( (tx.size < PKT_PROT_LEN) || (tx.size > ETH_FRAME_LEN) )
   6.343              {
   6.344 -                DPRINTK("Runt packet %d\n", tx.size);
   6.345 +                DPRINTK("Bad packet size: %d\n", tx.size);
   6.346 +                make_tx_response(current_vif, tx.id, RING_STATUS_BAD_PAGE);
   6.347                  continue; 
   6.348              }
   6.349  
   6.350 +            /* No crossing a page boundary as the payload mustn't fragment. */
   6.351              if ( ((tx.addr & ~PAGE_MASK) + tx.size) >= PAGE_SIZE ) 
   6.352              {
   6.353                  DPRINTK("tx.addr: %lx, size: %u, end: %lu\n", 
   6.354                          tx.addr, tx.size, (tx.addr &~PAGE_MASK) + tx.size);
   6.355 +                make_tx_response(current_vif, tx.id, RING_STATUS_BAD_PAGE);
   6.356                  continue;
   6.357              }
   6.358  
   6.359 @@ -1909,6 +1857,7 @@ long do_net_update(void)
   6.360              {
   6.361                  DPRINTK("Bad page frame\n");
   6.362                  spin_unlock_irq(&current->page_lock);
   6.363 +                make_tx_response(current_vif, tx.id, RING_STATUS_BAD_PAGE);
   6.364                  continue;
   6.365              }
   6.366              
   6.367 @@ -1917,45 +1866,61 @@ long do_net_update(void)
   6.368              protocol = __constant_htons(
   6.369                  init_tx_header(g_data, tx.size, the_dev));
   6.370              if ( protocol == 0 )
   6.371 +            {
   6.372 +                make_tx_response(current_vif, tx.id, RING_STATUS_BAD_PAGE);
   6.373                  goto tx_unmap_and_continue;
   6.374 +            }
   6.375  
   6.376              target = __net_get_target_vif(g_data, tx.size, current_vif->id);
   6.377  
   6.378              if ( target > VIF_PHYSICAL_INTERFACE )
   6.379              {
   6.380                  /* Local delivery */
   6.381 -                if ( (skb = dev_alloc_skb(tx.size)) == NULL ) 
   6.382 +                if ( (skb = dev_alloc_skb(ETH_FRAME_LEN + 32)) == NULL )
   6.383 +                {
   6.384 +                    make_tx_response(current_vif, tx.id, RING_STATUS_BAD_PAGE);
   6.385                      goto tx_unmap_and_continue;
   6.386 -                
   6.387 -                skb->destructor = tx_skb_release;
   6.388 -                get_vif(current_vif);
   6.389 -
   6.390 -                shadow_ring->tx_ring[i].status = RING_STATUS_OK;
   6.391 +                }
   6.392  
   6.393                  skb->src_vif = current_vif->id;
   6.394                  skb->dst_vif = target;
   6.395 -                skb->protocol = protocol;
   6.396 -                
   6.397 +                skb->protocol = protocol;                
   6.398 +
   6.399 +                /*
   6.400 +                 * We don't need a well-formed skb as netif_rx will fill these
   6.401 +                 * fields in as necessary. All we actually need is the right
   6.402 +                 * page offset in skb->data, and the right length in skb->len.
   6.403 +                 * Note that the correct address/length *excludes* link header.
   6.404 +                 */
   6.405                  skb->head = (u8 *)map_domain_mem(
   6.406                      ((skb->pf - frame_table) << PAGE_SHIFT));
   6.407 -                skb->data = skb->head + 16;
   6.408 -                skb_reserve(skb,2);
   6.409 +                skb->data = skb->head + 18;
   6.410                  memcpy(skb->data, g_data, tx.size);
   6.411 -                skb->len = tx.size;
   6.412 +                skb->data += ETH_HLEN;
   6.413 +                skb->len = tx.size - ETH_HLEN;
   6.414                  unmap_domain_mem(skb->head);
   6.415 -                skb->data += ETH_HLEN;
   6.416 +
   6.417                  (void)netif_rx(skb);
   6.418 +
   6.419 +                make_tx_response(current_vif, tx.id, RING_STATUS_OK);
   6.420              }
   6.421              else if ( target == VIF_PHYSICAL_INTERFACE )
   6.422              {
   6.423 -                shadow_ring->tx_ring[i].header = 
   6.424 +                shadow_ring->tx_ring[j].id     = tx.id;
   6.425 +                shadow_ring->tx_ring[j].size   = tx.size;
   6.426 +                shadow_ring->tx_ring[j].status = RING_STATUS_OK;
   6.427 +                shadow_ring->tx_ring[j].header = 
   6.428                      kmem_cache_alloc(net_header_cachep, GFP_KERNEL);
   6.429 -                if ( shadow_ring->tx_ring[i].header == NULL ) 
   6.430 +                if ( shadow_ring->tx_ring[j].header == NULL )
   6.431 +                { 
   6.432 +                    make_tx_response(current_vif, tx.id, RING_STATUS_OK);
   6.433                      goto tx_unmap_and_continue;
   6.434 -                memcpy(shadow_ring->tx_ring[i].header, g_data, PKT_PROT_LEN);
   6.435 -                shadow_ring->tx_ring[i].payload = tx.addr + PKT_PROT_LEN;
   6.436 -                shadow_ring->tx_ring[i].status = RING_STATUS_OK;
   6.437 +                }
   6.438 +
   6.439 +                memcpy(shadow_ring->tx_ring[j].header, g_data, PKT_PROT_LEN);
   6.440 +                shadow_ring->tx_ring[j].payload = tx.addr + PKT_PROT_LEN;
   6.441                  get_page_tot(page);
   6.442 +                j = TX_RING_INC(j);
   6.443              }
   6.444  
   6.445          tx_unmap_and_continue:
   6.446 @@ -1963,10 +1928,12 @@ long do_net_update(void)
   6.447              spin_unlock_irq(&current->page_lock);
   6.448          }
   6.449  
   6.450 -        if ( shadow_ring->tx_prod != i )
   6.451 +        shadow_ring->tx_req_cons = i;
   6.452 +
   6.453 +        if ( shadow_ring->tx_prod != j )
   6.454          {
   6.455              smp_mb(); /* Let other CPUs see new descriptors first. */
   6.456 -            shadow_ring->tx_prod = i;
   6.457 +            shadow_ring->tx_prod = j;
   6.458              add_to_net_schedule_list_tail(current_vif);
   6.459              maybe_schedule_tx_action();
   6.460          }
   6.461 @@ -1980,29 +1947,23 @@ long do_net_update(void)
   6.462           * new producer index, but take care not to catch up with our own
   6.463           * consumer index.
   6.464           */
   6.465 -        for ( i = shadow_ring->rx_prod; 
   6.466 -              (i != net_ring->rx_prod) && 
   6.467 -                  (((shadow_ring->rx_cons-i) & (RX_RING_SIZE-1)) != 1); 
   6.468 +        j = shadow_ring->rx_prod;
   6.469 +        for ( i = shadow_ring->rx_req_cons; 
   6.470 +              (i != net_ring->rx_req_prod) && 
   6.471 +                  (((shadow_ring->rx_resp_prod-i) & (RX_RING_SIZE-1)) != 1); 
   6.472                i = RX_RING_INC(i) )
   6.473          {
   6.474 -            /* 
   6.475 -             * This copy assumes that rx_shadow_entry_t is an extension of 
   6.476 -             * rx_net_entry_t extra fields must be tacked on to the end.
   6.477 -             */
   6.478 -            if ( copy_from_user(shadow_ring->rx_ring+i, net_ring->rx_ring+i, 
   6.479 -                                sizeof (rx_entry_t) ) )
   6.480 +            if ( copy_from_user(&rx, &net_ring->rx_ring[i].req, sizeof(rx)) )
   6.481              {
   6.482 -                DPRINTK("Bad copy_from_user for rx ring\n");
   6.483 -                shadow_ring->rx_ring[i].status = RING_STATUS_ERR_CFU;
   6.484 +                DPRINTK("Bad copy_from_user for rx net descriptor\n");
   6.485 +                make_rx_response(current_vif, 
   6.486 +                                 rx.id, 0, RING_STATUS_ERR_CFU, 0);
   6.487                  continue;
   6.488 -            } 
   6.489 +            }
   6.490  
   6.491 -            rx = shadow_ring->rx_ring + i;
   6.492 -            pfn = rx->addr >> PAGE_SHIFT;
   6.493 +            pfn = rx.addr >> PAGE_SHIFT;
   6.494              page = frame_table + pfn;
   6.495              
   6.496 -            shadow_ring->rx_ring[i].status = RING_STATUS_BAD_PAGE;
   6.497 -            
   6.498              spin_lock_irq(&current->page_lock);
   6.499              if ( (pfn >= max_page) || 
   6.500                   (page->flags != (PGT_l1_page_table | current->domain)) ) 
   6.501 @@ -2010,14 +1971,18 @@ long do_net_update(void)
   6.502                  DPRINTK("Bad page frame for ppte %d,%08lx,%08lx,%08lx\n",
   6.503                          current->domain, pfn, max_page, page->flags);
   6.504                  spin_unlock_irq(&current->page_lock);
   6.505 +                make_rx_response(current_vif, 
   6.506 +                                 rx.id, 0, RING_STATUS_BAD_PAGE, 0);
   6.507                  continue;
   6.508              }
   6.509              
   6.510 -            g_pte = map_domain_mem(rx->addr);
   6.511 +            g_pte = map_domain_mem(rx.addr);
   6.512              
   6.513              if ( !(*g_pte & _PAGE_PRESENT) )
   6.514              {
   6.515 -                DPRINTK("Inavlid PTE passed down (not present)\n");
   6.516 +                DPRINTK("Invalid PTE passed down (not present)\n");
   6.517 +                make_rx_response(current_vif, 
   6.518 +                                 rx.id, 0, RING_STATUS_BAD_PAGE, 0);
   6.519                  goto rx_unmap_and_continue;
   6.520              }
   6.521              
   6.522 @@ -2027,25 +1992,32 @@ long do_net_update(void)
   6.523              {
   6.524  		DPRINTK("RX page mapped multple times (%d/%d/%08x)\n",
   6.525        		page->type_count, page->tot_count, page->flags);
   6.526 -                
   6.527 +                make_rx_response(current_vif, 
   6.528 +                                 rx.id, 0, RING_STATUS_BAD_PAGE, 0);
   6.529                  goto rx_unmap_and_continue;
   6.530              }
   6.531              
   6.532              /* The pte they passed was good, so take it away from them. */
   6.533 -            shadow_ring->rx_ring[i].status = RING_STATUS_OK;
   6.534              *g_pte &= ~_PAGE_PRESENT;
   6.535              page->flags = (page->flags & ~PG_type_mask) | PGT_net_rx_buf;
   6.536 -            rx->flush_count = tlb_flush_count[smp_processor_id()];
   6.537 +            shadow_ring->rx_ring[j].id          = rx.id;
   6.538 +            shadow_ring->rx_ring[j].addr        = rx.addr;
   6.539 +            shadow_ring->rx_ring[j].status      = RING_STATUS_OK;
   6.540 +            shadow_ring->rx_ring[j].flush_count = 
   6.541 +                tlb_flush_count[smp_processor_id()];
   6.542 +            j = RX_RING_INC(j);
   6.543              
   6.544          rx_unmap_and_continue:
   6.545              unmap_domain_mem(g_pte);
   6.546              spin_unlock_irq(&current->page_lock);
   6.547          }
   6.548  
   6.549 -        if ( shadow_ring->rx_prod != i )
   6.550 +        shadow_ring->rx_req_cons = i;
   6.551 +
   6.552 +        if ( shadow_ring->rx_prod != j )
   6.553          {
   6.554              smp_mb(); /* Let other CPUs see new descriptors first. */
   6.555 -            shadow_ring->rx_prod = i;
   6.556 +            shadow_ring->rx_prod = j;
   6.557          }
   6.558      }
   6.559  
   6.560 @@ -2053,6 +2025,58 @@ long do_net_update(void)
   6.561  }
   6.562  
   6.563  
   6.564 +static void make_tx_response(net_vif_t *vif, 
   6.565 +                             unsigned long id, 
   6.566 +                             unsigned char st)
   6.567 +{
   6.568 +    unsigned long flags;
   6.569 +    net_shadow_ring_t *shadow = vif->shadow_ring;
   6.570 +    unsigned int pos;
   6.571 +    tx_resp_entry_t *resp, privresp;
   6.572 +
   6.573 +    /* Place on the response ring for the relevant domain. */ 
   6.574 +    local_irq_save(flags);
   6.575 +    pos  = shadow->tx_resp_prod;
   6.576 +    resp = &vif->net_ring->tx_ring[pos].resp;
   6.577 +    privresp.id     = id;
   6.578 +    privresp.status = st;
   6.579 +    copy_to_user(resp, &privresp, sizeof(privresp));
   6.580 +    pos = TX_RING_INC(pos);
   6.581 +    shadow->tx_resp_prod = vif->net_ring->tx_resp_prod = pos;
   6.582 +    if ( pos == vif->net_ring->rx_event )
   6.583 +        set_bit(_EVENT_NET_TX, &current->shared_info->events);
   6.584 +    local_irq_restore(flags);
   6.585 +}
   6.586 +
   6.587 +
   6.588 +static void make_rx_response(net_vif_t     *vif, 
   6.589 +                             unsigned long  id, 
   6.590 +                             unsigned short size,
   6.591 +                             unsigned char  st,
   6.592 +                             unsigned char  off)
   6.593 +{
   6.594 +    unsigned long flags;
   6.595 +    net_shadow_ring_t *shadow = vif->shadow_ring;
   6.596 +    unsigned int pos;
   6.597 +    rx_resp_entry_t *resp, privresp;
   6.598 +
   6.599 +    /* Place on the response ring for the relevant domain. */ 
   6.600 +    local_irq_save(flags);
   6.601 +    pos  = shadow->rx_resp_prod;
   6.602 +    resp = &vif->net_ring->rx_ring[pos].resp;
   6.603 +    privresp.id     = id;
   6.604 +    privresp.size   = size;
   6.605 +    privresp.status = st;
   6.606 +    privresp.offset = off;
   6.607 +    copy_to_user(resp, &privresp, sizeof(privresp));
   6.608 +    pos = RX_RING_INC(pos);
   6.609 +    shadow->rx_resp_prod = vif->net_ring->rx_resp_prod = pos;
   6.610 +    if ( pos == vif->net_ring->rx_event )
   6.611 +        set_bit(_EVENT_NET_RX, &current->shared_info->events);
   6.612 +    local_irq_restore(flags);
   6.613 +}
   6.614 +
   6.615 +
   6.616  int setup_network_devices(void)
   6.617  {
   6.618      int ret;
     7.1 --- a/xenolinux-2.4.21-pre4-sparse/arch/xeno/drivers/network/network.c	Thu Apr 17 12:26:14 2003 +0000
     7.2 +++ b/xenolinux-2.4.21-pre4-sparse/arch/xeno/drivers/network/network.c	Thu Apr 17 17:12:21 2003 +0000
     7.3 @@ -3,7 +3,7 @@
     7.4   * 
     7.5   * Virtual network driver for XenoLinux.
     7.6   * 
     7.7 - * Copyright (c) 2002, K A Fraser
     7.8 + * Copyright (c) 2002-2003, K A Fraser
     7.9   */
    7.10  
    7.11  #include <linux/config.h>
    7.12 @@ -47,21 +47,14 @@ static void cleanup_module(void);
    7.13  
    7.14  static struct list_head dev_list;
    7.15  
    7.16 -/*
    7.17 - * RX RING:   RX_IDX <= rx_cons <= rx_prod
    7.18 - * TX RING:   TX_IDX <= tx_cons <= tx_prod
    7.19 - * (*_IDX allocated privately here, *_cons & *_prod shared with hypervisor)
    7.20 - */
    7.21  struct net_private
    7.22  {
    7.23      struct list_head list;
    7.24      struct net_device *dev;
    7.25  
    7.26      struct net_device_stats stats;
    7.27 -    struct sk_buff **tx_skb_ring;
    7.28 -    struct sk_buff **rx_skb_ring;
    7.29      atomic_t tx_entries;
    7.30 -    unsigned int rx_idx, tx_idx, tx_full;
    7.31 +    unsigned int rx_resp_cons, tx_resp_cons, tx_full;
    7.32      net_ring_t *net_ring;
    7.33      spinlock_t tx_lock;
    7.34  };
    7.35 @@ -71,10 +64,10 @@ static void dbg_network_int(int irq, voi
    7.36  {
    7.37      struct net_device *dev = (struct net_device *)dev_id;
    7.38      struct net_private *np = dev->priv;
    7.39 -    printk(KERN_ALERT "tx_full = %d, tx_entries = %d, tx_idx = %d,"
    7.40 -           " tx_cons = %d, tx_prod = %d, tx_event = %d, state=%d\n",
    7.41 -           np->tx_full, atomic_read(&np->tx_entries), np->tx_idx, 
    7.42 -           np->net_ring->tx_cons, np->net_ring->tx_prod, 
    7.43 +    printk(KERN_ALERT "tx_full = %d, tx_entries = %d, tx_resp_cons = %d,"
    7.44 +           " tx_req_prod = %d, tx_resp_prod = %d, tx_event = %d, state=%d\n",
    7.45 +           np->tx_full, atomic_read(&np->tx_entries), np->tx_resp_cons, 
    7.46 +           np->net_ring->tx_req_prod, np->net_ring->tx_resp_prod, 
    7.47             np->net_ring->tx_event,
    7.48             test_bit(__LINK_STATE_XOFF, &dev->state));
    7.49  }
    7.50 @@ -85,29 +78,17 @@ static int network_open(struct net_devic
    7.51      struct net_private *np = dev->priv;
    7.52      int error = 0;
    7.53  
    7.54 -    np->rx_idx = np->tx_idx = np->tx_full = 0;
    7.55 -
    7.56 +    np->rx_resp_cons = np->tx_resp_cons = np->tx_full = 0;
    7.57      memset(&np->stats, 0, sizeof(np->stats));
    7.58 -
    7.59      spin_lock_init(&np->tx_lock);
    7.60 -
    7.61      atomic_set(&np->tx_entries, 0);
    7.62 +    memset(np->net_ring, 0, sizeof(*np->net_ring));
    7.63  
    7.64 -    np->net_ring->tx_prod = np->net_ring->tx_cons = np->net_ring->tx_event = 0;
    7.65 -    np->net_ring->rx_prod = np->net_ring->rx_cons = np->net_ring->rx_event = 0;
    7.66 -    np->net_ring->tx_ring = NULL;
    7.67 -    np->net_ring->rx_ring = NULL;
    7.68 -
    7.69 -    np->tx_skb_ring = kmalloc(TX_RING_SIZE * sizeof(struct sk_buff *),
    7.70 -                              GFP_KERNEL);
    7.71 -    np->rx_skb_ring = kmalloc(RX_RING_SIZE * sizeof(struct sk_buff *),
    7.72 -                              GFP_KERNEL);
    7.73      np->net_ring->tx_ring = kmalloc(TX_RING_SIZE * sizeof(tx_entry_t), 
    7.74                                      GFP_KERNEL);
    7.75      np->net_ring->rx_ring = kmalloc(RX_RING_SIZE * sizeof(rx_entry_t), 
    7.76                                      GFP_KERNEL);
    7.77 -    if ( (np->tx_skb_ring == NULL) || (np->rx_skb_ring == NULL) ||
    7.78 -         (np->net_ring->tx_ring == NULL) || (np->net_ring->rx_ring == NULL) )
    7.79 +    if ( (np->net_ring->tx_ring == NULL) || (np->net_ring->rx_ring == NULL) )
    7.80      {
    7.81          printk(KERN_WARNING "%s; Could not allocate ring memory\n", dev->name);
    7.82          error = -ENOBUFS;
    7.83 @@ -156,8 +137,6 @@ static int network_open(struct net_devic
    7.84   fail:
    7.85      if ( np->net_ring->rx_ring ) kfree(np->net_ring->rx_ring);
    7.86      if ( np->net_ring->tx_ring ) kfree(np->net_ring->tx_ring);
    7.87 -    if ( np->rx_skb_ring ) kfree(np->rx_skb_ring);
    7.88 -    if ( np->tx_skb_ring ) kfree(np->tx_skb_ring);
    7.89      kfree(np);
    7.90      return error;
    7.91  }
    7.92 @@ -169,28 +148,29 @@ static void network_tx_buf_gc(struct net
    7.93      struct net_private *np = dev->priv;
    7.94      struct sk_buff *skb;
    7.95      unsigned long flags;
    7.96 -    unsigned int cons;
    7.97 +    unsigned int prod;
    7.98 +    tx_entry_t *tx_ring = np->net_ring->tx_ring;
    7.99  
   7.100      spin_lock_irqsave(&np->tx_lock, flags);
   7.101  
   7.102      do {
   7.103 -        cons = np->net_ring->tx_cons;
   7.104 +        prod = np->net_ring->tx_resp_prod;
   7.105  
   7.106 -        for ( i = np->tx_idx; i != cons; i = TX_RING_INC(i) )
   7.107 +        for ( i = np->tx_resp_cons; i != prod; i = TX_RING_INC(i) )
   7.108          {
   7.109 -            skb = np->tx_skb_ring[i];
   7.110 +            skb = (struct sk_buff *)tx_ring[i].resp.id;
   7.111              dev_kfree_skb_any(skb);
   7.112              atomic_dec(&np->tx_entries);
   7.113          }
   7.114          
   7.115 -        np->tx_idx = i;
   7.116 +        np->tx_resp_cons = prod;
   7.117          
   7.118          /* Set a new event, then check for race with update of tx_cons. */
   7.119          np->net_ring->tx_event =
   7.120 -            TX_RING_ADD(cons, (atomic_read(&np->tx_entries)>>1) + 1);
   7.121 +            TX_RING_ADD(prod, (atomic_read(&np->tx_entries)>>1) + 1);
   7.122          smp_mb();
   7.123      }
   7.124 -    while ( cons != np->net_ring->tx_cons );
   7.125 +    while ( prod != np->net_ring->tx_resp_prod );
   7.126  
   7.127      if ( np->tx_full && (atomic_read(&np->tx_entries) < TX_MAX_ENTRIES) )
   7.128      {
   7.129 @@ -201,21 +181,13 @@ static void network_tx_buf_gc(struct net
   7.130      spin_unlock_irqrestore(&np->tx_lock, flags);
   7.131  }
   7.132  
   7.133 -inline unsigned long get_ppte(unsigned long addr)
   7.134 +inline pte_t *get_ppte(void *addr)
   7.135  {
   7.136 -    unsigned long ppte;
   7.137 -    pgd_t *pgd; pmd_t *pmd; pte_t *ptep;
   7.138 -    pgd = pgd_offset_k(addr);
   7.139 -
   7.140 -    if ( pgd_none(*pgd) || pgd_bad(*pgd) ) BUG();
   7.141 -        
   7.142 -    pmd = pmd_offset(pgd, addr);
   7.143 -    if ( pmd_none(*pmd) || pmd_bad(*pmd) ) BUG(); 
   7.144 -        
   7.145 -    ptep = pte_offset(pmd, addr);
   7.146 -    ppte = (unsigned long)phys_to_machine(virt_to_phys(ptep));
   7.147 -
   7.148 -    return ppte;
   7.149 +    pgd_t *pgd; pmd_t *pmd; pte_t *pte;
   7.150 +    pgd = pgd_offset_k(   (unsigned long)addr);
   7.151 +    pmd = pmd_offset(pgd, (unsigned long)addr);
   7.152 +    pte = pte_offset(pmd, (unsigned long)addr);
   7.153 +    return pte;
   7.154  }
   7.155  
   7.156  static void network_alloc_rx_buffers(struct net_device *dev)
   7.157 @@ -223,21 +195,21 @@ static void network_alloc_rx_buffers(str
   7.158      unsigned int i;
   7.159      struct net_private *np = dev->priv;
   7.160      struct sk_buff *skb;
   7.161 -    unsigned int end = RX_RING_ADD(np->rx_idx, RX_MAX_ENTRIES);    
   7.162 +    unsigned int end = RX_RING_ADD(np->rx_resp_cons, RX_MAX_ENTRIES);    
   7.163  
   7.164 -    for ( i = np->net_ring->rx_prod; i != end; i = RX_RING_INC(i) )
   7.165 +    for ( i = np->net_ring->rx_req_prod; i != end; i = RX_RING_INC(i) )
   7.166      {
   7.167          skb = dev_alloc_skb(RX_BUF_SIZE);
   7.168          if ( skb == NULL ) break;
   7.169          skb->dev = dev;
   7.170 -        np->rx_skb_ring[i] = skb;
   7.171 -        np->net_ring->rx_ring[i].addr = get_ppte((unsigned long)skb->head); 
   7.172 -        np->net_ring->rx_ring[i].size = RX_BUF_SIZE - 16; /* arbitrary */
   7.173 +        np->net_ring->rx_ring[i].req.id   = (unsigned long)skb;
   7.174 +        np->net_ring->rx_ring[i].req.addr = 
   7.175 +            virt_to_machine(get_ppte(skb->head));
   7.176      }
   7.177  
   7.178 -    np->net_ring->rx_prod = i;
   7.179 +    np->net_ring->rx_req_prod = i;
   7.180  
   7.181 -    np->net_ring->rx_event = RX_RING_INC(np->rx_idx);
   7.182 +    np->net_ring->rx_event = RX_RING_INC(np->rx_resp_cons);
   7.183  
   7.184      /*
   7.185       * We may have allocated buffers which have entries outstanding in
   7.186 @@ -254,9 +226,11 @@ static void network_free_rx_buffers(stru
   7.187      struct net_private *np = dev->priv;
   7.188      struct sk_buff *skb;    
   7.189  
   7.190 -    for ( i = np->rx_idx; i != np->net_ring->rx_prod; i = RX_RING_INC(i) )
   7.191 +    for ( i  = np->rx_resp_cons; 
   7.192 +          i != np->net_ring->rx_req_prod; 
   7.193 +          i  = RX_RING_INC(i) )
   7.194      {
   7.195 -        skb = np->rx_skb_ring[i];
   7.196 +        skb = (struct sk_buff *)np->net_ring->rx_ring[i].req.id;
   7.197          dev_kfree_skb_any(skb);
   7.198      }
   7.199  }
   7.200 @@ -272,7 +246,7 @@ static int network_start_xmit(struct sk_
   7.201          netif_stop_queue(dev);
   7.202          return -ENOBUFS;
   7.203      }
   7.204 -    i = np->net_ring->tx_prod;
   7.205 +    i = np->net_ring->tx_req_prod;
   7.206  
   7.207      if ( (((unsigned long)skb->data & ~PAGE_MASK) + skb->len) >= PAGE_SIZE )
   7.208      {
   7.209 @@ -284,11 +258,11 @@ static int network_start_xmit(struct sk_
   7.210          skb = new_skb;
   7.211      }   
   7.212      
   7.213 -    np->tx_skb_ring[i] = skb;
   7.214 -    np->net_ring->tx_ring[i].addr =
   7.215 -        (unsigned long)phys_to_machine(virt_to_phys(skb->data));
   7.216 -    np->net_ring->tx_ring[i].size = skb->len;
   7.217 -    np->net_ring->tx_prod = TX_RING_INC(i);
   7.218 +    np->net_ring->tx_ring[i].req.id   = (unsigned long)skb;
   7.219 +    np->net_ring->tx_ring[i].req.addr =
   7.220 +        phys_to_machine(virt_to_phys(skb->data));
   7.221 +    np->net_ring->tx_ring[i].req.size = skb->len;
   7.222 +    np->net_ring->tx_req_prod = TX_RING_INC(i);
   7.223      atomic_inc(&np->tx_entries);
   7.224  
   7.225      np->stats.tx_bytes += skb->len;
   7.226 @@ -316,13 +290,15 @@ static void network_rx_int(int irq, void
   7.227      struct net_device *dev = (struct net_device *)dev_id;
   7.228      struct net_private *np = dev->priv;
   7.229      struct sk_buff *skb;
   7.230 -    rx_entry_t *rx;
   7.231 +    rx_resp_entry_t *rx;
   7.232      
   7.233   again:
   7.234 -    for ( i = np->rx_idx; i != np->net_ring->rx_cons; i = RX_RING_INC(i) )
   7.235 +    for ( i  = np->rx_resp_cons; 
   7.236 +          i != np->net_ring->rx_resp_prod; 
   7.237 +          i  = RX_RING_INC(i) )
   7.238      {
   7.239 -        rx  = &np->net_ring->rx_ring[i];
   7.240 -        skb = np->rx_skb_ring[i];
   7.241 +        rx  = &np->net_ring->rx_ring[i].resp;
   7.242 +        skb = (struct sk_buff *)rx->id;
   7.243  
   7.244          if ( rx->status != RING_STATUS_OK )
   7.245          {
   7.246 @@ -341,8 +317,7 @@ static void network_rx_int(int irq, void
   7.247          skb_shinfo(skb)->frag_list = NULL;
   7.248                                  
   7.249          phys_to_machine_mapping[virt_to_phys(skb->head) >> PAGE_SHIFT] =
   7.250 -            (*(unsigned long *)phys_to_virt(machine_to_phys(rx->addr))
   7.251 -                ) >> PAGE_SHIFT;
   7.252 +            (*(unsigned long *)get_ppte(skb->head)) >> PAGE_SHIFT;
   7.253  
   7.254          if ( rx->offset < 16 )
   7.255          {
   7.256 @@ -353,23 +328,23 @@ static void network_rx_int(int irq, void
   7.257          
   7.258          skb_reserve(skb, rx->offset - 16);
   7.259  
   7.260 -        skb_put(skb, np->net_ring->rx_ring[i].size);
   7.261 +        skb_put(skb, rx->size);
   7.262          skb->protocol = eth_type_trans(skb, dev);
   7.263  
   7.264          np->stats.rx_packets++;
   7.265  
   7.266 -        np->stats.rx_bytes += np->net_ring->rx_ring[i].size;
   7.267 +        np->stats.rx_bytes += rx->size;
   7.268          netif_rx(skb);
   7.269          dev->last_rx = jiffies;
   7.270      }
   7.271  
   7.272 -    np->rx_idx = i;
   7.273 +    np->rx_resp_cons = i;
   7.274  
   7.275      network_alloc_rx_buffers(dev);
   7.276      
   7.277      /* Deal with hypervisor racing our resetting of rx_event. */
   7.278      smp_mb();
   7.279 -    if ( np->net_ring->rx_cons != i ) goto again;
   7.280 +    if ( np->net_ring->rx_resp_prod != i ) goto again;
   7.281  }
   7.282  
   7.283  
   7.284 @@ -382,8 +357,6 @@ static void network_tx_int(int irq, void
   7.285  
   7.286  int network_close(struct net_device *dev)
   7.287  {
   7.288 -    struct net_private *np = dev->priv;
   7.289 -
   7.290      netif_stop_queue(dev);
   7.291  
   7.292      free_irq(NET_RX_IRQ, dev);
   7.293 @@ -401,9 +374,6 @@ int network_close(struct net_device *dev
   7.294      kfree(np->net_ring->tx_ring);
   7.295  #endif
   7.296  
   7.297 -    kfree(np->rx_skb_ring);
   7.298 -    kfree(np->tx_skb_ring);
   7.299 -
   7.300      MOD_DEC_USE_COUNT;
   7.301  
   7.302      return 0;