ia64/xen-unstable

changeset 951:87fcc882f083

bitkeeper revision 1.607 (3fb80bcdDsC2bacgLhXMLo9Gck9Icg)

dev.c, netdevice.h, interrupt.h, xen_block.c, memory.c, mm.c:
Clean up locking in blkdev and net code. Many locks are no longer ever held in interrupt context.
author kaf24@scramble.cl.cam.ac.uk
date Sun Nov 16 23:44:13 2003 +0000 (2003-11-16)
parents 7c5471a2cb87
children 6c5d838ecf1e
files xen/arch/i386/mm.c xen/common/memory.c xen/drivers/block/xen_block.c xen/include/xeno/interrupt.h xen/include/xeno/netdevice.h xen/net/dev.c
line diff
     1.1 --- a/xen/arch/i386/mm.c	Sun Nov 16 18:50:57 2003 +0000
     1.2 +++ b/xen/arch/i386/mm.c	Sun Nov 16 23:44:13 2003 +0000
     1.3 @@ -213,12 +213,12 @@ long set_gdt(struct task_struct *p,
     1.4  {
     1.5      /* NB. There are 512 8-byte entries per GDT page. */
     1.6      unsigned int i, j, nr_pages = (entries + 511) / 512;
     1.7 -    unsigned long pfn, *gdt_page, flags;
     1.8 +    unsigned long pfn, *gdt_page;
     1.9      long ret = -EINVAL;
    1.10      struct pfn_info *page;
    1.11      struct desc_struct *vgdt;
    1.12  
    1.13 -    spin_lock_irqsave(&p->page_lock, flags);
    1.14 +    spin_lock(&p->page_lock);
    1.15  
    1.16      /* Check the new GDT. */
    1.17      for ( i = 0; i < nr_pages; i++ )
    1.18 @@ -284,7 +284,7 @@ long set_gdt(struct task_struct *p,
    1.19      ret = 0; /* success */
    1.20  
    1.21   out:
    1.22 -    spin_unlock_irqrestore(&p->page_lock, flags);
    1.23 +    spin_unlock(&p->page_lock);
    1.24      return ret;
    1.25  }
    1.26  
    1.27 @@ -314,14 +314,14 @@ long do_set_gdt(unsigned long *frame_lis
    1.28  long do_update_descriptor(
    1.29      unsigned long pa, unsigned long word1, unsigned long word2)
    1.30  {
    1.31 -    unsigned long *gdt_pent, flags, pfn = pa >> PAGE_SHIFT;
    1.32 +    unsigned long *gdt_pent, pfn = pa >> PAGE_SHIFT;
    1.33      struct pfn_info *page;
    1.34      long ret = -EINVAL;
    1.35  
    1.36      if ( (pa & 7) || (pfn >= max_page) || !check_descriptor(word1, word2) )
    1.37          return -EINVAL;
    1.38  
    1.39 -    spin_lock_irqsave(&current->page_lock, flags);
    1.40 +    spin_lock(&current->page_lock);
    1.41  
    1.42      page = frame_table + pfn;
    1.43      if ( (page->flags & PG_domain_mask) != current->domain )
    1.44 @@ -353,6 +353,6 @@ long do_update_descriptor(
    1.45      ret = 0; /* success */
    1.46  
    1.47   out:
    1.48 -    spin_unlock_irqrestore(&current->page_lock, flags);
    1.49 +    spin_unlock(&current->page_lock);
    1.50      return ret;
    1.51  }
     2.1 --- a/xen/common/memory.c	Sun Nov 16 18:50:57 2003 +0000
     2.2 +++ b/xen/common/memory.c	Sun Nov 16 23:44:13 2003 +0000
     2.3 @@ -132,6 +132,7 @@
     2.4  #include <xeno/sched.h>
     2.5  #include <xeno/errno.h>
     2.6  #include <xeno/perfc.h>
     2.7 +#include <xeno/interrupt.h>
     2.8  #include <asm/page.h>
     2.9  #include <asm/flushtlb.h>
    2.10  #include <asm/io.h>
    2.11 @@ -253,11 +254,15 @@ int map_ldt_shadow_page(unsigned int off
    2.12  {
    2.13      struct task_struct *p = current;
    2.14      unsigned long addr = p->mm.ldt_base + (off << PAGE_SHIFT);
    2.15 -    unsigned long l1e, *ldt_page, flags;
    2.16 +    unsigned long l1e, *ldt_page;
    2.17      struct pfn_info *page;
    2.18      int i, ret = -1;
    2.19  
    2.20 -    spin_lock_irqsave(&p->page_lock, flags);
    2.21 +    /* We cannot take a page_lock in interrupt context. */
    2.22 +    if ( in_interrupt() )
    2.23 +        BUG();
    2.24 +
    2.25 +    spin_lock(&p->page_lock);
    2.26  
    2.27      __get_user(l1e, (unsigned long *)(linear_pg_table+(addr>>PAGE_SHIFT)));
    2.28      if ( unlikely(!(l1e & _PAGE_PRESENT)) )
    2.29 @@ -294,7 +299,7 @@ int map_ldt_shadow_page(unsigned int off
    2.30      ret = 0;
    2.31  
    2.32   out:
    2.33 -    spin_unlock_irqrestore(&p->page_lock, flags);
    2.34 +    spin_unlock(&p->page_lock);
    2.35      return ret;
    2.36  }
    2.37  
    2.38 @@ -865,7 +870,7 @@ int do_mmu_update(mmu_update_t *ureqs, i
    2.39  
    2.40          err = 1;
    2.41  
    2.42 -        spin_lock_irq(&current->page_lock);
    2.43 +        spin_lock(&current->page_lock);
    2.44  
    2.45          /* Get the page-frame number that a non-extended command references. */
    2.46          if ( (cmd == MMU_NORMAL_PT_UPDATE) || 
    2.47 @@ -974,7 +979,7 @@ int do_mmu_update(mmu_update_t *ureqs, i
    2.48          }
    2.49  
    2.50      unlock:
    2.51 -        spin_unlock_irq(&current->page_lock);
    2.52 +        spin_unlock(&current->page_lock);
    2.53  
    2.54          if ( unlikely(err) )
    2.55          {
    2.56 @@ -1015,7 +1020,7 @@ int do_update_va_mapping(unsigned long p
    2.57      if ( unlikely(page_nr >= (HYPERVISOR_VIRT_START >> PAGE_SHIFT)) )
    2.58          goto out;
    2.59  
    2.60 -    spin_lock_irq(&p->page_lock);
    2.61 +    spin_lock(&p->page_lock);
    2.62  
    2.63      /* Check that the VA's page-directory entry is present.. */
    2.64      if ( unlikely((err = __get_user(_x, (unsigned long *)
    2.65 @@ -1047,7 +1052,7 @@ int do_update_va_mapping(unsigned long p
    2.66      if ( unlikely(cr0 != 0) )
    2.67          write_cr0(cr0);
    2.68   unlock_and_out:
    2.69 -    spin_unlock_irq(&p->page_lock);
    2.70 +    spin_unlock(&p->page_lock);
    2.71   out:
    2.72      return err;
    2.73  }
     3.1 --- a/xen/drivers/block/xen_block.c	Sun Nov 16 18:50:57 2003 +0000
     3.2 +++ b/xen/drivers/block/xen_block.c	Sun Nov 16 23:44:13 2003 +0000
     3.3 @@ -58,6 +58,8 @@ static spinlock_t pend_prod_lock = SPIN_
     3.4  static kmem_cache_t *buffer_head_cachep;
     3.5  static atomic_t nr_pending;
     3.6  
     3.7 +static struct buffer_head *completed_bhs[NR_CPUS] __cacheline_aligned;
     3.8 +
     3.9  static int __buffer_is_valid(struct task_struct *p, 
    3.10                               unsigned long buffer, 
    3.11                               unsigned short size,
    3.12 @@ -166,41 +168,68 @@ static void maybe_trigger_io_schedule(vo
    3.13  
    3.14  /******************************************************************
    3.15   * COMPLETION CALLBACK -- Called as bh->b_end_io()
    3.16 - * NB. This can be called from interrupt context!
    3.17   */
    3.18  
    3.19 +static void end_block_io_op_softirq(struct softirq_action *h)
    3.20 +{
    3.21 +    pending_req_t *pending_req;
    3.22 +    struct buffer_head *bh, *nbh;
    3.23 +    unsigned int cpu = smp_processor_id();
    3.24 +
    3.25 +    local_irq_disable();
    3.26 +    bh = completed_bhs[cpu];
    3.27 +    completed_bhs[cpu] = NULL;
    3.28 +    local_irq_enable();
    3.29 +
    3.30 +    while ( bh != NULL )
    3.31 +    {
    3.32 +        pending_req = bh->pending_req;
    3.33 +        
    3.34 +        unlock_buffer(pending_req->domain, 
    3.35 +                      virt_to_phys(bh->b_data), 
    3.36 +                      bh->b_size, 
    3.37 +                      (pending_req->operation==READ));
    3.38 +        
    3.39 +        if ( atomic_dec_and_test(&pending_req->pendcnt) )
    3.40 +        {
    3.41 +            make_response(pending_req->domain, pending_req->id,
    3.42 +                          pending_req->operation, pending_req->status);
    3.43 +            put_task_struct(pending_req->domain);
    3.44 +            spin_lock(&pend_prod_lock);
    3.45 +            pending_ring[pending_prod] = pending_req - pending_reqs;
    3.46 +            PENDREQ_IDX_INC(pending_prod);
    3.47 +            spin_unlock(&pend_prod_lock);
    3.48 +            atomic_dec(&nr_pending);
    3.49 +            maybe_trigger_io_schedule();
    3.50 +        }
    3.51 +        
    3.52 +        nbh = bh->b_reqnext;
    3.53 +        kmem_cache_free(buffer_head_cachep, bh);
    3.54 +        bh = nbh;
    3.55 +    }
    3.56 +}
    3.57 +
    3.58  static void end_block_io_op(struct buffer_head *bh, int uptodate)
    3.59  {
    3.60      unsigned long flags;
    3.61 -    pending_req_t *pending_req = bh->pending_req;
    3.62 +    unsigned int cpu = smp_processor_id();
    3.63  
    3.64      /* An error fails the entire request. */
    3.65      if ( !uptodate )
    3.66      {
    3.67          DPRINTK("Buffer not up-to-date at end of operation\n");
    3.68 -        pending_req->status = 2;
    3.69 +        bh->pending_req->status = 2;
    3.70      }
    3.71  
    3.72 -    unlock_buffer(pending_req->domain, 
    3.73 -                  virt_to_phys(bh->b_data), 
    3.74 -                  bh->b_size, 
    3.75 -                  (pending_req->operation==READ));
    3.76 +    local_irq_save(flags);
    3.77 +    bh->b_reqnext = completed_bhs[cpu];
    3.78 +    completed_bhs[cpu] = bh;
    3.79 +    local_irq_restore(flags);
    3.80  
    3.81 -    if ( atomic_dec_and_test(&pending_req->pendcnt) )
    3.82 -    {
    3.83 -        make_response(pending_req->domain, pending_req->id,
    3.84 -                      pending_req->operation, pending_req->status);
    3.85 -        put_task_struct(pending_req->domain);
    3.86 -        spin_lock_irqsave(&pend_prod_lock, flags);
    3.87 -        pending_ring[pending_prod] = pending_req - pending_reqs;
    3.88 -        PENDREQ_IDX_INC(pending_prod);
    3.89 -        spin_unlock_irqrestore(&pend_prod_lock, flags);
    3.90 -        atomic_dec(&nr_pending);
    3.91 -        maybe_trigger_io_schedule();
    3.92 -    }
    3.93 +    __cpu_raise_softirq(cpu, BLKDEV_RESPONSE_SOFTIRQ);
    3.94 +}
    3.95  
    3.96 -    kmem_cache_free(buffer_head_cachep, bh);
    3.97 -}
    3.98 +
    3.99  /* ----[ Syscall Interface ]------------------------------------------------*/
   3.100  
   3.101  long do_block_io_op(block_io_op_t *u_block_io_op)
   3.102 @@ -364,10 +393,10 @@ static void unlock_buffer(struct task_st
   3.103                            unsigned short size,
   3.104                            int writeable_buffer)
   3.105  {
   3.106 -    unsigned long    pfn, flags;
   3.107 +    unsigned long    pfn;
   3.108      struct pfn_info *page;
   3.109  
   3.110 -    spin_lock_irqsave(&p->page_lock, flags);
   3.111 +    spin_lock(&p->page_lock);
   3.112      for ( pfn = buffer >> PAGE_SHIFT; 
   3.113            pfn < ((buffer + size + PAGE_SIZE - 1) >> PAGE_SHIFT);
   3.114            pfn++ )
   3.115 @@ -377,7 +406,7 @@ static void unlock_buffer(struct task_st
   3.116              put_page_type(page);
   3.117          put_page_tot(page);
   3.118      }
   3.119 -    spin_unlock_irqrestore(&p->page_lock, flags);
   3.120 +    spin_unlock(&p->page_lock);
   3.121  }
   3.122  
   3.123  static int do_block_io_op_domain(struct task_struct *p, int max_to_do)
   3.124 @@ -438,7 +467,7 @@ static void dispatch_rw_block_io(struct 
   3.125      struct buffer_head *bh;
   3.126      int operation = (req->operation == XEN_BLOCK_WRITE) ? WRITE : READ;
   3.127      unsigned short nr_sects;
   3.128 -    unsigned long buffer, flags;
   3.129 +    unsigned long buffer;
   3.130      int i, tot_sects;
   3.131      pending_req_t *pending_req;
   3.132  
   3.133 @@ -446,7 +475,7 @@ static void dispatch_rw_block_io(struct 
   3.134      int new_segs, nr_psegs = 0;
   3.135      phys_seg_t phys_seg[MAX_BLK_SEGS * 2];
   3.136  
   3.137 -    spin_lock_irqsave(&p->page_lock, flags);
   3.138 +    spin_lock(&p->page_lock);
   3.139  
   3.140      /* Check that number of segments is sane. */
   3.141      if ( (req->nr_segments == 0) || (req->nr_segments > MAX_BLK_SEGS) )
   3.142 @@ -516,7 +545,7 @@ static void dispatch_rw_block_io(struct 
   3.143      for ( i = 0; i < nr_psegs; i++ )
   3.144          __lock_buffer(phys_seg[i].buffer, phys_seg[i].nr_sects<<9, 
   3.145                        (operation==READ));
   3.146 -    spin_unlock_irqrestore(&p->page_lock, flags);
   3.147 +    spin_unlock(&p->page_lock);
   3.148  
   3.149      atomic_inc(&nr_pending);
   3.150      pending_req = pending_reqs + pending_ring[pending_cons];
   3.151 @@ -560,7 +589,7 @@ static void dispatch_rw_block_io(struct 
   3.152      return;
   3.153  
   3.154   bad_descriptor:
   3.155 -    spin_unlock_irqrestore(&p->page_lock, flags);
   3.156 +    spin_unlock(&p->page_lock);
   3.157      make_response(p, req->id, req->operation, 1);
   3.158  } 
   3.159  
   3.160 @@ -574,19 +603,19 @@ static void dispatch_rw_block_io(struct 
   3.161  static void make_response(struct task_struct *p, unsigned long id, 
   3.162  			  unsigned short op, unsigned long st)
   3.163  {
   3.164 -    unsigned long cpu_mask, flags;
   3.165 +    unsigned long cpu_mask;
   3.166      int position;
   3.167      blk_ring_t *blk_ring;
   3.168  
   3.169      /* Place on the response ring for the relevant domain. */ 
   3.170 -    spin_lock_irqsave(&p->blk_ring_lock, flags);
   3.171 +    spin_lock(&p->blk_ring_lock);
   3.172      blk_ring = p->blk_ring_base;
   3.173      position = p->blk_resp_prod;
   3.174      blk_ring->ring[position].resp.id        = id;
   3.175      blk_ring->ring[position].resp.operation = op;
   3.176      blk_ring->ring[position].resp.status    = st;
   3.177      p->blk_resp_prod = blk_ring->resp_prod = BLK_RING_INC(position);
   3.178 -    spin_unlock_irqrestore(&p->blk_ring_lock, flags);
   3.179 +    spin_unlock(&p->blk_ring_lock);
   3.180      
   3.181      /* Kick the relevant domain. */
   3.182      cpu_mask = mark_guest_event(p, _EVENT_BLKDEV);
   3.183 @@ -659,7 +688,13 @@ void initialize_block_io ()
   3.184      atomic_set(&nr_pending, 0);
   3.185      pending_prod = pending_cons = 0;
   3.186      memset(pending_reqs, 0, sizeof(pending_reqs));
   3.187 -    for ( i = 0; i < MAX_PENDING_REQS; i++ ) pending_ring[i] = i;
   3.188 +    for ( i = 0; i < MAX_PENDING_REQS; i++ )
   3.189 +        pending_ring[i] = i;
   3.190 +    
   3.191 +    for ( i = 0; i < NR_CPUS; i++ )
   3.192 +        completed_bhs[i] = NULL;
   3.193 +        
   3.194 +    open_softirq(BLKDEV_RESPONSE_SOFTIRQ, end_block_io_op_softirq, NULL);
   3.195  
   3.196      spin_lock_init(&io_schedule_list_lock);
   3.197      INIT_LIST_HEAD(&io_schedule_list);
     4.1 --- a/xen/include/xeno/interrupt.h	Sun Nov 16 18:50:57 2003 +0000
     4.2 +++ b/xen/include/xeno/interrupt.h	Sun Nov 16 23:44:13 2003 +0000
     4.3 @@ -21,33 +21,23 @@ struct irqaction {
     4.4  };
     4.5  
     4.6  
     4.7 -/* Who gets which entry in bh_base.  Things which will occur most often
     4.8 -   should come first */
     4.9 -   
    4.10  enum {
    4.11  	TIMER_BH = 0,
    4.12 -	TQUEUE_BH,
    4.13 -	SCSI_BH,
    4.14 -	IMMEDIATE_BH
    4.15 +	SCSI_BH
    4.16  };
    4.17  
    4.18  #include <asm/hardirq.h>
    4.19  #include <asm/softirq.h>
    4.20  
    4.21  
    4.22 -
    4.23 -/* PLEASE, avoid to allocate new softirqs, if you need not _really_ high
    4.24 -   frequency threaded job scheduling. For almost all the purposes
    4.25 -   tasklets are more than enough. F.e. all serial device BHs et
    4.26 -   al. should be converted to tasklets, not to softirqs.
    4.27 - */
    4.28 -
    4.29  enum
    4.30  {
    4.31  	HI_SOFTIRQ=0,
    4.32  	NET_RX_SOFTIRQ,
    4.33  	AC_TIMER_SOFTIRQ,
    4.34 -	TASKLET_SOFTIRQ
    4.35 +	TASKLET_SOFTIRQ,
    4.36 +        BLKDEV_RESPONSE_SOFTIRQ,
    4.37 +        NET_TX_SOFTIRQ
    4.38  };
    4.39  
    4.40  /* softirq mask and active fields moved to irq_cpustat_t in
     5.1 --- a/xen/include/xeno/netdevice.h	Sun Nov 16 18:50:57 2003 +0000
     5.2 +++ b/xen/include/xeno/netdevice.h	Sun Nov 16 23:44:13 2003 +0000
     5.3 @@ -40,6 +40,12 @@
     5.4  
     5.5  struct vlan_group;
     5.6  
     5.7 +extern struct skb_completion_queues {
     5.8 +    struct sk_buff *rx; /* Packets received in interrupt context. */
     5.9 +    unsigned int rx_qlen;
    5.10 +    struct sk_buff *tx; /* Tx buffers defunct in interrupt context. */
    5.11 +} skb_queue[NR_CPUS] __cacheline_aligned;
    5.12 +
    5.13  /* Backlog congestion levels */
    5.14  #define NET_RX_SUCCESS		0   /* keep 'em coming, baby */
    5.15  #define NET_RX_DROP		1  /* packet dropped */
    5.16 @@ -453,12 +459,30 @@ static inline int netif_running(struct n
    5.17  }
    5.18  
    5.19  
    5.20 -/*
    5.21 - * Xen does not need deferred skb freeing, as all destructor hook functions 
    5.22 - * are IRQ safe. Linux needed more care for some destructors...
    5.23 +/* Use this variant when it is known for sure that it
    5.24 + * is executing from interrupt context.
    5.25   */
    5.26 -#define dev_kfree_skb_irq(_skb) dev_kfree_skb(_skb)
    5.27 -#define dev_kfree_skb_any(_skb) dev_kfree_skb(_skb)
    5.28 +static inline void dev_kfree_skb_irq(struct sk_buff *skb)
    5.29 +{
    5.30 +	int cpu = smp_processor_id();
    5.31 +	unsigned long flags;
    5.32 +	local_irq_save(flags);
    5.33 +	skb->next = skb_queue[cpu].tx;
    5.34 +	skb_queue[cpu].tx = skb;
    5.35 +	__cpu_raise_softirq(cpu, NET_TX_SOFTIRQ);
    5.36 +	local_irq_restore(flags);
    5.37 +}
    5.38 +
    5.39 +/* Use this variant in places where it could be invoked
    5.40 + * either from interrupt or non-interrupt context.
    5.41 + */
    5.42 +static inline void dev_kfree_skb_any(struct sk_buff *skb)
    5.43 +{
    5.44 +	if (in_irq())
    5.45 +		dev_kfree_skb_irq(skb);
    5.46 +	else
    5.47 +		dev_kfree_skb(skb);
    5.48 +}
    5.49  
    5.50  extern void		net_call_rx_atomic(void (*fn)(void));
    5.51  extern int		netif_rx(struct sk_buff *skb);
     6.1 --- a/xen/net/dev.c	Sun Nov 16 18:50:57 2003 +0000
     6.2 +++ b/xen/net/dev.c	Sun Nov 16 23:44:13 2003 +0000
     6.3 @@ -50,7 +50,7 @@
     6.4  #define TX_RING_ADD(_i,_j) (((_i)+(_j)) & (TX_RING_SIZE-1))
     6.5  #define RX_RING_ADD(_i,_j) (((_i)+(_j)) & (RX_RING_SIZE-1))
     6.6  
     6.7 -static struct sk_buff_head rx_skb_queue[NR_CPUS] __cacheline_aligned;
     6.8 +struct skb_completion_queues skb_queue[NR_CPUS] __cacheline_aligned;
     6.9  
    6.10  static int get_tx_bufs(net_vif_t *vif);
    6.11  
    6.12 @@ -607,35 +607,40 @@ void deliver_packet(struct sk_buff *skb,
    6.13  
    6.14  int netif_rx(struct sk_buff *skb)
    6.15  {
    6.16 -    int this_cpu = smp_processor_id();
    6.17 -    struct sk_buff_head *q = &rx_skb_queue[this_cpu];
    6.18 +    int cpu = smp_processor_id();
    6.19      unsigned long flags;
    6.20  
    6.21 -    /* This oughtn't to happen, really! */
    6.22 -    if ( unlikely(skb_queue_len(q) > 100) )
    6.23 +    local_irq_save(flags);
    6.24 +
    6.25 +    if ( unlikely(skb_queue[cpu].rx_qlen > 100) )
    6.26      {
    6.27 +        local_irq_restore(flags);
    6.28          perfc_incr(net_rx_congestion_drop);
    6.29          return NET_RX_DROP;
    6.30      }
    6.31  
    6.32 -    local_irq_save(flags);
    6.33 -    __skb_queue_tail(q, skb);
    6.34 +    skb->next = skb_queue[cpu].rx;
    6.35 +    skb_queue[cpu].rx = skb;
    6.36 +
    6.37      local_irq_restore(flags);
    6.38  
    6.39 -    __cpu_raise_softirq(this_cpu, NET_RX_SOFTIRQ);
    6.40 +    __cpu_raise_softirq(cpu, NET_RX_SOFTIRQ);
    6.41  
    6.42      return NET_RX_SUCCESS;
    6.43  }
    6.44  
    6.45  static void net_rx_action(struct softirq_action *h)
    6.46  {
    6.47 -    int offset, this_cpu = smp_processor_id();
    6.48 -    struct sk_buff_head *q = &rx_skb_queue[this_cpu];
    6.49 -    struct sk_buff *skb;
    6.50 +    int offset, cpu = smp_processor_id();
    6.51 +    struct sk_buff *skb, *nskb;
    6.52  
    6.53      local_irq_disable();
    6.54 -    
    6.55 -    while ( (skb = __skb_dequeue(q)) != NULL )
    6.56 +    skb = skb_queue[cpu].rx;
    6.57 +    skb_queue[cpu].rx = NULL;
    6.58 +    skb_queue[cpu].rx_qlen = 0;
    6.59 +    local_irq_enable();
    6.60 +
    6.61 +    while ( skb != NULL )
    6.62      {
    6.63          ASSERT(skb->skb_type == SKB_ZERO_COPY);
    6.64  
    6.65 @@ -652,7 +657,7 @@ static void net_rx_action(struct softirq
    6.66          skb_push(skb, ETH_HLEN);
    6.67          skb->mac.raw = skb->data;
    6.68          
    6.69 -        netdev_rx_stat[this_cpu].total++;
    6.70 +        netdev_rx_stat[cpu].total++;
    6.71          
    6.72          if ( skb->dst_vif == NULL )
    6.73              skb->dst_vif = net_get_target_vif(
    6.74 @@ -668,10 +673,11 @@ static void net_rx_action(struct softirq
    6.75          }
    6.76  
    6.77          unmap_domain_mem(skb->head);
    6.78 +
    6.79 +        nskb = skb->next;
    6.80          kfree_skb(skb);
    6.81 +        skb = nskb;
    6.82      }
    6.83 -
    6.84 -    local_irq_enable();
    6.85  }
    6.86  
    6.87  
    6.88 @@ -823,39 +829,58 @@ static inline void maybe_schedule_tx_act
    6.89  }
    6.90  
    6.91  
    6.92 +static void net_tx_gc(struct softirq_action *h)
    6.93 +{
    6.94 +    int cpu = smp_processor_id();
    6.95 +    struct sk_buff *skb, *nskb;
    6.96 +
    6.97 +    local_irq_disable();
    6.98 +    skb = skb_queue[cpu].tx;
    6.99 +    skb_queue[cpu].tx = NULL;
   6.100 +    local_irq_enable();
   6.101 +
   6.102 +    while ( skb != NULL )
   6.103 +    {
   6.104 +        nskb = skb->next;
   6.105 +        __kfree_skb(skb);
   6.106 +        skb = nskb;
   6.107 +    }
   6.108 +}
   6.109 +
   6.110  /* Destructor function for tx skbs. */
   6.111  static void tx_skb_release(struct sk_buff *skb)
   6.112  {
   6.113      int i;
   6.114 -    net_vif_t *vif = skb->src_vif;
   6.115 -    unsigned long flags;
   6.116 +    net_vif_t *vif;
   6.117 +
   6.118 +    vif = skb->src_vif;
   6.119      
   6.120 -    spin_lock_irqsave(&vif->domain->page_lock, flags);
   6.121 +    spin_lock(&vif->domain->page_lock);
   6.122      for ( i = 0; i < skb_shinfo(skb)->nr_frags; i++ )
   6.123          put_page_tot(skb_shinfo(skb)->frags[i].page);
   6.124 -    spin_unlock_irqrestore(&vif->domain->page_lock, flags);
   6.125 -
   6.126 +    spin_unlock(&vif->domain->page_lock);
   6.127 +    
   6.128      if ( skb->skb_type == SKB_NODATA )
   6.129          kmem_cache_free(net_header_cachep, skb->head);
   6.130 -
   6.131 +    
   6.132      skb_shinfo(skb)->nr_frags = 0; 
   6.133 -
   6.134 -    spin_lock_irqsave(&vif->tx_lock, flags);
   6.135 +    
   6.136 +    spin_lock(&vif->tx_lock);
   6.137      __make_tx_response(vif, skb->guest_id, RING_STATUS_OK);
   6.138 -    spin_unlock_irqrestore(&vif->tx_lock, flags);
   6.139 -
   6.140 +    spin_unlock(&vif->tx_lock);
   6.141 +    
   6.142      /*
   6.143 -     * Checks below must happen after the above response is posted.
   6.144 -     * This avoids a possible race with a guest OS on another CPU.
   6.145 +     * Checks below must happen after the above response is posted. This avoids
   6.146 +     * a possible race with a guest OS on another CPU.
   6.147       */
   6.148      smp_mb();
   6.149 -
   6.150 +    
   6.151      if ( (vif->tx_cons == vif->tx_prod) && get_tx_bufs(vif) )
   6.152      {
   6.153          add_to_net_schedule_list_tail(vif);
   6.154          maybe_schedule_tx_action();        
   6.155      }
   6.156 -
   6.157 +    
   6.158      put_vif(vif);
   6.159  }
   6.160  
   6.161 @@ -1849,12 +1874,11 @@ static int get_tx_bufs(net_vif_t *vif)
   6.162      struct sk_buff     *skb;
   6.163      tx_req_entry_t      tx;
   6.164      int                 i, j, ret = 0;
   6.165 -    unsigned long       flags;
   6.166  
   6.167      if ( vif->tx_req_cons == shared_idxs->tx_req_prod )
   6.168          return 0;
   6.169  
   6.170 -    spin_lock_irqsave(&vif->tx_lock, flags);
   6.171 +    spin_lock(&vif->tx_lock);
   6.172  
   6.173      /* Currently waiting for more credit? */
   6.174      if ( vif->remaining_credit == 0 )
   6.175 @@ -2013,7 +2037,7 @@ static int get_tx_bufs(net_vif_t *vif)
   6.176          vif->tx_prod = j;
   6.177  
   6.178   out:
   6.179 -    spin_unlock_irqrestore(&vif->tx_lock, flags);
   6.180 +    spin_unlock(&vif->tx_lock);
   6.181  
   6.182      return ret;
   6.183  }
   6.184 @@ -2063,14 +2087,14 @@ static long get_bufs_from_vif(net_vif_t 
   6.185          pte_pfn = rx.addr >> PAGE_SHIFT;
   6.186          pte_page = frame_table + pte_pfn;
   6.187              
   6.188 -        spin_lock_irq(&p->page_lock);
   6.189 +        spin_lock(&p->page_lock);
   6.190          if ( (pte_pfn >= max_page) || 
   6.191               ((pte_page->flags & (PG_type_mask | PG_domain_mask)) != 
   6.192                (PGT_l1_page_table | p->domain)) ) 
   6.193          {
   6.194              DPRINTK("Bad page frame for ppte %d,%08lx,%08lx,%08lx\n",
   6.195                      p->domain, pte_pfn, max_page, pte_page->flags);
   6.196 -            spin_unlock_irq(&p->page_lock);
   6.197 +            spin_unlock(&p->page_lock);
   6.198              make_rx_response(vif, rx.id, 0, RING_STATUS_BAD_PAGE, 0);
   6.199              continue;
   6.200          }
   6.201 @@ -2117,7 +2141,7 @@ static long get_bufs_from_vif(net_vif_t 
   6.202              
   6.203      rx_unmap_and_continue:
   6.204          unmap_domain_mem(ptep);
   6.205 -        spin_unlock_irq(&p->page_lock);
   6.206 +        spin_unlock(&p->page_lock);
   6.207      }
   6.208  
   6.209      vif->rx_req_cons = i;
   6.210 @@ -2135,7 +2159,7 @@ static long get_bufs_from_vif(net_vif_t 
   6.211  long flush_bufs_for_vif(net_vif_t *vif)
   6.212  {
   6.213      int i;
   6.214 -    unsigned long *pte, flags;
   6.215 +    unsigned long *pte;
   6.216      struct pfn_info *page;
   6.217      struct task_struct *p = vif->domain;
   6.218      rx_shadow_entry_t *rx;
   6.219 @@ -2143,7 +2167,7 @@ long flush_bufs_for_vif(net_vif_t *vif)
   6.220      net_idx_t *shared_idxs = vif->shared_idxs;
   6.221  
   6.222      /* Return any outstanding receive buffers to the guest OS. */
   6.223 -    spin_lock_irqsave(&p->page_lock, flags);
   6.224 +    spin_lock(&p->page_lock);
   6.225      for ( i = vif->rx_req_cons; 
   6.226            (i != shared_idxs->rx_req_prod) && 
   6.227                (((vif->rx_resp_prod-i) & (RX_RING_SIZE-1)) != 1); 
   6.228 @@ -2181,13 +2205,13 @@ long flush_bufs_for_vif(net_vif_t *vif)
   6.229          make_rx_response(vif, rx->id, 0, RING_STATUS_DROPPED, 0);
   6.230      }
   6.231      vif->rx_cons = i;
   6.232 -    spin_unlock_irqrestore(&p->page_lock, flags);
   6.233 +    spin_unlock(&p->page_lock);
   6.234  
   6.235      /*
   6.236       * Flush pending transmit buffers. The guest may still have to wait for
   6.237       * buffers that are queued at a physical NIC.
   6.238       */
   6.239 -    spin_lock_irqsave(&vif->tx_lock, flags);
   6.240 +    spin_lock(&vif->tx_lock);
   6.241      for ( i = vif->tx_req_cons; 
   6.242            (i != shared_idxs->tx_req_prod) && 
   6.243                (((vif->tx_resp_prod-i) & (TX_RING_SIZE-1)) != 1); 
   6.244 @@ -2197,7 +2221,7 @@ long flush_bufs_for_vif(net_vif_t *vif)
   6.245                             RING_STATUS_DROPPED);
   6.246      }
   6.247      vif->tx_req_cons = i;
   6.248 -    spin_unlock_irqrestore(&vif->tx_lock, flags);
   6.249 +    spin_unlock(&vif->tx_lock);
   6.250  
   6.251      return 0;
   6.252  }
   6.253 @@ -2236,7 +2260,7 @@ long do_net_io_op(netop_t *uop)
   6.254  
   6.255      case NETOP_RESET_RINGS:
   6.256          /* We take the tx_lock to avoid a race with get_tx_bufs. */
   6.257 -        spin_lock_irq(&vif->tx_lock);
   6.258 +        spin_lock(&vif->tx_lock);
   6.259          if ( (vif->rx_req_cons != vif->rx_resp_prod) || 
   6.260               (vif->tx_req_cons != vif->tx_resp_prod) )
   6.261          {
   6.262 @@ -2249,7 +2273,7 @@ long do_net_io_op(netop_t *uop)
   6.263              vif->tx_req_cons = vif->tx_resp_prod = 0;
   6.264              ret = 0;
   6.265          }
   6.266 -        spin_unlock_irq(&vif->tx_lock);
   6.267 +        spin_unlock(&vif->tx_lock);
   6.268          break;
   6.269  
   6.270      case NETOP_GET_VIF_INFO:
   6.271 @@ -2297,12 +2321,11 @@ static void make_rx_response(net_vif_t  
   6.272                               unsigned char  st,
   6.273                               unsigned char  off)
   6.274  {
   6.275 -    unsigned long flags;
   6.276      unsigned int pos;
   6.277      rx_resp_entry_t *resp;
   6.278  
   6.279      /* Place on the response ring for the relevant domain. */ 
   6.280 -    spin_lock_irqsave(&vif->rx_lock, flags);
   6.281 +    spin_lock(&vif->rx_lock);
   6.282      pos  = vif->rx_resp_prod;
   6.283      resp = &vif->shared_rings->rx_ring[pos].resp;
   6.284      resp->id     = id;
   6.285 @@ -2317,19 +2340,24 @@ static void make_rx_response(net_vif_t  
   6.286          unsigned long cpu_mask = mark_guest_event(vif->domain, _EVENT_NET);
   6.287          guest_event_notify(cpu_mask);    
   6.288      }
   6.289 -    spin_unlock_irqrestore(&vif->rx_lock, flags);
   6.290 +    spin_unlock(&vif->rx_lock);
   6.291  }
   6.292  
   6.293  
   6.294  int setup_network_devices(void)
   6.295  {
   6.296 -    int i, ret;
   6.297 +    int ret;
   6.298      extern char opt_ifname[];
   6.299  
   6.300 -    for ( i = 0; i < smp_num_cpus; i++ )
   6.301 -        skb_queue_head_init(&rx_skb_queue[i]);
   6.302 +    memset(skb_queue, 0, sizeof(skb_queue));
   6.303 +
   6.304 +    /* Actual receive processing happens in softirq context. */
   6.305 +    open_softirq(NET_RX_SOFTIRQ, net_rx_action, NULL);
   6.306  
   6.307 -    open_softirq(NET_RX_SOFTIRQ, net_rx_action, NULL);
   6.308 +    /* Processing of defunct transmit buffers happens in softirq context. */
   6.309 +    open_softirq(NET_TX_SOFTIRQ, net_tx_gc, NULL);
   6.310 +
   6.311 +    /* Tranmit scheduling happens in a tasklet to exclude other processors. */
   6.312      tasklet_enable(&net_tx_tasklet);
   6.313  
   6.314      if ( (the_dev = dev_get_by_name(opt_ifname)) == NULL )