direct-io.hg

changeset 299:665cb39b0125

bitkeeper revision 1.122.5.1 (3e71d6fe7FguR-sT8s7ha1pGTKuYSA)

Many files:
Sort out interrupt distribution in SMP systems. We now periodically redistribute towrds the most idle processors. There's more sport to be had here though...
author kaf24@scramble.cl.cam.ac.uk
date Fri Mar 14 13:19:58 2003 +0000 (2003-03-14)
parents d2f6b07b136d
children f233fd11d69e
files xen/arch/i386/io_apic.c xen/arch/i386/irq.c xen/arch/i386/process.c xen/arch/i386/setup.c xen/arch/i386/smpboot.c xen/common/schedule.c xen/include/asm-i386/hardirq.h xen/include/asm-i386/smpboot.h xen/include/xeno/sched.h
line diff
     1.1 --- a/xen/arch/i386/io_apic.c	Thu Mar 13 21:33:05 2003 +0000
     1.2 +++ b/xen/arch/i386/io_apic.c	Fri Mar 14 13:19:58 2003 +0000
     1.3 @@ -189,6 +189,86 @@ static void clear_IO_APIC (void)
     1.4  			clear_IO_APIC_pin(apic, pin);
     1.5  }
     1.6  
     1.7 +static void set_ioapic_affinity (unsigned int irq, unsigned long mask)
     1.8 +{
     1.9 +	unsigned long flags;
    1.10 +
    1.11 +	/*
    1.12 +	 * Only the first 8 bits are valid.
    1.13 +	 */
    1.14 +	mask = mask << 24;
    1.15 +	spin_lock_irqsave(&ioapic_lock, flags);
    1.16 +	__DO_ACTION(1, = mask, )
    1.17 +	spin_unlock_irqrestore(&ioapic_lock, flags);
    1.18 +}
    1.19 +
    1.20 +#if CONFIG_SMP
    1.21 +
    1.22 +typedef struct {
    1.23 +	unsigned int cpu;
    1.24 +	unsigned long timestamp;
    1.25 +} ____cacheline_aligned irq_balance_t;
    1.26 +
    1.27 +static irq_balance_t irq_balance[NR_IRQS] __cacheline_aligned
    1.28 +			= { [ 0 ... NR_IRQS-1 ] = { 0, 0 } };
    1.29 +
    1.30 +extern unsigned long irq_affinity [NR_IRQS];
    1.31 +
    1.32 +#endif
    1.33 +
    1.34 +#define IDLE_ENOUGH(cpu,now) \
    1.35 +		(idle_cpu(cpu) && ((now) - irq_stat[(cpu)].idle_timestamp > 1))
    1.36 +
    1.37 +#define IRQ_ALLOWED(cpu,allowed_mask) \
    1.38 +		((1 << cpu) & (allowed_mask))
    1.39 +
    1.40 +static unsigned long move(int curr_cpu, unsigned long allowed_mask, unsigned long now, int direction)
    1.41 +{
    1.42 +	int search_idle = 1;
    1.43 +	int cpu = curr_cpu;
    1.44 +
    1.45 +	goto inside;
    1.46 +
    1.47 +	do {
    1.48 +		if (unlikely(cpu == curr_cpu))
    1.49 +			search_idle = 0;
    1.50 +inside:
    1.51 +		if (direction == 1) {
    1.52 +			cpu++;
    1.53 +			if (cpu >= smp_num_cpus)
    1.54 +				cpu = 0;
    1.55 +		} else {
    1.56 +			cpu--;
    1.57 +			if (cpu == -1)
    1.58 +				cpu = smp_num_cpus-1;
    1.59 +		}
    1.60 +	} while (!IRQ_ALLOWED(cpu,allowed_mask) ||
    1.61 +			(search_idle && !IDLE_ENOUGH(cpu,now)));
    1.62 +
    1.63 +	return cpu;
    1.64 +}
    1.65 +
    1.66 +static inline void balance_irq(int irq)
    1.67 +{
    1.68 +#if CONFIG_SMP
    1.69 +	irq_balance_t *entry = irq_balance + irq;
    1.70 +	unsigned long now = jiffies;
    1.71 +
    1.72 +	if (unlikely(entry->timestamp != now)) {
    1.73 +		unsigned long allowed_mask;
    1.74 +		int random_number;
    1.75 +
    1.76 +		rdtscl(random_number);
    1.77 +		random_number &= 1;
    1.78 +
    1.79 +		allowed_mask = cpu_online_map & irq_affinity[irq];
    1.80 +		entry->timestamp = now;
    1.81 +		entry->cpu = move(entry->cpu, allowed_mask, now, random_number);
    1.82 +		set_ioapic_affinity(irq, apicid_to_phys_cpu_present(entry->cpu));
    1.83 +	}
    1.84 +#endif
    1.85 +}
    1.86 +
    1.87  /*
    1.88   * support for broken MP BIOSs, enables hand-redirection of PIRQ0-7 to
    1.89   * specific CPU-side IRQs.
    1.90 @@ -1233,6 +1313,7 @@ static unsigned int startup_edge_ioapic_
    1.91   */
    1.92  static void ack_edge_ioapic_irq(unsigned int irq)
    1.93  {
    1.94 +	balance_irq(irq);
    1.95  	if ((irq_desc[irq].status & (IRQ_PENDING | IRQ_DISABLED))
    1.96  					== (IRQ_PENDING | IRQ_DISABLED))
    1.97  		mask_IO_APIC_irq(irq);
    1.98 @@ -1272,6 +1353,8 @@ static void end_level_ioapic_irq (unsign
    1.99  	unsigned long v;
   1.100  	int i;
   1.101  
   1.102 +	balance_irq(irq);
   1.103 +
   1.104  /*
   1.105   * It appears there is an erratum which affects at least version 0x11
   1.106   * of I/O APIC (that's the 82093AA and cores integrated into various
   1.107 @@ -1328,19 +1411,6 @@ static void end_level_ioapic_irq (unsign
   1.108  
   1.109  static void mask_and_ack_level_ioapic_irq (unsigned int irq) { /* nothing */ }
   1.110  
   1.111 -static void set_ioapic_affinity (unsigned int irq, unsigned long mask)
   1.112 -{
   1.113 -	unsigned long flags;
   1.114 -	/*
   1.115 -	 * Only the first 8 bits are valid.
   1.116 -	 */
   1.117 -	mask = mask << 24;
   1.118 -
   1.119 -	spin_lock_irqsave(&ioapic_lock, flags);
   1.120 -	__DO_ACTION(1, = mask, )
   1.121 -	spin_unlock_irqrestore(&ioapic_lock, flags);
   1.122 -}
   1.123 -
   1.124  /*
   1.125   * Level and edge triggered IO-APIC interrupts need different handling,
   1.126   * so we use two separate IRQ descriptors. Edge triggered IRQs can be
     2.1 --- a/xen/arch/i386/irq.c	Thu Mar 13 21:33:05 2003 +0000
     2.2 +++ b/xen/arch/i386/irq.c	Fri Mar 14 13:19:58 2003 +0000
     2.3 @@ -60,6 +60,11 @@
     2.4  irq_desc_t irq_desc[NR_IRQS] __cacheline_aligned =
     2.5  { [0 ... NR_IRQS-1] = { 0, &no_irq_type, NULL, 0, SPIN_LOCK_UNLOCKED}};
     2.6  
     2.7 +#ifdef CONFIG_SMP
     2.8 +/* NB. XXX We'll want some way of fiddling with this from DOM0. */
     2.9 +unsigned long irq_affinity [NR_IRQS] = { [0 ... NR_IRQS-1] = ~0UL };
    2.10 +#endif
    2.11 +
    2.12  /*
    2.13   * Special irq handlers.
    2.14   */
     3.1 --- a/xen/arch/i386/process.c	Thu Mar 13 21:33:05 2003 +0000
     3.2 +++ b/xen/arch/i386/process.c	Fri Mar 14 13:19:58 2003 +0000
     3.3 @@ -85,6 +85,7 @@ void cpu_idle (void)
     3.4  
     3.5      for ( ; ; )
     3.6      {
     3.7 +        irq_stat[cpu].idle_timestamp = jiffies;
     3.8          while (!current->hyp_events && !softirq_pending(cpu))
     3.9              default_idle();
    3.10          do_hyp_events();
     4.1 --- a/xen/arch/i386/setup.c	Thu Mar 13 21:33:05 2003 +0000
     4.2 +++ b/xen/arch/i386/setup.c	Fri Mar 14 13:19:58 2003 +0000
     4.3 @@ -20,6 +20,7 @@ unsigned long wait_init_idle;
     4.4  
     4.5  /* Basic page table for each CPU in the system. */
     4.6  l2_pgentry_t *idle_pg_table[NR_CPUS] = { idle0_pg_table };
     4.7 +struct task_struct *idle_task[NR_CPUS] = { &idle0_task };
     4.8  
     4.9  /* for asm/domain_page.h, map_domain_page() */
    4.10  unsigned long *mapcache[NR_CPUS];
     5.1 --- a/xen/arch/i386/smpboot.c	Thu Mar 13 21:33:05 2003 +0000
     5.2 +++ b/xen/arch/i386/smpboot.c	Fri Mar 14 13:19:58 2003 +0000
     5.3 @@ -699,6 +699,8 @@ static void __init do_boot_cpu (int apic
     5.4  
     5.5      SET_DEFAULT_FAST_TRAP(&idle->thread);
     5.6  
     5.7 +    idle_task[cpu] = idle;
     5.8 +
     5.9      /* start_eip had better be page-aligned! */
    5.10      start_eip = setup_trampoline();
    5.11  
     6.1 --- a/xen/common/schedule.c	Thu Mar 13 21:33:05 2003 +0000
     6.2 +++ b/xen/common/schedule.c	Fri Mar 14 13:19:58 2003 +0000
     6.3 @@ -174,6 +174,7 @@ long schedule_timeout(long timeout)
     6.4  }
     6.5  
     6.6  /* RN: XXX turn this into do_halt() */
     6.7 +/* KAF: No, turn it back into do_yield()! */
     6.8  /*
     6.9   * yield the current process
    6.10   */
    6.11 @@ -281,6 +282,15 @@ asmlinkage void schedule(void)
    6.12      return;
    6.13  }
    6.14  
    6.15 +
    6.16 +/* No locking needed -- pointer comparison is safe :-) */
    6.17 +int idle_cpu(int cpu)
    6.18 +{
    6.19 +    struct task_struct *p = schedule_data[cpu].curr;
    6.20 +    return p == idle_task[cpu];
    6.21 +}
    6.22 +
    6.23 +
    6.24  /*
    6.25   * The scheduling timer.
    6.26   */
     7.1 --- a/xen/include/asm-i386/hardirq.h	Thu Mar 13 21:33:05 2003 +0000
     7.2 +++ b/xen/include/asm-i386/hardirq.h	Fri Mar 14 13:19:58 2003 +0000
     7.3 @@ -10,6 +10,7 @@ typedef struct {
     7.4  	unsigned int __local_irq_count;
     7.5  	unsigned int __local_bh_count;
     7.6  	unsigned int __syscall_count;
     7.7 +	unsigned long idle_timestamp;
     7.8  } ____cacheline_aligned irq_cpustat_t;
     7.9  
    7.10  #include <xeno/irq_cpustat.h>	/* Standard mappings for irq_cpustat_t above */
     8.1 --- a/xen/include/asm-i386/smpboot.h	Thu Mar 13 21:33:05 2003 +0000
     8.2 +++ b/xen/include/asm-i386/smpboot.h	Fri Mar 14 13:19:58 2003 +0000
     8.3 @@ -30,6 +30,15 @@ static inline void detect_clustered_apic
     8.4  		/*Start cyclone clock*/
     8.5  		cyclone_setup(0);
     8.6  	}
     8.7 +	else if (!strncmp(oem, "IBM ENSW", 8) && !strncmp(prod, "RUTHLESS SMP", 9)){
     8.8 +		clustered_apic_mode = CLUSTERED_APIC_XAPIC;
     8.9 +		apic_broadcast_id = APIC_BROADCAST_ID_XAPIC;
    8.10 +		int_dest_addr_mode = APIC_DEST_PHYSICAL;
    8.11 +		int_delivery_mode = dest_Fixed;
    8.12 +		esr_disable = 1;
    8.13 +		/*Start cyclone clock*/
    8.14 +		cyclone_setup(0);
    8.15 +	}
    8.16  	else if (!strncmp(oem, "IBM NUMA", 8)){
    8.17  		clustered_apic_mode = CLUSTERED_APIC_NUMAQ;
    8.18  		apic_broadcast_id = APIC_BROADCAST_ID_APIC;
    8.19 @@ -116,15 +125,6 @@ static inline int target_cpus(void)
    8.20  	return cpu_online_map;
    8.21  }
    8.22  #else
    8.23 -/* KAF Xen: Round-robin allocate IRQs to CPUs. */
    8.24 -static inline int target_cpus(void)
    8.25 -{
    8.26 -    static unsigned int cpu_field = 1;
    8.27 -    do { 
    8.28 -        cpu_field <<= 1; 
    8.29 -        if ( cpu_field == 0x100 ) cpu_field = 1; /* logical field == 8 bits */ 
    8.30 -    } while ( (cpu_field & cpu_online_map) == 0 );
    8.31 -    return cpu_field;
    8.32 -}
    8.33 +#define target_cpus() (0xFF)
    8.34  #endif
    8.35  #endif
     9.1 --- a/xen/include/xeno/sched.h	Thu Mar 13 21:33:05 2003 +0000
     9.2 +++ b/xen/include/xeno/sched.h	Fri Mar 14 13:19:58 2003 +0000
     9.3 @@ -149,6 +149,7 @@ struct task_struct {
     9.4      next_task:   &(_t)           \
     9.5  }
     9.6  
     9.7 +extern struct task_struct *idle_task[NR_CPUS];
     9.8  #define IDLE_DOMAIN_ID   (~0)
     9.9  #define is_idle_task(_p) ((_p)->domain == IDLE_DOMAIN_ID)
    9.10  
    9.11 @@ -214,7 +215,8 @@ asmlinkage void schedule(void);
    9.12  
    9.13  void domain_init(void);
    9.14  
    9.15 -void cpu_idle(void);
    9.16 +int idle_cpu(int cpu); /* Is CPU 'cpu' idle right now? */
    9.17 +void cpu_idle(void);   /* Idle loop. */
    9.18  
    9.19  #define REMOVE_LINKS(p) do { \
    9.20          (p)->next_task->prev_task = (p)->prev_task; \