ia64/xen-unstable

changeset 17687:d795e15b85a7

x86: Fix lapic timer stop issue in deep C state

Local APIC timer may stop at deep C state (C3/C4...) entry/exit. this
patch add the logic that use platform timer (HPET) to reenable local
APIC timer at C state entry/exit.

Signed-off-by: Wei Gang <gang.wei@intel.com>
Signed-off-by: Yu Ke <ke.yu@intel.com>
Signed-off-by: Keir Fraser <keir.fraser@citrix.com>
author Keir Fraser <keir.fraser@citrix.com>
date Wed May 21 10:38:57 2008 +0100 (2008-05-21)
parents 672c09aad49d
children 5d4eac11e14f
files xen/arch/x86/Makefile xen/arch/x86/acpi/cpu_idle.c xen/arch/x86/hpet.c xen/arch/x86/time.c xen/common/timer.c xen/include/asm-x86/hpet.h xen/include/xen/timer.h
line diff
     1.1 --- a/xen/arch/x86/Makefile	Tue May 20 14:50:45 2008 +0100
     1.2 +++ b/xen/arch/x86/Makefile	Wed May 21 10:38:57 2008 +0100
     1.3 @@ -50,6 +50,7 @@ obj-y += x86_emulate.o
     1.4  obj-y += machine_kexec.o
     1.5  obj-y += crash.o
     1.6  obj-y += tboot.o
     1.7 +obj-y += hpet.o
     1.8  
     1.9  obj-$(crash_debug) += gdbstub.o
    1.10  
     2.1 --- a/xen/arch/x86/acpi/cpu_idle.c	Tue May 20 14:50:45 2008 +0100
     2.2 +++ b/xen/arch/x86/acpi/cpu_idle.c	Wed May 21 10:38:57 2008 +0100
     2.3 @@ -39,6 +39,7 @@
     2.4  #include <xen/smp.h>
     2.5  #include <asm/cache.h>
     2.6  #include <asm/io.h>
     2.7 +#include <asm/hpet.h>
     2.8  #include <xen/guest_access.h>
     2.9  #include <public/platform.h>
    2.10  #include <asm/processor.h>
    2.11 @@ -438,19 +439,19 @@ static void acpi_processor_idle(void)
    2.12          t1 = inl(pmtmr_ioport);
    2.13  
    2.14          /*
    2.15 -         * FIXME: Before invoking C3, be aware that TSC/APIC timer may be 
    2.16 +         * Before invoking C3, be aware that TSC/APIC timer may be 
    2.17           * stopped by H/W. Without carefully handling of TSC/APIC stop issues,
    2.18           * deep C state can't work correctly.
    2.19           */
    2.20          /* preparing TSC stop */
    2.21          cstate_save_tsc();
    2.22 -        /* placeholder for preparing APIC stop */
    2.23 -
    2.24 +        /* preparing APIC stop */
    2.25 +        hpet_broadcast_enter();
    2.26          /* Invoke C3 */
    2.27          acpi_idle_do_entry(cx);
    2.28  
    2.29 -        /* placeholder for recovering APIC */
    2.30 -
    2.31 +        /* recovering APIC */
    2.32 +        hpet_broadcast_exit();
    2.33          /* recovering TSC */
    2.34          cstate_restore_tsc();
    2.35  
     3.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     3.2 +++ b/xen/arch/x86/hpet.c	Wed May 21 10:38:57 2008 +0100
     3.3 @@ -0,0 +1,291 @@
     3.4 +/******************************************************************************
     3.5 + * arch/x86/hpet.c
     3.6 + * 
     3.7 + * HPET management.
     3.8 + */
     3.9 +
    3.10 +#include <xen/config.h>
    3.11 +#include <xen/errno.h>
    3.12 +#include <xen/time.h>
    3.13 +#include <xen/timer.h>
    3.14 +#include <xen/smp.h>
    3.15 +#include <xen/softirq.h>
    3.16 +#include <asm/fixmap.h>
    3.17 +#include <asm/div64.h>
    3.18 +#include <asm/hpet.h>
    3.19 +
    3.20 +#define STIME_MAX ((s_time_t)((uint64_t)~0ull>>1))
    3.21 +
    3.22 +#define MAX_DELTA_NS MILLISECS(10*1000)
    3.23 +#define MIN_DELTA_NS MICROSECS(1)
    3.24 +
    3.25 +struct hpet_event_channel
    3.26 +{
    3.27 +    unsigned long mult;
    3.28 +    int           shift;
    3.29 +    s_time_t      next_event;
    3.30 +    cpumask_t     cpumask;
    3.31 +    spinlock_t    lock;
    3.32 +    void          (*event_handler)(struct hpet_event_channel *);
    3.33 +};
    3.34 +static struct hpet_event_channel hpet_event;
    3.35 +
    3.36 +unsigned long hpet_address;
    3.37 +
    3.38 +/*
    3.39 + * Calculate a multiplication factor for scaled math, which is used to convert
    3.40 + * nanoseconds based values to clock ticks:
    3.41 + *
    3.42 + * clock_ticks = (nanoseconds * factor) >> shift.
    3.43 + *
    3.44 + * div_sc is the rearranged equation to calculate a factor from a given clock
    3.45 + * ticks / nanoseconds ratio:
    3.46 + *
    3.47 + * factor = (clock_ticks << shift) / nanoseconds
    3.48 + */
    3.49 +static inline unsigned long div_sc(unsigned long ticks, unsigned long nsec,
    3.50 +                                   int shift)
    3.51 +{
    3.52 +    uint64_t tmp = ((uint64_t)ticks) << shift;
    3.53 +
    3.54 +    do_div(tmp, nsec);
    3.55 +    return (unsigned long) tmp;
    3.56 +}
    3.57 +
    3.58 +/*
    3.59 + * Convert nanoseconds based values to clock ticks:
    3.60 + *
    3.61 + * clock_ticks = (nanoseconds * factor) >> shift.
    3.62 + */
    3.63 +static inline unsigned long ns2ticks(unsigned long nsec, int shift,
    3.64 +                                     unsigned long factor)
    3.65 +{
    3.66 +    uint64_t tmp = ((uint64_t)nsec * factor) >> shift;
    3.67 +
    3.68 +    return (unsigned long) tmp;
    3.69 +}
    3.70 +
    3.71 +static int hpet_legacy_next_event(unsigned long delta)
    3.72 +{
    3.73 +    unsigned long cnt;
    3.74 +
    3.75 +    cnt = hpet_read32(HPET_COUNTER);
    3.76 +    cnt += delta;
    3.77 +    hpet_write32(cnt, HPET_T0_CMP);
    3.78 +
    3.79 +    return ((long)(hpet_read32(HPET_COUNTER) - cnt) > 0) ? -ETIME : 0;
    3.80 +}
    3.81 +
    3.82 +static int reprogram_hpet_evt_channel(
    3.83 +    struct hpet_event_channel *ch,
    3.84 +    s_time_t expire, s_time_t now, int force)
    3.85 +{
    3.86 +    int64_t delta;
    3.87 +    int ret;
    3.88 +
    3.89 +    if ( unlikely(expire < 0) )
    3.90 +    {
    3.91 +        printk(KERN_DEBUG "reprogram: expire < 0\n");
    3.92 +        return -ETIME;
    3.93 +    }
    3.94 +
    3.95 +    delta = expire - now;
    3.96 +    if ( delta <= 0 )
    3.97 +    {
    3.98 +        printk(KERN_DEBUG "reprogram: expire(%"PRIx64") < "
    3.99 +               "now(%"PRIx64")\n", expire, now);
   3.100 +        if ( !force )
   3.101 +            return -ETIME;
   3.102 +    }
   3.103 +
   3.104 +    ch->next_event = expire;
   3.105 +
   3.106 +    delta = min_t(int64_t, delta, MAX_DELTA_NS);
   3.107 +    delta = max_t(int64_t, delta, MIN_DELTA_NS);
   3.108 +    delta = ns2ticks(delta, ch->shift, ch->mult);
   3.109 +
   3.110 +    ret = hpet_legacy_next_event(delta);
   3.111 +    while ( ret && force )
   3.112 +    {
   3.113 +        delta += delta;
   3.114 +        ret = hpet_legacy_next_event(delta);
   3.115 +    }
   3.116 +
   3.117 +    return ret;
   3.118 +}
   3.119 +
   3.120 +static int evt_do_broadcast(cpumask_t mask)
   3.121 +{
   3.122 +    int ret = 0, cpu = smp_processor_id();
   3.123 +
   3.124 +    if ( cpu_isset(cpu, mask) )
   3.125 +    {
   3.126 +        cpu_clear(cpu, mask);
   3.127 +        raise_softirq(TIMER_SOFTIRQ);
   3.128 +        ret = 1;
   3.129 +    }
   3.130 +
   3.131 +    if ( !cpus_empty(mask) )
   3.132 +    {
   3.133 +       cpumask_raise_softirq(mask, TIMER_SOFTIRQ);
   3.134 +       ret = 1;
   3.135 +    }
   3.136 +    return ret;
   3.137 +}
   3.138 +
   3.139 +static void handle_hpet_broadcast(struct hpet_event_channel *ch)
   3.140 +{
   3.141 +    cpumask_t mask;
   3.142 +    s_time_t now, next_event;
   3.143 +    int cpu, current_cpu = smp_processor_id();
   3.144 +
   3.145 +    spin_lock(&ch->lock);
   3.146 +
   3.147 +    if ( cpu_isset(current_cpu, ch->cpumask) )
   3.148 +        printk(KERN_DEBUG "WARNING: current cpu%d in bc_mask\n", current_cpu);
   3.149 +again:
   3.150 +    ch->next_event = STIME_MAX;
   3.151 +    next_event = STIME_MAX;
   3.152 +    mask = (cpumask_t)CPU_MASK_NONE;
   3.153 +    now = NOW();
   3.154 +
   3.155 +    /* find all expired events */
   3.156 +    for_each_cpu_mask(cpu, ch->cpumask)
   3.157 +    {
   3.158 +        if ( per_cpu(timer_deadline, cpu) <= now )
   3.159 +            cpu_set(cpu, mask);
   3.160 +        else if ( per_cpu(timer_deadline, cpu) < next_event )
   3.161 +            next_event = per_cpu(timer_deadline, cpu);
   3.162 +    }
   3.163 +    if ( per_cpu(timer_deadline, current_cpu) <= now )
   3.164 +        cpu_set(current_cpu, mask);
   3.165 +
   3.166 +    /* wakeup the cpus which have an expired event. */
   3.167 +    evt_do_broadcast(mask);
   3.168 +
   3.169 +    if ( next_event != STIME_MAX )
   3.170 +    {
   3.171 +        if ( reprogram_hpet_evt_channel(ch, next_event, now, 0) )
   3.172 +            goto again;
   3.173 +    }
   3.174 +    spin_unlock(&ch->lock);
   3.175 +}
   3.176 +
   3.177 +void hpet_broadcast_init(void)
   3.178 +{
   3.179 +    u64 hpet_rate;
   3.180 +    u32 hpet_id, cfg;
   3.181 +
   3.182 +    hpet_rate = hpet_setup();
   3.183 +    if ( hpet_rate == 0 )
   3.184 +        return;
   3.185 +
   3.186 +    hpet_id = hpet_read32(HPET_ID);
   3.187 +    if ( !(hpet_id & HPET_ID_LEGSUP) )
   3.188 +        return;
   3.189 +
   3.190 +    /* Start HPET legacy interrupts */
   3.191 +    cfg = hpet_read32(HPET_CFG);
   3.192 +    cfg |= HPET_CFG_LEGACY;
   3.193 +    hpet_write32(cfg, HPET_CFG);
   3.194 +
   3.195 +    /* set HPET T0 as oneshot */
   3.196 +    cfg = hpet_read32(HPET_T0_CFG);
   3.197 +    cfg &= ~HPET_TN_PERIODIC;
   3.198 +    cfg |= HPET_TN_ENABLE | HPET_TN_32BIT;
   3.199 +    hpet_write32(cfg, HPET_T0_CFG);
   3.200 +
   3.201 +    /*
   3.202 +     * The period is a femto seconds value. We need to calculate the scaled
   3.203 +     * math multiplication factor for nanosecond to hpet tick conversion.
   3.204 +     */
   3.205 +    hpet_event.mult = div_sc((unsigned long)hpet_rate, 1000000000ul, 32);
   3.206 +    hpet_event.shift = 32;
   3.207 +    hpet_event.next_event = STIME_MAX;
   3.208 +    hpet_event.event_handler = handle_hpet_broadcast;
   3.209 +    spin_lock_init(&hpet_event.lock);
   3.210 +}
   3.211 +
   3.212 +void hpet_broadcast_enter(void)
   3.213 +{
   3.214 +    struct hpet_event_channel *ch = &hpet_event;
   3.215 +
   3.216 +    cpu_set(smp_processor_id(), ch->cpumask);
   3.217 +
   3.218 +    spin_lock(&ch->lock);
   3.219 +
   3.220 +    /* reprogram if current cpu expire time is nearer */
   3.221 +    if ( this_cpu(timer_deadline) < ch->next_event )
   3.222 +        reprogram_hpet_evt_channel(ch, this_cpu(timer_deadline), NOW(), 1);
   3.223 +
   3.224 +    spin_unlock(&ch->lock);
   3.225 +}
   3.226 +
   3.227 +void hpet_broadcast_exit(void)
   3.228 +{
   3.229 +    struct hpet_event_channel *ch = &hpet_event;
   3.230 +    int cpu = smp_processor_id();
   3.231 +
   3.232 +    if ( cpu_test_and_clear(cpu, ch->cpumask) )
   3.233 +        reprogram_timer(per_cpu(timer_deadline, cpu));
   3.234 +}
   3.235 +
   3.236 +int hpet_legacy_irq_tick(void)
   3.237 +{
   3.238 +    if ( !hpet_event.event_handler )
   3.239 +        return 0;
   3.240 +    hpet_event.event_handler(&hpet_event);
   3.241 +    return 1;
   3.242 +}
   3.243 +
   3.244 +u64 hpet_setup(void)
   3.245 +{
   3.246 +    static u64 hpet_rate;
   3.247 +    static int initialised;
   3.248 +    u32 hpet_id, hpet_period, cfg;
   3.249 +    int i;
   3.250 +
   3.251 +    if ( initialised )
   3.252 +        return hpet_rate;
   3.253 +    initialised = 1;
   3.254 +
   3.255 +    if ( hpet_address == 0 )
   3.256 +        return 0;
   3.257 +
   3.258 +    set_fixmap_nocache(FIX_HPET_BASE, hpet_address);
   3.259 +
   3.260 +    hpet_id = hpet_read32(HPET_ID);
   3.261 +    if ( hpet_id == 0 )
   3.262 +    {
   3.263 +        printk("BAD HPET vendor id.\n");
   3.264 +        return 0;
   3.265 +    }
   3.266 +
   3.267 +    /* Check for sane period (100ps <= period <= 100ns). */
   3.268 +    hpet_period = hpet_read32(HPET_PERIOD);
   3.269 +    if ( (hpet_period > 100000000) || (hpet_period < 100000) )
   3.270 +    {
   3.271 +        printk("BAD HPET period %u.\n", hpet_period);
   3.272 +        return 0;
   3.273 +    }
   3.274 +
   3.275 +    cfg = hpet_read32(HPET_CFG);
   3.276 +    cfg &= ~(HPET_CFG_ENABLE | HPET_CFG_LEGACY);
   3.277 +    hpet_write32(cfg, HPET_CFG);
   3.278 +
   3.279 +    for ( i = 0; i <= ((hpet_id >> 8) & 31); i++ )
   3.280 +    {
   3.281 +        cfg = hpet_read32(HPET_T0_CFG + i*0x20);
   3.282 +        cfg &= ~HPET_TN_ENABLE;
   3.283 +        hpet_write32(cfg & ~HPET_TN_ENABLE, HPET_T0_CFG);
   3.284 +    }
   3.285 +
   3.286 +    cfg = hpet_read32(HPET_CFG);
   3.287 +    cfg |= HPET_CFG_ENABLE;
   3.288 +    hpet_write32(cfg, HPET_CFG);
   3.289 +
   3.290 +    hpet_rate = 1000000000000000ULL; /* 10^15 */
   3.291 +    (void)do_div(hpet_rate, hpet_period);
   3.292 +
   3.293 +    return hpet_rate;
   3.294 +}
     4.1 --- a/xen/arch/x86/time.c	Tue May 20 14:50:45 2008 +0100
     4.2 +++ b/xen/arch/x86/time.c	Wed May 21 10:38:57 2008 +0100
     4.3 @@ -38,7 +38,6 @@ string_param("clocksource", opt_clocksou
     4.4  #define EPOCH MILLISECS(1000)
     4.5  
     4.6  unsigned long cpu_khz;  /* CPU clock frequency in kHz. */
     4.7 -unsigned long hpet_address;
     4.8  DEFINE_SPINLOCK(rtc_lock);
     4.9  unsigned long pit0_ticks;
    4.10  static u32 wc_sec, wc_nsec; /* UTC time at last 'time update'. */
    4.11 @@ -68,7 +67,8 @@ struct platform_timesource {
    4.12  
    4.13  static DEFINE_PER_CPU(struct cpu_time, cpu_time);
    4.14  
    4.15 -static u8 tsc_invariant=0;  /* TSC is invariant upon C state entry */
    4.16 +/* TSC is invariant on C state entry? */
    4.17 +static bool_t tsc_invariant;
    4.18  
    4.19  /*
    4.20   * We simulate a 32-bit platform timer from the 16-bit PIT ch2 counter.
    4.21 @@ -151,6 +151,9 @@ static void timer_interrupt(int irq, voi
    4.22  {
    4.23      ASSERT(local_irq_is_enabled());
    4.24  
    4.25 +    if ( hpet_legacy_irq_tick() )
    4.26 +        return;
    4.27 +
    4.28      /* Only for start-of-day interruopt tests in io_apic.c. */
    4.29      (*(volatile unsigned long *)&pit0_ticks)++;
    4.30  
    4.31 @@ -347,47 +350,10 @@ static u32 read_hpet_count(void)
    4.32  
    4.33  static int init_hpet(struct platform_timesource *pts)
    4.34  {
    4.35 -    u64 hpet_rate;
    4.36 -    u32 hpet_id, hpet_period, cfg;
    4.37 -    int i;
    4.38 -
    4.39 -    if ( hpet_address == 0 )
    4.40 -        return 0;
    4.41 -
    4.42 -    set_fixmap_nocache(FIX_HPET_BASE, hpet_address);
    4.43 -
    4.44 -    hpet_id = hpet_read32(HPET_ID);
    4.45 -    if ( hpet_id == 0 )
    4.46 -    {
    4.47 -        printk("BAD HPET vendor id.\n");
    4.48 -        return 0;
    4.49 -    }
    4.50 +    u64 hpet_rate = hpet_setup();
    4.51  
    4.52 -    /* Check for sane period (100ps <= period <= 100ns). */
    4.53 -    hpet_period = hpet_read32(HPET_PERIOD);
    4.54 -    if ( (hpet_period > 100000000) || (hpet_period < 100000) )
    4.55 -    {
    4.56 -        printk("BAD HPET period %u.\n", hpet_period);
    4.57 +    if ( hpet_rate == 0 )
    4.58          return 0;
    4.59 -    }
    4.60 -
    4.61 -    cfg = hpet_read32(HPET_CFG);
    4.62 -    cfg &= ~(HPET_CFG_ENABLE | HPET_CFG_LEGACY);
    4.63 -    hpet_write32(cfg, HPET_CFG);
    4.64 -
    4.65 -    for ( i = 0; i <= ((hpet_id >> 8) & 31); i++ )
    4.66 -    {
    4.67 -        cfg = hpet_read32(HPET_T0_CFG + i*0x20);
    4.68 -        cfg &= ~HPET_TN_ENABLE;
    4.69 -        hpet_write32(cfg & ~HPET_TN_ENABLE, HPET_T0_CFG);
    4.70 -    }
    4.71 -
    4.72 -    cfg = hpet_read32(HPET_CFG);
    4.73 -    cfg |= HPET_CFG_ENABLE;
    4.74 -    hpet_write32(cfg, HPET_CFG);
    4.75 -
    4.76 -    hpet_rate = 1000000000000000ULL; /* 10^15 */
    4.77 -    (void)do_div(hpet_rate, hpet_period);
    4.78  
    4.79      pts->name = "HPET";
    4.80      pts->frequency = hpet_rate;
    4.81 @@ -1041,7 +1007,14 @@ static int __init disable_pit_irq(void)
    4.82          outb_p(0x30, PIT_MODE);
    4.83          outb_p(0, PIT_CH0);
    4.84          outb_p(0, PIT_CH0);
    4.85 +
    4.86 +        /*
    4.87 +         * If we do not rely on PIT CH0 then we can use HPET for one-shot
    4.88 +         * timer emulation when entering deep C states.
    4.89 +         */
    4.90 +        hpet_broadcast_init();
    4.91      }
    4.92 +
    4.93      return 0;
    4.94  }
    4.95  __initcall(disable_pit_irq);
     5.1 --- a/xen/common/timer.c	Tue May 20 14:50:45 2008 +0100
     5.2 +++ b/xen/common/timer.c	Wed May 21 10:38:57 2008 +0100
     5.3 @@ -35,7 +35,7 @@ struct timers {
     5.4  
     5.5  static DEFINE_PER_CPU(struct timers, timers);
     5.6  
     5.7 -extern int reprogram_timer(s_time_t timeout);
     5.8 +DEFINE_PER_CPU(s_time_t, timer_deadline);
     5.9  
    5.10  /****************************************************************************
    5.11   * HEAP OPERATIONS.
    5.12 @@ -323,8 +323,10 @@ static void timer_softirq_action(void)
    5.13          }
    5.14  
    5.15          ts->running = NULL;
    5.16 +
    5.17 +        this_cpu(timer_deadline) = GET_HEAP_SIZE(heap) ? heap[1]->expires : 0;
    5.18      }
    5.19 -    while ( !reprogram_timer(GET_HEAP_SIZE(heap) ? heap[1]->expires : 0) );
    5.20 +    while ( !reprogram_timer(this_cpu(timer_deadline)) );
    5.21  
    5.22      spin_unlock_irq(&ts->lock);
    5.23  }
     6.1 --- a/xen/include/asm-x86/hpet.h	Tue May 20 14:50:45 2008 +0100
     6.2 +++ b/xen/include/asm-x86/hpet.h	Wed May 21 10:38:57 2008 +0100
     6.3 @@ -49,4 +49,24 @@
     6.4  #define hpet_write32(y,x) \
     6.5      (*(volatile u32 *)(fix_to_virt(FIX_HPET_BASE) + (x)) = (y))
     6.6  
     6.7 +/*
     6.8 + * Detect and initialise HPET hardware: return counter update frequency.
     6.9 + * Return value is zero if HPET is unavailable.
    6.10 + */
    6.11 +u64 hpet_setup(void);
    6.12 +
    6.13 +/*
    6.14 + * Callback from legacy timer (PIT channel 0) IRQ handler.
    6.15 + * Returns 1 if tick originated from HPET; else 0.
    6.16 + */
    6.17 +int hpet_legacy_irq_tick(void);
    6.18 +
    6.19 +/*
    6.20 + * Temporarily use an HPET event counter for timer interrupt handling,
    6.21 + * rather than using the LAPIC timer. Used for Cx state entry.
    6.22 + */
    6.23 +void hpet_broadcast_init(void);
    6.24 +void hpet_broadcast_enter(void);
    6.25 +void hpet_broadcast_exit(void);
    6.26 +
    6.27  #endif /* __X86_HPET_H__ */
     7.1 --- a/xen/include/xen/timer.h	Tue May 20 14:50:45 2008 +0100
     7.2 +++ b/xen/include/xen/timer.h	Wed May 21 10:38:57 2008 +0100
     7.3 @@ -99,6 +99,15 @@ extern void process_pending_timers(void)
     7.4   */
     7.5  extern void timer_init(void);
     7.6  
     7.7 +/*
     7.8 + * Next timer deadline for each CPU.
     7.9 + * Modified only by the local CPU and never in interrupt context.
    7.10 + */
    7.11 +DECLARE_PER_CPU(s_time_t, timer_deadline);
    7.12 +
    7.13 +/* Arch-defined function to reprogram timer hardware for new deadline. */
    7.14 +extern int reprogram_timer(s_time_t timeout);
    7.15 +
    7.16  #endif /* _TIMER_H_ */
    7.17  
    7.18  /*