ia64/xen-unstable

changeset 17548:520519f5e346

Add acpi C3 support for x86.

C3 & deep C state will need to do ARB_DIS or cache flush. ARB_DIS
should be done within the last cpu which is ready for C3. Bus master
activities are checked and recorded for C state promotion/demotion
judgement.

C3 is disabled by default, till TSC/APIC stop issues resolved.
Adding cmdline option "max_cstate=3" can enable C3.

Signed-off-by: Wei Gang <gang.wei@intel.com>
author Keir Fraser <keir.fraser@citrix.com>
date Thu May 01 10:46:59 2008 +0100 (2008-05-01)
parents 93bc6d9b5f31
children 4aec1797720f
files xen/arch/x86/acpi/cpu_idle.c
line diff
     1.1 --- a/xen/arch/x86/acpi/cpu_idle.c	Thu May 01 10:41:51 2008 +0100
     1.2 +++ b/xen/arch/x86/acpi/cpu_idle.c	Thu May 01 10:46:59 2008 +0100
     1.3 @@ -60,6 +60,16 @@ extern void (*pm_idle) (void);
     1.4  static void (*pm_idle_save) (void) __read_mostly;
     1.5  unsigned int max_cstate __read_mostly = 2;
     1.6  integer_param("max_cstate", max_cstate);
     1.7 +/*
     1.8 + * bm_history -- bit-mask with a bit per jiffy of bus-master activity
     1.9 + * 1000 HZ: 0xFFFFFFFF: 32 jiffies = 32ms
    1.10 + * 800 HZ: 0xFFFFFFFF: 32 jiffies = 40ms
    1.11 + * 100 HZ: 0x0000000F: 4 jiffies = 40ms
    1.12 + * reduce history for more aggressive entry into C3
    1.13 + */
    1.14 +unsigned int bm_history __read_mostly =
    1.15 +    (HZ >= 800 ? 0xFFFFFFFF : ((1U << (HZ / 25)) - 1));
    1.16 +integer_param("bm_history", bm_history);
    1.17  
    1.18  struct acpi_processor_cx;
    1.19  
    1.20 @@ -91,10 +101,20 @@ struct acpi_processor_cx
    1.21      struct acpi_processor_cx_policy demotion;
    1.22  };
    1.23  
    1.24 +struct acpi_processor_flags
    1.25 +{
    1.26 +    u8 bm_control:1;
    1.27 +    u8 bm_check:1;
    1.28 +    u8 has_cst:1;
    1.29 +    u8 power_setup_done:1;
    1.30 +    u8 bm_rld_set:1;
    1.31 +};
    1.32 +
    1.33  struct acpi_processor_power
    1.34  {
    1.35 +    struct acpi_processor_flags flags;
    1.36      struct acpi_processor_cx *state;
    1.37 -    u64 bm_check_timestamp;
    1.38 +    s_time_t bm_check_timestamp;
    1.39      u32 default_state;
    1.40      u32 bm_activity;
    1.41      u32 count;
    1.42 @@ -185,6 +205,29 @@ static void acpi_processor_power_activat
    1.43          old->promotion.count = 0;
    1.44      new->demotion.count = 0;
    1.45  
    1.46 +    /* Cleanup from old state. */
    1.47 +    if ( old )
    1.48 +    {
    1.49 +        switch ( old->type )
    1.50 +        {
    1.51 +        case ACPI_STATE_C3:
    1.52 +            /* Disable bus master reload */
    1.53 +            if ( new->type != ACPI_STATE_C3 && power->flags.bm_check )
    1.54 +                acpi_set_register(ACPI_BITREG_BUS_MASTER_RLD, 0);
    1.55 +            break;
    1.56 +        }
    1.57 +    }
    1.58 +
    1.59 +    /* Prepare to use new state. */
    1.60 +    switch ( new->type )
    1.61 +    {
    1.62 +    case ACPI_STATE_C3:
    1.63 +        /* Enable bus master reload */
    1.64 +        if ( old->type != ACPI_STATE_C3 && power->flags.bm_check )
    1.65 +            acpi_set_register(ACPI_BITREG_BUS_MASTER_RLD, 1);
    1.66 +        break;
    1.67 +    }
    1.68 +
    1.69      power->state = new;
    1.70  
    1.71      return;
    1.72 @@ -196,7 +239,7 @@ static void acpi_safe_halt(void)
    1.73      safe_halt();
    1.74  }
    1.75  
    1.76 -#define MWAIT_ECX_INTERRUPT_BREAK	(0x1)
    1.77 +#define MWAIT_ECX_INTERRUPT_BREAK   (0x1)
    1.78  
    1.79  static void mwait_idle_with_hints(unsigned long eax, unsigned long ecx)
    1.80  {
    1.81 @@ -229,6 +272,8 @@ static void acpi_idle_do_entry(struct ac
    1.82      }
    1.83  }
    1.84  
    1.85 +static atomic_t c3_cpu_count;
    1.86 +
    1.87  static void acpi_processor_idle(void)
    1.88  {
    1.89      struct acpi_processor_power *power = NULL;
    1.90 @@ -261,6 +306,62 @@ static void acpi_processor_idle(void)
    1.91      }
    1.92  
    1.93      /*
    1.94 +     * Check BM Activity
    1.95 +     * -----------------
    1.96 +     * Check for bus mastering activity (if required), record, and check
    1.97 +     * for demotion.
    1.98 +     */
    1.99 +    if ( power->flags.bm_check )
   1.100 +    {
   1.101 +        u32 bm_status = 0;
   1.102 +        unsigned long diff = (NOW() - power->bm_check_timestamp) >> 23;
   1.103 +
   1.104 +        if ( diff > 31 )
   1.105 +            diff = 31;
   1.106 +
   1.107 +        power->bm_activity <<= diff;
   1.108 +
   1.109 +        acpi_get_register(ACPI_BITREG_BUS_MASTER_STATUS, &bm_status);
   1.110 +        if ( bm_status )
   1.111 +        {
   1.112 +            power->bm_activity |= 0x1;
   1.113 +            acpi_set_register(ACPI_BITREG_BUS_MASTER_STATUS, 1);
   1.114 +        }
   1.115 +        /*
   1.116 +         * PIIX4 Erratum #18: Note that BM_STS doesn't always reflect
   1.117 +         * the true state of bus mastering activity; forcing us to
   1.118 +         * manually check the BMIDEA bit of each IDE channel.
   1.119 +         */
   1.120 +        /*else if ( errata.piix4.bmisx )
   1.121 +        {
   1.122 +            if ( (inb_p(errata.piix4.bmisx + 0x02) & 0x01)
   1.123 +                || (inb_p(errata.piix4.bmisx + 0x0A) & 0x01) )
   1.124 +                pr->power.bm_activity |= 0x1;
   1.125 +        }*/
   1.126 +
   1.127 +        power->bm_check_timestamp = NOW();
   1.128 +
   1.129 +        /*
   1.130 +         * If bus mastering is or was active this jiffy, demote
   1.131 +         * to avoid a faulty transition.  Note that the processor
   1.132 +         * won't enter a low-power state during this call (to this
   1.133 +         * function) but should upon the next.
   1.134 +         *
   1.135 +         * TBD: A better policy might be to fallback to the demotion
   1.136 +         *      state (use it for this quantum only) istead of
   1.137 +         *      demoting -- and rely on duration as our sole demotion
   1.138 +         *      qualification.  This may, however, introduce DMA
   1.139 +         *      issues (e.g. floppy DMA transfer overrun/underrun).
   1.140 +         */
   1.141 +        if ( (power->bm_activity & 0x1) && cx->demotion.threshold.bm )
   1.142 +        {
   1.143 +            local_irq_enable();
   1.144 +            next_state = cx->demotion.state;
   1.145 +            goto end;
   1.146 +        }
   1.147 +    }
   1.148 +
   1.149 +    /*
   1.150       * Sleep:
   1.151       * ------
   1.152       * Invoke the current Cx state to put the processor to sleep.
   1.153 @@ -303,6 +404,73 @@ static void acpi_processor_idle(void)
   1.154          sleep_ticks =
   1.155              ticks_elapsed(t1, t2) - cx->latency_ticks - C2_OVERHEAD;
   1.156          break;
   1.157 +
   1.158 +    case ACPI_STATE_C3:
   1.159 +        /*
   1.160 +         * disable bus master
   1.161 +         * bm_check implies we need ARB_DIS
   1.162 +         * !bm_check implies we need cache flush
   1.163 +         * bm_control implies whether we can do ARB_DIS
   1.164 +         *
   1.165 +         * That leaves a case where bm_check is set and bm_control is
   1.166 +         * not set. In that case we cannot do much, we enter C3
   1.167 +         * without doing anything.
   1.168 +         */
   1.169 +        if ( power->flags.bm_check && power->flags.bm_control )
   1.170 +        {
   1.171 +            atomic_inc(&c3_cpu_count);
   1.172 +            if ( atomic_read(&c3_cpu_count) == num_online_cpus() )
   1.173 +            {
   1.174 +                /*
   1.175 +                 * All CPUs are trying to go to C3
   1.176 +                 * Disable bus master arbitration
   1.177 +                 */
   1.178 +                acpi_set_register(ACPI_BITREG_ARB_DISABLE, 1);
   1.179 +            }
   1.180 +        }
   1.181 +        else if ( !power->flags.bm_check )
   1.182 +        {
   1.183 +            /* SMP with no shared cache... Invalidate cache  */
   1.184 +            ACPI_FLUSH_CPU_CACHE();
   1.185 +        }
   1.186 +
   1.187 +        /* Get start time (ticks) */
   1.188 +        t1 = inl(pmtmr_ioport);
   1.189 +
   1.190 +        /*
   1.191 +         * FIXME: Before invoking C3, be aware that TSC/APIC timer may be 
   1.192 +         * stopped by H/W. Without carefully handling of TSC/APIC stop issues,
   1.193 +         * deep C state can't work correctly.
   1.194 +         */
   1.195 +        /* placeholder for preparing TSC stop */
   1.196 +
   1.197 +        /* placeholder for preparing APIC stop */
   1.198 +
   1.199 +        /* Invoke C3 */
   1.200 +        acpi_idle_do_entry(cx);
   1.201 +
   1.202 +        /* placeholder for recovering APIC */
   1.203 +
   1.204 +        /* placeholder for recovering TSC */
   1.205 +
   1.206 +        /* Get end time (ticks) */
   1.207 +        t2 = inl(pmtmr_ioport);
   1.208 +        if ( power->flags.bm_check && power->flags.bm_control )
   1.209 +        {
   1.210 +            /* Enable bus master arbitration */
   1.211 +            atomic_dec(&c3_cpu_count);
   1.212 +            acpi_set_register(ACPI_BITREG_ARB_DISABLE, 0);
   1.213 +        }
   1.214 +
   1.215 +        /* Compute time (ticks) that we were actually asleep */
   1.216 +        sleep_ticks = ticks_elapsed(t1, t2);
   1.217 +        /* Re-enable interrupts */
   1.218 +        local_irq_enable();
   1.219 +        /* Do not account our idle-switching overhead: */
   1.220 +        sleep_ticks -= cx->latency_ticks + C3_OVERHEAD;
   1.221 +
   1.222 +        break;
   1.223 +
   1.224      default:
   1.225          local_irq_enable();
   1.226          return;
   1.227 @@ -331,8 +499,19 @@ static void acpi_processor_idle(void)
   1.228              cx->demotion.count = 0;
   1.229              if ( cx->promotion.count >= cx->promotion.threshold.count )
   1.230              {
   1.231 -                next_state = cx->promotion.state;
   1.232 -                goto end;
   1.233 +                if ( power->flags.bm_check )
   1.234 +                {
   1.235 +                    if ( !(power->bm_activity & cx->promotion.threshold.bm) )
   1.236 +                    {
   1.237 +                        next_state = cx->promotion.state;
   1.238 +                        goto end;
   1.239 +                    }
   1.240 +                }
   1.241 +                else
   1.242 +                {
   1.243 +                    next_state = cx->promotion.state;
   1.244 +                    goto end;
   1.245 +                }
   1.246              }
   1.247          }
   1.248      }
   1.249 @@ -425,6 +604,8 @@ static int acpi_processor_set_power_poli
   1.250              cx->demotion.state = lower;
   1.251              cx->demotion.threshold.ticks = cx->latency_ticks;
   1.252              cx->demotion.threshold.count = 1;
   1.253 +            if ( cx->type == ACPI_STATE_C3 )
   1.254 +                cx->demotion.threshold.bm = bm_history;
   1.255          }
   1.256  
   1.257          lower = cx;
   1.258 @@ -445,6 +626,8 @@ static int acpi_processor_set_power_poli
   1.259                  cx->promotion.threshold.count = 4;
   1.260              else
   1.261                  cx->promotion.threshold.count = 10;
   1.262 +            if ( higher->type == ACPI_STATE_C3 )
   1.263 +                cx->promotion.threshold.bm = bm_history;
   1.264          }
   1.265  
   1.266          higher = cx;
   1.267 @@ -511,11 +694,40 @@ static int acpi_processor_ffh_cstate_pro
   1.268      return 0;
   1.269  }
   1.270  
   1.271 +/*
   1.272 + * Initialize bm_flags based on the CPU cache properties
   1.273 + * On SMP it depends on cache configuration
   1.274 + * - When cache is not shared among all CPUs, we flush cache
   1.275 + *   before entering C3.
   1.276 + * - When cache is shared among all CPUs, we use bm_check
   1.277 + *   mechanism as in UP case
   1.278 + *
   1.279 + * This routine is called only after all the CPUs are online
   1.280 + */
   1.281 +static void acpi_processor_power_init_bm_check(struct acpi_processor_flags *flags)
   1.282 +{
   1.283 +    struct cpuinfo_x86 *c = &current_cpu_data;
   1.284 +
   1.285 +    flags->bm_check = 0;
   1.286 +    if ( num_online_cpus() == 1 )
   1.287 +        flags->bm_check = 1;
   1.288 +    else if ( c->x86_vendor == X86_VENDOR_INTEL )
   1.289 +    {
   1.290 +        /*
   1.291 +         * Today all CPUs that support C3 share cache.
   1.292 +         * TBD: This needs to look at cache shared map, once
   1.293 +         * multi-core detection patch makes to the base.
   1.294 +         */
   1.295 +        flags->bm_check = 1;
   1.296 +    }
   1.297 +}
   1.298 +
   1.299  #define VENDOR_INTEL                   (1)
   1.300  #define NATIVE_CSTATE_BEYOND_HALT      (2)
   1.301  
   1.302 -static int check_cx(xen_processor_cx_t *cx)
   1.303 +static int check_cx(struct acpi_processor_power *power, xen_processor_cx_t *cx)
   1.304  {
   1.305 +    static int bm_check_flag;
   1.306      if ( cx == NULL )
   1.307          return -EINVAL;
   1.308  
   1.309 @@ -543,6 +755,56 @@ static int check_cx(xen_processor_cx_t *
   1.310          return -ENODEV;
   1.311      }
   1.312  
   1.313 +    if ( cx->type == ACPI_STATE_C3 )
   1.314 +    {
   1.315 +        /* All the logic here assumes flags.bm_check is same across all CPUs */
   1.316 +        if ( !bm_check_flag )
   1.317 +        {
   1.318 +            /* Determine whether bm_check is needed based on CPU  */
   1.319 +            acpi_processor_power_init_bm_check(&(power->flags));
   1.320 +            bm_check_flag = power->flags.bm_check;
   1.321 +        }
   1.322 +        else
   1.323 +        {
   1.324 +            power->flags.bm_check = bm_check_flag;
   1.325 +        }
   1.326 +
   1.327 +        if ( power->flags.bm_check )
   1.328 +        {
   1.329 +            if ( !power->flags.bm_control )
   1.330 +            {
   1.331 +                if ( power->flags.has_cst != 1 )
   1.332 +                {
   1.333 +                    /* bus mastering control is necessary */
   1.334 +                    ACPI_DEBUG_PRINT((ACPI_DB_INFO,
   1.335 +                        "C3 support requires BM control\n"));
   1.336 +                    return -1;
   1.337 +                }
   1.338 +                else
   1.339 +                {
   1.340 +                    /* Here we enter C3 without bus mastering */
   1.341 +                    ACPI_DEBUG_PRINT((ACPI_DB_INFO,
   1.342 +                        "C3 support without BM control\n"));
   1.343 +                }
   1.344 +            }
   1.345 +        }
   1.346 +        else
   1.347 +        {
   1.348 +            /*
   1.349 +             * WBINVD should be set in fadt, for C3 state to be
   1.350 +             * supported on when bm_check is not required.
   1.351 +             */
   1.352 +            if ( !(acpi_gbl_FADT.flags & ACPI_FADT_WBINVD) )
   1.353 +            {
   1.354 +                ACPI_DEBUG_PRINT((ACPI_DB_INFO,
   1.355 +                          "Cache invalidation should work properly"
   1.356 +                          " for C3 to be enabled on SMP systems\n"));
   1.357 +                return -1;
   1.358 +            }
   1.359 +            acpi_set_register(ACPI_BITREG_BUS_MASTER_RLD, 0);
   1.360 +        }
   1.361 +    }
   1.362 +
   1.363      return 0;
   1.364  }
   1.365  
   1.366 @@ -552,7 +814,7 @@ static int set_cx(struct acpi_processor_
   1.367      struct acpi_processor_cx *cx;
   1.368  
   1.369      /* skip unsupported acpi cstate */
   1.370 -    if ( check_cx(xen_cx) )
   1.371 +    if ( check_cx(acpi_power, xen_cx) )
   1.372          return -EFAULT;
   1.373  
   1.374      cx = &acpi_power->states[xen_cx->type];
   1.375 @@ -663,6 +925,10 @@ long set_cx_pminfo(uint32_t cpu, struct 
   1.376  
   1.377      init_cx_pminfo(acpi_power);
   1.378  
   1.379 +    acpi_power->flags.bm_check = power->flags.bm_check;
   1.380 +    acpi_power->flags.bm_control = power->flags.bm_control;
   1.381 +    acpi_power->flags.has_cst = power->flags.has_cst;
   1.382 +
   1.383      states = power->states;
   1.384  
   1.385      for ( i = 0; i < power->count; i++ )