ia64/xen-unstable

changeset 18473:706844309f36

CPUIDLE: Port Linux menu governor to replace the initial ladder governor

The ladder governor has long pro/demotion delay shortcome while
applying to tickless mode, because it needs to count usage. Menu
governor chooses the next state simply via break-event prediction
including the factors of next timer event & last residency time etc,
so it would have faster response speed.

Signed-off-by: Gang Wei <gang.wei@intel.com>
author Keir Fraser <keir.fraser@citrix.com>
date Wed Sep 10 11:18:36 2008 +0100 (2008-09-10)
parents cfbe4df8d47c
children f5e72cbfbb17
files xen/arch/x86/acpi/Makefile xen/arch/x86/acpi/cpu_idle.c xen/arch/x86/acpi/cpuidle_menu.c xen/include/xen/cpuidle.h
line diff
     1.1 --- a/xen/arch/x86/acpi/Makefile	Wed Sep 10 11:17:13 2008 +0100
     1.2 +++ b/xen/arch/x86/acpi/Makefile	Wed Sep 10 11:18:36 2008 +0100
     1.3 @@ -1,5 +1,5 @@
     1.4  subdir-y += cpufreq
     1.5  
     1.6  obj-y += boot.o
     1.7 -obj-y += power.o suspend.o wakeup_prot.o cpu_idle.o
     1.8 +obj-y += power.o suspend.o wakeup_prot.o cpu_idle.o cpuidle_menu.o
     1.9  obj-y += pmstat.o
     2.1 --- a/xen/arch/x86/acpi/cpu_idle.c	Wed Sep 10 11:17:13 2008 +0100
     2.2 +++ b/xen/arch/x86/acpi/cpu_idle.c	Wed Sep 10 11:18:36 2008 +0100
     2.3 @@ -39,6 +39,7 @@
     2.4  #include <xen/smp.h>
     2.5  #include <xen/guest_access.h>
     2.6  #include <xen/keyhandler.h>
     2.7 +#include <xen/cpuidle.h>
     2.8  #include <asm/cache.h>
     2.9  #include <asm/io.h>
    2.10  #include <asm/hpet.h>
    2.11 @@ -49,13 +50,10 @@
    2.12  #define DEBUG_PM_CX
    2.13  
    2.14  #define US_TO_PM_TIMER_TICKS(t)     ((t * (PM_TIMER_FREQUENCY/1000)) / 1000)
    2.15 +#define PM_TIMER_TICKS_TO_US(t)     ((t * 1000) / (PM_TIMER_FREQUENCY / 1000))
    2.16  #define C2_OVERHEAD         4   /* 1us (3.579 ticks per us) */
    2.17  #define C3_OVERHEAD         4   /* 1us (3.579 ticks per us) */
    2.18  
    2.19 -#define ACPI_PROCESSOR_MAX_POWER        8
    2.20 -#define ACPI_PROCESSOR_MAX_C2_LATENCY   100
    2.21 -#define ACPI_PROCESSOR_MAX_C3_LATENCY   1000
    2.22 -
    2.23  static void (*lapic_timer_off)(void);
    2.24  static void (*lapic_timer_on)(void);
    2.25  
    2.26 @@ -65,66 +63,6 @@ extern void (*pm_idle) (void);
    2.27  static void (*pm_idle_save) (void) __read_mostly;
    2.28  unsigned int max_cstate __read_mostly = 2;
    2.29  integer_param("max_cstate", max_cstate);
    2.30 -/*
    2.31 - * bm_history -- bit-mask with a bit per jiffy of bus-master activity
    2.32 - * 1000 HZ: 0xFFFFFFFF: 32 jiffies = 32ms
    2.33 - * 800 HZ: 0xFFFFFFFF: 32 jiffies = 40ms
    2.34 - * 100 HZ: 0x0000000F: 4 jiffies = 40ms
    2.35 - * reduce history for more aggressive entry into C3
    2.36 - */
    2.37 -unsigned int bm_history __read_mostly =
    2.38 -    (HZ >= 800 ? 0xFFFFFFFF : ((1U << (HZ / 25)) - 1));
    2.39 -integer_param("bm_history", bm_history);
    2.40 -
    2.41 -struct acpi_processor_cx;
    2.42 -
    2.43 -struct acpi_processor_cx_policy
    2.44 -{
    2.45 -    u32 count;
    2.46 -    struct acpi_processor_cx *state;
    2.47 -    struct
    2.48 -    {
    2.49 -        u32 time;
    2.50 -        u32 ticks;
    2.51 -        u32 count;
    2.52 -        u32 bm;
    2.53 -    } threshold;
    2.54 -};
    2.55 -
    2.56 -struct acpi_processor_cx
    2.57 -{
    2.58 -    u8 valid;
    2.59 -    u8 type;
    2.60 -    u32 address;
    2.61 -    u8 space_id;
    2.62 -    u32 latency;
    2.63 -    u32 latency_ticks;
    2.64 -    u32 power;
    2.65 -    u32 usage;
    2.66 -    u64 time;
    2.67 -    struct acpi_processor_cx_policy promotion;
    2.68 -    struct acpi_processor_cx_policy demotion;
    2.69 -};
    2.70 -
    2.71 -struct acpi_processor_flags
    2.72 -{
    2.73 -    u8 bm_control:1;
    2.74 -    u8 bm_check:1;
    2.75 -    u8 has_cst:1;
    2.76 -    u8 power_setup_done:1;
    2.77 -    u8 bm_rld_set:1;
    2.78 -};
    2.79 -
    2.80 -struct acpi_processor_power
    2.81 -{
    2.82 -    struct acpi_processor_flags flags;
    2.83 -    struct acpi_processor_cx *state;
    2.84 -    s_time_t bm_check_timestamp;
    2.85 -    u32 default_state;
    2.86 -    u32 bm_activity;
    2.87 -    u32 count;
    2.88 -    struct acpi_processor_cx states[ACPI_PROCESSOR_MAX_POWER];
    2.89 -};
    2.90  
    2.91  static struct acpi_processor_power processor_powers[NR_CPUS];
    2.92  
    2.93 @@ -133,26 +71,21 @@ static void print_acpi_power(uint32_t cp
    2.94      uint32_t i;
    2.95  
    2.96      printk("==cpu%d==\n", cpu);
    2.97 -    printk("active state:\t\tC%d\n", (power->state)?power->state->type:-1);
    2.98 +    printk("active state:\t\tC%d\n",
    2.99 +           (power->last_state) ? power->last_state->type : -1);
   2.100      printk("max_cstate:\t\tC%d\n", max_cstate);
   2.101 -    printk("bus master activity:\t%08x\n", power->bm_activity);
   2.102      printk("states:\n");
   2.103      
   2.104      for ( i = 1; i < power->count; i++ )
   2.105      {
   2.106 -        printk((power->states[i].type == power->state->type) ? "   *" : "    ");
   2.107 +        if ( power->last_state && 
   2.108 +             power->states[i].type == power->last_state->type )
   2.109 +            printk("   *");
   2.110 +        else
   2.111 +            printk("    ");
   2.112          printk("C%d:\t\t", i);
   2.113          printk("type[C%d] ", power->states[i].type);
   2.114 -        if ( power->states[i].promotion.state )
   2.115 -            printk("promotion[C%d] ", power->states[i].promotion.state->type);
   2.116 -        else
   2.117 -            printk("promotion[--] ");
   2.118 -        if ( power->states[i].demotion.state )
   2.119 -            printk("demotion[C%d] ", power->states[i].demotion.state->type);
   2.120 -        else
   2.121 -            printk("demotion[--] ");
   2.122 -        printk("latency[%03d]\n ", power->states[i].latency);
   2.123 -        printk("\t\t\t");
   2.124 +        printk("latency[%03d] ", power->states[i].latency);
   2.125          printk("usage[%08d] ", power->states[i].usage);
   2.126          printk("duration[%"PRId64"]\n", power->states[i].time);
   2.127      }
   2.128 @@ -182,48 +115,6 @@ static inline u32 ticks_elapsed(u32 t1, 
   2.129          return ((0xFFFFFFFF - t1) + t2);
   2.130  }
   2.131  
   2.132 -static void acpi_processor_power_activate(struct acpi_processor_power *power,
   2.133 -                                          struct acpi_processor_cx *new)
   2.134 -{
   2.135 -    struct acpi_processor_cx *old;
   2.136 -
   2.137 -    if ( !power || !new )
   2.138 -        return;
   2.139 -
   2.140 -    old = power->state;
   2.141 -
   2.142 -    if ( old )
   2.143 -        old->promotion.count = 0;
   2.144 -    new->demotion.count = 0;
   2.145 -
   2.146 -    /* Cleanup from old state. */
   2.147 -    if ( old )
   2.148 -    {
   2.149 -        switch ( old->type )
   2.150 -        {
   2.151 -        case ACPI_STATE_C3:
   2.152 -            /* Disable bus master reload */
   2.153 -            if ( new->type != ACPI_STATE_C3 && power->flags.bm_check )
   2.154 -                acpi_set_register(ACPI_BITREG_BUS_MASTER_RLD, 0);
   2.155 -            break;
   2.156 -        }
   2.157 -    }
   2.158 -
   2.159 -    /* Prepare to use new state. */
   2.160 -    switch ( new->type )
   2.161 -    {
   2.162 -    case ACPI_STATE_C3:
   2.163 -        /* Enable bus master reload */
   2.164 -        if ( old->type != ACPI_STATE_C3 && power->flags.bm_check )
   2.165 -            acpi_set_register(ACPI_BITREG_BUS_MASTER_RLD, 1);
   2.166 -        break;
   2.167 -    }
   2.168 -
   2.169 -    power->state = new;
   2.170 -
   2.171 -    return;
   2.172 -}
   2.173 -
   2.174  static void acpi_safe_halt(void)
   2.175  {
   2.176      smp_mb__after_clear_bit();
   2.177 @@ -263,6 +154,40 @@ static void acpi_idle_do_entry(struct ac
   2.178      }
   2.179  }
   2.180  
   2.181 +static inline void acpi_idle_update_bm_rld(struct acpi_processor_power *power,
   2.182 +                                           struct acpi_processor_cx *target)
   2.183 +{
   2.184 +    if ( !power->flags.bm_check )
   2.185 +        return;
   2.186 +
   2.187 +    if ( power->flags.bm_rld_set && target->type != ACPI_STATE_C3 )
   2.188 +    {
   2.189 +        acpi_set_register(ACPI_BITREG_BUS_MASTER_RLD, 0);
   2.190 +        power->flags.bm_rld_set = 0;
   2.191 +    }
   2.192 +
   2.193 +    if ( !power->flags.bm_rld_set && target->type == ACPI_STATE_C3 )
   2.194 +    {
   2.195 +        acpi_set_register(ACPI_BITREG_BUS_MASTER_RLD, 1);
   2.196 +        power->flags.bm_rld_set = 1;
   2.197 +    }
   2.198 +}
   2.199 +
   2.200 +static int acpi_idle_bm_check(void)
   2.201 +{
   2.202 +    u32 bm_status = 0;
   2.203 +
   2.204 +    acpi_get_register(ACPI_BITREG_BUS_MASTER_STATUS, &bm_status);
   2.205 +    if ( bm_status )
   2.206 +        acpi_set_register(ACPI_BITREG_BUS_MASTER_STATUS, 1);
   2.207 +    /*
   2.208 +     * TBD: PIIX4 Erratum #18: Note that BM_STS doesn't always reflect
   2.209 +     * the true state of bus mastering activity; forcing us to
   2.210 +     * manually check the BMIDEA bit of each IDE channel.
   2.211 +     */
   2.212 +    return bm_status;
   2.213 +}
   2.214 +
   2.215  static struct {
   2.216      spinlock_t lock;
   2.217      unsigned int count;
   2.218 @@ -272,7 +197,7 @@ static void acpi_processor_idle(void)
   2.219  {
   2.220      struct acpi_processor_power *power = NULL;
   2.221      struct acpi_processor_cx *cx = NULL;
   2.222 -    struct acpi_processor_cx *next_state = NULL;
   2.223 +    int next_state;
   2.224      int sleep_ticks = 0;
   2.225      u32 t1, t2 = 0;
   2.226  
   2.227 @@ -290,7 +215,16 @@ static void acpi_processor_idle(void)
   2.228          return;
   2.229      }
   2.230  
   2.231 -    cx = power->state;
   2.232 +    next_state = cpuidle_current_governor->select(power);
   2.233 +    if ( next_state > 0 )
   2.234 +    {
   2.235 +        cx = &power->states[next_state];
   2.236 +        if ( power->flags.bm_check && acpi_idle_bm_check()
   2.237 +             && cx->type == ACPI_STATE_C3 )
   2.238 +            cx = power->safe_state;
   2.239 +        if ( cx->type > max_cstate )
   2.240 +            cx = &power->states[max_cstate];
   2.241 +    }
   2.242      if ( !cx )
   2.243      {
   2.244          if ( pm_idle_save )
   2.245 @@ -306,69 +240,14 @@ static void acpi_processor_idle(void)
   2.246          return;
   2.247      }
   2.248  
   2.249 -    /*
   2.250 -     * Check BM Activity
   2.251 -     * -----------------
   2.252 -     * Check for bus mastering activity (if required), record, and check
   2.253 -     * for demotion.
   2.254 -     */
   2.255 -    if ( power->flags.bm_check )
   2.256 -    {
   2.257 -        u32 bm_status = 0;
   2.258 -        unsigned long diff = (NOW() - power->bm_check_timestamp) >> 23;
   2.259 -
   2.260 -        if ( diff > 31 )
   2.261 -            diff = 31;
   2.262 -
   2.263 -        power->bm_activity <<= diff;
   2.264 -
   2.265 -        acpi_get_register(ACPI_BITREG_BUS_MASTER_STATUS, &bm_status);
   2.266 -        if ( bm_status )
   2.267 -        {
   2.268 -            power->bm_activity |= 0x1;
   2.269 -            acpi_set_register(ACPI_BITREG_BUS_MASTER_STATUS, 1);
   2.270 -        }
   2.271 -        /*
   2.272 -         * PIIX4 Erratum #18: Note that BM_STS doesn't always reflect
   2.273 -         * the true state of bus mastering activity; forcing us to
   2.274 -         * manually check the BMIDEA bit of each IDE channel.
   2.275 -         */
   2.276 -        /*else if ( errata.piix4.bmisx )
   2.277 -        {
   2.278 -            if ( (inb_p(errata.piix4.bmisx + 0x02) & 0x01)
   2.279 -                || (inb_p(errata.piix4.bmisx + 0x0A) & 0x01) )
   2.280 -                pr->power.bm_activity |= 0x1;
   2.281 -        }*/
   2.282 -
   2.283 -        power->bm_check_timestamp = NOW();
   2.284 -
   2.285 -        /*
   2.286 -         * If bus mastering is or was active this jiffy, demote
   2.287 -         * to avoid a faulty transition.  Note that the processor
   2.288 -         * won't enter a low-power state during this call (to this
   2.289 -         * function) but should upon the next.
   2.290 -         *
   2.291 -         * TBD: A better policy might be to fallback to the demotion
   2.292 -         *      state (use it for this quantum only) istead of
   2.293 -         *      demoting -- and rely on duration as our sole demotion
   2.294 -         *      qualification.  This may, however, introduce DMA
   2.295 -         *      issues (e.g. floppy DMA transfer overrun/underrun).
   2.296 -         */
   2.297 -        if ( (power->bm_activity & 0x1) && cx->demotion.threshold.bm )
   2.298 -        {
   2.299 -            local_irq_enable();
   2.300 -            next_state = cx->demotion.state;
   2.301 -            goto end;
   2.302 -        }
   2.303 -    }
   2.304 +    power->last_state = cx;
   2.305  
   2.306      /*
   2.307       * Sleep:
   2.308       * ------
   2.309       * Invoke the current Cx state to put the processor to sleep.
   2.310       */
   2.311 -    if ( cx->type == ACPI_STATE_C2 || cx->type == ACPI_STATE_C3 )
   2.312 -        smp_mb__after_clear_bit();
   2.313 +    acpi_idle_update_bm_rld(power, cx);
   2.314  
   2.315      switch ( cx->type )
   2.316      {
   2.317 @@ -480,162 +359,13 @@ static void acpi_processor_idle(void)
   2.318  
   2.319      cx->usage++;
   2.320      if ( sleep_ticks > 0 )
   2.321 -        cx->time += sleep_ticks;
   2.322 -
   2.323 -    next_state = power->state;
   2.324 -
   2.325 -    /*
   2.326 -     * Promotion?
   2.327 -     * ----------
   2.328 -     * Track the number of longs (time asleep is greater than threshold)
   2.329 -     * and promote when the count threshold is reached.  Note that bus
   2.330 -     * mastering activity may prevent promotions.
   2.331 -     * Do not promote above max_cstate.
   2.332 -     */
   2.333 -    if ( cx->promotion.state &&
   2.334 -         ((cx->promotion.state - power->states) <= max_cstate) )
   2.335      {
   2.336 -        if ( sleep_ticks > cx->promotion.threshold.ticks )
   2.337 -        {
   2.338 -            cx->promotion.count++;
   2.339 -            cx->demotion.count = 0;
   2.340 -            if ( cx->promotion.count >= cx->promotion.threshold.count )
   2.341 -            {
   2.342 -                if ( power->flags.bm_check )
   2.343 -                {
   2.344 -                    if ( !(power->bm_activity & cx->promotion.threshold.bm) )
   2.345 -                    {
   2.346 -                        next_state = cx->promotion.state;
   2.347 -                        goto end;
   2.348 -                    }
   2.349 -                }
   2.350 -                else
   2.351 -                {
   2.352 -                    next_state = cx->promotion.state;
   2.353 -                    goto end;
   2.354 -                }
   2.355 -            }
   2.356 -        }
   2.357 -    }
   2.358 -
   2.359 -    /*
   2.360 -     * Demotion?
   2.361 -     * ---------
   2.362 -     * Track the number of shorts (time asleep is less than time threshold)
   2.363 -     * and demote when the usage threshold is reached.
   2.364 -     */
   2.365 -    if ( cx->demotion.state )
   2.366 -    {
   2.367 -        if ( sleep_ticks < cx->demotion.threshold.ticks )
   2.368 -        {
   2.369 -            cx->demotion.count++;
   2.370 -            cx->promotion.count = 0;
   2.371 -            if ( cx->demotion.count >= cx->demotion.threshold.count )
   2.372 -            {
   2.373 -                next_state = cx->demotion.state;
   2.374 -                goto end;
   2.375 -            }
   2.376 -        }
   2.377 -    }
   2.378 -
   2.379 -end:
   2.380 -    /*
   2.381 -     * Demote if current state exceeds max_cstate
   2.382 -     */
   2.383 -    if ( (power->state - power->states) > max_cstate )
   2.384 -    {
   2.385 -        if ( cx->demotion.state )
   2.386 -            next_state = cx->demotion.state;
   2.387 +        power->last_residency = PM_TIMER_TICKS_TO_US(sleep_ticks);
   2.388 +        cx->time += sleep_ticks;
   2.389      }
   2.390  
   2.391 -    /*
   2.392 -     * New Cx State?
   2.393 -     * -------------
   2.394 -     * If we're going to start using a new Cx state we must clean up
   2.395 -     * from the previous and prepare to use the new.
   2.396 -     */
   2.397 -    if ( next_state != power->state )
   2.398 -        acpi_processor_power_activate(power, next_state);
   2.399 -}
   2.400 -
   2.401 -static int acpi_processor_set_power_policy(struct acpi_processor_power *power)
   2.402 -{
   2.403 -    unsigned int i;
   2.404 -    unsigned int state_is_set = 0;
   2.405 -    struct acpi_processor_cx *lower = NULL;
   2.406 -    struct acpi_processor_cx *higher = NULL;
   2.407 -    struct acpi_processor_cx *cx;
   2.408 -
   2.409 -    if ( !power )
   2.410 -        return -EINVAL;
   2.411 -
   2.412 -    /*
   2.413 -     * This function sets the default Cx state policy (OS idle handler).
   2.414 -     * Our scheme is to promote quickly to C2 but more conservatively
   2.415 -     * to C3.  We're favoring C2  for its characteristics of low latency
   2.416 -     * (quick response), good power savings, and ability to allow bus
   2.417 -     * mastering activity.  Note that the Cx state policy is completely
   2.418 -     * customizable and can be altered dynamically.
   2.419 -     */
   2.420 -
   2.421 -    /* startup state */
   2.422 -    for ( i = 1; i < ACPI_PROCESSOR_MAX_POWER; i++ )
   2.423 -    {
   2.424 -        cx = &power->states[i];
   2.425 -        if ( !cx->valid )
   2.426 -            continue;
   2.427 -
   2.428 -        if ( !state_is_set )
   2.429 -            power->state = cx;
   2.430 -        state_is_set++;
   2.431 -        break;
   2.432 -    }
   2.433 -
   2.434 -    if ( !state_is_set )
   2.435 -        return -ENODEV;
   2.436 -
   2.437 -    /* demotion */
   2.438 -    for ( i = 1; i < ACPI_PROCESSOR_MAX_POWER; i++ )
   2.439 -    {
   2.440 -        cx = &power->states[i];
   2.441 -        if ( !cx->valid )
   2.442 -            continue;
   2.443 -
   2.444 -        if ( lower )
   2.445 -        {
   2.446 -            cx->demotion.state = lower;
   2.447 -            cx->demotion.threshold.ticks = cx->latency_ticks;
   2.448 -            cx->demotion.threshold.count = 1;
   2.449 -            if ( cx->type == ACPI_STATE_C3 )
   2.450 -                cx->demotion.threshold.bm = bm_history;
   2.451 -        }
   2.452 -
   2.453 -        lower = cx;
   2.454 -    }
   2.455 -
   2.456 -    /* promotion */
   2.457 -    for ( i = (ACPI_PROCESSOR_MAX_POWER - 1); i > 0; i-- )
   2.458 -    {
   2.459 -        cx = &power->states[i];
   2.460 -        if ( !cx->valid )
   2.461 -            continue;
   2.462 -
   2.463 -        if ( higher )
   2.464 -        {
   2.465 -            cx->promotion.state = higher;
   2.466 -            cx->promotion.threshold.ticks = cx->latency_ticks;
   2.467 -            if ( cx->type >= ACPI_STATE_C2 )
   2.468 -                cx->promotion.threshold.count = 4;
   2.469 -            else
   2.470 -                cx->promotion.threshold.count = 10;
   2.471 -            if ( higher->type == ACPI_STATE_C3 )
   2.472 -                cx->promotion.threshold.bm = bm_history;
   2.473 -        }
   2.474 -
   2.475 -        higher = cx;
   2.476 -    }
   2.477 -
   2.478 -    return 0;
   2.479 +    if ( cpuidle_current_governor->reflect )
   2.480 +        cpuidle_current_governor->reflect(power);
   2.481  }
   2.482  
   2.483  static int init_cx_pminfo(struct acpi_processor_power *acpi_power)
   2.484 @@ -824,6 +554,8 @@ static int check_cx(struct acpi_processo
   2.485      return 0;
   2.486  }
   2.487  
   2.488 +static unsigned int latency_factor = 2;
   2.489 +
   2.490  static void set_cx(
   2.491      struct acpi_processor_power *acpi_power,
   2.492      xen_processor_cx_t *xen_cx)
   2.493 @@ -845,6 +577,9 @@ static void set_cx(
   2.494      cx->power    = xen_cx->power;
   2.495      
   2.496      cx->latency_ticks = US_TO_PM_TIMER_TICKS(cx->latency);
   2.497 +    cx->target_residency = cx->latency * latency_factor;
   2.498 +    if ( cx->type == ACPI_STATE_C1 || cx->type == ACPI_STATE_C2 )
   2.499 +        acpi_power->safe_state = cx;
   2.500  }
   2.501  
   2.502  int get_cpu_id(u8 acpi_id)
   2.503 @@ -939,6 +674,7 @@ long set_cx_pminfo(uint32_t cpu, struct 
   2.504  
   2.505      init_cx_pminfo(acpi_power);
   2.506  
   2.507 +    acpi_power->cpu = cpu_id;
   2.508      acpi_power->flags.bm_check = power->flags.bm_check;
   2.509      acpi_power->flags.bm_control = power->flags.bm_control;
   2.510      acpi_power->flags.has_cst = power->flags.has_cst;
   2.511 @@ -953,10 +689,11 @@ long set_cx_pminfo(uint32_t cpu, struct 
   2.512          set_cx(acpi_power, &xen_cx);
   2.513      }
   2.514  
   2.515 +    if ( cpuidle_current_governor->enable &&
   2.516 +         cpuidle_current_governor->enable(acpi_power) )
   2.517 +        return -EFAULT;
   2.518 +
   2.519      /* FIXME: C-state dependency is not supported by far */
   2.520 -    
   2.521 -    /* initialize default policy */
   2.522 -    acpi_processor_set_power_policy(acpi_power);
   2.523  
   2.524      print_acpi_power(cpu_id, acpi_power);
   2.525  
   2.526 @@ -981,7 +718,7 @@ int pmstat_get_cx_stat(uint32_t cpuid, s
   2.527      uint64_t usage;
   2.528      int i;
   2.529  
   2.530 -    stat->last = (power->state) ? power->state->type : 0;
   2.531 +    stat->last = (power->last_state) ? power->last_state->type : 0;
   2.532      stat->nr = processor_powers[cpuid].count;
   2.533      stat->idle_time = v->runstate.time[RUNSTATE_running];
   2.534      if ( v->is_running )
     3.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     3.2 +++ b/xen/arch/x86/acpi/cpuidle_menu.c	Wed Sep 10 11:18:36 2008 +0100
     3.3 @@ -0,0 +1,132 @@
     3.4 +/*
     3.5 + * cpuidle_menu - menu governor for cpu idle, main idea come from Linux.
     3.6 + *            drivers/cpuidle/governors/menu.c 
     3.7 + *
     3.8 + *  Copyright (C) 2006-2007 Adam Belay <abelay@novell.com>
     3.9 + *  Copyright (C) 2007, 2008 Intel Corporation
    3.10 + *
    3.11 + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    3.12 + *
    3.13 + *  This program is free software; you can redistribute it and/or modify
    3.14 + *  it under the terms of the GNU General Public License as published by
    3.15 + *  the Free Software Foundation; either version 2 of the License, or (at
    3.16 + *  your option) any later version.
    3.17 + *
    3.18 + *  This program is distributed in the hope that it will be useful, but
    3.19 + *  WITHOUT ANY WARRANTY; without even the implied warranty of
    3.20 + *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
    3.21 + *  General Public License for more details.
    3.22 + *
    3.23 + *  You should have received a copy of the GNU General Public License along
    3.24 + *  with this program; if not, write to the Free Software Foundation, Inc.,
    3.25 + *  59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
    3.26 + *
    3.27 + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    3.28 + */
    3.29 +#include <xen/config.h>
    3.30 +#include <xen/errno.h>
    3.31 +#include <xen/lib.h>
    3.32 +#include <xen/types.h>
    3.33 +#include <xen/acpi.h>
    3.34 +#include <xen/timer.h>
    3.35 +#include <xen/cpuidle.h>
    3.36 +
    3.37 +#define BREAK_FUZZ      4       /* 4 us */
    3.38 +#define USEC_PER_SEC 1000000
    3.39 +
    3.40 +struct menu_device
    3.41 +{
    3.42 +    int             last_state_idx;
    3.43 +    unsigned int    expected_us;
    3.44 +    unsigned int    predicted_us;
    3.45 +    unsigned int    last_measured_us;
    3.46 +    unsigned int    elapsed_us;
    3.47 +};
    3.48 +
    3.49 +static DEFINE_PER_CPU(struct menu_device, menu_devices);
    3.50 +
    3.51 +static s_time_t get_sleep_length_ns(void)
    3.52 +{
    3.53 +    return per_cpu(timer_deadline, smp_processor_id()) - NOW();
    3.54 +}
    3.55 +
    3.56 +static int menu_select(struct acpi_processor_power *power)
    3.57 +{
    3.58 +    struct menu_device *data = &__get_cpu_var(menu_devices);
    3.59 +    int i;
    3.60 +
    3.61 +    /* determine the expected residency time */
    3.62 +    data->expected_us = (u32) get_sleep_length_ns() / 1000;
    3.63 +
    3.64 +    /* find the deepest idle state that satisfies our constraints */
    3.65 +    for ( i = 1; i < power->count; i++ )
    3.66 +    {
    3.67 +        struct acpi_processor_cx *s = &power->states[i];
    3.68 +
    3.69 +        if ( s->target_residency > data->expected_us + s->latency )
    3.70 +            break;
    3.71 +        if ( s->target_residency > data->predicted_us )
    3.72 +            break;
    3.73 +        /* TBD: we need to check the QoS requirment in future */
    3.74 +    }
    3.75 +
    3.76 +    data->last_state_idx = i - 1;
    3.77 +    return i - 1;
    3.78 +}
    3.79 +
    3.80 +static void menu_reflect(struct acpi_processor_power *power)
    3.81 +{
    3.82 +    struct menu_device *data = &__get_cpu_var(menu_devices);
    3.83 +    struct acpi_processor_cx *target = &power->states[data->last_state_idx];
    3.84 +    unsigned int last_residency; 
    3.85 +    unsigned int measured_us;
    3.86 +
    3.87 +    /*
    3.88 +     * Ugh, this idle state doesn't support residency measurements, so we
    3.89 +     * are basically lost in the dark.  As a compromise, assume we slept
    3.90 +     * for one full standard timer tick.  However, be aware that this
    3.91 +     * could potentially result in a suboptimal state transition.
    3.92 +     */
    3.93 +    if ( target->type == ACPI_STATE_C1 )
    3.94 +        last_residency = USEC_PER_SEC / HZ;
    3.95 +    else
    3.96 +        last_residency = power->last_residency;
    3.97 +
    3.98 +    measured_us = last_residency + data->elapsed_us;
    3.99 +
   3.100 +    /* if wrapping, set to max uint (-1) */
   3.101 +    measured_us = data->elapsed_us <= measured_us ? measured_us : -1;
   3.102 +
   3.103 +    /* Predict time remaining until next break event */
   3.104 +    data->predicted_us = max(measured_us, data->last_measured_us);
   3.105 +
   3.106 +    /* Distinguish between expected & non-expected events */
   3.107 +    if ( last_residency + BREAK_FUZZ
   3.108 +         < data->expected_us + target->latency )
   3.109 +    {
   3.110 +        data->last_measured_us = measured_us;
   3.111 +        data->elapsed_us = 0;
   3.112 +    }
   3.113 +    else
   3.114 +        data->elapsed_us = measured_us;
   3.115 +}
   3.116 +
   3.117 +static int menu_enable_device(struct acpi_processor_power *power)
   3.118 +{
   3.119 +    struct menu_device *data = &per_cpu(menu_devices, power->cpu);
   3.120 +
   3.121 +    memset(data, 0, sizeof(struct menu_device));
   3.122 +
   3.123 +    return 0;
   3.124 +}
   3.125 +
   3.126 +static struct cpuidle_governor menu_governor =
   3.127 +{
   3.128 +    .name =         "menu",
   3.129 +    .rating =       20,
   3.130 +    .enable =       menu_enable_device,
   3.131 +    .select =       menu_select,
   3.132 +    .reflect =      menu_reflect,
   3.133 +};
   3.134 +
   3.135 +struct cpuidle_governor *cpuidle_current_governor = &menu_governor;
     4.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     4.2 +++ b/xen/include/xen/cpuidle.h	Wed Sep 10 11:18:36 2008 +0100
     4.3 @@ -0,0 +1,82 @@
     4.4 +/*
     4.5 + * cpuidle.h - xen idle state module derived from Linux 
     4.6 + *
     4.7 + * (C) 2007 Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
     4.8 + *          Shaohua Li <shaohua.li@intel.com>
     4.9 + *          Adam Belay <abelay@novell.com>
    4.10 + *  Copyright (C) 2008 Intel Corporation
    4.11 + *
    4.12 + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    4.13 + *
    4.14 + *  This program is free software; you can redistribute it and/or modify
    4.15 + *  it under the terms of the GNU General Public License as published by
    4.16 + *  the Free Software Foundation; either version 2 of the License, or (at
    4.17 + *  your option) any later version.
    4.18 + *
    4.19 + *  This program is distributed in the hope that it will be useful, but
    4.20 + *  WITHOUT ANY WARRANTY; without even the implied warranty of
    4.21 + *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
    4.22 + *  General Public License for more details.
    4.23 + *
    4.24 + *  You should have received a copy of the GNU General Public License along
    4.25 + *  with this program; if not, write to the Free Software Foundation, Inc.,
    4.26 + *  59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
    4.27 + *
    4.28 + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    4.29 + */
    4.30 +#ifndef _XEN_CPUIDLE_H
    4.31 +#define _XEN_CPUIDLE_H
    4.32 +
    4.33 +#define ACPI_PROCESSOR_MAX_POWER        8
    4.34 +#define CPUIDLE_NAME_LEN                16
    4.35 +
    4.36 +struct acpi_processor_cx
    4.37 +{
    4.38 +    u8 valid;
    4.39 +    u8 type;
    4.40 +    u32 address;
    4.41 +    u8 space_id;
    4.42 +    u32 latency;
    4.43 +    u32 latency_ticks;
    4.44 +    u32 power;
    4.45 +    u32 usage;
    4.46 +    u64 time;
    4.47 +    u32 target_residency;
    4.48 +};
    4.49 +
    4.50 +struct acpi_processor_flags
    4.51 +{
    4.52 +    u8 bm_control:1;
    4.53 +    u8 bm_check:1;
    4.54 +    u8 has_cst:1;
    4.55 +    u8 power_setup_done:1;
    4.56 +    u8 bm_rld_set:1;
    4.57 +};
    4.58 +
    4.59 +struct acpi_processor_power
    4.60 +{
    4.61 +    unsigned int cpu;
    4.62 +    struct acpi_processor_flags flags;
    4.63 +    struct acpi_processor_cx *last_state;
    4.64 +    struct acpi_processor_cx *safe_state;
    4.65 +    u32 last_residency;
    4.66 +    void *gdata; /* governor specific data */
    4.67 +    u32 count;
    4.68 +    struct acpi_processor_cx states[ACPI_PROCESSOR_MAX_POWER];
    4.69 +};
    4.70 +
    4.71 +struct cpuidle_governor
    4.72 +{
    4.73 +    char                    name[CPUIDLE_NAME_LEN];
    4.74 +    unsigned int            rating;
    4.75 +
    4.76 +    int  (*enable)          (struct acpi_processor_power *dev);
    4.77 +    void (*disable)         (struct acpi_processor_power *dev);
    4.78 +
    4.79 +    int  (*select)          (struct acpi_processor_power *dev);
    4.80 +    void (*reflect)         (struct acpi_processor_power *dev);
    4.81 +};
    4.82 +
    4.83 +extern struct cpuidle_governor *cpuidle_current_governor;
    4.84 +
    4.85 +#endif /* _XEN_CPUIDLE_H */