ia64/xen-unstable

changeset 17901:00af74d06652

AMD PowerNow! in Xen

This patch set moves support for AMD's PowerNow! technology from
dom0 into the hypervisor, now that there is support for
transferring ACPI data to the hypervisor. It will only work for
AMD processors that support the architectural P-state driver,
such as 3rd generation Opterons, Phenoms, and Turion Ultras.

This patch creates the Architectural P-state driver inside
of the Xen hypervisor and hooks it into the Xen code. It
has been tested at AMD with a variety of para- and fully-
virtualized guests for a week without regressions.

Signed-off-by: Mark Langsdorf <mark.langsdorf@amd.com>
Acked-by: Conny Seidel <conny.seidel@amd.com>
author Keir Fraser <keir.fraser@citrix.com>
date Fri Jun 20 18:02:46 2008 +0100 (2008-06-20)
parents 8fa124ab2e71
children 6ace85eb96c0
files xen/arch/x86/acpi/cpufreq/Makefile xen/arch/x86/acpi/cpufreq/powernow.c xen/arch/x86/platform_hypercall.c xen/include/acpi/cpufreq/processor_perf.h
line diff
     1.1 --- a/xen/arch/x86/acpi/cpufreq/Makefile	Fri Jun 20 17:45:23 2008 +0100
     1.2 +++ b/xen/arch/x86/acpi/cpufreq/Makefile	Fri Jun 20 18:02:46 2008 +0100
     1.3 @@ -1,3 +1,4 @@
     1.4  obj-y += cpufreq.o
     1.5  obj-y += utility.o
     1.6  obj-y += cpufreq_ondemand.o
     1.7 +obj-y += powernow.o
     2.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     2.2 +++ b/xen/arch/x86/acpi/cpufreq/powernow.c	Fri Jun 20 18:02:46 2008 +0100
     2.3 @@ -0,0 +1,305 @@
     2.4 +/*
     2.5 + *  powernow - AMD Architectural P-state Driver ($Revision: 1.4 $)
     2.6 + *
     2.7 + *  Copyright (C) 2008 Mark Langsdorf <mark.langsdorf@amd.com>
     2.8 + *
     2.9 + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    2.10 + *
    2.11 + *  This program is free software; you can redistribute it and/or modify
    2.12 + *  it under the terms of the GNU General Public License as published by
    2.13 + *  the Free Software Foundation; either version 2 of the License, or (at
    2.14 + *  your option) any later version.
    2.15 + *
    2.16 + *  This program is distributed in the hope that it will be useful, but
    2.17 + *  WITHOUT ANY WARRANTY; without even the implied warranty of
    2.18 + *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
    2.19 + *  General Public License for more details.
    2.20 + *
    2.21 + *  You should have received a copy of the GNU General Public License along
    2.22 + *  with this program; if not, write to the Free Software Foundation, Inc.,
    2.23 + *  59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
    2.24 + *
    2.25 + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    2.26 + */
    2.27 +
    2.28 +#include <xen/types.h>
    2.29 +#include <xen/errno.h>
    2.30 +#include <xen/delay.h>
    2.31 +#include <xen/cpumask.h>
    2.32 +#include <xen/timer.h>
    2.33 +#include <xen/xmalloc.h>
    2.34 +#include <asm/bug.h>
    2.35 +#include <asm/msr.h>
    2.36 +#include <asm/io.h>
    2.37 +#include <asm/config.h>
    2.38 +#include <asm/processor.h>
    2.39 +#include <asm/percpu.h>
    2.40 +#include <asm/cpufeature.h>
    2.41 +#include <acpi/acpi.h>
    2.42 +#include <acpi/cpufreq/cpufreq.h>
    2.43 +
    2.44 +#define CPUID_FREQ_VOLT_CAPABILITIES    0x80000007
    2.45 +#define USE_HW_PSTATE           0x00000080
    2.46 +#define HW_PSTATE_MASK          0x00000007
    2.47 +#define HW_PSTATE_VALID_MASK    0x80000000
    2.48 +#define HW_PSTATE_MAX_MASK      0x000000f0
    2.49 +#define HW_PSTATE_MAX_SHIFT     4
    2.50 +#define MSR_PSTATE_DEF_BASE     0xc0010064 /* base of Pstate MSRs */
    2.51 +#define MSR_PSTATE_STATUS       0xc0010063 /* Pstate Status MSR */
    2.52 +#define MSR_PSTATE_CTRL         0xc0010062 /* Pstate control MSR */
    2.53 +#define MSR_PSTATE_CUR_LIMIT    0xc0010061 /* pstate current limit MSR */
    2.54 +
    2.55 +extern struct processor_pminfo processor_pminfo[NR_CPUS];
    2.56 +extern struct cpufreq_policy xen_px_policy[NR_CPUS];
    2.57 +
    2.58 +struct powernow_cpufreq_data {
    2.59 +    struct processor_performance *acpi_data;
    2.60 +    struct cpufreq_frequency_table *freq_table;
    2.61 +    unsigned int max_freq;
    2.62 +    unsigned int resume;
    2.63 +    unsigned int cpu_feature;
    2.64 +};
    2.65 +
    2.66 +static struct powernow_cpufreq_data *drv_data[NR_CPUS];
    2.67 +
    2.68 +struct drv_cmd {
    2.69 +    unsigned int type;
    2.70 +    cpumask_t mask;
    2.71 +    u64 addr;
    2.72 +    u32 val;
    2.73 +};
    2.74 +
    2.75 +static void transition_pstate(void *drvcmd)
    2.76 +{
    2.77 +    struct drv_cmd *cmd;
    2.78 +    cmd = (struct drv_cmd *) drvcmd;
    2.79 +
    2.80 +    wrmsr(MSR_PSTATE_CTRL, cmd->val, 0);
    2.81 +}
    2.82 +
    2.83 +static int powernow_cpufreq_target(struct cpufreq_policy *policy,
    2.84 +                               unsigned int target_freq, unsigned int relation)
    2.85 +{
    2.86 +    struct powernow_cpufreq_data *data = drv_data[policy->cpu];
    2.87 +    struct processor_performance *perf;
    2.88 +    struct cpufreq_freqs freqs;
    2.89 +    cpumask_t online_policy_cpus;
    2.90 +    struct drv_cmd cmd;
    2.91 +    unsigned int next_state = 0; /* Index into freq_table */
    2.92 +    unsigned int next_perf_state = 0; /* Index into perf table */
    2.93 +    int result = 0;
    2.94 +
    2.95 +    if (unlikely(data == NULL ||
    2.96 +        data->acpi_data == NULL || data->freq_table == NULL)) {
    2.97 +        return -ENODEV;
    2.98 +    }
    2.99 +
   2.100 +    perf = data->acpi_data;
   2.101 +    result = cpufreq_frequency_table_target(policy,
   2.102 +                                            data->freq_table,
   2.103 +                                            target_freq,
   2.104 +                                            relation, &next_state);
   2.105 +    if (unlikely(result))
   2.106 +        return -ENODEV;
   2.107 +
   2.108 +    online_policy_cpus = policy->cpus;
   2.109 +
   2.110 +    next_perf_state = data->freq_table[next_state].index;
   2.111 +    if (perf->state == next_perf_state) {
   2.112 +        if (unlikely(data->resume)) 
   2.113 +            data->resume = 0;
   2.114 +        else
   2.115 +            return 0;
   2.116 +    }
   2.117 +
   2.118 +    cpus_clear(cmd.mask);
   2.119 +
   2.120 +    if (policy->shared_type != CPUFREQ_SHARED_TYPE_ANY)
   2.121 +        cmd.mask = online_policy_cpus;
   2.122 +    else
   2.123 +        cpu_set(policy->cpu, cmd.mask);
   2.124 +
   2.125 +    freqs.old = perf->states[perf->state].core_frequency * 1000;
   2.126 +    freqs.new = data->freq_table[next_state].frequency;
   2.127 +
   2.128 +    cmd.val = next_perf_state;
   2.129 +
   2.130 +    on_selected_cpus( cmd.mask, transition_pstate, (void *) &cmd, 0, 0);
   2.131 +
   2.132 +    perf->state = next_perf_state;
   2.133 +    policy->cur = freqs.new;
   2.134 +
   2.135 +    return result;
   2.136 +}
   2.137 +
   2.138 +static int powernow_cpufreq_cpu_init(struct cpufreq_policy *policy)
   2.139 +{
   2.140 +    unsigned int i;
   2.141 +    unsigned int valid_states = 0;
   2.142 +    unsigned int cpu = policy->cpu;
   2.143 +    struct powernow_cpufreq_data *data;
   2.144 +    unsigned int result = 0;
   2.145 +    struct processor_performance *perf;
   2.146 +    u32 max_hw_pstate, hi = 0, lo = 0;
   2.147 +
   2.148 +    data = xmalloc(struct powernow_cpufreq_data);
   2.149 +    if (!data)
   2.150 +        return -ENOMEM;
   2.151 +    memset(data, 0, sizeof(struct powernow_cpufreq_data));
   2.152 +
   2.153 +    drv_data[cpu] = data;
   2.154 +
   2.155 +    data->acpi_data = &processor_pminfo[cpu].perf;
   2.156 +
   2.157 +    perf = data->acpi_data;
   2.158 +    policy->shared_type = perf->shared_type;
   2.159 +
   2.160 +    /*
   2.161 +     * Will let policy->cpus know about dependency only when software
   2.162 +     * coordination is required.
   2.163 +     */
   2.164 +    if (policy->shared_type == CPUFREQ_SHARED_TYPE_ALL ||
   2.165 +        policy->shared_type == CPUFREQ_SHARED_TYPE_ANY) {
   2.166 +        policy->cpus = perf->shared_cpu_map;
   2.167 +    } else {
   2.168 +        policy->cpus = cpumask_of_cpu(cpu);    
   2.169 +    }
   2.170 +
   2.171 +    /* capability check */
   2.172 +    if (perf->state_count <= 1) {
   2.173 +        printk("No P-States\n");
   2.174 +        result = -ENODEV;
   2.175 +        goto err_unreg;
   2.176 +    }
   2.177 +    rdmsr(MSR_PSTATE_CUR_LIMIT, hi, lo);
   2.178 +    max_hw_pstate = (hi & HW_PSTATE_MAX_MASK) >> HW_PSTATE_MAX_SHIFT;
   2.179 +
   2.180 +    if (perf->control_register.space_id != perf->status_register.space_id) {
   2.181 +        result = -ENODEV;
   2.182 +        goto err_unreg;
   2.183 +    }
   2.184 +
   2.185 +    data->freq_table = xmalloc_array(struct cpufreq_frequency_table, 
   2.186 +                                    (perf->state_count+1));
   2.187 +    if (!data->freq_table) {
   2.188 +        result = -ENOMEM;
   2.189 +        goto err_unreg;
   2.190 +    }
   2.191 +
   2.192 +    /* detect transition latency */
   2.193 +    policy->cpuinfo.transition_latency = 0;
   2.194 +    for (i=0; i<perf->state_count; i++) {
   2.195 +        if ((perf->states[i].transition_latency * 1000) >
   2.196 +            policy->cpuinfo.transition_latency)
   2.197 +            policy->cpuinfo.transition_latency =
   2.198 +                perf->states[i].transition_latency * 1000;
   2.199 +    }
   2.200 +
   2.201 +    data->max_freq = perf->states[0].core_frequency * 1000;
   2.202 +    /* table init */
   2.203 +    for (i=0; i<perf->state_count && i<max_hw_pstate; i++) {
   2.204 +        if (i>0 && perf->states[i].core_frequency >=
   2.205 +            data->freq_table[valid_states-1].frequency / 1000)
   2.206 +            continue;
   2.207 +
   2.208 +        data->freq_table[valid_states].index = perf->states[i].control & HW_PSTATE_MASK;
   2.209 +        data->freq_table[valid_states].frequency =
   2.210 +            perf->states[i].core_frequency * 1000;
   2.211 +        valid_states++;
   2.212 +    }
   2.213 +    data->freq_table[valid_states].frequency = CPUFREQ_TABLE_END;
   2.214 +    perf->state = 0;
   2.215 +
   2.216 +    result = cpufreq_frequency_table_cpuinfo(policy, data->freq_table);
   2.217 +    if (result)
   2.218 +        goto err_freqfree;
   2.219 +
   2.220 +    /*
   2.221 +     * the first call to ->target() should result in us actually
   2.222 +     * writing something to the appropriate registers.
   2.223 +     */
   2.224 +    data->resume = 1;
   2.225 +
   2.226 +    policy->cur = data->freq_table[i].frequency;
   2.227 +    return result;
   2.228 +
   2.229 +err_freqfree:
   2.230 +    xfree(data->freq_table);
   2.231 +err_unreg:
   2.232 +    xfree(data);
   2.233 +    drv_data[cpu] = NULL;
   2.234 +
   2.235 +    return result;
   2.236 +}
   2.237 +
   2.238 +static struct cpufreq_driver powernow_cpufreq_driver = {
   2.239 +    .target = powernow_cpufreq_target,
   2.240 +    .init   = powernow_cpufreq_cpu_init,
   2.241 +};
   2.242 +
   2.243 +int powernow_cpufreq_init(void)
   2.244 +{
   2.245 +    unsigned int i, ret = 0;
   2.246 +    unsigned int dom, max_dom = 0;
   2.247 +    cpumask_t *pt, dom_mask;
   2.248 +
   2.249 +    cpus_clear(dom_mask);
   2.250 +
   2.251 +    for_each_online_cpu(i) {
   2.252 +        struct cpuinfo_x86 *c = &cpu_data[i];
   2.253 +	if (c->x86_vendor != X86_VENDOR_AMD)
   2.254 +            ret = -ENODEV;
   2.255 +        else 
   2.256 +        {
   2.257 +            u32 eax, ebx, ecx, edx;
   2.258 +            cpuid(CPUID_FREQ_VOLT_CAPABILITIES, &eax, &ebx, &ecx, &edx);
   2.259 +            if ((edx & USE_HW_PSTATE) != USE_HW_PSTATE)
   2.260 +                ret = -ENODEV;
   2.261 +	}
   2.262 +        if (ret)
   2.263 +            return ret;
   2.264 +        cpu_set(processor_pminfo[i].perf.domain_info.domain, dom_mask);
   2.265 +        if (max_dom < processor_pminfo[i].perf.domain_info.domain)
   2.266 +            max_dom = processor_pminfo[i].perf.domain_info.domain;
   2.267 +    }
   2.268 +    max_dom++;
   2.269 +
   2.270 +    pt = xmalloc_array(cpumask_t, max_dom);
   2.271 +    if (!pt)
   2.272 +        return -ENOMEM;
   2.273 +    memset(pt, 0, max_dom * sizeof(cpumask_t));
   2.274 +
   2.275 +    /* get cpumask of each psd domain */
   2.276 +    for_each_online_cpu(i)
   2.277 +        cpu_set(i, pt[processor_pminfo[i].perf.domain_info.domain]);
   2.278 +
   2.279 +    for_each_online_cpu(i)
   2.280 +        processor_pminfo[i].perf.shared_cpu_map = 
   2.281 +            pt[processor_pminfo[i].perf.domain_info.domain];
   2.282 +
   2.283 +    cpufreq_driver = &powernow_cpufreq_driver;
   2.284 +
   2.285 +    /* setup cpufreq infrastructure */
   2.286 +    for_each_online_cpu(i) {
   2.287 +        xen_px_policy[i].cpu = i;
   2.288 +
   2.289 +        ret = powernow_cpufreq_cpu_init(&xen_px_policy[i]);
   2.290 +        if (ret)
   2.291 +            goto cpufreq_init_out;
   2.292 +    }
   2.293 +
   2.294 +    /* setup ondemand cpufreq */
   2.295 +    for (dom=0; dom<max_dom; dom++) {
   2.296 +        if (!cpu_isset(dom, dom_mask))
   2.297 +            continue;
   2.298 +        i = first_cpu(pt[dom]);
   2.299 +        ret = cpufreq_governor_dbs(&xen_px_policy[i], CPUFREQ_GOV_START);
   2.300 +        if (ret)
   2.301 +            goto cpufreq_init_out;
   2.302 +    }
   2.303 +
   2.304 +cpufreq_init_out:
   2.305 +    xfree(pt);
   2.306 +   
   2.307 +    return ret;
   2.308 +}
     3.1 --- a/xen/arch/x86/platform_hypercall.c	Fri Jun 20 17:45:23 2008 +0100
     3.2 +++ b/xen/arch/x86/platform_hypercall.c	Fri Jun 20 18:02:46 2008 +0100
     3.3 @@ -408,7 +408,12 @@ ret_t do_platform_op(XEN_GUEST_HANDLE(xe
     3.4                  cpu_count++;
     3.5              }
     3.6              if ( cpu_count == num_online_cpus() )
     3.7 -                ret = acpi_cpufreq_init();
     3.8 +            {
     3.9 +                if ( boot_cpu_data.x86_vendor == X86_VENDOR_AMD )
    3.10 +                    ret = powernow_cpufreq_init();
    3.11 +                else
    3.12 +                    ret = acpi_cpufreq_init();
    3.13 +            }
    3.14              break;
    3.15          }
    3.16   
     4.1 --- a/xen/include/acpi/cpufreq/processor_perf.h	Fri Jun 20 17:45:23 2008 +0100
     4.2 +++ b/xen/include/acpi/cpufreq/processor_perf.h	Fri Jun 20 18:02:46 2008 +0100
     4.3 @@ -6,6 +6,7 @@
     4.4  
     4.5  int get_cpu_id(u8);
     4.6  int acpi_cpufreq_init(void);
     4.7 +int powernow_cpufreq_init(void);
     4.8  void px_statistic_update(cpumask_t, uint8_t, uint8_t);
     4.9  int  px_statistic_init(int);
    4.10  void px_statistic_reset(int);