ia64/xen-unstable

changeset 18948:4d5203f95498

Enable CMCI for Intel CPUs

Signed-off-by Yunhong Jiang <yunhong.jiang@intel.com>
Signed-off-by Liping Ke <liping.ke@intel.com>
author Keir Fraser <keir.fraser@citrix.com>
date Mon Dec 22 08:12:33 2008 +0000 (2008-12-22)
parents 2dffa6ceb0af
children aa0fee8a6ef5
files xen/arch/x86/apic.c xen/arch/x86/cpu/mcheck/Makefile xen/arch/x86/cpu/mcheck/k7.c xen/arch/x86/cpu/mcheck/mce.c xen/arch/x86/cpu/mcheck/mce.h xen/arch/x86/cpu/mcheck/mce_intel.c xen/arch/x86/cpu/mcheck/non-fatal.c xen/arch/x86/cpu/mcheck/p4.c xen/arch/x86/cpu/mcheck/p6.c xen/arch/x86/cpu/mcheck/x86_mca.h xen/arch/x86/hvm/vmx/vmx.c xen/arch/x86/i8259.c xen/arch/x86/smpboot.c xen/common/stop_machine.c xen/include/asm-x86/apicdef.h xen/include/asm-x86/config.h xen/include/asm-x86/irq.h xen/include/asm-x86/mach-default/irq_vectors.h xen/include/asm-x86/msr-index.h xen/include/asm-x86/smp.h xen/include/public/arch-x86/xen-mca.h xen/include/xen/stop_machine.h
line diff
     1.1 --- a/xen/arch/x86/apic.c	Fri Dec 19 14:56:36 2008 +0000
     1.2 +++ b/xen/arch/x86/apic.c	Mon Dec 22 08:12:33 2008 +0000
     1.3 @@ -99,8 +99,11 @@ void __init apic_intr_init(void)
     1.4      /* Performance Counters Interrupt */
     1.5      set_intr_gate(PMU_APIC_VECTOR, pmu_apic_interrupt);
     1.6  
     1.7 -    /* thermal monitor LVT interrupt */
     1.8 -#ifdef CONFIG_X86_MCE_P4THERMAL
     1.9 +    /* CMCI Correctable Machine Check Interrupt */
    1.10 +    set_intr_gate(CMCI_APIC_VECTOR, cmci_interrupt);
    1.11 +
    1.12 +    /* thermal monitor LVT interrupt, for P4 and latest Intel CPU*/
    1.13 +#ifdef CONFIG_X86_MCE_THERMAL
    1.14      set_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
    1.15  #endif
    1.16  }
    1.17 @@ -172,12 +175,17 @@ void clear_local_APIC(void)
    1.18      }
    1.19  
    1.20  /* lets not touch this if we didn't frob it */
    1.21 -#ifdef CONFIG_X86_MCE_P4THERMAL
    1.22 +#ifdef CONFIG_X86_MCE_THERMAL
    1.23      if (maxlvt >= 5) {
    1.24          v = apic_read(APIC_LVTTHMR);
    1.25          apic_write_around(APIC_LVTTHMR, v | APIC_LVT_MASKED);
    1.26      }
    1.27  #endif
    1.28 +
    1.29 +    if (maxlvt >= 6) {
    1.30 +        v = apic_read(APIC_CMCI);
    1.31 +        apic_write_around(APIC_CMCI, v | APIC_LVT_MASKED);
    1.32 +    }
    1.33      /*
    1.34       * Clean APIC state for other OSs:
    1.35       */
    1.36 @@ -189,10 +197,13 @@ void clear_local_APIC(void)
    1.37      if (maxlvt >= 4)
    1.38          apic_write_around(APIC_LVTPC, APIC_LVT_MASKED);
    1.39  
    1.40 -#ifdef CONFIG_X86_MCE_P4THERMAL
    1.41 +#ifdef CONFIG_X86_MCE_THERMAL
    1.42      if (maxlvt >= 5)
    1.43          apic_write_around(APIC_LVTTHMR, APIC_LVT_MASKED);
    1.44  #endif
    1.45 +    if (maxlvt >= 6)
    1.46 +        apic_write_around(APIC_CMCI, APIC_LVT_MASKED);
    1.47 +
    1.48      v = GET_APIC_VERSION(apic_read(APIC_LVR));
    1.49      if (APIC_INTEGRATED(v)) {  /* !82489DX */
    1.50          if (maxlvt > 3)        /* Due to Pentium errata 3AP and 11AP. */
    1.51 @@ -597,6 +608,7 @@ static struct {
    1.52      unsigned int apic_spiv;
    1.53      unsigned int apic_lvtt;
    1.54      unsigned int apic_lvtpc;
    1.55 +    unsigned int apic_lvtcmci;
    1.56      unsigned int apic_lvt0;
    1.57      unsigned int apic_lvt1;
    1.58      unsigned int apic_lvterr;
    1.59 @@ -608,7 +620,7 @@ static struct {
    1.60  int lapic_suspend(void)
    1.61  {
    1.62      unsigned long flags;
    1.63 -
    1.64 +    int maxlvt = get_maxlvt();
    1.65      if (!apic_pm_state.active)
    1.66          return 0;
    1.67  
    1.68 @@ -620,6 +632,11 @@ int lapic_suspend(void)
    1.69      apic_pm_state.apic_spiv = apic_read(APIC_SPIV);
    1.70      apic_pm_state.apic_lvtt = apic_read(APIC_LVTT);
    1.71      apic_pm_state.apic_lvtpc = apic_read(APIC_LVTPC);
    1.72 +
    1.73 +    if (maxlvt >= 6) {
    1.74 +        apic_pm_state.apic_lvtcmci = apic_read(APIC_CMCI);
    1.75 +    }
    1.76 +
    1.77      apic_pm_state.apic_lvt0 = apic_read(APIC_LVT0);
    1.78      apic_pm_state.apic_lvt1 = apic_read(APIC_LVT1);
    1.79      apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR);
    1.80 @@ -637,6 +654,7 @@ int lapic_resume(void)
    1.81  {
    1.82      unsigned int l, h;
    1.83      unsigned long flags;
    1.84 +    int maxlvt = get_maxlvt();
    1.85  
    1.86      if (!apic_pm_state.active)
    1.87          return 0;
    1.88 @@ -669,6 +687,11 @@ int lapic_resume(void)
    1.89      apic_write(APIC_LVT0, apic_pm_state.apic_lvt0);
    1.90      apic_write(APIC_LVT1, apic_pm_state.apic_lvt1);
    1.91      apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr);
    1.92 +
    1.93 +    if (maxlvt >= 6) {
    1.94 +        apic_write(APIC_CMCI, apic_pm_state.apic_lvtcmci);
    1.95 +    }
    1.96 +
    1.97      apic_write(APIC_LVTPC, apic_pm_state.apic_lvtpc);
    1.98      apic_write(APIC_LVTT, apic_pm_state.apic_lvtt);
    1.99      apic_write(APIC_TDCR, apic_pm_state.apic_tdcr);
     2.1 --- a/xen/arch/x86/cpu/mcheck/Makefile	Fri Dec 19 14:56:36 2008 +0000
     2.2 +++ b/xen/arch/x86/cpu/mcheck/Makefile	Mon Dec 22 08:12:33 2008 +0000
     2.3 @@ -3,8 +3,7 @@ obj-y += k7.o
     2.4  obj-y += amd_k8.o
     2.5  obj-y += amd_f10.o
     2.6  obj-y += mce.o
     2.7 +obj-y += mce_intel.o
     2.8  obj-y += non-fatal.o
     2.9 -obj-y += p4.o
    2.10  obj-$(x86_32) += p5.o
    2.11 -obj-$(x86_32) += p6.o
    2.12  obj-$(x86_32) += winchip.o
     3.1 --- a/xen/arch/x86/cpu/mcheck/k7.c	Fri Dec 19 14:56:36 2008 +0000
     3.2 +++ b/xen/arch/x86/cpu/mcheck/k7.c	Mon Dec 22 08:12:33 2008 +0000
     3.3 @@ -14,6 +14,7 @@
     3.4  #include <asm/msr.h>
     3.5  
     3.6  #include "mce.h"
     3.7 +#include "x86_mca.h"
     3.8  
     3.9  /* Machine Check Handler For AMD Athlon/Duron */
    3.10  static fastcall void k7_machine_check(struct cpu_user_regs * regs, long error_code)
     4.1 --- a/xen/arch/x86/cpu/mcheck/mce.c	Fri Dec 19 14:56:36 2008 +0000
     4.2 +++ b/xen/arch/x86/cpu/mcheck/mce.c	Mon Dec 22 08:12:33 2008 +0000
     4.3 @@ -27,7 +27,7 @@ EXPORT_SYMBOL_GPL(nr_mce_banks);	/* non-
     4.4   * to physical cpus present in the machine.
     4.5   * The more physical cpus are available, the more entries you need.
     4.6   */
     4.7 -#define MAX_MCINFO	10
     4.8 +#define MAX_MCINFO	20
     4.9  
    4.10  struct mc_machine_notify {
    4.11  	struct mc_info mc;
    4.12 @@ -110,6 +110,22 @@ static void amd_mcheck_init(struct cpuin
    4.13  	}
    4.14  }
    4.15  
    4.16 +/*check the existence of Machine Check*/
    4.17 +int mce_available(struct cpuinfo_x86 *c)
    4.18 +{
    4.19 +	return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
    4.20 +}
    4.21 +
    4.22 +/*Make sure there are no machine check on offlined or suspended CPUs*/
    4.23 +void mce_disable_cpu(void)
    4.24 +{
    4.25 +    if (!mce_available(&current_cpu_data) || mce_disabled == 1)
    4.26 +         return;
    4.27 +    printk(KERN_DEBUG "MCE: disable mce on CPU%d\n", smp_processor_id());
    4.28 +    clear_in_cr4(X86_CR4_MCE);
    4.29 +}
    4.30 +
    4.31 +
    4.32  /* This has to be run for each processor */
    4.33  void mcheck_init(struct cpuinfo_x86 *c)
    4.34  {
    4.35 @@ -135,11 +151,13 @@ void mcheck_init(struct cpuinfo_x86 *c)
    4.36  #ifndef CONFIG_X86_64
    4.37  		if (c->x86==5)
    4.38  			intel_p5_mcheck_init(c);
    4.39 -		if (c->x86==6)
    4.40 -			intel_p6_mcheck_init(c);
    4.41  #endif
    4.42 -		if (c->x86==15)
    4.43 -			intel_p4_mcheck_init(c);
    4.44 +		/*If it is P6 or P4 family, including CORE 2 DUO series*/
    4.45 +		if (c->x86 == 6 || c->x86==15)
    4.46 +		{
    4.47 +			printk(KERN_DEBUG "MCE: Intel newly family MC Init\n");
    4.48 +			intel_mcheck_init(c);
    4.49 +		}
    4.50  		break;
    4.51  
    4.52  #ifndef CONFIG_X86_64
    4.53 @@ -413,7 +431,7 @@ void x86_mcinfo_dump(struct mc_info *mi)
    4.54  		if (mic == NULL)
    4.55  			return;
    4.56  		if (mic->type != MC_TYPE_BANK)
    4.57 -			continue;
    4.58 +			goto next;
    4.59  
    4.60  		mc_bank = (struct mcinfo_bank *)mic;
    4.61  	
    4.62 @@ -426,6 +444,7 @@ void x86_mcinfo_dump(struct mc_info *mi)
    4.63  			printk(" at %16"PRIx64, mc_bank->mc_addr);
    4.64  
    4.65  		printk("\n");
    4.66 +next:
    4.67  		mic = x86_mcinfo_next(mic); /* next entry */
    4.68  		if ((mic == NULL) || (mic->size == 0))
    4.69  			break;
     5.1 --- a/xen/arch/x86/cpu/mcheck/mce.h	Fri Dec 19 14:56:36 2008 +0000
     5.2 +++ b/xen/arch/x86/cpu/mcheck/mce.h	Mon Dec 22 08:12:33 2008 +0000
     5.3 @@ -1,14 +1,22 @@
     5.4  #include <xen/init.h>
     5.5 +#include <asm/types.h>
     5.6  #include <asm/traps.h>
     5.7 +#include <asm/atomic.h>
     5.8 +#include <asm/percpu.h>
     5.9 +
    5.10  
    5.11  /* Init functions */
    5.12  void amd_nonfatal_mcheck_init(struct cpuinfo_x86 *c);
    5.13  void amd_k7_mcheck_init(struct cpuinfo_x86 *c);
    5.14  void amd_k8_mcheck_init(struct cpuinfo_x86 *c);
    5.15  void amd_f10_mcheck_init(struct cpuinfo_x86 *c);
    5.16 -void intel_p4_mcheck_init(struct cpuinfo_x86 *c);
    5.17 +
    5.18 +
    5.19 +void intel_mcheck_timer(struct cpuinfo_x86 *c);
    5.20  void intel_p5_mcheck_init(struct cpuinfo_x86 *c);
    5.21 -void intel_p6_mcheck_init(struct cpuinfo_x86 *c);
    5.22 +void intel_mcheck_init(struct cpuinfo_x86 *c);
    5.23 +void mce_intel_feature_init(struct cpuinfo_x86 *c);
    5.24 +
    5.25  void winchip_mcheck_init(struct cpuinfo_x86 *c);
    5.26  
    5.27  /* Function pointer used in the handlers to collect additional information
    5.28 @@ -19,6 +27,7 @@ extern int (*mc_callback_bank_extended)(
    5.29  		uint16_t bank, uint64_t status);
    5.30  
    5.31  
    5.32 +int mce_available(struct cpuinfo_x86 *c);
    5.33  /* Helper functions used for collecting error telemetry */
    5.34  struct mc_info *x86_mcinfo_getptr(void);
    5.35  void x86_mcinfo_clear(struct mc_info *mi);
    5.36 @@ -26,6 +35,3 @@ int x86_mcinfo_add(struct mc_info *mi, v
    5.37  void x86_mcinfo_dump(struct mc_info *mi);
    5.38  void mc_panic(char *s);
    5.39  
    5.40 -/* Global variables */
    5.41 -extern int mce_disabled;
    5.42 -extern unsigned int nr_mce_banks;
     6.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     6.2 +++ b/xen/arch/x86/cpu/mcheck/mce_intel.c	Mon Dec 22 08:12:33 2008 +0000
     6.3 @@ -0,0 +1,681 @@
     6.4 +#include <xen/init.h>
     6.5 +#include <xen/types.h>
     6.6 +#include <xen/irq.h>
     6.7 +#include <xen/event.h>
     6.8 +#include <xen/kernel.h>
     6.9 +#include <xen/smp.h>
    6.10 +#include <asm/processor.h> 
    6.11 +#include <asm/system.h>
    6.12 +#include <asm/msr.h>
    6.13 +#include "mce.h"
    6.14 +#include "x86_mca.h"
    6.15 +
    6.16 +DEFINE_PER_CPU(cpu_banks_t, mce_banks_owned);
    6.17 +
    6.18 +static int nr_intel_ext_msrs = 0;
    6.19 +static int cmci_support = 0;
    6.20 +extern int firstbank;
    6.21 +
    6.22 +#ifdef CONFIG_X86_MCE_THERMAL
    6.23 +static void unexpected_thermal_interrupt(struct cpu_user_regs *regs)
    6.24 +{	
    6.25 +    printk(KERN_ERR "Thermal: CPU%d: Unexpected LVT TMR interrupt!\n",
    6.26 +                smp_processor_id());
    6.27 +    add_taint(TAINT_MACHINE_CHECK);
    6.28 +}
    6.29 +
    6.30 +/* P4/Xeon Thermal transition interrupt handler */
    6.31 +static void intel_thermal_interrupt(struct cpu_user_regs *regs)
    6.32 +{
    6.33 +    u32 l, h;
    6.34 +    unsigned int cpu = smp_processor_id();
    6.35 +    static s_time_t next[NR_CPUS];
    6.36 +
    6.37 +    ack_APIC_irq();
    6.38 +    if (NOW() < next[cpu])
    6.39 +        return;
    6.40 +
    6.41 +    next[cpu] = NOW() + MILLISECS(5000);
    6.42 +    rdmsr(MSR_IA32_THERM_STATUS, l, h);
    6.43 +    if (l & 0x1) {
    6.44 +        printk(KERN_EMERG "CPU%d: Temperature above threshold\n", cpu);
    6.45 +        printk(KERN_EMERG "CPU%d: Running in modulated clock mode\n",
    6.46 +                cpu);
    6.47 +        add_taint(TAINT_MACHINE_CHECK);
    6.48 +    } else {
    6.49 +        printk(KERN_INFO "CPU%d: Temperature/speed normal\n", cpu);
    6.50 +    }
    6.51 +}
    6.52 +
    6.53 +/* Thermal interrupt handler for this CPU setup */
    6.54 +static void (*vendor_thermal_interrupt)(struct cpu_user_regs *regs) 
    6.55 +        = unexpected_thermal_interrupt;
    6.56 +
    6.57 +fastcall void smp_thermal_interrupt(struct cpu_user_regs *regs)
    6.58 +{
    6.59 +    irq_enter();
    6.60 +    vendor_thermal_interrupt(regs);
    6.61 +    irq_exit();
    6.62 +}
    6.63 +
    6.64 +/* P4/Xeon Thermal regulation detect and init */
    6.65 +static void intel_init_thermal(struct cpuinfo_x86 *c)
    6.66 +{
    6.67 +    u32 l, h;
    6.68 +    int tm2 = 0;
    6.69 +    unsigned int cpu = smp_processor_id();
    6.70 +
    6.71 +    /* Thermal monitoring */
    6.72 +    if (!cpu_has(c, X86_FEATURE_ACPI))
    6.73 +        return;	/* -ENODEV */
    6.74 +
    6.75 +    /* Clock modulation */
    6.76 +    if (!cpu_has(c, X86_FEATURE_ACC))
    6.77 +        return;	/* -ENODEV */
    6.78 +
    6.79 +    /* first check if its enabled already, in which case there might
    6.80 +     * be some SMM goo which handles it, so we can't even put a handler
    6.81 +     * since it might be delivered via SMI already -zwanem.
    6.82 +     */
    6.83 +    rdmsr (MSR_IA32_MISC_ENABLE, l, h);
    6.84 +    h = apic_read(APIC_LVTTHMR);
    6.85 +    if ((l & (1<<3)) && (h & APIC_DM_SMI)) {
    6.86 +        printk(KERN_DEBUG "CPU%d: Thermal monitoring handled by SMI\n",cpu);
    6.87 +        return; /* -EBUSY */
    6.88 +    }
    6.89 +
    6.90 +    if (cpu_has(c, X86_FEATURE_TM2) && (l & (1 << 13)))
    6.91 +        tm2 = 1;
    6.92 +
    6.93 +	/* check whether a vector already exists, temporarily masked? */
    6.94 +    if (h & APIC_VECTOR_MASK) {
    6.95 +        printk(KERN_DEBUG "CPU%d: Thermal LVT vector (%#x) already installed\n",
    6.96 +                 cpu, (h & APIC_VECTOR_MASK));
    6.97 +        return; /* -EBUSY */
    6.98 +    }
    6.99 +
   6.100 +    /* The temperature transition interrupt handler setup */
   6.101 +    h = THERMAL_APIC_VECTOR;		/* our delivery vector */
   6.102 +    h |= (APIC_DM_FIXED | APIC_LVT_MASKED);	/* we'll mask till we're ready */
   6.103 +    apic_write_around(APIC_LVTTHMR, h);
   6.104 +
   6.105 +    rdmsr (MSR_IA32_THERM_INTERRUPT, l, h);
   6.106 +    wrmsr (MSR_IA32_THERM_INTERRUPT, l | 0x03 , h);
   6.107 +
   6.108 +    /* ok we're good to go... */
   6.109 +    vendor_thermal_interrupt = intel_thermal_interrupt;
   6.110 +
   6.111 +    rdmsr (MSR_IA32_MISC_ENABLE, l, h);
   6.112 +    wrmsr (MSR_IA32_MISC_ENABLE, l | (1<<3), h);
   6.113 +
   6.114 +    l = apic_read (APIC_LVTTHMR);
   6.115 +    apic_write_around (APIC_LVTTHMR, l & ~APIC_LVT_MASKED);
   6.116 +    printk (KERN_INFO "CPU%d: Thermal monitoring enabled (%s)\n", 
   6.117 +            cpu, tm2 ? "TM2" : "TM1");
   6.118 +    return;
   6.119 +}
   6.120 +#endif /* CONFIG_X86_MCE_THERMAL */
   6.121 +
   6.122 +static inline void intel_get_extended_msrs(struct mcinfo_extended *mc_ext)
   6.123 +{
   6.124 +    if (nr_intel_ext_msrs == 0)
   6.125 +        return;
   6.126 +
   6.127 +	/*this function will called when CAP(9).MCG_EXT_P = 1*/
   6.128 +    memset(mc_ext, 0, sizeof(struct mcinfo_extended));
   6.129 +    mc_ext->common.type = MC_TYPE_EXTENDED;
   6.130 +    mc_ext->common.size = sizeof(mc_ext);
   6.131 +    mc_ext->mc_msrs = 10;
   6.132 +
   6.133 +    mc_ext->mc_msr[0].reg = MSR_IA32_MCG_EAX;
   6.134 +    rdmsrl(MSR_IA32_MCG_EAX, mc_ext->mc_msr[0].value);
   6.135 +    mc_ext->mc_msr[1].reg = MSR_IA32_MCG_EBX;
   6.136 +    rdmsrl(MSR_IA32_MCG_EBX, mc_ext->mc_msr[1].value);
   6.137 +    mc_ext->mc_msr[2].reg = MSR_IA32_MCG_ECX;
   6.138 +    rdmsrl(MSR_IA32_MCG_ECX, mc_ext->mc_msr[2].value);
   6.139 +
   6.140 +    mc_ext->mc_msr[3].reg = MSR_IA32_MCG_EDX;
   6.141 +    rdmsrl(MSR_IA32_MCG_EDX, mc_ext->mc_msr[3].value);
   6.142 +    mc_ext->mc_msr[4].reg = MSR_IA32_MCG_ESI;
   6.143 +    rdmsrl(MSR_IA32_MCG_ESI, mc_ext->mc_msr[4].value);
   6.144 +    mc_ext->mc_msr[5].reg = MSR_IA32_MCG_EDI;
   6.145 +    rdmsrl(MSR_IA32_MCG_EDI, mc_ext->mc_msr[5].value);
   6.146 +
   6.147 +    mc_ext->mc_msr[6].reg = MSR_IA32_MCG_EBP;
   6.148 +    rdmsrl(MSR_IA32_MCG_EBP, mc_ext->mc_msr[6].value);
   6.149 +    mc_ext->mc_msr[7].reg = MSR_IA32_MCG_ESP;
   6.150 +    rdmsrl(MSR_IA32_MCG_ESP, mc_ext->mc_msr[7].value);
   6.151 +    mc_ext->mc_msr[8].reg = MSR_IA32_MCG_EFLAGS;
   6.152 +    rdmsrl(MSR_IA32_MCG_EFLAGS, mc_ext->mc_msr[8].value);
   6.153 +    mc_ext->mc_msr[9].reg = MSR_IA32_MCG_EIP;
   6.154 +    rdmsrl(MSR_IA32_MCG_EIP, mc_ext->mc_msr[9].value);
   6.155 +}
   6.156 +
   6.157 +/* machine_check_poll might be called by following types:
   6.158 + * 1. called when do mcheck_init.
   6.159 + * 2. called in cmci interrupt handler
   6.160 + * 3. called in polling handler
   6.161 + * It will generate a new mc_info item if found CE/UC errors. DOM0 is the 
   6.162 + * consumer.
   6.163 +*/
   6.164 +static int machine_check_poll(struct mc_info *mi, int calltype)
   6.165 +{
   6.166 +    int exceptions = (read_cr4() & X86_CR4_MCE);
   6.167 +    int i, nr_unit = 0, uc = 0, pcc = 0;
   6.168 +    uint64_t status, addr;
   6.169 +    struct mcinfo_global mcg;
   6.170 +    struct mcinfo_extended mce;
   6.171 +    unsigned int cpu;
   6.172 +    struct domain *d;
   6.173 +
   6.174 +    cpu = smp_processor_id();
   6.175 +
   6.176 +    if (!mi) {
   6.177 +        printk(KERN_ERR "mcheck_poll: Failed to get mc_info entry\n");
   6.178 +        return 0;
   6.179 +    }
   6.180 +    x86_mcinfo_clear(mi);
   6.181 +
   6.182 +    memset(&mcg, 0, sizeof(mcg));
   6.183 +    mcg.common.type = MC_TYPE_GLOBAL;
   6.184 +    mcg.common.size = sizeof(mcg);
   6.185 +    /*If called from cpu-reset check, don't need to fill them.
   6.186 +     *If called from cmci context, we'll try to fill domid by memory addr
   6.187 +    */
   6.188 +    mcg.mc_domid = -1;
   6.189 +    mcg.mc_vcpuid = -1;
   6.190 +    if (calltype == MC_FLAG_POLLED || calltype == MC_FLAG_RESET)
   6.191 +        mcg.mc_flags = MC_FLAG_POLLED;
   6.192 +    else if (calltype == MC_FLAG_CMCI)
   6.193 +        mcg.mc_flags = MC_FLAG_CMCI;
   6.194 +    mcg.mc_socketid = phys_proc_id[cpu];
   6.195 +    mcg.mc_coreid = cpu_core_id[cpu];
   6.196 +    mcg.mc_apicid = cpu_physical_id(cpu);
   6.197 +    mcg.mc_core_threadid = mcg.mc_apicid & ( 1 << (smp_num_siblings - 1)); 
   6.198 +    rdmsrl(MSR_IA32_MCG_STATUS, mcg.mc_gstatus);
   6.199 +
   6.200 +    for ( i = 0; i < nr_mce_banks; i++ ) {
   6.201 +        struct mcinfo_bank mcb;
   6.202 +        /*For CMCI, only owners checks the owned MSRs*/
   6.203 +        if ( !test_bit(i, __get_cpu_var(mce_banks_owned)) &&
   6.204 +			(calltype & MC_FLAG_CMCI) )
   6.205 +            continue;
   6.206 +        rdmsrl(MSR_IA32_MC0_STATUS + 4 * i, status);
   6.207 +
   6.208 +        if (! (status & MCi_STATUS_VAL) )
   6.209 +            continue;
   6.210 +        /*
   6.211 +         * Uncorrected events are handled by the exception
   6.212 +         * handler when it is enabled. But when the exception
   6.213 +         * is disabled such as when mcheck_init, log everything.
   6.214 +         */
   6.215 +        if ((status & MCi_STATUS_UC) && exceptions)
   6.216 +            continue;
   6.217 +
   6.218 +        if (status & MCi_STATUS_UC)
   6.219 +            uc = 1;
   6.220 +        if (status & MCi_STATUS_PCC)
   6.221 +            pcc = 1;
   6.222 +
   6.223 +        memset(&mcb, 0, sizeof(mcb));
   6.224 +        mcb.common.type = MC_TYPE_BANK;
   6.225 +        mcb.common.size = sizeof(mcb);
   6.226 +        mcb.mc_bank = i;
   6.227 +        mcb.mc_status = status;
   6.228 +        if (status & MCi_STATUS_MISCV)
   6.229 +            rdmsrl(MSR_IA32_MC0_MISC + 4 * i, mcb.mc_misc);
   6.230 +        if (status & MCi_STATUS_ADDRV) {
   6.231 +            rdmsrl(MSR_IA32_MC0_ADDR + 4 * i, addr);
   6.232 +            d = maddr_get_owner(addr);
   6.233 +            if ( d && (calltype == MC_FLAG_CMCI || calltype == MC_FLAG_POLLED) )
   6.234 +                mcb.mc_domid = d->domain_id;
   6.235 +        }
   6.236 +        if (cmci_support)
   6.237 +            rdmsrl(MSR_IA32_MC0_CTL2 + i, mcb.mc_ctrl2);
   6.238 +        if (calltype == MC_FLAG_CMCI)
   6.239 +            rdtscll(mcb.mc_tsc);
   6.240 +        x86_mcinfo_add(mi, &mcb);
   6.241 +        nr_unit++;
   6.242 +        add_taint(TAINT_MACHINE_CHECK);
   6.243 +        /*Clear state for this bank */
   6.244 +        wrmsrl(MSR_IA32_MC0_STATUS + 4 * i, 0);
   6.245 +        printk(KERN_DEBUG "mcheck_poll: bank%i CPU%d status[%lx]\n", 
   6.246 +                i, cpu, status);
   6.247 +        printk(KERN_DEBUG "mcheck_poll: CPU%d, SOCKET%d, CORE%d, APICID[%d], "
   6.248 +                "thread[%d]\n", cpu, mcg.mc_socketid, 
   6.249 +                mcg.mc_coreid, mcg.mc_apicid, mcg.mc_core_threadid);
   6.250 + 
   6.251 +    }
   6.252 +    /*if pcc = 1, uc must be 1*/
   6.253 +    if (pcc)
   6.254 +        mcg.mc_flags |= MC_FLAG_UNCORRECTABLE;
   6.255 +    else if (uc)
   6.256 +        mcg.mc_flags |= MC_FLAG_RECOVERABLE;
   6.257 +    else /*correctable*/
   6.258 +        mcg.mc_flags |= MC_FLAG_CORRECTABLE;
   6.259 +
   6.260 +    if (nr_unit && nr_intel_ext_msrs && 
   6.261 +                    (mcg.mc_gstatus & MCG_STATUS_EIPV)) {
   6.262 +        intel_get_extended_msrs(&mce);
   6.263 +        x86_mcinfo_add(mi, &mce);
   6.264 +    }
   6.265 +    if (nr_unit) 
   6.266 +        x86_mcinfo_add(mi, &mcg);
   6.267 +    /*Clear global state*/
   6.268 +    return nr_unit;
   6.269 +}
   6.270 +
   6.271 +static fastcall void intel_machine_check(struct cpu_user_regs * regs, long error_code)
   6.272 +{
   6.273 +    /* MACHINE CHECK Error handler will be sent in another patch,
   6.274 +     * simply copy old solutions here. This code will be replaced
   6.275 +     * by upcoming machine check patches
   6.276 +     */
   6.277 +
   6.278 +    int recover=1;
   6.279 +    u32 alow, ahigh, high, low;
   6.280 +    u32 mcgstl, mcgsth;
   6.281 +    int i;
   6.282 +   
   6.283 +    rdmsr (MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
   6.284 +    if (mcgstl & (1<<0))	/* Recoverable ? */
   6.285 +    	recover=0;
   6.286 +    
   6.287 +    printk (KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n",
   6.288 +    	smp_processor_id(), mcgsth, mcgstl);
   6.289 +    
   6.290 +    for (i=0; i<nr_mce_banks; i++) {
   6.291 +    	rdmsr (MSR_IA32_MC0_STATUS+i*4,low, high);
   6.292 +    	if (high & (1<<31)) {
   6.293 +    		if (high & (1<<29))
   6.294 +    			recover |= 1;
   6.295 +    		if (high & (1<<25))
   6.296 +    			recover |= 2;
   6.297 +    		printk (KERN_EMERG "Bank %d: %08x%08x", i, high, low);
   6.298 +    		high &= ~(1<<31);
   6.299 +    		if (high & (1<<27)) {
   6.300 +    			rdmsr (MSR_IA32_MC0_MISC+i*4, alow, ahigh);
   6.301 +    			printk ("[%08x%08x]", ahigh, alow);
   6.302 +    		}
   6.303 +    		if (high & (1<<26)) {
   6.304 +    			rdmsr (MSR_IA32_MC0_ADDR+i*4, alow, ahigh);
   6.305 +    			printk (" at %08x%08x", ahigh, alow);
   6.306 +    		}
   6.307 +    		printk ("\n");
   6.308 +    	}
   6.309 +    }
   6.310 +    
   6.311 +    if (recover & 2)
   6.312 +    	mc_panic ("CPU context corrupt");
   6.313 +    if (recover & 1)
   6.314 +    	mc_panic ("Unable to continue");
   6.315 +    
   6.316 +    printk(KERN_EMERG "Attempting to continue.\n");
   6.317 +    /* 
   6.318 +     * Do not clear the MSR_IA32_MCi_STATUS if the error is not 
   6.319 +     * recoverable/continuable.This will allow BIOS to look at the MSRs
   6.320 +     * for errors if the OS could not log the error.
   6.321 +     */
   6.322 +    for (i=0; i<nr_mce_banks; i++) {
   6.323 +    	u32 msr;
   6.324 +    	msr = MSR_IA32_MC0_STATUS+i*4;
   6.325 +    	rdmsr (msr, low, high);
   6.326 +    	if (high&(1<<31)) {
   6.327 +    		/* Clear it */
   6.328 +    		wrmsr(msr, 0UL, 0UL);
   6.329 +    		/* Serialize */
   6.330 +    		wmb();
   6.331 +    		add_taint(TAINT_MACHINE_CHECK);
   6.332 +    	}
   6.333 +    }
   6.334 +    mcgstl &= ~(1<<2);
   6.335 +    wrmsr (MSR_IA32_MCG_STATUS,mcgstl, mcgsth);
   6.336 +}
   6.337 +
   6.338 +extern void (*cpu_down_handler)(int down_cpu);
   6.339 +extern void (*cpu_down_rollback_handler)(int down_cpu);
   6.340 +extern void mce_disable_cpu(void);
   6.341 +static bool_t cmci_clear_lock = 0;
   6.342 +static DEFINE_SPINLOCK(cmci_discover_lock);
   6.343 +static DEFINE_PER_CPU(cpu_banks_t, no_cmci_banks);
   6.344 +
   6.345 +/*
   6.346 + * Discover bank sharing using the algorithm recommended in the SDM.
   6.347 + */
   6.348 +static int do_cmci_discover(int i)
   6.349 +{
   6.350 +    unsigned msr = MSR_IA32_MC0_CTL2 + i;
   6.351 +    u64 val;
   6.352 +
   6.353 +    rdmsrl(msr, val);
   6.354 +    /* Some other CPU already owns this bank. */
   6.355 +    if (val & CMCI_EN) {
   6.356 +    	clear_bit(i, __get_cpu_var(mce_banks_owned));
   6.357 +    	goto out;
   6.358 +    }
   6.359 +    wrmsrl(msr, val | CMCI_EN | CMCI_THRESHOLD);
   6.360 +    rdmsrl(msr, val);
   6.361 +
   6.362 +    if (!(val & CMCI_EN)) {
   6.363 +     /*
   6.364 +      * This bank does not support CMCI. The polling
   6.365 +      * timer has to handle it. 
   6.366 +      */
   6.367 +    	set_bit(i, __get_cpu_var(no_cmci_banks));
   6.368 +    	return 0;
   6.369 +    }
   6.370 +    set_bit(i, __get_cpu_var(mce_banks_owned));
   6.371 +out:
   6.372 +    clear_bit(i, __get_cpu_var(no_cmci_banks));
   6.373 +    return 1;
   6.374 +}
   6.375 +
   6.376 +void cmci_discover(void)
   6.377 +{
   6.378 +    int i;
   6.379 +
   6.380 +    printk(KERN_DEBUG "CMCI: find owner on CPU%d\n", smp_processor_id());
   6.381 +    spin_lock(&cmci_discover_lock);
   6.382 +    for (i = 0; i < nr_mce_banks; i++) {
   6.383 +        /*If the cpu is the bank owner, need not re-discover*/
   6.384 +        if (test_bit(i, __get_cpu_var(mce_banks_owned)))
   6.385 +            continue;
   6.386 +        do_cmci_discover(i);
   6.387 +    }
   6.388 +    spin_unlock(&cmci_discover_lock);
   6.389 +    printk(KERN_DEBUG "CMCI: CPU%d owner_map[%lx], no_cmci_map[%lx]\n", 
   6.390 +            smp_processor_id(), 
   6.391 +            *((unsigned long *)__get_cpu_var(mce_banks_owned)), 
   6.392 +            *((unsigned long *)__get_cpu_var(no_cmci_banks)));
   6.393 +}
   6.394 +
   6.395 +/*
   6.396 + * Define an owner for each bank. Banks can be shared between CPUs
   6.397 + * and to avoid reporting events multiple times always set up one
   6.398 + * CPU as owner. 
   6.399 + *
   6.400 + * The assignment has to be redone when CPUs go offline and
   6.401 + * any of the owners goes away. Also pollers run in parallel so we
   6.402 + * have to be careful to update the banks in a way that doesn't
   6.403 + * lose or duplicate events.
   6.404 + */
   6.405 +
   6.406 +static void mce_set_owner(void)
   6.407 +{
   6.408 +
   6.409 +    if (!cmci_support || mce_disabled == 1)
   6.410 +        return;
   6.411 +
   6.412 +    cmci_discover();
   6.413 +}
   6.414 +
   6.415 +static void clear_cmci(void)
   6.416 +{
   6.417 +    int i;
   6.418 +
   6.419 +    if (!cmci_support || mce_disabled == 1)
   6.420 +        return;
   6.421 +
   6.422 +    printk(KERN_DEBUG "CMCI: clear_cmci support on CPU%d\n", 
   6.423 +            smp_processor_id());
   6.424 +
   6.425 +    for (i = 0; i < nr_mce_banks; i++) {
   6.426 +        unsigned msr = MSR_IA32_MC0_CTL2 + i;
   6.427 +        u64 val;
   6.428 +        if (!test_bit(i, __get_cpu_var(mce_banks_owned)))
   6.429 +            continue;
   6.430 +        rdmsrl(msr, val);
   6.431 +        if (val & (CMCI_EN|CMCI_THRESHOLD_MASK))
   6.432 +            wrmsrl(msr, val & ~(CMCI_EN|CMCI_THRESHOLD_MASK));
   6.433 +        clear_bit(i, __get_cpu_var(mce_banks_owned));
   6.434 +    }
   6.435 +}
   6.436 +
   6.437 +/*we need to re-set cmci owners when cpu_down fail or cpu_up*/
   6.438 +static void cmci_reenable_cpu(void *h)
   6.439 +{
   6.440 +    if (!mce_available(&current_cpu_data) || mce_disabled == 1)
   6.441 +         return;
   6.442 +    printk(KERN_DEBUG "CMCI: reenable mce on CPU%d\n", smp_processor_id());
   6.443 +    mce_set_owner();
   6.444 +    set_in_cr4(X86_CR4_MCE);
   6.445 +}
   6.446 +
   6.447 +/* When take cpu_down, we need to execute the impacted cmci_owner judge algorithm 
   6.448 + * First, we need to clear the ownership on the dead CPU
   6.449 + * Then,  other CPUs will check whether to take the bank's ownership from down_cpu
   6.450 + * CPU0 need not and "never" execute this path
   6.451 +*/
   6.452 +void  __cpu_clear_cmci( int down_cpu)
   6.453 +{
   6.454 +    int cpu = smp_processor_id();
   6.455 +
   6.456 +    if (!cmci_support && mce_disabled == 1)
   6.457 +        return;
   6.458 +
   6.459 +    if (cpu == 0) {
   6.460 +        printk(KERN_DEBUG "CMCI: CPU0 need not be cleared\n");
   6.461 +        return;
   6.462 +    }
   6.463 +
   6.464 +    local_irq_disable();
   6.465 +    if (cpu == down_cpu){
   6.466 +        mce_disable_cpu();
   6.467 +        clear_cmci();
   6.468 +        wmb();
   6.469 +        test_and_set_bool(cmci_clear_lock);
   6.470 +        return;
   6.471 +    }
   6.472 +    while (!cmci_clear_lock)
   6.473 +        cpu_relax();
   6.474 +    if (cpu != down_cpu)
   6.475 +        mce_set_owner();
   6.476 +
   6.477 +    test_and_clear_bool(cmci_clear_lock);
   6.478 +    local_irq_enable();
   6.479 +
   6.480 +}
   6.481 +
   6.482 +void  __cpu_clear_cmci_rollback( int down_cpu)
   6.483 +{
   6.484 +    cpumask_t down_map;
   6.485 +    if (!cmci_support || mce_disabled == 1) 
   6.486 +        return;
   6.487 +
   6.488 +    cpus_clear(down_map);
   6.489 +    cpu_set(down_cpu, down_map);
   6.490 +    printk(KERN_ERR "CMCI: cpu_down fail. "
   6.491 +        "Reenable cmci on CPU%d\n", down_cpu);
   6.492 +    on_selected_cpus(down_map, cmci_reenable_cpu, NULL, 1, 1);
   6.493 +}
   6.494 +
   6.495 +static void intel_init_cmci(struct cpuinfo_x86 *c)
   6.496 +{
   6.497 +    u32 l, apic;
   6.498 +    int cpu = smp_processor_id();
   6.499 +
   6.500 +    if (!mce_available(c) || !cmci_support) {
   6.501 +        printk(KERN_DEBUG "CMCI: CPU%d has no CMCI support\n", cpu);
   6.502 +        return;
   6.503 +    }
   6.504 +
   6.505 +    apic = apic_read(APIC_CMCI);
   6.506 +    if ( apic & APIC_VECTOR_MASK )
   6.507 +    {
   6.508 +        printk(KERN_WARNING "CPU%d CMCI LVT vector (%#x) already installed\n",
   6.509 +            cpu, ( apic & APIC_VECTOR_MASK ));
   6.510 +        return;
   6.511 +    }
   6.512 +
   6.513 +    apic = CMCI_APIC_VECTOR;
   6.514 +    apic |= (APIC_DM_FIXED | APIC_LVT_MASKED);
   6.515 +    apic_write_around(APIC_CMCI, apic);
   6.516 +
   6.517 +	/*now clear mask flag*/
   6.518 +    l = apic_read(APIC_CMCI);
   6.519 +    apic_write_around(APIC_CMCI, l & ~APIC_LVT_MASKED);
   6.520 +    cpu_down_handler =  __cpu_clear_cmci;
   6.521 +    cpu_down_rollback_handler = __cpu_clear_cmci_rollback; 
   6.522 +}
   6.523 +
   6.524 +fastcall void smp_cmci_interrupt(struct cpu_user_regs *regs)
   6.525 +{
   6.526 +    int nr_unit;
   6.527 +    struct mc_info *mi =  x86_mcinfo_getptr();
   6.528 +    int cpu = smp_processor_id();
   6.529 +
   6.530 +    ack_APIC_irq();
   6.531 +    irq_enter();
   6.532 +    printk(KERN_DEBUG "CMCI: cmci_intr happen on CPU%d\n", cpu);
   6.533 +    nr_unit = machine_check_poll(mi, MC_FLAG_CMCI);
   6.534 +    if (nr_unit) {
   6.535 +        x86_mcinfo_dump(mi);
   6.536 +        if (dom0 && guest_enabled_event(dom0->vcpu[0], VIRQ_MCA))
   6.537 +            send_guest_global_virq(dom0, VIRQ_MCA);
   6.538 +    }
   6.539 +    irq_exit();
   6.540 +}
   6.541 +
   6.542 +void mce_intel_feature_init(struct cpuinfo_x86 *c)
   6.543 +{
   6.544 +
   6.545 +#ifdef CONFIG_X86_MCE_THERMAL
   6.546 +    intel_init_thermal(c);
   6.547 +#endif
   6.548 +    intel_init_cmci(c);
   6.549 +}
   6.550 +
   6.551 +static void mce_cap_init(struct cpuinfo_x86 *c)
   6.552 +{
   6.553 +    u32 l, h;
   6.554 +
   6.555 +    rdmsr (MSR_IA32_MCG_CAP, l, h);
   6.556 +    if ((l & MCG_CMCI_P) && cpu_has_apic)
   6.557 +        cmci_support = 1;
   6.558 +
   6.559 +    nr_mce_banks = l & 0xff;
   6.560 +    if (nr_mce_banks > MAX_NR_BANKS)
   6.561 +        printk(KERN_WARNING "MCE: exceed max mce banks\n");
   6.562 +    if (l & MCG_EXT_P)
   6.563 +    {
   6.564 +        nr_intel_ext_msrs = (l >> MCG_EXT_CNT) & 0xff;
   6.565 +        printk (KERN_INFO "CPU%d: Intel Extended MCE MSRs (%d) available\n",
   6.566 +            smp_processor_id(), nr_intel_ext_msrs);
   6.567 +    }
   6.568 +    /* for most of p6 family, bank 0 is an alias bios MSR.
   6.569 +     * But after model>1a, bank 0 is available*/
   6.570 +    if ( c->x86 == 6 && c->x86_vendor == X86_VENDOR_INTEL
   6.571 +            && c->x86_model < 0x1A)
   6.572 +        firstbank = 1;
   6.573 +    else
   6.574 +        firstbank = 0;
   6.575 +}
   6.576 +
   6.577 +static void mce_init(void)
   6.578 +{
   6.579 +    u32 l, h;
   6.580 +    int i, nr_unit;
   6.581 +    struct mc_info *mi =  x86_mcinfo_getptr();
   6.582 +    clear_in_cr4(X86_CR4_MCE);
   6.583 +    /* log the machine checks left over from the previous reset.
   6.584 +     * This also clears all registers*/
   6.585 +
   6.586 +    nr_unit = machine_check_poll(mi, MC_FLAG_RESET);
   6.587 +    /*in the boot up stage, not expect inject to DOM0, but go print out
   6.588 +    */
   6.589 +    if (nr_unit > 0)
   6.590 +        x86_mcinfo_dump(mi);
   6.591 +
   6.592 +    set_in_cr4(X86_CR4_MCE);
   6.593 +    rdmsr (MSR_IA32_MCG_CAP, l, h);
   6.594 +    if (l & MCG_CTL_P)	/* Control register present ? */
   6.595 +        wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
   6.596 +
   6.597 +    for (i = firstbank; i < nr_mce_banks; i++)
   6.598 +    {
   6.599 +        /*Some banks are shared across cores, use MCi_CTRL to judge whether
   6.600 +         * this bank has been initialized by other cores already.*/
   6.601 +        rdmsr(MSR_IA32_MC0_CTL + 4*i, l, h);
   6.602 +        if (!l & !h)
   6.603 +        {
   6.604 +            /*if ctl is 0, this bank is never initialized*/
   6.605 +            printk(KERN_DEBUG "mce_init: init bank%d\n", i);
   6.606 +            wrmsr (MSR_IA32_MC0_CTL + 4*i, 0xffffffff, 0xffffffff);
   6.607 +            wrmsr (MSR_IA32_MC0_STATUS + 4*i, 0x0, 0x0);
   6.608 +       }
   6.609 +    }
   6.610 +    if (firstbank) /*if cmci enabled, firstbank = 0*/
   6.611 +        wrmsr (MSR_IA32_MC0_STATUS, 0x0, 0x0);
   6.612 +}
   6.613 +
   6.614 +/*p4/p6 faimily has similar MCA initialization process*/
   6.615 +void intel_mcheck_init(struct cpuinfo_x86 *c)
   6.616 +{
   6.617 +	
   6.618 +	mce_cap_init(c);
   6.619 +	printk (KERN_INFO "Intel machine check reporting enabled on CPU#%d.\n",
   6.620 +		smp_processor_id());
   6.621 +	/* machine check is available */
   6.622 +	machine_check_vector = intel_machine_check;
   6.623 +	mce_init();
   6.624 +	mce_intel_feature_init(c);
   6.625 +	mce_set_owner();
   6.626 +}
   6.627 +
   6.628 +/*
   6.629 + * Periodic polling timer for "silent" machine check errors. If the
   6.630 + * poller finds an MCE, poll faster. When the poller finds no more 
   6.631 + * errors, poll slower
   6.632 +*/
   6.633 +static struct timer mce_timer;
   6.634 +
   6.635 +#define MCE_PERIOD 4000
   6.636 +#define MCE_MIN    2000
   6.637 +#define MCE_MAX    32000
   6.638 +
   6.639 +static u64 period = MCE_PERIOD;
   6.640 +static int adjust = 0;
   6.641 +
   6.642 +static void mce_intel_checkregs(void *info)
   6.643 +{
   6.644 +    int nr_unit;
   6.645 +    struct mc_info *mi =  x86_mcinfo_getptr();
   6.646 +
   6.647 +    if( !mce_available(&current_cpu_data))
   6.648 +        return;
   6.649 +    nr_unit = machine_check_poll(mi, MC_FLAG_POLLED);
   6.650 +    if (nr_unit)
   6.651 +    {
   6.652 +        x86_mcinfo_dump(mi);
   6.653 +        adjust++;
   6.654 +        if (dom0 && guest_enabled_event(dom0->vcpu[0], VIRQ_MCA))
   6.655 +            send_guest_global_virq(dom0, VIRQ_MCA);
   6.656 +    }
   6.657 +}
   6.658 +
   6.659 +static void mce_intel_work_fn(void *data)
   6.660 +{
   6.661 +    on_each_cpu(mce_intel_checkregs, data, 1, 1);
   6.662 +    if (adjust) {
   6.663 +        period = period / (adjust + 1);
   6.664 +        printk(KERN_DEBUG "mcheck_poll: Find error, shorten interval to %ld",
   6.665 +            period);
   6.666 +    }
   6.667 +    else {
   6.668 +        period *= 2;
   6.669 +    }
   6.670 +    if (period > MCE_MAX) 
   6.671 +        period = MCE_MAX;
   6.672 +    if (period < MCE_MIN)
   6.673 +        period = MCE_MIN;
   6.674 +    set_timer(&mce_timer, NOW() + MILLISECS(period));
   6.675 +    adjust = 0;
   6.676 +}
   6.677 +
   6.678 +void intel_mcheck_timer(struct cpuinfo_x86 *c)
   6.679 +{
   6.680 +    printk(KERN_DEBUG "mcheck_poll: Init_mcheck_timer\n");
   6.681 +    init_timer(&mce_timer, mce_intel_work_fn, NULL, 0);
   6.682 +    set_timer(&mce_timer, NOW() + MILLISECS(MCE_PERIOD));
   6.683 +}
   6.684 +
     7.1 --- a/xen/arch/x86/cpu/mcheck/non-fatal.c	Fri Dec 19 14:56:36 2008 +0000
     7.2 +++ b/xen/arch/x86/cpu/mcheck/non-fatal.c	Mon Dec 22 08:12:33 2008 +0000
     7.3 @@ -19,8 +19,8 @@
     7.4  #include <asm/msr.h>
     7.5  
     7.6  #include "mce.h"
     7.7 -
     7.8 -static int firstbank;
     7.9 +#include "x86_mca.h"
    7.10 +int firstbank = 0;
    7.11  static struct timer mce_timer;
    7.12  
    7.13  #define MCE_PERIOD MILLISECS(15000)
    7.14 @@ -61,13 +61,8 @@ static int __init init_nonfatal_mce_chec
    7.15  	struct cpuinfo_x86 *c = &boot_cpu_data;
    7.16  
    7.17  	/* Check for MCE support */
    7.18 -	if (!cpu_has(c, X86_FEATURE_MCE))
    7.19 +	if (!mce_available(c))
    7.20  		return -ENODEV;
    7.21 -
    7.22 -	/* Check for PPro style MCA */
    7.23 -	if (!cpu_has(c, X86_FEATURE_MCA))
    7.24 -		return -ENODEV;
    7.25 -
    7.26  	/*
    7.27  	 * Check for non-fatal errors every MCE_RATE s
    7.28  	 */
    7.29 @@ -85,12 +80,20 @@ static int __init init_nonfatal_mce_chec
    7.30  		break;
    7.31  
    7.32  	case X86_VENDOR_INTEL:
    7.33 -		init_timer(&mce_timer, mce_work_fn, NULL, 0);
    7.34 -		set_timer(&mce_timer, NOW() + MCE_PERIOD);
    7.35 +		/* p5 family is different. P4/P6 and latest CPUs shares the
    7.36 +		 * same polling methods
    7.37 +		*/
    7.38 +		if ( c->x86 != 5 )
    7.39 +		{
    7.40 +			/* some CPUs or banks don't support cmci, we need to 
    7.41 +			 * enable this feature anyway
    7.42 +			 */
    7.43 +			intel_mcheck_timer(c);
    7.44 +		}
    7.45  		break;
    7.46  	}
    7.47  
    7.48 -	printk(KERN_INFO "MCA: Machine check polling timer started.\n");
    7.49 +	printk(KERN_INFO "mcheck_poll: Machine check polling timer started.\n");
    7.50  	return 0;
    7.51  }
    7.52  __initcall(init_nonfatal_mce_checker);
     8.1 --- a/xen/arch/x86/cpu/mcheck/p4.c	Fri Dec 19 14:56:36 2008 +0000
     8.2 +++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
     8.3 @@ -1,270 +0,0 @@
     8.4 -/*
     8.5 - * P4 specific Machine Check Exception Reporting
     8.6 - */
     8.7 -
     8.8 -#include <xen/init.h>
     8.9 -#include <xen/types.h>
    8.10 -#include <xen/kernel.h>
    8.11 -#include <xen/config.h>
    8.12 -#include <xen/smp.h>
    8.13 -#include <xen/irq.h>
    8.14 -#include <xen/time.h>
    8.15 -#include <asm/processor.h> 
    8.16 -#include <asm/system.h>
    8.17 -#include <asm/msr.h>
    8.18 -#include <asm/apic.h>
    8.19 -
    8.20 -#include "mce.h"
    8.21 -
    8.22 -/* as supported by the P4/Xeon family */
    8.23 -struct intel_mce_extended_msrs {
    8.24 -	u32 eax;
    8.25 -	u32 ebx;
    8.26 -	u32 ecx;
    8.27 -	u32 edx;
    8.28 -	u32 esi;
    8.29 -	u32 edi;
    8.30 -	u32 ebp;
    8.31 -	u32 esp;
    8.32 -	u32 eflags;
    8.33 -	u32 eip;
    8.34 -	/* u32 *reserved[]; */
    8.35 -};
    8.36 -
    8.37 -static int mce_num_extended_msrs = 0;
    8.38 -
    8.39 -
    8.40 -#ifdef CONFIG_X86_MCE_P4THERMAL
    8.41 -static void unexpected_thermal_interrupt(struct cpu_user_regs *regs)
    8.42 -{	
    8.43 -	printk(KERN_ERR "CPU%d: Unexpected LVT TMR interrupt!\n",
    8.44 -			smp_processor_id());
    8.45 -	add_taint(TAINT_MACHINE_CHECK);
    8.46 -}
    8.47 -
    8.48 -/* P4/Xeon Thermal transition interrupt handler */
    8.49 -static void intel_thermal_interrupt(struct cpu_user_regs *regs)
    8.50 -{
    8.51 -	u32 l, h;
    8.52 -	unsigned int cpu = smp_processor_id();
    8.53 -	static s_time_t next[NR_CPUS];
    8.54 -
    8.55 -	ack_APIC_irq();
    8.56 -
    8.57 -	if (NOW() < next[cpu])
    8.58 -		return;
    8.59 -
    8.60 -	next[cpu] = NOW() + MILLISECS(5000);
    8.61 -	rdmsr(MSR_IA32_THERM_STATUS, l, h);
    8.62 -	if (l & 0x1) {
    8.63 -		printk(KERN_EMERG "CPU%d: Temperature above threshold\n", cpu);
    8.64 -		printk(KERN_EMERG "CPU%d: Running in modulated clock mode\n",
    8.65 -				cpu);
    8.66 -		add_taint(TAINT_MACHINE_CHECK);
    8.67 -	} else {
    8.68 -		printk(KERN_INFO "CPU%d: Temperature/speed normal\n", cpu);
    8.69 -	}
    8.70 -}
    8.71 -
    8.72 -/* Thermal interrupt handler for this CPU setup */
    8.73 -static void (*vendor_thermal_interrupt)(struct cpu_user_regs *regs) = unexpected_thermal_interrupt;
    8.74 -
    8.75 -fastcall void smp_thermal_interrupt(struct cpu_user_regs *regs)
    8.76 -{
    8.77 -	irq_enter();
    8.78 -	vendor_thermal_interrupt(regs);
    8.79 -	irq_exit();
    8.80 -}
    8.81 -
    8.82 -/* P4/Xeon Thermal regulation detect and init */
    8.83 -static void intel_init_thermal(struct cpuinfo_x86 *c)
    8.84 -{
    8.85 -	u32 l, h;
    8.86 -	unsigned int cpu = smp_processor_id();
    8.87 -
    8.88 -	/* Thermal monitoring */
    8.89 -	if (!cpu_has(c, X86_FEATURE_ACPI))
    8.90 -		return;	/* -ENODEV */
    8.91 -
    8.92 -	/* Clock modulation */
    8.93 -	if (!cpu_has(c, X86_FEATURE_ACC))
    8.94 -		return;	/* -ENODEV */
    8.95 -
    8.96 -	/* first check if its enabled already, in which case there might
    8.97 -	 * be some SMM goo which handles it, so we can't even put a handler
    8.98 -	 * since it might be delivered via SMI already -zwanem.
    8.99 -	 */
   8.100 -	rdmsr (MSR_IA32_MISC_ENABLE, l, h);
   8.101 -	h = apic_read(APIC_LVTTHMR);
   8.102 -	if ((l & (1<<3)) && (h & APIC_DM_SMI)) {
   8.103 -		printk(KERN_DEBUG "CPU%d: Thermal monitoring handled by SMI\n",
   8.104 -				cpu);
   8.105 -		return; /* -EBUSY */
   8.106 -	}
   8.107 -
   8.108 -	/* check whether a vector already exists, temporarily masked? */	
   8.109 -	if (h & APIC_VECTOR_MASK) {
   8.110 -		printk(KERN_DEBUG "CPU%d: Thermal LVT vector (%#x) already "
   8.111 -				"installed\n",
   8.112 -			cpu, (h & APIC_VECTOR_MASK));
   8.113 -		return; /* -EBUSY */
   8.114 -	}
   8.115 -
   8.116 -	/* The temperature transition interrupt handler setup */
   8.117 -	h = THERMAL_APIC_VECTOR;		/* our delivery vector */
   8.118 -	h |= (APIC_DM_FIXED | APIC_LVT_MASKED);	/* we'll mask till we're ready */
   8.119 -	apic_write_around(APIC_LVTTHMR, h);
   8.120 -
   8.121 -	rdmsr (MSR_IA32_THERM_INTERRUPT, l, h);
   8.122 -	wrmsr (MSR_IA32_THERM_INTERRUPT, l | 0x03 , h);
   8.123 -
   8.124 -	/* ok we're good to go... */
   8.125 -	vendor_thermal_interrupt = intel_thermal_interrupt;
   8.126 -	
   8.127 -	rdmsr (MSR_IA32_MISC_ENABLE, l, h);
   8.128 -	wrmsr (MSR_IA32_MISC_ENABLE, l | (1<<3), h);
   8.129 -	
   8.130 -	l = apic_read (APIC_LVTTHMR);
   8.131 -	apic_write_around (APIC_LVTTHMR, l & ~APIC_LVT_MASKED);
   8.132 -	printk (KERN_INFO "CPU%d: Thermal monitoring enabled\n", cpu);
   8.133 -	return;
   8.134 -}
   8.135 -#endif /* CONFIG_X86_MCE_P4THERMAL */
   8.136 -
   8.137 -
   8.138 -/* P4/Xeon Extended MCE MSR retrieval, return 0 if unsupported */
   8.139 -static inline int intel_get_extended_msrs(struct intel_mce_extended_msrs *r)
   8.140 -{
   8.141 -	u32 h;
   8.142 -
   8.143 -	if (mce_num_extended_msrs == 0)
   8.144 -		goto done;
   8.145 -
   8.146 -	rdmsr (MSR_IA32_MCG_EAX, r->eax, h);
   8.147 -	rdmsr (MSR_IA32_MCG_EBX, r->ebx, h);
   8.148 -	rdmsr (MSR_IA32_MCG_ECX, r->ecx, h);
   8.149 -	rdmsr (MSR_IA32_MCG_EDX, r->edx, h);
   8.150 -	rdmsr (MSR_IA32_MCG_ESI, r->esi, h);
   8.151 -	rdmsr (MSR_IA32_MCG_EDI, r->edi, h);
   8.152 -	rdmsr (MSR_IA32_MCG_EBP, r->ebp, h);
   8.153 -	rdmsr (MSR_IA32_MCG_ESP, r->esp, h);
   8.154 -	rdmsr (MSR_IA32_MCG_EFLAGS, r->eflags, h);
   8.155 -	rdmsr (MSR_IA32_MCG_EIP, r->eip, h);
   8.156 -
   8.157 -	/* can we rely on kmalloc to do a dynamic
   8.158 -	 * allocation for the reserved registers?
   8.159 -	 */
   8.160 -done:
   8.161 -	return mce_num_extended_msrs;
   8.162 -}
   8.163 -
   8.164 -static fastcall void intel_machine_check(struct cpu_user_regs * regs, long error_code)
   8.165 -{
   8.166 -	int recover=1;
   8.167 -	u32 alow, ahigh, high, low;
   8.168 -	u32 mcgstl, mcgsth;
   8.169 -	int i;
   8.170 -	struct intel_mce_extended_msrs dbg;
   8.171 -
   8.172 -	rdmsr (MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
   8.173 -	if (mcgstl & (1<<0))	/* Recoverable ? */
   8.174 -		recover=0;
   8.175 -
   8.176 -	printk (KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n",
   8.177 -		smp_processor_id(), mcgsth, mcgstl);
   8.178 -
   8.179 -	if (intel_get_extended_msrs(&dbg)) {
   8.180 -		printk (KERN_DEBUG "CPU %d: EIP: %08x EFLAGS: %08x\n",
   8.181 -			smp_processor_id(), dbg.eip, dbg.eflags);
   8.182 -		printk (KERN_DEBUG "\teax: %08x ebx: %08x ecx: %08x edx: %08x\n",
   8.183 -			dbg.eax, dbg.ebx, dbg.ecx, dbg.edx);
   8.184 -		printk (KERN_DEBUG "\tesi: %08x edi: %08x ebp: %08x esp: %08x\n",
   8.185 -			dbg.esi, dbg.edi, dbg.ebp, dbg.esp);
   8.186 -	}
   8.187 -
   8.188 -	for (i=0; i<nr_mce_banks; i++) {
   8.189 -		rdmsr (MSR_IA32_MC0_STATUS+i*4,low, high);
   8.190 -		if (high & (1<<31)) {
   8.191 -			if (high & (1<<29))
   8.192 -				recover |= 1;
   8.193 -			if (high & (1<<25))
   8.194 -				recover |= 2;
   8.195 -			printk (KERN_EMERG "Bank %d: %08x%08x", i, high, low);
   8.196 -			high &= ~(1<<31);
   8.197 -			if (high & (1<<27)) {
   8.198 -				rdmsr (MSR_IA32_MC0_MISC+i*4, alow, ahigh);
   8.199 -				printk ("[%08x%08x]", ahigh, alow);
   8.200 -			}
   8.201 -			if (high & (1<<26)) {
   8.202 -				rdmsr (MSR_IA32_MC0_ADDR+i*4, alow, ahigh);
   8.203 -				printk (" at %08x%08x", ahigh, alow);
   8.204 -			}
   8.205 -			printk ("\n");
   8.206 -		}
   8.207 -	}
   8.208 -
   8.209 -	if (recover & 2)
   8.210 -		mc_panic ("CPU context corrupt");
   8.211 -	if (recover & 1)
   8.212 -		mc_panic ("Unable to continue");
   8.213 -
   8.214 -	printk(KERN_EMERG "Attempting to continue.\n");
   8.215 -	/* 
   8.216 -	 * Do not clear the MSR_IA32_MCi_STATUS if the error is not 
   8.217 -	 * recoverable/continuable.This will allow BIOS to look at the MSRs
   8.218 -	 * for errors if the OS could not log the error.
   8.219 -	 */
   8.220 -	for (i=0; i<nr_mce_banks; i++) {
   8.221 -		u32 msr;
   8.222 -		msr = MSR_IA32_MC0_STATUS+i*4;
   8.223 -		rdmsr (msr, low, high);
   8.224 -		if (high&(1<<31)) {
   8.225 -			/* Clear it */
   8.226 -			wrmsr(msr, 0UL, 0UL);
   8.227 -			/* Serialize */
   8.228 -			wmb();
   8.229 -			add_taint(TAINT_MACHINE_CHECK);
   8.230 -		}
   8.231 -	}
   8.232 -	mcgstl &= ~(1<<2);
   8.233 -	wrmsr (MSR_IA32_MCG_STATUS,mcgstl, mcgsth);
   8.234 -}
   8.235 -
   8.236 -
   8.237 -void intel_p4_mcheck_init(struct cpuinfo_x86 *c)
   8.238 -{
   8.239 -	u32 l, h;
   8.240 -	int i;
   8.241 -	
   8.242 -	machine_check_vector = intel_machine_check;
   8.243 -	wmb();
   8.244 -
   8.245 -	printk (KERN_INFO "Intel machine check architecture supported.\n");
   8.246 -	rdmsr (MSR_IA32_MCG_CAP, l, h);
   8.247 -	if (l & (1<<8))	/* Control register present ? */
   8.248 -		wrmsr (MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
   8.249 -	nr_mce_banks = l & 0xff;
   8.250 -
   8.251 -	for (i=0; i<nr_mce_banks; i++) {
   8.252 -		wrmsr (MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff);
   8.253 -		wrmsr (MSR_IA32_MC0_STATUS+4*i, 0x0, 0x0);
   8.254 -	}
   8.255 -
   8.256 -	set_in_cr4 (X86_CR4_MCE);
   8.257 -	printk (KERN_INFO "Intel machine check reporting enabled on CPU#%d.\n",
   8.258 -		smp_processor_id());
   8.259 -
   8.260 -	/* Check for P4/Xeon extended MCE MSRs */
   8.261 -	rdmsr (MSR_IA32_MCG_CAP, l, h);
   8.262 -	if (l & (1<<9))	{/* MCG_EXT_P */
   8.263 -		mce_num_extended_msrs = (l >> 16) & 0xff;
   8.264 -		printk (KERN_INFO "CPU%d: Intel P4/Xeon Extended MCE MSRs (%d)"
   8.265 -				" available\n",
   8.266 -			smp_processor_id(), mce_num_extended_msrs);
   8.267 -
   8.268 -#ifdef CONFIG_X86_MCE_P4THERMAL
   8.269 -		/* Check for P4/Xeon Thermal monitor */
   8.270 -		intel_init_thermal(c);
   8.271 -#endif
   8.272 -	}
   8.273 -}
     9.1 --- a/xen/arch/x86/cpu/mcheck/p6.c	Fri Dec 19 14:56:36 2008 +0000
     9.2 +++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
     9.3 @@ -1,118 +0,0 @@
     9.4 -/*
     9.5 - * P6 specific Machine Check Exception Reporting
     9.6 - * (C) Copyright 2002 Alan Cox <alan@redhat.com>
     9.7 - */
     9.8 -
     9.9 -#include <xen/init.h>
    9.10 -#include <xen/types.h>
    9.11 -#include <xen/kernel.h>
    9.12 -#include <xen/smp.h>
    9.13 -
    9.14 -#include <asm/processor.h> 
    9.15 -#include <asm/system.h>
    9.16 -#include <asm/msr.h>
    9.17 -
    9.18 -#include "mce.h"
    9.19 -
    9.20 -/* Machine Check Handler For PII/PIII */
    9.21 -static fastcall void intel_machine_check(struct cpu_user_regs * regs, long error_code)
    9.22 -{
    9.23 -	int recover=1;
    9.24 -	u32 alow, ahigh, high, low;
    9.25 -	u32 mcgstl, mcgsth;
    9.26 -	int i;
    9.27 -
    9.28 -	rdmsr (MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
    9.29 -	if (mcgstl & (1<<0))	/* Recoverable ? */
    9.30 -		recover=0;
    9.31 -
    9.32 -	printk (KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n",
    9.33 -		smp_processor_id(), mcgsth, mcgstl);
    9.34 -
    9.35 -	for (i=0; i<nr_mce_banks; i++) {
    9.36 -		rdmsr (MSR_IA32_MC0_STATUS+i*4,low, high);
    9.37 -		if (high & (1<<31)) {
    9.38 -			if (high & (1<<29))
    9.39 -				recover |= 1;
    9.40 -			if (high & (1<<25))
    9.41 -				recover |= 2;
    9.42 -			printk (KERN_EMERG "Bank %d: %08x%08x", i, high, low);
    9.43 -			high &= ~(1<<31);
    9.44 -			if (high & (1<<27)) {
    9.45 -				rdmsr (MSR_IA32_MC0_MISC+i*4, alow, ahigh);
    9.46 -				printk ("[%08x%08x]", ahigh, alow);
    9.47 -			}
    9.48 -			if (high & (1<<26)) {
    9.49 -				rdmsr (MSR_IA32_MC0_ADDR+i*4, alow, ahigh);
    9.50 -				printk (" at %08x%08x", ahigh, alow);
    9.51 -			}
    9.52 -			printk ("\n");
    9.53 -		}
    9.54 -	}
    9.55 -
    9.56 -	if (recover & 2)
    9.57 -		mc_panic ("CPU context corrupt");
    9.58 -	if (recover & 1)
    9.59 -		mc_panic ("Unable to continue");
    9.60 -
    9.61 -	printk (KERN_EMERG "Attempting to continue.\n");
    9.62 -	/* 
    9.63 -	 * Do not clear the MSR_IA32_MCi_STATUS if the error is not 
    9.64 -	 * recoverable/continuable.This will allow BIOS to look at the MSRs
    9.65 -	 * for errors if the OS could not log the error.
    9.66 -	 */
    9.67 -	for (i=0; i<nr_mce_banks; i++) {
    9.68 -		unsigned int msr;
    9.69 -		msr = MSR_IA32_MC0_STATUS+i*4;
    9.70 -		rdmsr (msr,low, high);
    9.71 -		if (high & (1<<31)) {
    9.72 -			/* Clear it */
    9.73 -			wrmsr (msr, 0UL, 0UL);
    9.74 -			/* Serialize */
    9.75 -			wmb();
    9.76 -			add_taint(TAINT_MACHINE_CHECK);
    9.77 -		}
    9.78 -	}
    9.79 -	mcgstl &= ~(1<<2);
    9.80 -	wrmsr (MSR_IA32_MCG_STATUS,mcgstl, mcgsth);
    9.81 -}
    9.82 -
    9.83 -/* Set up machine check reporting for processors with Intel style MCE */
    9.84 -void intel_p6_mcheck_init(struct cpuinfo_x86 *c)
    9.85 -{
    9.86 -	u32 l, h;
    9.87 -	int i;
    9.88 -	
    9.89 -	/* Check for MCE support */
    9.90 -	if (!cpu_has(c, X86_FEATURE_MCE))
    9.91 -		return;
    9.92 -
    9.93 -	/* Check for PPro style MCA */
    9.94 - 	if (!cpu_has(c, X86_FEATURE_MCA))
    9.95 -		return;
    9.96 -
    9.97 -	/* Ok machine check is available */
    9.98 -	machine_check_vector = intel_machine_check;
    9.99 -	wmb();
   9.100 -
   9.101 -	printk (KERN_INFO "Intel machine check architecture supported.\n");
   9.102 -	rdmsr (MSR_IA32_MCG_CAP, l, h);
   9.103 -	if (l & (1<<8))	/* Control register present ? */
   9.104 -		wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
   9.105 -	nr_mce_banks = l & 0xff;
   9.106 -
   9.107 -	/*
   9.108 -	 * Following the example in IA-32 SDM Vol 3:
   9.109 -	 * - MC0_CTL should not be written
   9.110 -	 * - Status registers on all banks should be cleared on reset
   9.111 -	 */
   9.112 -	for (i=1; i<nr_mce_banks; i++)
   9.113 -		wrmsr (MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff);
   9.114 -
   9.115 -	for (i=0; i<nr_mce_banks; i++)
   9.116 -		wrmsr (MSR_IA32_MC0_STATUS+4*i, 0x0, 0x0);
   9.117 -
   9.118 -	set_in_cr4 (X86_CR4_MCE);
   9.119 -	printk (KERN_INFO "Intel machine check reporting enabled on CPU#%d.\n",
   9.120 -		smp_processor_id());
   9.121 -}
    10.1 --- a/xen/arch/x86/cpu/mcheck/x86_mca.h	Fri Dec 19 14:56:36 2008 +0000
    10.2 +++ b/xen/arch/x86/cpu/mcheck/x86_mca.h	Mon Dec 22 08:12:33 2008 +0000
    10.3 @@ -28,7 +28,10 @@
    10.4  /* Bitfield of the MSR_IA32_MCG_CAP register */
    10.5  #define MCG_CAP_COUNT           0x00000000000000ffULL
    10.6  #define MCG_CTL_P               0x0000000000000100ULL
    10.7 -/* Bits 9-63 are reserved */
    10.8 +#define MCG_EXT_P		(1UL<<9)
    10.9 +#define MCG_EXT_CNT		(16)
   10.10 +#define MCG_CMCI_P		(1UL<<10)
   10.11 +/* Other bits are reserved */
   10.12  
   10.13  /* Bitfield of the MSR_IA32_MCG_STATUS register */
   10.14  #define MCG_STATUS_RIPV         0x0000000000000001ULL
   10.15 @@ -70,3 +73,17 @@
   10.16  /* reserved bits */
   10.17  #define MCi_STATUS_OTHER_RESERVED2      0x0180000000000000ULL
   10.18  
   10.19 +/*Intel Specific bitfield*/
   10.20 +#define CMCI_THRESHOLD			0x2
   10.21 +
   10.22 +
   10.23 +#define MAX_NR_BANKS 128
   10.24 +
   10.25 +typedef DECLARE_BITMAP(cpu_banks_t, MAX_NR_BANKS);
   10.26 +DECLARE_PER_CPU(cpu_banks_t, mce_banks_owned);
   10.27 +
   10.28 +/* Global variables */
   10.29 +extern int mce_disabled;
   10.30 +extern unsigned int nr_mce_banks;
   10.31 +extern int firstbank;
   10.32 +
    11.1 --- a/xen/arch/x86/hvm/vmx/vmx.c	Fri Dec 19 14:56:36 2008 +0000
    11.2 +++ b/xen/arch/x86/hvm/vmx/vmx.c	Mon Dec 22 08:12:33 2008 +0000
    11.3 @@ -2030,7 +2030,8 @@ static void vmx_do_extint(struct cpu_use
    11.4      fastcall void smp_spurious_interrupt(struct cpu_user_regs *regs);
    11.5      fastcall void smp_error_interrupt(struct cpu_user_regs *regs);
    11.6      fastcall void smp_pmu_apic_interrupt(struct cpu_user_regs *regs);
    11.7 -#ifdef CONFIG_X86_MCE_P4THERMAL
    11.8 +    fastcall void smp_cmci_interrupt(struct cpu_user_regs *regs);
    11.9 +#ifdef CONFIG_X86_MCE_THERMAL
   11.10      fastcall void smp_thermal_interrupt(struct cpu_user_regs *regs);
   11.11  #endif
   11.12  
   11.13 @@ -2060,10 +2061,13 @@ static void vmx_do_extint(struct cpu_use
   11.14      case ERROR_APIC_VECTOR:
   11.15          smp_error_interrupt(regs);
   11.16          break;
   11.17 +    case CMCI_APIC_VECTOR:
   11.18 +        smp_cmci_interrupt(regs);
   11.19 +        break;
   11.20      case PMU_APIC_VECTOR:
   11.21          smp_pmu_apic_interrupt(regs);
   11.22          break;
   11.23 -#ifdef CONFIG_X86_MCE_P4THERMAL
   11.24 +#ifdef CONFIG_X86_MCE_THERMAL
   11.25      case THERMAL_APIC_VECTOR:
   11.26          smp_thermal_interrupt(regs);
   11.27          break;
    12.1 --- a/xen/arch/x86/i8259.c	Fri Dec 19 14:56:36 2008 +0000
    12.2 +++ b/xen/arch/x86/i8259.c	Mon Dec 22 08:12:33 2008 +0000
    12.3 @@ -74,6 +74,7 @@ BUILD_SMP_INTERRUPT(error_interrupt,ERRO
    12.4  BUILD_SMP_INTERRUPT(spurious_interrupt,SPURIOUS_APIC_VECTOR)
    12.5  BUILD_SMP_INTERRUPT(pmu_apic_interrupt,PMU_APIC_VECTOR)
    12.6  BUILD_SMP_INTERRUPT(thermal_interrupt,THERMAL_APIC_VECTOR)
    12.7 +BUILD_SMP_INTERRUPT(cmci_interrupt, CMCI_APIC_VECTOR)
    12.8  
    12.9  #define IRQ(x,y) \
   12.10      IRQ##x##y##_interrupt
    13.1 --- a/xen/arch/x86/smpboot.c	Fri Dec 19 14:56:36 2008 +0000
    13.2 +++ b/xen/arch/x86/smpboot.c	Mon Dec 22 08:12:33 2008 +0000
    13.3 @@ -1237,11 +1237,25 @@ remove_siblinginfo(int cpu)
    13.4  }
    13.5  
    13.6  extern void fixup_irqs(cpumask_t map);
    13.7 -int __cpu_disable(void)
    13.8 +
    13.9 +/*
   13.10 + * Functions called when offline cpu. 
   13.11 + * We need to process some new feature such as 
   13.12 + * CMCI owner change when do cpu hotplug in latest 
   13.13 + * Intel CPU families
   13.14 +*/
   13.15 +void (*cpu_down_handler)(int down_cpu) = NULL;
   13.16 +void (*cpu_down_rollback_handler)(int down_cpu) = NULL;
   13.17 +
   13.18 +
   13.19 +int __cpu_disable(int down_cpu)
   13.20  {
   13.21  	cpumask_t map = cpu_online_map;
   13.22  	int cpu = smp_processor_id();
   13.23  
   13.24 +	/*Only down_cpu need to execute this function*/
   13.25 +	if (cpu != down_cpu)
   13.26 +		return 0;
   13.27  	/*
   13.28  	 * Perhaps use cpufreq to drop frequency, but that could go
   13.29  	 * into generic code.
   13.30 @@ -1293,10 +1307,14 @@ void __cpu_die(unsigned int cpu)
   13.31  	}
   13.32   	printk(KERN_ERR "CPU %u didn't die...\n", cpu);
   13.33  }
   13.34 +static int take_cpu_down(void *down_cpu)
   13.35 +{
   13.36  
   13.37 -static int take_cpu_down(void *unused)
   13.38 -{
   13.39 -    return __cpu_disable();
   13.40 +    if (cpu_down_handler)
   13.41 +        cpu_down_handler(*(int *)down_cpu);
   13.42 +    wmb();
   13.43 +
   13.44 +    return __cpu_disable(*(int *)down_cpu);
   13.45  }
   13.46  
   13.47  int cpu_down(unsigned int cpu)
   13.48 @@ -1322,7 +1340,7 @@ int cpu_down(unsigned int cpu)
   13.49  
   13.50  	printk("Prepare to bring CPU%d down...\n", cpu);
   13.51  
   13.52 -	err = stop_machine_run(take_cpu_down, NULL, cpu);
   13.53 +	err = stop_machine_run(take_cpu_down, &cpu, cpu_online_map);
   13.54  	if ( err < 0 )
   13.55  		goto out;
   13.56  
   13.57 @@ -1333,6 +1351,10 @@ int cpu_down(unsigned int cpu)
   13.58  		err = -EBUSY;
   13.59  	}
   13.60  out:
   13.61 +	/*if cpu_offline failed, re-check cmci_owner*/
   13.62 +
   13.63 +	if ( err < 0 && cpu_down_rollback_handler) 
   13.64 +		cpu_down_rollback_handler(cpu); 
   13.65  	spin_unlock(&cpu_add_remove_lock);
   13.66  	return err;
   13.67  }
    14.1 --- a/xen/common/stop_machine.c	Fri Dec 19 14:56:36 2008 +0000
    14.2 +++ b/xen/common/stop_machine.c	Mon Dec 22 08:12:33 2008 +0000
    14.3 @@ -45,7 +45,7 @@ struct stopmachine_data {
    14.4      enum stopmachine_state state;
    14.5      atomic_t done;
    14.6  
    14.7 -    unsigned int fn_cpu;
    14.8 +    cpumask_t fn_cpus;
    14.9      int fn_result;
   14.10      int (*fn)(void *);
   14.11      void *fn_data;
   14.12 @@ -63,21 +63,22 @@ static void stopmachine_set_state(enum s
   14.13          cpu_relax();
   14.14  }
   14.15  
   14.16 -int stop_machine_run(int (*fn)(void *), void *data, unsigned int cpu)
   14.17 +int stop_machine_run(int (*fn)(void *), void *data, cpumask_t cpus)
   14.18  {
   14.19      cpumask_t allbutself;
   14.20      unsigned int i, nr_cpus;
   14.21 -    int ret;
   14.22 +    int cur_cpu, ret;
   14.23  
   14.24      BUG_ON(!local_irq_is_enabled());
   14.25  
   14.26      allbutself = cpu_online_map;
   14.27 -    cpu_clear(smp_processor_id(), allbutself);
   14.28 +    cur_cpu = smp_processor_id();
   14.29 +    cpu_clear(cur_cpu, allbutself);
   14.30      nr_cpus = cpus_weight(allbutself);
   14.31  
   14.32      if ( nr_cpus == 0 )
   14.33      {
   14.34 -        BUG_ON(cpu != smp_processor_id());
   14.35 +        BUG_ON(!cpu_isset(cur_cpu, cpus));
   14.36          return (*fn)(data);
   14.37      }
   14.38  
   14.39 @@ -91,7 +92,8 @@ int stop_machine_run(int (*fn)(void *), 
   14.40      stopmachine_data.fn = fn;
   14.41      stopmachine_data.fn_data = data;
   14.42      stopmachine_data.nr_cpus = nr_cpus;
   14.43 -    stopmachine_data.fn_cpu = cpu;
   14.44 +    stopmachine_data.fn_cpus = cpus;
   14.45 +    stopmachine_data.fn_result = 0;
   14.46      atomic_set(&stopmachine_data.done, 0);
   14.47      stopmachine_data.state = STOPMACHINE_START;
   14.48  
   14.49 @@ -105,8 +107,13 @@ int stop_machine_run(int (*fn)(void *), 
   14.50      local_irq_disable();
   14.51      stopmachine_set_state(STOPMACHINE_DISABLE_IRQ);
   14.52  
   14.53 -    if ( cpu == smp_processor_id() )
   14.54 -        stopmachine_data.fn_result = (*fn)(data);
   14.55 +    /* callback will run on each cpu of the input map.
   14.56 +     * If callback fails on any CPU, the stop_machine_run
   14.57 +     * will return the  *ORed* the failure
   14.58 +     */
   14.59 +    if ( cpu_isset(cur_cpu, cpus) ){
   14.60 +        stopmachine_data.fn_result |= (*fn)(data);
   14.61 +    }
   14.62      stopmachine_set_state(STOPMACHINE_INVOKE);
   14.63      ret = stopmachine_data.fn_result;
   14.64  
   14.65 @@ -121,7 +128,6 @@ int stop_machine_run(int (*fn)(void *), 
   14.66  static void stopmachine_softirq(void)
   14.67  {
   14.68      enum stopmachine_state state = STOPMACHINE_START;
   14.69 -
   14.70      smp_mb();
   14.71  
   14.72      while ( state != STOPMACHINE_EXIT )
   14.73 @@ -136,10 +142,11 @@ static void stopmachine_softirq(void)
   14.74              local_irq_disable();
   14.75              break;
   14.76          case STOPMACHINE_INVOKE:
   14.77 -            if ( stopmachine_data.fn_cpu == smp_processor_id() )
   14.78 -                stopmachine_data.fn_result =
   14.79 +            if ( cpu_isset(smp_processor_id(), stopmachine_data.fn_cpus )) {
   14.80 +                stopmachine_data.fn_result |= 
   14.81                      stopmachine_data.fn(stopmachine_data.fn_data);
   14.82 -            break;
   14.83 +            }
   14.84 +           break;
   14.85          default:
   14.86              break;
   14.87          }
    15.1 --- a/xen/include/asm-x86/apicdef.h	Fri Dec 19 14:56:36 2008 +0000
    15.2 +++ b/xen/include/asm-x86/apicdef.h	Mon Dec 22 08:12:33 2008 +0000
    15.3 @@ -80,6 +80,8 @@
    15.4  #define		APIC_LVTTHMR	0x330
    15.5  #define		APIC_LVTPC	0x340
    15.6  #define		APIC_LVT0	0x350
    15.7 +#define		APIC_CMCI	0x2F0
    15.8 +
    15.9  #define			APIC_LVT_TIMER_BASE_MASK	(0x3<<18)
   15.10  #define			GET_APIC_TIMER_BASE(x)		(((x)>>18)&0x3)
   15.11  #define			SET_APIC_TIMER_BASE(x)		(((x)<<18))
    16.1 --- a/xen/include/asm-x86/config.h	Fri Dec 19 14:56:36 2008 +0000
    16.2 +++ b/xen/include/asm-x86/config.h	Mon Dec 22 08:12:33 2008 +0000
    16.3 @@ -22,7 +22,7 @@
    16.4  #define CONFIG_X86_IO_APIC 1
    16.5  #define CONFIG_X86_PM_TIMER 1
    16.6  #define CONFIG_HPET_TIMER 1
    16.7 -#define CONFIG_X86_MCE_P4THERMAL 1
    16.8 +#define CONFIG_X86_MCE_THERMAL 1
    16.9  #define CONFIG_NUMA 1
   16.10  #define CONFIG_DISCONTIGMEM 1
   16.11  #define CONFIG_NUMA_EMU 1
    17.1 --- a/xen/include/asm-x86/irq.h	Fri Dec 19 14:56:36 2008 +0000
    17.2 +++ b/xen/include/asm-x86/irq.h	Mon Dec 22 08:12:33 2008 +0000
    17.3 @@ -33,6 +33,7 @@ fastcall void error_interrupt(void);
    17.4  fastcall void pmu_apic_interrupt(void);
    17.5  fastcall void spurious_interrupt(void);
    17.6  fastcall void thermal_interrupt(void);
    17.7 +fastcall void cmci_interrupt(void);
    17.8  
    17.9  void disable_8259A_irq(unsigned int irq);
   17.10  void enable_8259A_irq(unsigned int irq);
    18.1 --- a/xen/include/asm-x86/mach-default/irq_vectors.h	Fri Dec 19 14:56:36 2008 +0000
    18.2 +++ b/xen/include/asm-x86/mach-default/irq_vectors.h	Mon Dec 22 08:12:33 2008 +0000
    18.3 @@ -10,13 +10,13 @@
    18.4  #define THERMAL_APIC_VECTOR	0xfa
    18.5  #define LOCAL_TIMER_VECTOR	0xf9
    18.6  #define PMU_APIC_VECTOR 	0xf8
    18.7 -
    18.8 +#define CMCI_APIC_VECTOR	0xf7
    18.9  /*
   18.10   * High-priority dynamically-allocated vectors. For interrupts that
   18.11   * must be higher priority than any guest-bound interrupt.
   18.12   */
   18.13  #define FIRST_HIPRIORITY_VECTOR	0xf0
   18.14 -#define LAST_HIPRIORITY_VECTOR  0xf7
   18.15 +#define LAST_HIPRIORITY_VECTOR  0xf6
   18.16  
   18.17  /* Legacy PIC uses vectors 0xe0-0xef. */
   18.18  #define FIRST_LEGACY_VECTOR	0xe0
    19.1 --- a/xen/include/asm-x86/msr-index.h	Fri Dec 19 14:56:36 2008 +0000
    19.2 +++ b/xen/include/asm-x86/msr-index.h	Mon Dec 22 08:12:33 2008 +0000
    19.3 @@ -92,8 +92,10 @@
    19.4  #define MSR_IA32_MC0_STATUS		0x00000401
    19.5  #define MSR_IA32_MC0_ADDR		0x00000402
    19.6  #define MSR_IA32_MC0_MISC		0x00000403
    19.7 +#define MSR_IA32_MC0_CTL2		0x00000280
    19.8 +#define CMCI_EN 			(1UL<<30)
    19.9 +#define CMCI_THRESHOLD_MASK		0x7FFF
   19.10  
   19.11 -#define MSR_IA32_MC1_CTL		0x00000404
   19.12  #define MSR_IA32_MC1_STATUS		0x00000405
   19.13  #define MSR_IA32_MC1_ADDR		0x00000406
   19.14  #define MSR_IA32_MC1_MISC		0x00000407
    20.1 --- a/xen/include/asm-x86/smp.h	Fri Dec 19 14:56:36 2008 +0000
    20.2 +++ b/xen/include/asm-x86/smp.h	Mon Dec 22 08:12:33 2008 +0000
    20.3 @@ -101,7 +101,7 @@ static __inline int logical_smp_processo
    20.4  
    20.5  #endif
    20.6  
    20.7 -extern int __cpu_disable(void);
    20.8 +extern int __cpu_disable(int down_cpu);
    20.9  extern void __cpu_die(unsigned int cpu);
   20.10  #endif /* !__ASSEMBLY__ */
   20.11  
    21.1 --- a/xen/include/public/arch-x86/xen-mca.h	Fri Dec 19 14:56:36 2008 +0000
    21.2 +++ b/xen/include/public/arch-x86/xen-mca.h	Mon Dec 22 08:12:33 2008 +0000
    21.3 @@ -106,7 +106,10 @@ struct mcinfo_common {
    21.4  
    21.5  #define MC_FLAG_CORRECTABLE     (1 << 0)
    21.6  #define MC_FLAG_UNCORRECTABLE   (1 << 1)
    21.7 -
    21.8 +#define MC_FLAG_RECOVERABLE	(1 << 2)
    21.9 +#define MC_FLAG_POLLED		(1 << 3)
   21.10 +#define MC_FLAG_RESET		(1 << 4)
   21.11 +#define MC_FLAG_CMCI		(1 << 5)
   21.12  /* contains global x86 mc information */
   21.13  struct mcinfo_global {
   21.14      struct mcinfo_common common;
   21.15 @@ -115,6 +118,7 @@ struct mcinfo_global {
   21.16      uint16_t mc_domid;
   21.17      uint32_t mc_socketid; /* physical socket of the physical core */
   21.18      uint16_t mc_coreid; /* physical impacted core */
   21.19 +    uint8_t  mc_apicid;
   21.20      uint16_t mc_core_threadid; /* core thread of physical core */
   21.21      uint16_t mc_vcpuid; /* virtual cpu scheduled for mc_domid */
   21.22      uint64_t mc_gstatus; /* global status */
   21.23 @@ -132,6 +136,8 @@ struct mcinfo_bank {
   21.24      uint64_t mc_addr;   /* bank address, only valid
   21.25                           * if addr bit is set in mc_status */
   21.26      uint64_t mc_misc;
   21.27 +    uint64_t mc_ctrl2;
   21.28 +    uint64_t mc_tsc;
   21.29  };
   21.30  
   21.31  
   21.32 @@ -150,7 +156,12 @@ struct mcinfo_extended {
   21.33       * multiple times. */
   21.34  
   21.35      uint32_t mc_msrs; /* Number of msr with valid values. */
   21.36 -    struct mcinfo_msr mc_msr[5];
   21.37 +    /*
   21.38 +     * Currently Intel extended MSR (32/64) including all gp registers
   21.39 +     * and E(R)DI, E(R)BP, E(R)SP, E(R)FLAGS, E(R)IP, E(R)MISC, only 10
   21.40 +     * of them might be useful. So expend this array to 10.
   21.41 +    */
   21.42 +    struct mcinfo_msr mc_msr[10];
   21.43  };
   21.44  
   21.45  #define MCINFO_HYPERCALLSIZE	1024
    22.1 --- a/xen/include/xen/stop_machine.h	Fri Dec 19 14:56:36 2008 +0000
    22.2 +++ b/xen/include/xen/stop_machine.h	Mon Dec 22 08:12:33 2008 +0000
    22.3 @@ -5,7 +5,7 @@
    22.4   * stop_machine_run: freeze the machine on all CPUs and run this function
    22.5   * @fn: the function to run
    22.6   * @data: the data ptr for the @fn()
    22.7 - * @cpu: the cpu to run @fn() on (or any, if @cpu == NR_CPUS).
    22.8 + * @cpus: cpus to run @fn() on.
    22.9   *
   22.10   * Description: This causes every other cpu to enter a safe point, with
   22.11   * each of which disables interrupts, and finally interrupts are disabled
   22.12 @@ -14,6 +14,6 @@
   22.13   *
   22.14   * This can be thought of as a very heavy write lock, equivalent to
   22.15   * grabbing every spinlock in the kernel. */
   22.16 -int stop_machine_run(int (*fn)(void *), void *data, unsigned int cpu);
   22.17 +int stop_machine_run(int (*fn)(void *), void *data, cpumask_t cpu);
   22.18  
   22.19  #endif /* __XEN_STOP_MACHINE_H__ */