ia64/xen-unstable

changeset 19372:9c1be8f2013b

x86 mcheck: Replace hypervisor MCA telemetry structures with something
more robust and designed to make terminal error telemetry available to
the dom0 panic flow for diagnosis on reboot.

Use common code for a lot of the AMD and Intel MCE handling code.

Signed-off-by: Gavin Maltby <gavin.maltby@sun.com>
Signed-off-by: Frank van der Linden <frank.vanderlinden@sun.com>
author Keir Fraser <keir.fraser@citrix.com>
date Tue Mar 17 14:22:50 2009 +0000 (2009-03-17)
parents 0b1ce09f4577
children 372ec886ad0c
files xen/arch/x86/cpu/mcheck/Makefile xen/arch/x86/cpu/mcheck/amd_f10.c xen/arch/x86/cpu/mcheck/amd_k8.c xen/arch/x86/cpu/mcheck/amd_nonfatal.c xen/arch/x86/cpu/mcheck/k7.c xen/arch/x86/cpu/mcheck/mce.c xen/arch/x86/cpu/mcheck/mce.h xen/arch/x86/cpu/mcheck/mce_intel.c xen/arch/x86/cpu/mcheck/mctelem.c xen/arch/x86/cpu/mcheck/mctelem.h xen/arch/x86/cpu/mcheck/non-fatal.c xen/arch/x86/cpu/mcheck/p5.c xen/arch/x86/cpu/mcheck/winchip.c xen/arch/x86/cpu/mcheck/x86_mca.h xen/include/asm-x86/traps.h xen/include/public/arch-x86/xen-mca.h
line diff
     1.1 --- a/xen/arch/x86/cpu/mcheck/Makefile	Tue Mar 17 14:21:18 2009 +0000
     1.2 +++ b/xen/arch/x86/cpu/mcheck/Makefile	Tue Mar 17 14:22:50 2009 +0000
     1.3 @@ -2,6 +2,7 @@ obj-y += amd_nonfatal.o
     1.4  obj-y += k7.o
     1.5  obj-y += amd_k8.o
     1.6  obj-y += amd_f10.o
     1.7 +obj-y += mctelem.o
     1.8  obj-y += mce.o
     1.9  obj-y += mce_intel.o
    1.10  obj-y += non-fatal.o
     2.1 --- a/xen/arch/x86/cpu/mcheck/amd_f10.c	Tue Mar 17 14:21:18 2009 +0000
     2.2 +++ b/xen/arch/x86/cpu/mcheck/amd_f10.c	Tue Mar 17 14:22:50 2009 +0000
     2.3 @@ -49,20 +49,21 @@
     2.4  #include "x86_mca.h"
     2.5  
     2.6  
     2.7 -static int amd_f10_handler(struct mc_info *mi, uint16_t bank, uint64_t status)
     2.8 +static enum mca_extinfo
     2.9 +amd_f10_handler(struct mc_info *mi, uint16_t bank, uint64_t status)
    2.10  {
    2.11  	struct mcinfo_extended mc_ext;
    2.12  
    2.13  	/* Family 0x10 introduced additional MSR that belong to the
    2.14  	 * northbridge bank (4). */
    2.15 -	if (bank != 4)
    2.16 -		return 0;
    2.17 +	if (mi == NULL || bank != 4)
    2.18 +		return MCA_EXTINFO_IGNORED;
    2.19  
    2.20  	if (!(status & MCi_STATUS_VAL))
    2.21 -		return 0;
    2.22 +		return MCA_EXTINFO_IGNORED;
    2.23  
    2.24  	if (!(status & MCi_STATUS_MISCV))
    2.25 -		return 0;
    2.26 +		return MCA_EXTINFO_IGNORED;
    2.27  
    2.28  	memset(&mc_ext, 0, sizeof(mc_ext));
    2.29  	mc_ext.common.type = MC_TYPE_EXTENDED;
    2.30 @@ -78,23 +79,25 @@ static int amd_f10_handler(struct mc_inf
    2.31  	rdmsrl(MSR_F10_MC4_MISC3, mc_ext.mc_msr[2].value);
    2.32  	
    2.33  	x86_mcinfo_add(mi, &mc_ext);
    2.34 -	return 1;
    2.35 +	return MCA_EXTINFO_LOCAL;
    2.36  }
    2.37  
    2.38  
    2.39  extern void k8_machine_check(struct cpu_user_regs *regs, long error_code);
    2.40  
    2.41  /* AMD Family10 machine check */
    2.42 -void amd_f10_mcheck_init(struct cpuinfo_x86 *c) 
    2.43 +int amd_f10_mcheck_init(struct cpuinfo_x86 *c) 
    2.44  { 
    2.45  	uint64_t value;
    2.46  	uint32_t i;
    2.47  	int cpu_nr;
    2.48  
    2.49 -	machine_check_vector = k8_machine_check;
    2.50 -	mc_callback_bank_extended = amd_f10_handler;
    2.51 +	if (!cpu_has(c, X86_FEATURE_MCA))
    2.52 +		return 0;
    2.53 +
    2.54 +	x86_mce_vector_register(k8_machine_check);
    2.55 +	x86_mce_callback_register(amd_f10_handler);
    2.56  	cpu_nr = smp_processor_id();
    2.57 -	wmb();
    2.58  
    2.59  	rdmsrl(MSR_IA32_MCG_CAP, value);
    2.60  	if (value & MCG_CTL_P)	/* Control register present ? */
    2.61 @@ -104,18 +107,9 @@ void amd_f10_mcheck_init(struct cpuinfo_
    2.62  	for (i = 0; i < nr_mce_banks; i++) {
    2.63  		switch (i) {
    2.64  		case 4: /* Northbridge */
    2.65 -			/* Enable error reporting of all errors,
    2.66 -			 * enable error checking and
    2.67 -			 * disable sync flooding */
    2.68 -			wrmsrl(MSR_IA32_MC4_CTL, 0x02c3c008ffffffffULL);
    2.69 +			/* Enable error reporting of all errors */
    2.70 +			wrmsrl(MSR_IA32_MC4_CTL, 0xffffffffffffffffULL);
    2.71  			wrmsrl(MSR_IA32_MC4_STATUS, 0x0ULL);
    2.72 -
    2.73 -			/* XXX: We should write the value 0x1087821UL into
    2.74 -			 * to register F3x180 here, which sits in
    2.75 -			 * the PCI extended configuration space.
    2.76 -			 * Since this is not possible here, we can only hope,
    2.77 -			 * Dom0 is doing that.
    2.78 -			 */
    2.79  			break;
    2.80  
    2.81  		default:
    2.82 @@ -128,4 +122,5 @@ void amd_f10_mcheck_init(struct cpuinfo_
    2.83  
    2.84  	set_in_cr4(X86_CR4_MCE);
    2.85  	printk("CPU%i: AMD Family10h machine check reporting enabled.\n", cpu_nr);
    2.86 +	return 1;
    2.87  }
     3.1 --- a/xen/arch/x86/cpu/mcheck/amd_k8.c	Tue Mar 17 14:21:18 2009 +0000
     3.2 +++ b/xen/arch/x86/cpu/mcheck/amd_k8.c	Tue Mar 17 14:22:50 2009 +0000
     3.3 @@ -67,234 +67,27 @@
     3.4  #include <asm/msr.h>
     3.5  
     3.6  #include "mce.h"
     3.7 -#include "x86_mca.h"
     3.8  
     3.9  
    3.10  /* Machine Check Handler for AMD K8 family series */
    3.11  void k8_machine_check(struct cpu_user_regs *regs, long error_code)
    3.12  {
    3.13 -	struct vcpu *vcpu = current;
    3.14 -	struct domain *curdom;
    3.15 -	struct mc_info *mc_data;
    3.16 -	struct mcinfo_global mc_global;
    3.17 -	struct mcinfo_bank mc_info;
    3.18 -	uint64_t status, addrv, miscv, uc;
    3.19 -	uint32_t i;
    3.20 -	unsigned int cpu_nr;
    3.21 -	uint32_t xen_impacted = 0;
    3.22 -#define DOM_NORMAL	0
    3.23 -#define DOM0_TRAP	1
    3.24 -#define DOMU_TRAP	2
    3.25 -#define DOMU_KILLED	4
    3.26 -	uint32_t dom_state = DOM_NORMAL;
    3.27 -
    3.28 -	/* This handler runs as interrupt gate. So IPIs from the
    3.29 -	 * polling service routine are defered until we finished.
    3.30 -	 */
    3.31 -
    3.32 -        /* Disable interrupts for the _vcpu_. It may not re-scheduled to
    3.33 -	 * an other physical CPU or the impacted process in the guest
    3.34 -	 * continues running with corrupted data, otherwise. */
    3.35 -        vcpu_schedule_lock_irq(vcpu);
    3.36 -
    3.37 -	mc_data = x86_mcinfo_getptr();
    3.38 -	cpu_nr = smp_processor_id();
    3.39 -	BUG_ON(cpu_nr != vcpu->processor);
    3.40 -
    3.41 -	curdom = vcpu->domain;
    3.42 -
    3.43 -	memset(&mc_global, 0, sizeof(mc_global));
    3.44 -	mc_global.common.type = MC_TYPE_GLOBAL;
    3.45 -	mc_global.common.size = sizeof(mc_global);
    3.46 -
    3.47 -	mc_global.mc_domid = curdom->domain_id; /* impacted domain */
    3.48 -
    3.49 -	x86_mc_get_cpu_info(cpu_nr, &mc_global.mc_socketid,
    3.50 -	    &mc_global.mc_coreid, &mc_global.mc_core_threadid,
    3.51 -	    &mc_global.mc_apicid, NULL, NULL, NULL);
    3.52 -
    3.53 -	mc_global.mc_vcpuid = vcpu->vcpu_id; /* impacted vcpu */
    3.54 -	mc_global.mc_flags |= MC_FLAG_UNCORRECTABLE;
    3.55 -	rdmsrl(MSR_IA32_MCG_STATUS, mc_global.mc_gstatus);
    3.56 -
    3.57 -	/* Quick check, who is impacted */
    3.58 -	xen_impacted = is_idle_domain(curdom);
    3.59 -
    3.60 -	/* Dom0 */
    3.61 -	x86_mcinfo_clear(mc_data);
    3.62 -	x86_mcinfo_add(mc_data, &mc_global);
    3.63 -
    3.64 -	for (i = 0; i < nr_mce_banks; i++) {
    3.65 -		struct domain *d;
    3.66 -
    3.67 -		rdmsrl(MSR_IA32_MC0_STATUS + 4 * i, status);
    3.68 -
    3.69 -		if (!(status & MCi_STATUS_VAL))
    3.70 -			continue;
    3.71 -
    3.72 -		/* An error happened in this bank.
    3.73 -		 * This is expected to be an uncorrectable error,
    3.74 -		 * since correctable errors get polled.
    3.75 -		 */
    3.76 -		uc = status & MCi_STATUS_UC;
    3.77 -
    3.78 -		memset(&mc_info, 0, sizeof(mc_info));
    3.79 -		mc_info.common.type = MC_TYPE_BANK;
    3.80 -		mc_info.common.size = sizeof(mc_info);
    3.81 -		mc_info.mc_bank = i;
    3.82 -		mc_info.mc_status = status;
    3.83 -
    3.84 -		addrv = 0;
    3.85 -		if (status & MCi_STATUS_ADDRV) {
    3.86 -			rdmsrl(MSR_IA32_MC0_ADDR + 4 * i, addrv);
    3.87 -			
    3.88 -			d = maddr_get_owner(addrv);
    3.89 -			if (d != NULL)
    3.90 -				mc_info.mc_domid = d->domain_id;
    3.91 -		}
    3.92 -
    3.93 -		miscv = 0;
    3.94 -		if (status & MCi_STATUS_MISCV)
    3.95 -			rdmsrl(MSR_IA32_MC0_MISC + 4 * i, miscv);
    3.96 -
    3.97 -		mc_info.mc_addr = addrv;
    3.98 -		mc_info.mc_misc = miscv;
    3.99 -
   3.100 -		x86_mcinfo_add(mc_data, &mc_info); /* Dom0 */
   3.101 -
   3.102 -		if (mc_callback_bank_extended)
   3.103 -			mc_callback_bank_extended(mc_data, i, status);
   3.104 -
   3.105 -		/* clear status */
   3.106 -		wrmsrl(MSR_IA32_MC0_STATUS + 4 * i, 0x0ULL);
   3.107 -		wmb();
   3.108 -		add_taint(TAINT_MACHINE_CHECK);
   3.109 -	}
   3.110 -
   3.111 -	status = mc_global.mc_gstatus;
   3.112 -
   3.113 -	/* clear MCIP or cpu enters shutdown state
   3.114 -	 * in case another MCE occurs. */
   3.115 -	status &= ~MCG_STATUS_MCIP;
   3.116 -	wrmsrl(MSR_IA32_MCG_STATUS, status);
   3.117 -	wmb();
   3.118 -
   3.119 -	/* For the details see the discussion "MCE/MCA concept" on xen-devel.
   3.120 -	 * The thread started here:
   3.121 -	 * http://lists.xensource.com/archives/html/xen-devel/2007-05/msg01015.html
   3.122 -	 */
   3.123 -
   3.124 -	/* MCG_STATUS_RIPV: 
   3.125 -	 * When this bit is not set, then the instruction pointer onto the stack
   3.126 -	 * to resume at is not valid. If xen is interrupted, then we panic anyway
   3.127 -	 * right below. Otherwise it is up to the guest to figure out if 
   3.128 -	 * guest kernel or guest userland is affected and should kill either
   3.129 -	 * itself or the affected process.
   3.130 -	 */
   3.131 -
   3.132 -	/* MCG_STATUS_EIPV:
   3.133 -	 * Evaluation of EIPV is the job of the guest.
   3.134 -	 */
   3.135 -
   3.136 -	if (xen_impacted) {
   3.137 -		/* Now we are going to panic anyway. Allow interrupts, so that
   3.138 -		 * printk on serial console can work. */
   3.139 -		vcpu_schedule_unlock_irq(vcpu);
   3.140 -
   3.141 -		/* Uh, that means, machine check exception
   3.142 -		 * inside Xen occured. */
   3.143 -		printk("Machine check exception occured in Xen.\n");
   3.144 -
   3.145 -		/* if MCG_STATUS_EIPV indicates, the IP on the stack is related
   3.146 -		 * to the error then it makes sense to print a stack trace.
   3.147 -		 * That can be useful for more detailed error analysis and/or
   3.148 -		 * error case studies to figure out, if we can clear
   3.149 -		 * xen_impacted and kill a DomU instead
   3.150 -		 * (i.e. if a guest only control structure is affected, but then
   3.151 -		 * we must ensure the bad pages are not re-used again).
   3.152 -		 */
   3.153 -		if (status & MCG_STATUS_EIPV) {
   3.154 -			printk("MCE: Instruction Pointer is related to the error. "
   3.155 -				"Therefore, print the execution state.\n");
   3.156 -			show_execution_state(regs);
   3.157 -		}
   3.158 -		x86_mcinfo_dump(mc_data);
   3.159 -		mc_panic("End of MCE. Use mcelog to decode above error codes.\n");
   3.160 -	}
   3.161 -
   3.162 -	/* If Dom0 registered a machine check handler, which is only possible
   3.163 -	 * with a PV MCA driver, then ... */
   3.164 -	if ( guest_has_trap_callback(dom0, 0, TRAP_machine_check) ) {
   3.165 -		dom_state = DOM0_TRAP;
   3.166 -
   3.167 -		/* ... deliver machine check trap to Dom0. */
   3.168 -		send_guest_trap(dom0, 0, TRAP_machine_check);
   3.169 -
   3.170 -		/* Xen may tell Dom0 now to notify the DomU.
   3.171 -		 * But this will happen through a hypercall. */
   3.172 -	} else
   3.173 -		/* Dom0 did not register a machine check handler, but if DomU
   3.174 -		 * did so, then... */
   3.175 -                if ( guest_has_trap_callback(curdom, vcpu->vcpu_id, TRAP_machine_check) ) {
   3.176 -			dom_state = DOMU_TRAP;
   3.177 -
   3.178 -			/* ... deliver machine check trap to DomU */
   3.179 -			send_guest_trap(curdom, vcpu->vcpu_id, TRAP_machine_check);
   3.180 -	} else {
   3.181 -		/* hmm... noone feels responsible to handle the error.
   3.182 -		 * So, do a quick check if a DomU is impacted or not.
   3.183 -		 */
   3.184 -		if (curdom == dom0) {
   3.185 -			/* Dom0 is impacted. Since noone can't handle
   3.186 -			 * this error, panic! */
   3.187 -			x86_mcinfo_dump(mc_data);
   3.188 -			mc_panic("MCE occured in Dom0, which it can't handle\n");
   3.189 -
   3.190 -			/* UNREACHED */
   3.191 -		} else {
   3.192 -			dom_state = DOMU_KILLED;
   3.193 -
   3.194 -			/* Enable interrupts. This basically results in
   3.195 -			 * calling sti on the *physical* cpu. But after
   3.196 -			 * domain_crash() the vcpu pointer is invalid.
   3.197 -			 * Therefore, we must unlock the irqs before killing
   3.198 -			 * it. */
   3.199 -			vcpu_schedule_unlock_irq(vcpu);
   3.200 -
   3.201 -			/* DomU is impacted. Kill it and continue. */
   3.202 -			domain_crash(curdom);
   3.203 -		}
   3.204 -	}
   3.205 -
   3.206 -
   3.207 -	switch (dom_state) {
   3.208 -	case DOM0_TRAP:
   3.209 -	case DOMU_TRAP:
   3.210 -		/* Enable interrupts. */
   3.211 -		vcpu_schedule_unlock_irq(vcpu);
   3.212 -
   3.213 -		/* guest softirqs and event callbacks are scheduled
   3.214 -		 * immediately after this handler exits. */
   3.215 -		break;
   3.216 -	case DOMU_KILLED:
   3.217 -		/* Nothing to do here. */
   3.218 -		break;
   3.219 -	default:
   3.220 -		BUG();
   3.221 -	}
   3.222 +	mcheck_cmn_handler(regs, error_code, mca_allbanks);
   3.223  }
   3.224  
   3.225 -
   3.226  /* AMD K8 machine check */
   3.227 -void amd_k8_mcheck_init(struct cpuinfo_x86 *c)
   3.228 +int amd_k8_mcheck_init(struct cpuinfo_x86 *c)
   3.229  {
   3.230  	uint64_t value;
   3.231  	uint32_t i;
   3.232  	int cpu_nr;
   3.233  
   3.234 -	machine_check_vector = k8_machine_check;
   3.235 +	/* Check for PPro style MCA; our caller has confirmed MCE support. */
   3.236 +	if (!cpu_has(c, X86_FEATURE_MCA))
   3.237 +		return 0;
   3.238 +
   3.239 +	x86_mce_vector_register(k8_machine_check);
   3.240  	cpu_nr = smp_processor_id();
   3.241 -	wmb();
   3.242  
   3.243  	rdmsrl(MSR_IA32_MCG_CAP, value);
   3.244  	if (value & MCG_CTL_P)	/* Control register present ? */
   3.245 @@ -304,10 +97,8 @@ void amd_k8_mcheck_init(struct cpuinfo_x
   3.246  	for (i = 0; i < nr_mce_banks; i++) {
   3.247  		switch (i) {
   3.248  		case 4: /* Northbridge */
   3.249 -			/* Enable error reporting of all errors,
   3.250 -			 * enable error checking and
   3.251 -			 * disable sync flooding */
   3.252 -			wrmsrl(MSR_IA32_MC4_CTL, 0x02c3c008ffffffffULL);
   3.253 +			/* Enable error reporting of all errors */
   3.254 +			wrmsrl(MSR_IA32_MC4_CTL, 0xffffffffffffffffULL);
   3.255  			wrmsrl(MSR_IA32_MC4_STATUS, 0x0ULL);
   3.256  			break;
   3.257  
   3.258 @@ -321,4 +112,6 @@ void amd_k8_mcheck_init(struct cpuinfo_x
   3.259  
   3.260  	set_in_cr4(X86_CR4_MCE);
   3.261  	printk("CPU%i: AMD K8 machine check reporting enabled.\n", cpu_nr);
   3.262 +
   3.263 +	return 1;
   3.264  }
     4.1 --- a/xen/arch/x86/cpu/mcheck/amd_nonfatal.c	Tue Mar 17 14:21:18 2009 +0000
     4.2 +++ b/xen/arch/x86/cpu/mcheck/amd_nonfatal.c	Tue Mar 17 14:22:50 2009 +0000
     4.3 @@ -58,22 +58,23 @@
     4.4  #include <xen/smp.h>
     4.5  #include <xen/timer.h>
     4.6  #include <xen/event.h>
     4.7 -#include <asm/processor.h> 
     4.8 +
     4.9 +#include <asm/processor.h>
    4.10  #include <asm/system.h>
    4.11  #include <asm/msr.h>
    4.12  
    4.13  #include "mce.h"
    4.14 -#include "x86_mca.h"
    4.15  
    4.16  static struct timer mce_timer;
    4.17  
    4.18 -#define MCE_PERIOD MILLISECS(15000)
    4.19 +#define MCE_PERIOD MILLISECS(10000)
    4.20  #define MCE_MIN    MILLISECS(2000)
    4.21  #define MCE_MAX    MILLISECS(30000)
    4.22  
    4.23  static s_time_t period = MCE_PERIOD;
    4.24  static int hw_threshold = 0;
    4.25  static int adjust = 0;
    4.26 +static int variable_period = 1;
    4.27  
    4.28  /* The polling service routine:
    4.29   * Collects information of correctable errors and notifies
    4.30 @@ -81,99 +82,46 @@ static int adjust = 0;
    4.31   */
    4.32  void mce_amd_checkregs(void *info)
    4.33  {
    4.34 -	struct vcpu *vcpu = current;
    4.35 -	struct mc_info *mc_data;
    4.36 -	struct mcinfo_global mc_global;
    4.37 -	struct mcinfo_bank mc_info;
    4.38 -	uint64_t status, addrv, miscv;
    4.39 -	unsigned int i;
    4.40 +	mctelem_cookie_t mctc;
    4.41 +	struct mca_summary bs;
    4.42  	unsigned int event_enabled;
    4.43 -	unsigned int cpu_nr;
    4.44 -	int error_found;
    4.45 -
    4.46 -	/* We don't need a slot yet. Only allocate one on error. */
    4.47 -	mc_data = NULL;
    4.48 -
    4.49 -	cpu_nr = smp_processor_id();
    4.50 -	BUG_ON(cpu_nr != vcpu->processor);
    4.51 -	event_enabled = guest_enabled_event(dom0->vcpu[0], VIRQ_MCA);
    4.52 -	error_found = 0;
    4.53 -
    4.54 -	memset(&mc_global, 0, sizeof(mc_global));
    4.55 -	mc_global.common.type = MC_TYPE_GLOBAL;
    4.56 -	mc_global.common.size = sizeof(mc_global);
    4.57 -
    4.58 -	mc_global.mc_domid = vcpu->domain->domain_id; /* impacted domain */
    4.59 -	mc_global.mc_vcpuid = vcpu->vcpu_id; /* impacted vcpu */
    4.60 -
    4.61 -	x86_mc_get_cpu_info(cpu_nr, &mc_global.mc_socketid,
    4.62 -	    &mc_global.mc_coreid, &mc_global.mc_core_threadid,
    4.63 -	    &mc_global.mc_apicid, NULL, NULL, NULL);
    4.64 -
    4.65 -	mc_global.mc_flags |= MC_FLAG_CORRECTABLE;
    4.66 -	rdmsrl(MSR_IA32_MCG_STATUS, mc_global.mc_gstatus);
    4.67 -
    4.68 -	for (i = 0; i < nr_mce_banks; i++) {
    4.69 -		struct domain *d;
    4.70 -
    4.71 -		rdmsrl(MSR_IA32_MC0_STATUS + i * 4, status);
    4.72 -
    4.73 -		if (!(status & MCi_STATUS_VAL))
    4.74 -			continue;
    4.75  
    4.76 -		if (mc_data == NULL) {
    4.77 -			/* Now we need a slot to fill in error telemetry. */
    4.78 -			mc_data = x86_mcinfo_getptr();
    4.79 -			BUG_ON(mc_data == NULL);
    4.80 -			x86_mcinfo_clear(mc_data);
    4.81 -			x86_mcinfo_add(mc_data, &mc_global);
    4.82 -		}
    4.83 -
    4.84 -		memset(&mc_info, 0, sizeof(mc_info));
    4.85 -		mc_info.common.type = MC_TYPE_BANK;
    4.86 -		mc_info.common.size = sizeof(mc_info);
    4.87 -		mc_info.mc_bank = i;
    4.88 -		mc_info.mc_status = status;
    4.89 -
    4.90 -		/* Increase polling frequency */
    4.91 -		error_found = 1;
    4.92 +	mctc = mcheck_mca_logout(MCA_POLLER, mca_allbanks, &bs);
    4.93  
    4.94 -		addrv = 0;
    4.95 -		if (status & MCi_STATUS_ADDRV) {
    4.96 -			rdmsrl(MSR_IA32_MC0_ADDR + i * 4, addrv);
    4.97 -
    4.98 -			d = maddr_get_owner(addrv);
    4.99 -			if (d != NULL)
   4.100 -				mc_info.mc_domid = d->domain_id;
   4.101 -		}
   4.102 +	event_enabled = guest_enabled_event(dom0->vcpu[0], VIRQ_MCA);
   4.103  
   4.104 -		miscv = 0;
   4.105 -		if (status & MCi_STATUS_MISCV)
   4.106 -			rdmsrl(MSR_IA32_MC0_MISC + i * 4, miscv);
   4.107 +	if (bs.errcnt && mctc != NULL) {
   4.108 +		static uint64_t dumpcount = 0;
   4.109  
   4.110 -		mc_info.mc_addr = addrv;
   4.111 -		mc_info.mc_misc = miscv;
   4.112 -		x86_mcinfo_add(mc_data, &mc_info);
   4.113 +		/* If Dom0 enabled the VIRQ_MCA event, then notify it.
   4.114 +		 * Otherwise, if dom0 has had plenty of time to register
   4.115 +		 * the virq handler but still hasn't then dump telemetry
   4.116 +		 * to the Xen console.  The call count may be incremented
   4.117 +		 * on multiple cpus at once and is indicative only - just
   4.118 +		 * a simple-minded attempt to avoid spamming the console
   4.119 +		 * for corrected errors in early startup. */
   4.120  
   4.121 -		if (mc_callback_bank_extended)
   4.122 -			mc_callback_bank_extended(mc_data, i, status);
   4.123 -
   4.124 -		/* clear status */
   4.125 -		wrmsrl(MSR_IA32_MC0_STATUS + i * 4, 0x0ULL);
   4.126 -		wmb();
   4.127 +		if (event_enabled) {
   4.128 +			mctelem_commit(mctc);
   4.129 +			send_guest_global_virq(dom0, VIRQ_MCA);
   4.130 +		} else if (++dumpcount >= 10) {
   4.131 +			x86_mcinfo_dump((struct mc_info *)mctelem_dataptr(mctc));
   4.132 +			mctelem_dismiss(mctc);
   4.133 +		} else {
   4.134 +			mctelem_dismiss(mctc);
   4.135 +		}
   4.136 +		
   4.137 +	} else if (mctc != NULL) {
   4.138 +		mctelem_dismiss(mctc);
   4.139  	}
   4.140  
   4.141 -	if (error_found > 0) {
   4.142 -		/* If Dom0 enabled the VIRQ_MCA event, then ... */
   4.143 -		if (event_enabled)
   4.144 -			/* ... notify it. */
   4.145 -			send_guest_global_virq(dom0, VIRQ_MCA);
   4.146 -		else
   4.147 -			/* ... or dump it */
   4.148 -			x86_mcinfo_dump(mc_data);
   4.149 -	}
   4.150 -
   4.151 -	adjust += error_found;
   4.152 +	/* adjust is global and all cpus may attempt to increment it without
   4.153 +	 * synchronisation, so they race and the final adjust count
   4.154 +	 * (number of cpus seeing any error) is approximate.  We can
   4.155 +	 * guarantee that if any cpu observes an error that the
   4.156 +	 * adjust count is at least 1. */
   4.157 +	if (bs.errcnt)
   4.158 +		adjust++;
   4.159  }
   4.160  
   4.161  /* polling service routine invoker:
   4.162 @@ -188,7 +136,7 @@ static void mce_amd_work_fn(void *data)
   4.163  	on_each_cpu(mce_amd_checkregs, data, 1, 1);
   4.164  
   4.165  	if (adjust > 0) {
   4.166 -		if ( !guest_enabled_event(dom0->vcpu[0], VIRQ_MCA) ) {
   4.167 +		if (!guest_enabled_event(dom0->vcpu[0], VIRQ_MCA) ) {
   4.168  			/* Dom0 did not enable VIRQ_MCA, so Xen is reporting. */
   4.169  			printk("MCE: polling routine found correctable error. "
   4.170  				" Use mcelog to parse above error output.\n");
   4.171 @@ -229,19 +177,19 @@ static void mce_amd_work_fn(void *data)
   4.172  		}
   4.173  	}
   4.174  
   4.175 -	if (adjust > 0) {
   4.176 +	if (variable_period && adjust > 0) {
   4.177  		/* Increase polling frequency */
   4.178  		adjust++; /* adjust == 1 must have an effect */
   4.179  		period /= adjust;
   4.180 -	} else {
   4.181 +	} else if (variable_period) {
   4.182  		/* Decrease polling frequency */
   4.183  		period *= 2;
   4.184  	}
   4.185 -	if (period > MCE_MAX) {
   4.186 +	if (variable_period && period > MCE_MAX) {
   4.187  		/* limit: Poll at least every 30s */
   4.188  		period = MCE_MAX;
   4.189  	}
   4.190 -	if (period < MCE_MIN) {
   4.191 +	if (variable_period && period < MCE_MIN) {
   4.192  		/* limit: Poll every 2s.
   4.193  		 * When this is reached an uncorrectable error
   4.194  		 * is expected to happen, if Dom0 does nothing.
   4.195 @@ -262,7 +210,7 @@ void amd_nonfatal_mcheck_init(struct cpu
   4.196  
   4.197  	/* The threshold bitfields in MSR_IA32_MC4_MISC has
   4.198  	 * been introduced along with the SVME feature bit. */
   4.199 -	if (cpu_has(c, X86_FEATURE_SVME)) {
   4.200 +	if (variable_period && cpu_has(c, X86_FEATURE_SVME)) {
   4.201  		uint64_t value;
   4.202  
   4.203  		/* hw threshold registers present */
     5.1 --- a/xen/arch/x86/cpu/mcheck/k7.c	Tue Mar 17 14:21:18 2009 +0000
     5.2 +++ b/xen/arch/x86/cpu/mcheck/k7.c	Tue Mar 17 14:22:50 2009 +0000
     5.3 @@ -68,13 +68,16 @@ static fastcall void k7_machine_check(st
     5.4  
     5.5  
     5.6  /* AMD K7 machine check */
     5.7 -void amd_k7_mcheck_init(struct cpuinfo_x86 *c)
     5.8 +int amd_k7_mcheck_init(struct cpuinfo_x86 *c)
     5.9  {
    5.10  	u32 l, h;
    5.11  	int i;
    5.12  
    5.13 -	machine_check_vector = k7_machine_check;
    5.14 -	wmb();
    5.15 +	/* Check for PPro style MCA; our caller has confirmed MCE support. */
    5.16 +	if (!cpu_has(c, X86_FEATURE_MCA))
    5.17 +		return 0;
    5.18 +
    5.19 +	x86_mce_vector_register(k7_machine_check);
    5.20  
    5.21  	rdmsr (MSR_IA32_MCG_CAP, l, h);
    5.22  	if (l & (1<<8))	/* Control register present ? */
    5.23 @@ -92,4 +95,6 @@ void amd_k7_mcheck_init(struct cpuinfo_x
    5.24  	set_in_cr4 (X86_CR4_MCE);
    5.25  	printk (KERN_INFO "CPU%d: AMD K7 machine check reporting enabled.\n",
    5.26  		smp_processor_id());
    5.27 +
    5.28 +	return 1;
    5.29  }
     6.1 --- a/xen/arch/x86/cpu/mcheck/mce.c	Tue Mar 17 14:21:18 2009 +0000
     6.2 +++ b/xen/arch/x86/cpu/mcheck/mce.c	Tue Mar 17 14:22:50 2009 +0000
     6.3 @@ -10,104 +10,490 @@
     6.4  #include <xen/smp.h>
     6.5  #include <xen/errno.h>
     6.6  #include <xen/console.h>
     6.7 +#include <xen/sched.h>
     6.8 +#include <xen/sched-if.h>
     6.9 +#include <xen/cpumask.h>
    6.10 +#include <xen/event.h>
    6.11 +#include <xen/guest_access.h>
    6.12  
    6.13 -#include <asm/processor.h> 
    6.14 +#include <asm/processor.h>
    6.15  #include <asm/system.h>
    6.16 +#include <asm/msr.h>
    6.17  
    6.18  #include "mce.h"
    6.19 -#include "x86_mca.h"
    6.20  
    6.21  int mce_disabled = 0;
    6.22  unsigned int nr_mce_banks;
    6.23  
    6.24  EXPORT_SYMBOL_GPL(nr_mce_banks);	/* non-fatal.o */
    6.25  
    6.26 -/* XXX For now a fixed array is used. Later this should be changed
    6.27 - * to a dynamic allocated array with the size calculated in relation
    6.28 - * to physical cpus present in the machine.
    6.29 - * The more physical cpus are available, the more entries you need.
    6.30 - */
    6.31 -#define MAX_MCINFO	20
    6.32 -
    6.33 -struct mc_machine_notify {
    6.34 -	struct mc_info mc;
    6.35 -	uint32_t fetch_idx;
    6.36 -	uint32_t valid;
    6.37 -};
    6.38 -
    6.39 -struct mc_machine {
    6.40 -
    6.41 -	/* Array structure used for collecting machine check error telemetry. */
    6.42 -	struct mc_info mc[MAX_MCINFO];
    6.43 +static void mcinfo_clear(struct mc_info *);
    6.44  
    6.45 -	/* We handle multiple machine check reports lockless by
    6.46 -	 * iterating through the array using the producer/consumer concept.
    6.47 -	 */
    6.48 -	/* Producer array index to fill with machine check error data.
    6.49 -	 * Index must be increased atomically. */
    6.50 -	uint32_t error_idx;
    6.51 -
    6.52 -	/* Consumer array index to fetch machine check error data from.
    6.53 -	 * Index must be increased atomically. */
    6.54 -	uint32_t fetch_idx;
    6.55 +#define	SEG_PL(segsel) ((segsel) & 0x3)
    6.56  
    6.57 -	/* Integer array holding the indeces of the mc array that allows
    6.58 -         * a Dom0 to notify a DomU to re-fetch the same machine check error
    6.59 -         * data. The notification and refetch also uses its own 
    6.60 -	 * producer/consumer mechanism, because Dom0 may decide to not report
    6.61 -	 * every error to the impacted DomU.
    6.62 -	 */
    6.63 -	struct mc_machine_notify notify[MAX_MCINFO];
    6.64 +#if 1	/* XXFM switch to 0 for putback */
    6.65  
    6.66 -	/* Array index to get fetch_idx from.
    6.67 -	 * Index must be increased atomically. */
    6.68 -	uint32_t notifyproducer_idx;
    6.69 -	uint32_t notifyconsumer_idx;
    6.70 -};
    6.71 +#define	x86_mcerr(str, err) _x86_mcerr(str, err)
    6.72  
    6.73 -/* Global variable with machine check information. */
    6.74 -struct mc_machine mc_data;
    6.75 +static int _x86_mcerr(const char *msg, int err)
    6.76 +{
    6.77 +	printk("x86_mcerr: %s, returning %d\n",
    6.78 +	    msg != NULL ? msg : "", err);
    6.79 +	return err;
    6.80 +}
    6.81 +#else
    6.82 +#define x86_mcerr(str,err)
    6.83 +#endif
    6.84 +
    6.85 +cpu_banks_t mca_allbanks;
    6.86  
    6.87  /* Handle unconfigured int18 (should never happen) */
    6.88  static void unexpected_machine_check(struct cpu_user_regs *regs, long error_code)
    6.89 -{	
    6.90 +{
    6.91  	printk(XENLOG_ERR "CPU#%d: Unexpected int18 (Machine Check).\n",
    6.92  		smp_processor_id());
    6.93  }
    6.94  
    6.95  
    6.96 +static x86_mce_vector_t _machine_check_vector = unexpected_machine_check;
    6.97 +
    6.98 +void x86_mce_vector_register(x86_mce_vector_t hdlr)
    6.99 +{
   6.100 +	_machine_check_vector = hdlr;
   6.101 +	wmb();
   6.102 +}
   6.103 +
   6.104  /* Call the installed machine check handler for this CPU setup. */
   6.105 -void (*machine_check_vector)(struct cpu_user_regs *regs, long error_code) = unexpected_machine_check;
   6.106 +
   6.107 +void machine_check_vector(struct cpu_user_regs *regs, long error_code)
   6.108 +{
   6.109 +	_machine_check_vector(regs, error_code);
   6.110 +}
   6.111  
   6.112  /* Init machine check callback handler
   6.113   * It is used to collect additional information provided by newer
   6.114   * CPU families/models without the need to duplicate the whole handler.
   6.115   * This avoids having many handlers doing almost nearly the same and each
   6.116   * with its own tweaks ands bugs. */
   6.117 -int (*mc_callback_bank_extended)(struct mc_info *, uint16_t, uint64_t) = NULL;
   6.118 +static x86_mce_callback_t mc_callback_bank_extended = NULL;
   6.119 +
   6.120 +void x86_mce_callback_register(x86_mce_callback_t cbfunc)
   6.121 +{
   6.122 +	mc_callback_bank_extended = cbfunc;
   6.123 +}
   6.124 +
   6.125 +/* Utility function to perform MCA bank telemetry readout and to push that
   6.126 + * telemetry towards an interested dom0 for logging and diagnosis.
   6.127 + * The caller - #MC handler or MCA poll function - must arrange that we
   6.128 + * do not migrate cpus. */
   6.129 +
   6.130 +/* XXFM Could add overflow counting? */
   6.131 +mctelem_cookie_t mcheck_mca_logout(enum mca_source who, cpu_banks_t bankmask,
   6.132 +    struct mca_summary *sp)
   6.133 +{
   6.134 +	struct vcpu *v = current;
   6.135 +	struct domain *d;
   6.136 +	uint64_t gstatus, status, addr, misc;
   6.137 +	struct mcinfo_global mcg;	/* on stack */
   6.138 +	struct mcinfo_common *mic;
   6.139 +	struct mcinfo_global *mig;	/* on stack */
   6.140 +	mctelem_cookie_t mctc = NULL;
   6.141 +	uint32_t uc = 0, pcc = 0;
   6.142 +	struct mc_info *mci = NULL;
   6.143 +	mctelem_class_t which = MC_URGENT;	/* XXXgcc */
   6.144 +	unsigned int cpu_nr;
   6.145 +	int errcnt = 0;
   6.146 +	int i;
   6.147 +	enum mca_extinfo cbret = MCA_EXTINFO_IGNORED;
   6.148 +
   6.149 +	cpu_nr = smp_processor_id();
   6.150 +	BUG_ON(cpu_nr != v->processor);
   6.151 +
   6.152 +	rdmsrl(MSR_IA32_MCG_STATUS, gstatus);
   6.153 +
   6.154 +	memset(&mcg, 0, sizeof (mcg));
   6.155 +	mcg.common.type = MC_TYPE_GLOBAL;
   6.156 +	mcg.common.size = sizeof (mcg);
   6.157 +	if (v != NULL && ((d = v->domain) != NULL)) {
   6.158 +		mcg.mc_domid = d->domain_id;
   6.159 +		mcg.mc_vcpuid = v->vcpu_id;
   6.160 +	} else {
   6.161 +		mcg.mc_domid = -1;
   6.162 +		mcg.mc_vcpuid = -1;
   6.163 +	}
   6.164 +	mcg.mc_gstatus = gstatus;	/* MCG_STATUS */
   6.165 +
   6.166 +	switch (who) {
   6.167 +	case MCA_MCE_HANDLER:
   6.168 +		mcg.mc_flags = MC_FLAG_MCE;
   6.169 +		which = MC_URGENT;
   6.170 +		break;
   6.171 +
   6.172 +	case MCA_POLLER:
   6.173 +	case MCA_RESET:
   6.174 +		mcg.mc_flags = MC_FLAG_POLLED;
   6.175 +		which = MC_NONURGENT;
   6.176 +		break;
   6.177 +
   6.178 +	case MCA_CMCI_HANDLER:
   6.179 +		mcg.mc_flags = MC_FLAG_CMCI;
   6.180 +		which = MC_NONURGENT;
   6.181 +		break;
   6.182 +
   6.183 +	default:
   6.184 +		BUG();
   6.185 +	}
   6.186 +
   6.187 +	/* Retrieve detector information */
   6.188 +	x86_mc_get_cpu_info(cpu_nr, &mcg.mc_socketid,
   6.189 +	    &mcg.mc_coreid, &mcg.mc_core_threadid,
   6.190 +	    &mcg.mc_apicid, NULL, NULL, NULL);
   6.191 +
   6.192 +	for (i = 0; i < 32 && i < nr_mce_banks; i++) {
   6.193 +		struct mcinfo_bank mcb;		/* on stack */
   6.194 +
   6.195 +		/* Skip bank if corresponding bit in bankmask is clear */
   6.196 +		if (!test_bit(i, bankmask))
   6.197 +			continue;
   6.198 +
   6.199 +		rdmsrl(MSR_IA32_MC0_STATUS + i * 4, status);
   6.200 +		if (!(status & MCi_STATUS_VAL))
   6.201 +			continue;	/* this bank has no valid telemetry */
   6.202 +
   6.203 +		/* If this is the first bank with valid MCA DATA, then
   6.204 +		 * try to reserve an entry from the urgent/nonurgent queue
   6.205 +		 * depending on whethere we are called from an exception or
   6.206 +		 * a poller;  this can fail (for example dom0 may not
   6.207 +		 * yet have consumed past telemetry). */
   6.208 +		if (errcnt == 0) {
   6.209 +			if ((mctc = mctelem_reserve(which)) != NULL) {
   6.210 +				mci = mctelem_dataptr(mctc);
   6.211 +				mcinfo_clear(mci);
   6.212 +			}
   6.213 +		}
   6.214 +
   6.215 +		memset(&mcb, 0, sizeof (mcb));
   6.216 +		mcb.common.type = MC_TYPE_BANK;
   6.217 +		mcb.common.size = sizeof (mcb);
   6.218 +		mcb.mc_bank = i;
   6.219 +		mcb.mc_status = status;
   6.220 +
   6.221 +		/* form a mask of which banks have logged uncorrected errors */
   6.222 +		if ((status & MCi_STATUS_UC) != 0)
   6.223 +			uc |= (1 << i);
   6.224 +
   6.225 +		/* likewise for those with processor context corrupt */
   6.226 +		if ((status & MCi_STATUS_PCC) != 0)
   6.227 +			pcc |= (1 << i);
   6.228 +
   6.229 +		addr = misc = 0;
   6.230 +
   6.231 +		if (status & MCi_STATUS_ADDRV) {
   6.232 +			rdmsrl(MSR_IA32_MC0_ADDR + 4 * i, addr);
   6.233 +			d = maddr_get_owner(addr);
   6.234 +			if (d != NULL && (who == MCA_POLLER ||
   6.235 +			    who == MCA_CMCI_HANDLER))
   6.236 +				mcb.mc_domid = d->domain_id;
   6.237 +		}
   6.238 +
   6.239 +		if (status & MCi_STATUS_MISCV)
   6.240 +			rdmsrl(MSR_IA32_MC0_MISC + 4 * i, misc);
   6.241 +
   6.242 +		mcb.mc_addr = addr;
   6.243 +		mcb.mc_misc = misc;
   6.244 +
   6.245 +		if (who == MCA_CMCI_HANDLER) {
   6.246 +			rdmsrl(MSR_IA32_MC0_CTL2 + i, mcb.mc_ctrl2);
   6.247 +			rdtscll(mcb.mc_tsc);
   6.248 +		}
   6.249 +
   6.250 +		/* Increment the error count;  if this is the first bank
   6.251 +		 * with a valid error then add the global info to the mcinfo. */
   6.252 +		if (errcnt++ == 0 && mci != NULL)
   6.253 +			x86_mcinfo_add(mci, &mcg);
   6.254 +
   6.255 +		/* Add the bank data */
   6.256 +		if (mci != NULL)
   6.257 +			x86_mcinfo_add(mci, &mcb);
   6.258 +
   6.259 +		if (mc_callback_bank_extended && cbret != MCA_EXTINFO_GLOBAL) {
   6.260 +			cbret = mc_callback_bank_extended(mci, i, status);
   6.261 +		}
   6.262 +
   6.263 +		/* Clear status */
   6.264 +		wrmsrl(MSR_IA32_MC0_STATUS + 4 * i, 0x0ULL);
   6.265 +		wmb();
   6.266 +	}
   6.267 +
   6.268 +	if (mci != NULL && errcnt > 0) {
   6.269 +		x86_mcinfo_lookup(mic, mci, MC_TYPE_GLOBAL);
   6.270 +		mig = (struct mcinfo_global *)mic;
   6.271 +		if (pcc)
   6.272 +			mcg.mc_flags |= MC_FLAG_UNCORRECTABLE;
   6.273 +		else if (uc)
   6.274 +			mcg.mc_flags |= MC_FLAG_RECOVERABLE;
   6.275 +		else
   6.276 +			mcg.mc_flags |= MC_FLAG_CORRECTABLE;
   6.277 +	}
   6.278  
   6.279  
   6.280 -static void amd_mcheck_init(struct cpuinfo_x86 *ci)
   6.281 +	if (sp) {
   6.282 +		sp->errcnt = errcnt;
   6.283 +		sp->ripv = (gstatus & MCG_STATUS_RIPV) != 0;
   6.284 +		sp->eipv = (gstatus & MCG_STATUS_EIPV) != 0;
   6.285 +		sp->uc = uc;
   6.286 +		sp->pcc = pcc;
   6.287 +	}
   6.288 +
   6.289 +	return mci != NULL ? mctc : NULL;	/* may be NULL */
   6.290 +}
   6.291 +
   6.292 +#define DOM_NORMAL	0
   6.293 +#define DOM0_TRAP	1
   6.294 +#define DOMU_TRAP	2
   6.295 +#define DOMU_KILLED	4
   6.296 +
   6.297 +/* Shared #MC handler. */
   6.298 +void mcheck_cmn_handler(struct cpu_user_regs *regs, long error_code,
   6.299 +    cpu_banks_t bankmask)
   6.300  {
   6.301 +	int xen_state_lost, dom0_state_lost, domU_state_lost;
   6.302 +	struct vcpu *v = current;
   6.303 +	struct domain *curdom = v->domain;
   6.304 +	domid_t domid = curdom->domain_id;
   6.305 +	int ctx_xen, ctx_dom0, ctx_domU;
   6.306 +	uint32_t dom_state = DOM_NORMAL;
   6.307 +	mctelem_cookie_t mctc = NULL;
   6.308 +	struct mca_summary bs;
   6.309 +	struct mc_info *mci = NULL;
   6.310 +	int irqlocked = 0;
   6.311 +	uint64_t gstatus;
   6.312 +	int ripv;
   6.313 +
   6.314 +	/* This handler runs as interrupt gate. So IPIs from the
   6.315 +	 * polling service routine are defered until we're finished.
   6.316 +	 */
   6.317 +
   6.318 +	/* Disable interrupts for the _vcpu_. It may not re-scheduled to
   6.319 +	 * another physical CPU. */
   6.320 +	vcpu_schedule_lock_irq(v);
   6.321 +	irqlocked = 1;
   6.322 +
   6.323 +	/* Read global status;  if it does not indicate machine check
   6.324 +	 * in progress then bail as long as we have a valid ip to return to. */
   6.325 +	rdmsrl(MSR_IA32_MCG_STATUS, gstatus);
   6.326 +	ripv = ((gstatus & MCG_STATUS_RIPV) != 0);
   6.327 +	if (!(gstatus & MCG_STATUS_MCIP) && ripv) {
   6.328 +		add_taint(TAINT_MACHINE_CHECK); /* questionable */
   6.329 +		vcpu_schedule_unlock_irq(v);
   6.330 +		irqlocked = 0;
   6.331 +		goto cmn_handler_done;
   6.332 +	}
   6.333 +
   6.334 +	/* Go and grab error telemetry.  We must choose whether to commit
   6.335 +	 * for logging or dismiss the cookie that is returned, and must not
   6.336 +	 * reference the cookie after that action.
   6.337 +	 */
   6.338 +	mctc = mcheck_mca_logout(MCA_MCE_HANDLER, bankmask, &bs);
   6.339 +	if (mctc != NULL)
   6.340 +		mci = (struct mc_info *)mctelem_dataptr(mctc);
   6.341 +
   6.342 +	/* Clear MCIP or another #MC will enter shutdown state */
   6.343 +	gstatus &= ~MCG_STATUS_MCIP;
   6.344 +	wrmsrl(MSR_IA32_MCG_STATUS, gstatus);
   6.345 +	wmb();
   6.346 +
   6.347 +	/* If no valid errors and our stack is intact, we're done */
   6.348 +	if (ripv && bs.errcnt == 0) {
   6.349 +		vcpu_schedule_unlock_irq(v);
   6.350 +		irqlocked = 0;
   6.351 +		goto cmn_handler_done;
   6.352 +	}
   6.353 +
   6.354 +	if (bs.uc || bs.pcc)
   6.355 +		add_taint(TAINT_MACHINE_CHECK);
   6.356 +
   6.357 +	/* Machine check exceptions will usually be for UC and/or PCC errors,
   6.358 +	 * but it is possible to configure machine check for some classes
   6.359 +	 * of corrected error.
   6.360 +	 *
   6.361 +	 * UC errors could compromise any domain or the hypervisor
   6.362 +	 * itself - for example a cache writeback of modified data that
   6.363 +	 * turned out to be bad could be for data belonging to anyone, not
   6.364 +	 * just the current domain.  In the absence of known data poisoning
   6.365 +	 * to prevent consumption of such bad data in the system we regard
   6.366 +	 * all UC errors as terminal.  It may be possible to attempt some
   6.367 +	 * heuristics based on the address affected, which guests have
   6.368 +	 * mappings to that mfn etc.
   6.369 +	 *
   6.370 +	 * PCC errors apply to the current context.
   6.371 +	 *
   6.372 +	 * If MCG_STATUS indicates !RIPV then even a #MC that is not UC
   6.373 +	 * and not PCC is terminal - the return instruction pointer
   6.374 +	 * pushed onto the stack is bogus.  If the interrupt context is
   6.375 +	 * the hypervisor or dom0 the game is over, otherwise we can
   6.376 +	 * limit the impact to a single domU but only if we trampoline
   6.377 +	 * somewhere safely - we can't return and unwind the stack.
   6.378 +	 * Since there is no trampoline in place we will treat !RIPV
   6.379 +	 * as terminal for any context.
   6.380 +	 */
   6.381 +	ctx_xen = SEG_PL(regs->cs) == 0;
   6.382 +	ctx_dom0 = !ctx_xen && (domid == dom0->domain_id);
   6.383 +	ctx_domU = !ctx_xen && !ctx_dom0;
   6.384 +
   6.385 +	xen_state_lost = bs.uc != 0 || (ctx_xen && (bs.pcc || !ripv)) ||
   6.386 +	    !ripv;
   6.387 +	dom0_state_lost = bs.uc != 0 || (ctx_dom0 && (bs.pcc || !ripv));
   6.388 +	domU_state_lost = bs.uc != 0 || (ctx_domU && (bs.pcc || !ripv));
   6.389 +
   6.390 +	if (xen_state_lost) {
   6.391 +		/* Now we are going to panic anyway. Allow interrupts, so that
   6.392 +		 * printk on serial console can work. */
   6.393 +		vcpu_schedule_unlock_irq(v);
   6.394 +		irqlocked = 0;
   6.395 +
   6.396 +		printk("Terminal machine check exception occured in "
   6.397 +		    "hypervisor context.\n");
   6.398 +
   6.399 +		/* If MCG_STATUS_EIPV indicates, the IP on the stack is related
   6.400 +		 * to the error then it makes sense to print a stack trace.
   6.401 +		 * That can be useful for more detailed error analysis and/or
   6.402 +		 * error case studies to figure out, if we can clear
   6.403 +		 * xen_impacted and kill a DomU instead
   6.404 +		 * (i.e. if a guest only control structure is affected, but then
   6.405 +		 * we must ensure the bad pages are not re-used again).
   6.406 +		 */
   6.407 +		if (bs.eipv & MCG_STATUS_EIPV) {
   6.408 +			printk("MCE: Instruction Pointer is related to the "
   6.409 +			    "error, therefore print the execution state.\n");
   6.410 +			show_execution_state(regs);
   6.411 +		}
   6.412 +
   6.413 +		/* Commit the telemetry so that panic flow can find it. */
   6.414 +		if (mctc != NULL) {
   6.415 +			x86_mcinfo_dump(mci);
   6.416 +			mctelem_commit(mctc);
   6.417 +		}
   6.418 +		mc_panic("Hypervisor state lost due to machine check "
   6.419 +		    "exception.\n");
   6.420 +		/*NOTREACHED*/
   6.421 +	}
   6.422 +
   6.423 +	/*
   6.424 +	 * Xen hypervisor state is intact.  If dom0 state is lost then
   6.425 +	 * give it a chance to decide what to do if it has registered
   6.426 +	 * a handler for this event, otherwise panic.
   6.427 +	 *
   6.428 +	 * XXFM Could add some Solaris dom0 contract kill here?
   6.429 +	 */
   6.430 +	if (dom0_state_lost) {
   6.431 +		if (guest_has_trap_callback(dom0, 0, TRAP_machine_check)) {
   6.432 +			dom_state = DOM0_TRAP;
   6.433 +			send_guest_trap(dom0, 0, TRAP_machine_check);
   6.434 +			/* XXFM case of return with !ripv ??? */
   6.435 +		} else {
   6.436 +			/* Commit telemetry for panic flow. */
   6.437 +			if (mctc != NULL) {
   6.438 +				x86_mcinfo_dump(mci);
   6.439 +				mctelem_commit(mctc);
   6.440 +			}
   6.441 +			mc_panic("Dom0 state lost due to machine check "
   6.442 +			    "exception\n");
   6.443 +			/*NOTREACHED*/
   6.444 +		}
   6.445 +	}
   6.446 +
   6.447 +	/*
   6.448 +	 * If a domU has lost state then send it a trap if it has registered
   6.449 +	 * a handler, otherwise crash the domain.
   6.450 +	 * XXFM Revisit this functionality.
   6.451 +	 */
   6.452 +	if (domU_state_lost) {
   6.453 +		if (guest_has_trap_callback(v->domain, v->vcpu_id,
   6.454 +		    TRAP_machine_check)) {
   6.455 +			dom_state = DOMU_TRAP;
   6.456 +			send_guest_trap(curdom, v->vcpu_id,
   6.457 +			    TRAP_machine_check);
   6.458 +		} else {
   6.459 +			dom_state = DOMU_KILLED;
   6.460 +			/* Enable interrupts. This basically results in
   6.461 +			 * calling sti on the *physical* cpu. But after
   6.462 +			 * domain_crash() the vcpu pointer is invalid.
   6.463 +			 * Therefore, we must unlock the irqs before killing
   6.464 +			 * it. */
   6.465 +			vcpu_schedule_unlock_irq(v);
   6.466 +			irqlocked = 0;
   6.467 +
   6.468 +			/* DomU is impacted. Kill it and continue. */
   6.469 +			domain_crash(curdom);
   6.470 +		}
   6.471 +	}
   6.472 +
   6.473 +	switch (dom_state) {
   6.474 +	case DOM0_TRAP:
   6.475 +	case DOMU_TRAP:
   6.476 +		/* Enable interrupts. */
   6.477 +		vcpu_schedule_unlock_irq(v);
   6.478 +		irqlocked = 0;
   6.479 +
   6.480 +		/* guest softirqs and event callbacks are scheduled
   6.481 +		 * immediately after this handler exits. */
   6.482 +		break;
   6.483 +	case DOMU_KILLED:
   6.484 +		/* Nothing to do here. */
   6.485 +		break;
   6.486 +
   6.487 +	case DOM_NORMAL:
   6.488 +		vcpu_schedule_unlock_irq(v);
   6.489 +		irqlocked = 0;
   6.490 +		break;
   6.491 +	}
   6.492 +
   6.493 +cmn_handler_done:
   6.494 +	BUG_ON(irqlocked);
   6.495 +	BUG_ON(!ripv);
   6.496 +
   6.497 +	if (bs.errcnt) {
   6.498 +		/* Not panicing, so forward telemetry to dom0 now if it
   6.499 +		 * is interested. */
   6.500 +		if (guest_enabled_event(dom0->vcpu[0], VIRQ_MCA)) {
   6.501 +			if (mctc != NULL)
   6.502 +				mctelem_commit(mctc);
   6.503 +			send_guest_global_virq(dom0, VIRQ_MCA);
   6.504 +		} else {
   6.505 +			x86_mcinfo_dump(mci);
   6.506 +			if (mctc != NULL)
   6.507 +				mctelem_dismiss(mctc);
   6.508 +		}
   6.509 +	} else if (mctc != NULL) {
   6.510 +		mctelem_dismiss(mctc);
   6.511 +	}
   6.512 +}
   6.513 +
   6.514 +static int amd_mcheck_init(struct cpuinfo_x86 *ci)
   6.515 +{
   6.516 +	int rc = 0;
   6.517  
   6.518  	switch (ci->x86) {
   6.519  	case 6:
   6.520 -		amd_k7_mcheck_init(ci);
   6.521 +		rc = amd_k7_mcheck_init(ci);
   6.522  		break;
   6.523  
   6.524  	case 0xf:
   6.525 -		amd_k8_mcheck_init(ci);
   6.526 +		rc = amd_k8_mcheck_init(ci);
   6.527  		break;
   6.528  
   6.529  	case 0x10:
   6.530 -		amd_f10_mcheck_init(ci);
   6.531 +		rc = amd_f10_mcheck_init(ci);
   6.532  		break;
   6.533  
   6.534  	default:
   6.535  		/* Assume that machine check support is available.
   6.536  		 * The minimum provided support is at least the K8. */
   6.537 -		amd_k8_mcheck_init(ci);
   6.538 +		rc = amd_k8_mcheck_init(ci);
   6.539  	}
   6.540 +
   6.541 +	return rc;
   6.542  }
   6.543  
   6.544  /*check the existence of Machine Check*/
   6.545 @@ -116,50 +502,81 @@ int mce_available(struct cpuinfo_x86 *c)
   6.546  	return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
   6.547  }
   6.548  
   6.549 +/*
   6.550 + * Check if bank 0 is usable for MCE. It isn't for AMD K7,
   6.551 + * and Intel P6 family before model 0x1a.
   6.552 + */
   6.553 +int mce_firstbank(struct cpuinfo_x86 *c)
   6.554 +{
   6.555 +	if (c->x86 == 6) {
   6.556 +		if (c->x86_vendor == X86_VENDOR_AMD)
   6.557 +			return 1;
   6.558 +
   6.559 +		if (c->x86_vendor == X86_VENDOR_INTEL && c->x86_model < 0x1a)
   6.560 +			return 1;
   6.561 +	}
   6.562 +
   6.563 +	return 0;
   6.564 +}
   6.565 +
   6.566  /* This has to be run for each processor */
   6.567  void mcheck_init(struct cpuinfo_x86 *c)
   6.568  {
   6.569 +	int inited = 0, i;
   6.570 +
   6.571  	if (mce_disabled == 1) {
   6.572  		printk(XENLOG_INFO "MCE support disabled by bootparam\n");
   6.573  		return;
   6.574  	}
   6.575  
   6.576 +	for (i = 0; i < MAX_NR_BANKS; i++)
   6.577 +		set_bit(i,mca_allbanks);
   6.578 +
   6.579 +	/* Enforce at least MCE support in CPUID information.  Individual
   6.580 +	 * families may also need to enforce a check for MCA support. */
   6.581  	if (!cpu_has(c, X86_FEATURE_MCE)) {
   6.582  		printk(XENLOG_INFO "CPU%i: No machine check support available\n",
   6.583  			smp_processor_id());
   6.584  		return;
   6.585  	}
   6.586  
   6.587 -	memset(&mc_data, 0, sizeof(struct mc_machine));
   6.588 +	mctelem_init(sizeof (struct mc_info));
   6.589  
   6.590  	switch (c->x86_vendor) {
   6.591  	case X86_VENDOR_AMD:
   6.592 -		amd_mcheck_init(c);
   6.593 +		inited = amd_mcheck_init(c);
   6.594  		break;
   6.595  
   6.596  	case X86_VENDOR_INTEL:
   6.597 +		switch (c->x86) {
   6.598 +		case 5:
   6.599  #ifndef CONFIG_X86_64
   6.600 -		if (c->x86==5)
   6.601 -			intel_p5_mcheck_init(c);
   6.602 +			inited = intel_p5_mcheck_init(c);
   6.603  #endif
   6.604 -		/*If it is P6 or P4 family, including CORE 2 DUO series*/
   6.605 -		if (c->x86 == 6 || c->x86==15)
   6.606 -		{
   6.607 -			printk(KERN_DEBUG "MCE: Intel newly family MC Init\n");
   6.608 -			intel_mcheck_init(c);
   6.609 +			break;
   6.610 +
   6.611 +		case 6:
   6.612 +		case 15:
   6.613 +			inited = intel_mcheck_init(c);
   6.614 +			break;
   6.615  		}
   6.616  		break;
   6.617  
   6.618  #ifndef CONFIG_X86_64
   6.619  	case X86_VENDOR_CENTAUR:
   6.620 -		if (c->x86==5)
   6.621 -			winchip_mcheck_init(c);
   6.622 +		if (c->x86==5) {
   6.623 +			inited = winchip_mcheck_init(c);
   6.624 +		}
   6.625  		break;
   6.626  #endif
   6.627  
   6.628  	default:
   6.629  		break;
   6.630  	}
   6.631 +
   6.632 +	if (!inited)
   6.633 +		printk(XENLOG_INFO "CPU%i: No machine check initialization\n",
   6.634 +		    smp_processor_id());
   6.635  }
   6.636  
   6.637  
   6.638 @@ -176,191 +593,12 @@ static void __init mcheck_enable(char *s
   6.639  custom_param("nomce", mcheck_disable);
   6.640  custom_param("mce", mcheck_enable);
   6.641  
   6.642 -
   6.643 -#include <xen/guest_access.h>
   6.644 -#include <asm/traps.h>
   6.645 -
   6.646 -struct mc_info *x86_mcinfo_getptr(void)
   6.647 -{
   6.648 -	struct mc_info *mi;
   6.649 -	uint32_t entry, next;
   6.650 -
   6.651 -	for (;;) {
   6.652 -		entry = mc_data.error_idx;
   6.653 -		smp_rmb();
   6.654 -		next = entry + 1;
   6.655 -		if (cmpxchg(&mc_data.error_idx, entry, next) == entry)
   6.656 -			break;
   6.657 -	}
   6.658 -
   6.659 -	mi = &(mc_data.mc[(entry % MAX_MCINFO)]);
   6.660 -	BUG_ON(mc_data.error_idx < mc_data.fetch_idx);
   6.661 -
   6.662 -	return mi;
   6.663 -}
   6.664 -
   6.665 -static int x86_mcinfo_matches_guest(const struct mc_info *mi,
   6.666 -			const struct domain *d, const struct vcpu *v)
   6.667 -{
   6.668 -	struct mcinfo_common *mic;
   6.669 -	struct mcinfo_global *mig;
   6.670 -
   6.671 -	x86_mcinfo_lookup(mic, mi, MC_TYPE_GLOBAL);
   6.672 -	mig = (struct mcinfo_global *)mic;
   6.673 -	if (mig == NULL)
   6.674 -		return 0;
   6.675 -
   6.676 -	if (d->domain_id != mig->mc_domid)
   6.677 -		return 0;
   6.678 -
   6.679 -	if (v->vcpu_id != mig->mc_vcpuid)
   6.680 -		return 0;
   6.681 -
   6.682 -	return 1;
   6.683 -}
   6.684 -
   6.685 -
   6.686 -#define x86_mcinfo_mcdata(idx) (mc_data.mc[(idx % MAX_MCINFO)])
   6.687 -
   6.688 -static struct mc_info *x86_mcinfo_getfetchptr(uint32_t *fetch_idx,
   6.689 -				const struct domain *d, const struct vcpu *v)
   6.690 -{
   6.691 -	struct mc_info *mi;
   6.692 -
   6.693 -	/* This function is called from the fetch hypercall with
   6.694 -	 * the mc_lock spinlock held. Thus, no need for locking here.
   6.695 -	 */
   6.696 -	mi = &(x86_mcinfo_mcdata(mc_data.fetch_idx));
   6.697 -	if ((d != dom0) && !x86_mcinfo_matches_guest(mi, d, v)) {
   6.698 -		/* Bogus domU command detected. */
   6.699 -		*fetch_idx = 0;
   6.700 -		return NULL;
   6.701 -	}
   6.702 -
   6.703 -	*fetch_idx = mc_data.fetch_idx;
   6.704 -	mc_data.fetch_idx++;
   6.705 -	BUG_ON(mc_data.fetch_idx > mc_data.error_idx);
   6.706 -
   6.707 -	return mi;
   6.708 -}
   6.709 -
   6.710 -
   6.711 -static void x86_mcinfo_marknotified(struct xen_mc_notifydomain *mc_notifydomain)
   6.712 -{
   6.713 -	struct mc_machine_notify *mn;
   6.714 -	struct mcinfo_common *mic = NULL;
   6.715 -	struct mcinfo_global *mig;
   6.716 -	struct domain *d;
   6.717 -	int i;
   6.718 -
   6.719 -	/* This function is called from the notifier hypercall with
   6.720 -	 * the mc_notify_lock spinlock held. Thus, no need for locking here.
   6.721 -	 */
   6.722 -
   6.723 -	/* First invalidate entries for guests that disappeared after
   6.724 -	 * notification (e.g. shutdown/crash). This step prevents the
   6.725 -	 * notification array from filling up with stalling/leaking entries.
   6.726 -	 */
   6.727 -	for (i = mc_data.notifyconsumer_idx; i < mc_data.notifyproducer_idx; i++) {
   6.728 -		mn = &(mc_data.notify[(i % MAX_MCINFO)]);
   6.729 -		x86_mcinfo_lookup(mic, &mn->mc, MC_TYPE_GLOBAL);
   6.730 -		BUG_ON(mic == NULL);
   6.731 -		mig = (struct mcinfo_global *)mic;
   6.732 -		d = get_domain_by_id(mig->mc_domid);
   6.733 -		if (d == NULL) {
   6.734 -			/* Domain does not exist. */
   6.735 -			mn->valid = 0;
   6.736 -		}
   6.737 -		if ((!mn->valid) && (i == mc_data.notifyconsumer_idx))
   6.738 -			mc_data.notifyconsumer_idx++;
   6.739 -	}
   6.740 -
   6.741 -	/* Now put in the error telemetry. Since all error data fetchable
   6.742 -	 * by domUs are uncorrectable errors, they are very important.
   6.743 -	 * So we dump them before overriding them. When a guest takes that long,
   6.744 -	 * then we can assume something bad already happened (crash, hang, etc.)
   6.745 -	 */
   6.746 -	mn = &(mc_data.notify[(mc_data.notifyproducer_idx % MAX_MCINFO)]);
   6.747 -
   6.748 -	if (mn->valid) {
   6.749 -		struct mcinfo_common *mic = NULL;
   6.750 -		struct mcinfo_global *mig;
   6.751 -
   6.752 -		/* To not loose the information, we dump it. */
   6.753 -		x86_mcinfo_lookup(mic, &mn->mc, MC_TYPE_GLOBAL);
   6.754 -		BUG_ON(mic == NULL);
   6.755 -		mig = (struct mcinfo_global *)mic;
   6.756 -		printk(XENLOG_WARNING "Domain ID %u was notified by Dom0 to "
   6.757 -			"fetch machine check error telemetry. But Domain ID "
   6.758 -			"did not do that in time.\n",
   6.759 -			mig->mc_domid);
   6.760 -		x86_mcinfo_dump(&mn->mc);
   6.761 -	}
   6.762 -
   6.763 -	memcpy(&mn->mc, &(x86_mcinfo_mcdata(mc_notifydomain->fetch_idx)),
   6.764 -		sizeof(struct mc_info));
   6.765 -	mn->fetch_idx = mc_notifydomain->fetch_idx;
   6.766 -	mn->valid = 1;
   6.767 -
   6.768 -	mc_data.notifyproducer_idx++;
   6.769 -
   6.770 -	/* By design there can never be more notifies than machine check errors.
   6.771 -	 * If that ever happens, then we hit a bug. */
   6.772 -	BUG_ON(mc_data.notifyproducer_idx > mc_data.fetch_idx);
   6.773 -	BUG_ON(mc_data.notifyconsumer_idx > mc_data.notifyproducer_idx);
   6.774 -}
   6.775 -
   6.776 -static struct mc_info *x86_mcinfo_getnotifiedptr(uint32_t *fetch_idx,
   6.777 -				const struct domain *d, const struct vcpu *v)
   6.778 -{
   6.779 -	struct mc_machine_notify *mn = NULL;
   6.780 -	uint32_t i;
   6.781 -	int found;
   6.782 -
   6.783 -	/* This function is called from the fetch hypercall with
   6.784 -	 * the mc_notify_lock spinlock held. Thus, no need for locking here.
   6.785 -	 */
   6.786 -
   6.787 -	/* The notifier data is filled in the order guests get notified, but
   6.788 -	 * guests may fetch them in a different order. That's why we need
   6.789 -	 * the game with valid/invalid entries. */
   6.790 -	found = 0;
   6.791 -	for (i = mc_data.notifyconsumer_idx; i < mc_data.notifyproducer_idx; i++) {
   6.792 -		mn = &(mc_data.notify[(i % MAX_MCINFO)]);
   6.793 -		if (!mn->valid) {
   6.794 -			if (i == mc_data.notifyconsumer_idx)
   6.795 -				mc_data.notifyconsumer_idx++;
   6.796 -			continue;
   6.797 -		}
   6.798 -		if (x86_mcinfo_matches_guest(&mn->mc, d, v)) {
   6.799 -			found = 1;
   6.800 -			break;
   6.801 -		}
   6.802 -	}
   6.803 -
   6.804 -	if (!found) {
   6.805 -		/* This domain has never been notified. This must be
   6.806 -		 * a bogus domU command. */
   6.807 -		*fetch_idx = 0;
   6.808 -		return NULL;
   6.809 -	}
   6.810 -
   6.811 -	BUG_ON(mn == NULL);
   6.812 -	*fetch_idx = mn->fetch_idx;
   6.813 -	mn->valid = 0;
   6.814 -
   6.815 -	BUG_ON(mc_data.notifyconsumer_idx > mc_data.notifyproducer_idx);
   6.816 -	return &mn->mc;
   6.817 -}
   6.818 -
   6.819 -
   6.820 -void x86_mcinfo_clear(struct mc_info *mi)
   6.821 +static void mcinfo_clear(struct mc_info *mi)
   6.822  {
   6.823  	memset(mi, 0, sizeof(struct mc_info));
   6.824  	x86_mcinfo_nentries(mi) = 0;
   6.825  }
   6.826  
   6.827 -
   6.828  int x86_mcinfo_add(struct mc_info *mi, void *mcinfo)
   6.829  {
   6.830  	int i;
   6.831 @@ -380,7 +618,7 @@ int x86_mcinfo_add(struct mc_info *mi, v
   6.832  	end2 = (unsigned long)((uint8_t *)mic_index + mic->size);
   6.833  
   6.834  	if (end1 < end2)
   6.835 -		return -ENOSPC; /* No space. Can't add entry. */
   6.836 +		return x86_mcerr("mcinfo_add: no more sparc", -ENOSPC);
   6.837  
   6.838  	/* there's enough space. add entry. */
   6.839  	memcpy(mic_index, mic, mic->size);
   6.840 @@ -389,7 +627,6 @@ int x86_mcinfo_add(struct mc_info *mi, v
   6.841  	return 0;
   6.842  }
   6.843  
   6.844 -
   6.845  /* Dump machine check information in a format,
   6.846   * mcelog can parse. This is used only when
   6.847   * Dom0 does not take the notification. */
   6.848 @@ -404,7 +641,7 @@ void x86_mcinfo_dump(struct mc_info *mi)
   6.849  	if (mic == NULL)
   6.850  		return;
   6.851  	mc_global = (struct mcinfo_global *)mic;
   6.852 -	if (mc_global->mc_flags & MC_FLAG_UNCORRECTABLE) {
   6.853 +	if (mc_global->mc_flags & MC_FLAG_MCE) {
   6.854  		printk(XENLOG_WARNING
   6.855  			"CPU%d: Machine Check Exception: %16"PRIx64"\n",
   6.856  			mc_global->mc_coreid, mc_global->mc_gstatus);
   6.857 @@ -424,7 +661,7 @@ void x86_mcinfo_dump(struct mc_info *mi)
   6.858  			goto next;
   6.859  
   6.860  		mc_bank = (struct mcinfo_bank *)mic;
   6.861 -	
   6.862 +
   6.863  		printk(XENLOG_WARNING "Bank %d: %16"PRIx64,
   6.864  			mc_bank->mc_bank,
   6.865  			mc_bank->mc_status);
   6.866 @@ -441,8 +678,6 @@ next:
   6.867  	} while (1);
   6.868  }
   6.869  
   6.870 -
   6.871 -
   6.872  static void do_mc_get_cpu_info(void *v)
   6.873  {
   6.874  	int cpu = smp_processor_id();
   6.875 @@ -533,183 +768,141 @@ void x86_mc_get_cpu_info(unsigned cpu, u
   6.876  	}
   6.877  }
   6.878  
   6.879 +#if BITS_PER_LONG == 64
   6.880 +
   6.881 +#define	ID2COOKIE(id)	((mctelem_cookie_t)(id))
   6.882 +#define	COOKIE2ID(c) ((uint64_t)(c))
   6.883 +
   6.884 +#elif BITS_PER_LONG == 32
   6.885 +
   6.886 +#define	ID2COOKIE(id)	((mctelem_cookie_t)(uint32_t)((id) & 0xffffffffU))
   6.887 +#define	COOKIE2ID(c)	((uint64_t)(uint32_t)(c))
   6.888 +
   6.889 +#elif defined(BITS_PER_LONG)
   6.890 +#error BITS_PER_LONG has unexpected value
   6.891 +#else
   6.892 +#error BITS_PER_LONG definition absent
   6.893 +#endif
   6.894 +
   6.895  /* Machine Check Architecture Hypercall */
   6.896  long do_mca(XEN_GUEST_HANDLE(xen_mc_t) u_xen_mc)
   6.897  {
   6.898  	long ret = 0;
   6.899  	struct xen_mc curop, *op = &curop;
   6.900  	struct vcpu *v = current;
   6.901 -	struct domain *domU;
   6.902  	struct xen_mc_fetch *mc_fetch;
   6.903 -	struct xen_mc_notifydomain *mc_notifydomain;
   6.904  	struct xen_mc_physcpuinfo *mc_physcpuinfo;
   6.905 -	struct mc_info *mi;
   6.906 -	uint32_t flags;
   6.907 -	uint32_t fetch_idx;
   6.908 -        uint16_t vcpuid;
   6.909 -	/* Use a different lock for the notify hypercall in order to allow
   6.910 -	 * a DomU to fetch mc data while Dom0 notifies another DomU. */
   6.911 -	static DEFINE_SPINLOCK(mc_lock);
   6.912 -	static DEFINE_SPINLOCK(mc_notify_lock);
   6.913 +	uint32_t flags, cmdflags;
   6.914  	int nlcpu;
   6.915  	xen_mc_logical_cpu_t *log_cpus = NULL;
   6.916 +	mctelem_cookie_t mctc;
   6.917 +	mctelem_class_t which;
   6.918  
   6.919  	if ( copy_from_guest(op, u_xen_mc, 1) )
   6.920 -		return -EFAULT;
   6.921 +		return x86_mcerr("do_mca: failed copyin of xen_mc_t", -EFAULT);
   6.922  
   6.923  	if ( op->interface_version != XEN_MCA_INTERFACE_VERSION )
   6.924 -		return -EACCES;
   6.925 +		return x86_mcerr("do_mca: interface version mismatch", -EACCES);
   6.926  
   6.927 -	switch ( op->cmd ) {
   6.928 +	switch (op->cmd) {
   6.929  	case XEN_MC_fetch:
   6.930 -		/* This hypercall is for any domain */
   6.931  		mc_fetch = &op->u.mc_fetch;
   6.932 +		cmdflags = mc_fetch->flags;
   6.933  
   6.934 -		switch (mc_fetch->flags) {
   6.935 -		case XEN_MC_CORRECTABLE:
   6.936 -			/* But polling mode is Dom0 only, because
   6.937 -			 * correctable errors are reported to Dom0 only */
   6.938 -			if ( !IS_PRIV(v->domain) )
   6.939 -				return -EPERM;
   6.940 +		/* This hypercall is for Dom0 only */
   6.941 +		if (!IS_PRIV(v->domain) )
   6.942 +			return x86_mcerr(NULL, -EPERM);
   6.943 +
   6.944 +		switch (cmdflags & (XEN_MC_NONURGENT | XEN_MC_URGENT)) {
   6.945 +		case XEN_MC_NONURGENT:
   6.946 +			which = MC_NONURGENT;
   6.947  			break;
   6.948  
   6.949 -		case XEN_MC_TRAP:
   6.950 +		case XEN_MC_URGENT:
   6.951 +			which = MC_URGENT;
   6.952  			break;
   6.953 +
   6.954  		default:
   6.955 -			return -EFAULT;
   6.956 +			return x86_mcerr("do_mca fetch: bad cmdflags", -EINVAL);
   6.957  		}
   6.958  
   6.959  		flags = XEN_MC_OK;
   6.960 -		spin_lock(&mc_lock);
   6.961  
   6.962 -		if ( IS_PRIV(v->domain) ) {
   6.963 -			/* this must be Dom0. So a notify hypercall
   6.964 -			 * can't have happened before. */
   6.965 -			mi = x86_mcinfo_getfetchptr(&fetch_idx, dom0, v);
   6.966 +		if (cmdflags & XEN_MC_ACK) {
   6.967 +			mctelem_cookie_t cookie = ID2COOKIE(mc_fetch->fetch_id);
   6.968 +			mctelem_ack(which, cookie);
   6.969  		} else {
   6.970 -			/* Hypercall comes from an unprivileged domain */
   6.971 -			domU = v->domain;
   6.972 -			if (guest_has_trap_callback(dom0, 0, TRAP_machine_check)) {
   6.973 -				/* Dom0 must have notified this DomU before
   6.974 -				 * via the notify hypercall. */
   6.975 -				mi = x86_mcinfo_getnotifiedptr(&fetch_idx, domU, v);
   6.976 +			if (guest_handle_is_null(mc_fetch->data))
   6.977 +				return x86_mcerr("do_mca fetch: guest buffer "
   6.978 +				    "invalid", -EINVAL);
   6.979 +
   6.980 +			if ((mctc = mctelem_consume_oldest_begin(which))) {
   6.981 +				struct mc_info *mcip = mctelem_dataptr(mctc);
   6.982 +				if (copy_to_guest(mc_fetch->data, mcip, 1)) {
   6.983 +					ret = -EFAULT;
   6.984 +					flags |= XEN_MC_FETCHFAILED;
   6.985 +					mc_fetch->fetch_id = 0;
   6.986 +				} else {
   6.987 +					mc_fetch->fetch_id = COOKIE2ID(mctc);
   6.988 +				}
   6.989 +				mctelem_consume_oldest_end(mctc);
   6.990  			} else {
   6.991 -				/* Xen notified the DomU. */
   6.992 -				mi = x86_mcinfo_getfetchptr(&fetch_idx, domU, v);
   6.993 +				/* There is no data */
   6.994 +				flags |= XEN_MC_NODATA;
   6.995 +				mc_fetch->fetch_id = 0;
   6.996  			}
   6.997 +
   6.998 +			mc_fetch->flags = flags;
   6.999 +			if (copy_to_guest(u_xen_mc, op, 1) != 0)
  6.1000 +				ret = -EFAULT;
  6.1001  		}
  6.1002  
  6.1003 -		if (mi) {
  6.1004 -			memcpy(&mc_fetch->mc_info, mi,
  6.1005 -				sizeof(struct mc_info));
  6.1006 -		} else {
  6.1007 -			/* There is no data for a bogus DomU command. */
  6.1008 -			flags |= XEN_MC_NODATA;
  6.1009 -			memset(&mc_fetch->mc_info, 0, sizeof(struct mc_info));
  6.1010 -		}
  6.1011 -
  6.1012 -		mc_fetch->flags = flags;
  6.1013 -		mc_fetch->fetch_idx = fetch_idx;
  6.1014 -
  6.1015 -		if ( copy_to_guest(u_xen_mc, op, 1) )
  6.1016 -			ret = -EFAULT;
  6.1017 -
  6.1018 -		spin_unlock(&mc_lock);
  6.1019  		break;
  6.1020  
  6.1021  	case XEN_MC_notifydomain:
  6.1022 -		/* This hypercall is for Dom0 only */
  6.1023 -		if ( !IS_PRIV(v->domain) )
  6.1024 -			return -EPERM;
  6.1025 -
  6.1026 -		spin_lock(&mc_notify_lock);
  6.1027 -
  6.1028 -		mc_notifydomain = &op->u.mc_notifydomain;
  6.1029 -		domU = get_domain_by_id(mc_notifydomain->mc_domid);
  6.1030 -		vcpuid = mc_notifydomain->mc_vcpuid;
  6.1031 +		return x86_mcerr("do_mca notify unsupported", -EINVAL);
  6.1032  
  6.1033 -		if ((domU == NULL) || (domU == dom0)) {
  6.1034 -			/* It's not possible to notify a non-existent domain
  6.1035 -			 * or the dom0. */
  6.1036 -			spin_unlock(&mc_notify_lock);
  6.1037 -			return -EACCES;
  6.1038 -		}
  6.1039 +	case XEN_MC_physcpuinfo:
  6.1040 +		if ( !IS_PRIV(v->domain) )
  6.1041 +			return x86_mcerr("do_mca cpuinfo", -EPERM);
  6.1042  
  6.1043 -		if (vcpuid >= MAX_VIRT_CPUS) {
  6.1044 -			/* It's not possible to notify a vcpu, Xen can't
  6.1045 -			 * assign to a domain. */
  6.1046 -			spin_unlock(&mc_notify_lock);
  6.1047 -			return -EACCES;
  6.1048 +		mc_physcpuinfo = &op->u.mc_physcpuinfo;
  6.1049 +		nlcpu = num_online_cpus();
  6.1050 +
  6.1051 +		if (!guest_handle_is_null(mc_physcpuinfo->info)) {
  6.1052 +			if (mc_physcpuinfo->ncpus <= 0)
  6.1053 +				return x86_mcerr("do_mca cpuinfo: ncpus <= 0",
  6.1054 +				    -EINVAL);
  6.1055 +			nlcpu = min(nlcpu, (int)mc_physcpuinfo->ncpus);
  6.1056 +			log_cpus = xmalloc_array(xen_mc_logical_cpu_t, nlcpu);
  6.1057 +			if (log_cpus == NULL)
  6.1058 +				return x86_mcerr("do_mca cpuinfo", -ENOMEM);
  6.1059 +
  6.1060 +			if (on_each_cpu(do_mc_get_cpu_info, log_cpus,
  6.1061 +			    1, 1) != 0) {
  6.1062 +				xfree(log_cpus);
  6.1063 +				return x86_mcerr("do_mca cpuinfo", -EIO);
  6.1064 +			}
  6.1065  		}
  6.1066  
  6.1067 -		mc_notifydomain->flags = XEN_MC_OK;
  6.1068 +		mc_physcpuinfo->ncpus = nlcpu;
  6.1069  
  6.1070 -		mi = &(x86_mcinfo_mcdata(mc_notifydomain->fetch_idx));
  6.1071 -		if (!x86_mcinfo_matches_guest(mi, domU, domU->vcpu[vcpuid])) {
  6.1072 -			/* The error telemetry is not for the guest, Dom0
  6.1073 -			 * wants to notify. */
  6.1074 -			mc_notifydomain->flags |= XEN_MC_NOMATCH;
  6.1075 -		} else if ( guest_has_trap_callback(domU, vcpuid,
  6.1076 -						TRAP_machine_check) )
  6.1077 -		{
  6.1078 -			/* Send notification */
  6.1079 -			if ( send_guest_trap(domU, vcpuid, TRAP_machine_check) )
  6.1080 -				mc_notifydomain->flags |= XEN_MC_NOTDELIVERED;
  6.1081 -		} else
  6.1082 -			mc_notifydomain->flags |= XEN_MC_CANNOTHANDLE;
  6.1083 -
  6.1084 -#ifdef DEBUG
  6.1085 -		/* sanity check - these two flags are mutually exclusive */
  6.1086 -		if ((flags & XEN_MC_CANNOTHANDLE) && (flags & XEN_MC_NOTDELIVERED))
  6.1087 -			BUG();
  6.1088 -#endif
  6.1089 -
  6.1090 -		if ( copy_to_guest(u_xen_mc, op, 1) )
  6.1091 -			ret = -EFAULT;
  6.1092 -
  6.1093 -		if (ret == 0) {
  6.1094 -			x86_mcinfo_marknotified(mc_notifydomain);
  6.1095 +		if (copy_to_guest(u_xen_mc, op, 1)) {
  6.1096 +			if (log_cpus != NULL)
  6.1097 +				xfree(log_cpus);
  6.1098 +			return x86_mcerr("do_mca cpuinfo", -EFAULT);
  6.1099  		}
  6.1100  
  6.1101 -		spin_unlock(&mc_notify_lock);
  6.1102 +		if (!guest_handle_is_null(mc_physcpuinfo->info)) {
  6.1103 +			if (copy_to_guest(mc_physcpuinfo->info,
  6.1104 +			    log_cpus, nlcpu))
  6.1105 +				ret = -EFAULT;
  6.1106 +			xfree(log_cpus);
  6.1107 +		}
  6.1108  		break;
  6.1109  
  6.1110 -       case XEN_MC_physcpuinfo:
  6.1111 -	       if ( !IS_PRIV(v->domain) )
  6.1112 -		       return -EPERM;
  6.1113 - 
  6.1114 -	       mc_physcpuinfo = &op->u.mc_physcpuinfo;
  6.1115 -	       nlcpu = num_online_cpus();
  6.1116 - 
  6.1117 -	       if (!guest_handle_is_null(mc_physcpuinfo->info)) {
  6.1118 -		       if (mc_physcpuinfo->ncpus <= 0)
  6.1119 -			       return -EINVAL;
  6.1120 -		       nlcpu = min(nlcpu, (int)mc_physcpuinfo->ncpus);
  6.1121 -		       log_cpus = xmalloc_array(xen_mc_logical_cpu_t, nlcpu);
  6.1122 -		       if (log_cpus == NULL)
  6.1123 -			       return -ENOMEM;
  6.1124 - 
  6.1125 -		       if (on_each_cpu(do_mc_get_cpu_info, log_cpus,
  6.1126 -			   1, 1) != 0) {
  6.1127 -			       xfree(log_cpus);
  6.1128 -			       return -EIO;
  6.1129 -		       }
  6.1130 -	       }
  6.1131 - 
  6.1132 -	       mc_physcpuinfo->ncpus = nlcpu;
  6.1133 - 
  6.1134 -	       if (copy_to_guest(u_xen_mc, op, 1)) {
  6.1135 -		       if (log_cpus != NULL)
  6.1136 -			       xfree(log_cpus);
  6.1137 -		       return -EFAULT;
  6.1138 -	       }
  6.1139 - 
  6.1140 -	       if (!guest_handle_is_null(mc_physcpuinfo->info)) {
  6.1141 -		       if (copy_to_guest(mc_physcpuinfo->info,
  6.1142 -			   log_cpus, nlcpu))
  6.1143 -			       ret = -EFAULT;
  6.1144 -		       xfree(log_cpus);
  6.1145 -	       }
  6.1146 +	default:
  6.1147 +		return x86_mcerr("do_mca: bad command", -EINVAL);
  6.1148  	}
  6.1149  
  6.1150  	return ret;
     7.1 --- a/xen/arch/x86/cpu/mcheck/mce.h	Tue Mar 17 14:21:18 2009 +0000
     7.2 +++ b/xen/arch/x86/cpu/mcheck/mce.h	Tue Mar 17 14:22:50 2009 +0000
     7.3 @@ -1,38 +1,98 @@
     7.4 +#ifndef _MCE_H
     7.5 +
     7.6 +#define _MCE_H
     7.7 +
     7.8  #include <xen/init.h>
     7.9 +#include <xen/smp.h>
    7.10  #include <asm/types.h>
    7.11  #include <asm/traps.h>
    7.12  #include <asm/atomic.h>
    7.13  #include <asm/percpu.h>
    7.14  
    7.15 +#include "x86_mca.h"
    7.16 +#include "mctelem.h"
    7.17  
    7.18  /* Init functions */
    7.19 -void amd_nonfatal_mcheck_init(struct cpuinfo_x86 *c);
    7.20 -void amd_k7_mcheck_init(struct cpuinfo_x86 *c);
    7.21 -void amd_k8_mcheck_init(struct cpuinfo_x86 *c);
    7.22 -void amd_f10_mcheck_init(struct cpuinfo_x86 *c);
    7.23 +int amd_k7_mcheck_init(struct cpuinfo_x86 *c);
    7.24 +int amd_k8_mcheck_init(struct cpuinfo_x86 *c);
    7.25 +int amd_f10_mcheck_init(struct cpuinfo_x86 *c);
    7.26  
    7.27 +int intel_p5_mcheck_init(struct cpuinfo_x86 *c);
    7.28 +int winchip_mcheck_init(struct cpuinfo_x86 *c);
    7.29 +int intel_mcheck_init(struct cpuinfo_x86 *c);
    7.30  
    7.31  void intel_mcheck_timer(struct cpuinfo_x86 *c);
    7.32 -void intel_p5_mcheck_init(struct cpuinfo_x86 *c);
    7.33 -void intel_mcheck_init(struct cpuinfo_x86 *c);
    7.34  void mce_intel_feature_init(struct cpuinfo_x86 *c);
    7.35 -
    7.36 -void winchip_mcheck_init(struct cpuinfo_x86 *c);
    7.37 -
    7.38 -/* Function pointer used in the handlers to collect additional information
    7.39 - * provided by newer CPU families/models without the need to duplicate
    7.40 - * the whole handler resulting in various handlers each with its own
    7.41 - * tweaks and bugs */
    7.42 -extern int (*mc_callback_bank_extended)(struct mc_info *mi,
    7.43 -		uint16_t bank, uint64_t status);
    7.44 -
    7.45 +void amd_nonfatal_mcheck_init(struct cpuinfo_x86 *c);
    7.46  
    7.47  int mce_available(struct cpuinfo_x86 *c);
    7.48 +int mce_firstbank(struct cpuinfo_x86 *c);
    7.49  /* Helper functions used for collecting error telemetry */
    7.50  struct mc_info *x86_mcinfo_getptr(void);
    7.51 -void x86_mcinfo_clear(struct mc_info *mi);
    7.52 -int x86_mcinfo_add(struct mc_info *mi, void *mcinfo);
    7.53 -void x86_mcinfo_dump(struct mc_info *mi);
    7.54  void mc_panic(char *s);
    7.55  void x86_mc_get_cpu_info(unsigned, uint32_t *, uint16_t *, uint16_t *,
    7.56  			 uint32_t *, uint32_t *, uint32_t *, uint32_t *);
    7.57 +
    7.58 +
    7.59 +/* Register a handler for machine check exceptions. */
    7.60 +typedef void (*x86_mce_vector_t)(struct cpu_user_regs *, long);
    7.61 +extern void x86_mce_vector_register(x86_mce_vector_t);
    7.62 +
    7.63 +/* Common generic MCE handler that implementations may nominate
    7.64 + * via x86_mce_vector_register. */
    7.65 +extern void mcheck_cmn_handler(struct cpu_user_regs *, long, cpu_banks_t);
    7.66 +
    7.67 +/* Utility function to "logout" all architectural MCA telemetry from the MCA
    7.68 + * banks of the current processor.  A cookie is returned which may be
    7.69 + * uses to reference the data so logged (the cookie can be NULL if
    7.70 + * no logout structures were available).  The caller can also pass a pointer
    7.71 + * to a structure which will be completed with some summary information
    7.72 + * of the MCA data observed in the logout operation. */
    7.73 +
    7.74 +enum mca_source {
    7.75 +	MCA_MCE_HANDLER,
    7.76 +	MCA_POLLER,
    7.77 +	MCA_CMCI_HANDLER,
    7.78 +	MCA_RESET
    7.79 +};
    7.80 +
    7.81 +enum mca_extinfo {
    7.82 +	MCA_EXTINFO_LOCAL,
    7.83 +	MCA_EXTINFO_GLOBAL,
    7.84 +	MCA_EXTINFO_IGNORED
    7.85 +};
    7.86 +
    7.87 +struct mca_summary {
    7.88 +	uint32_t	errcnt;	/* number of banks with valid errors */
    7.89 +	int		ripv;	/* meaningful on #MC */
    7.90 +	int		eipv;	/* meaningful on #MC */
    7.91 +	uint32_t	uc;	/* bitmask of banks with UC */
    7.92 +	uint32_t	pcc;	/* bitmask of banks with PCC */
    7.93 +};
    7.94 +
    7.95 +extern cpu_banks_t mca_allbanks;
    7.96 +
    7.97 +extern mctelem_cookie_t mcheck_mca_logout(enum mca_source, cpu_banks_t,
    7.98 +    struct mca_summary *);
    7.99 +
   7.100 +/* Register a callback to be made during bank telemetry logout.
   7.101 + * This callback is only available to those machine check handlers
   7.102 + * that call to the common mcheck_cmn_handler or who use the common
   7.103 + * telemetry logout function mcheck_mca_logout in error polling.
   7.104 + *
   7.105 + * This can be used to collect additional information (typically non-
   7.106 + * architectural) provided by newer CPU families/models without the need
   7.107 + * to duplicate the whole handler resulting in various handlers each with
   7.108 + * its own tweaks and bugs.  The callback receives an struct mc_info pointer
   7.109 + * which it can use with x86_mcinfo_add to add additional telemetry,
   7.110 + * the current MCA bank number we are reading telemetry from, and the
   7.111 + * MCi_STATUS value for that bank.
   7.112 + */
   7.113 +typedef enum mca_extinfo (*x86_mce_callback_t)
   7.114 +    (struct mc_info *, uint16_t, uint64_t);
   7.115 +extern void x86_mce_callback_register(x86_mce_callback_t);
   7.116 +
   7.117 +int x86_mcinfo_add(struct mc_info *mi, void *mcinfo);
   7.118 +void x86_mcinfo_dump(struct mc_info *mi);
   7.119 +
   7.120 +#endif /* _MCE_H */
     8.1 --- a/xen/arch/x86/cpu/mcheck/mce_intel.c	Tue Mar 17 14:21:18 2009 +0000
     8.2 +++ b/xen/arch/x86/cpu/mcheck/mce_intel.c	Tue Mar 17 14:22:50 2009 +0000
     8.3 @@ -14,6 +14,7 @@ DEFINE_PER_CPU(cpu_banks_t, mce_banks_ow
     8.4  
     8.5  static int nr_intel_ext_msrs = 0;
     8.6  static int cmci_support = 0;
     8.7 +static int firstbank;
     8.8  
     8.9  #ifdef CONFIG_X86_MCE_THERMAL
    8.10  static void unexpected_thermal_interrupt(struct cpu_user_regs *regs)
    8.11 @@ -115,222 +116,51 @@ static void intel_init_thermal(struct cp
    8.12  }
    8.13  #endif /* CONFIG_X86_MCE_THERMAL */
    8.14  
    8.15 -static inline void intel_get_extended_msrs(struct mcinfo_extended *mc_ext)
    8.16 +static enum mca_extinfo
    8.17 +intel_get_extended_msrs(struct mc_info *mci, uint16_t bank, uint64_t status)
    8.18  {
    8.19 -    if (nr_intel_ext_msrs == 0)
    8.20 -        return;
    8.21 +    struct mcinfo_extended mc_ext;
    8.22 +
    8.23 +    if (mci == NULL || nr_intel_ext_msrs == 0 || !(status & MCG_STATUS_EIPV))
    8.24 +        return MCA_EXTINFO_IGNORED;
    8.25  
    8.26      /* this function will called when CAP(9).MCG_EXT_P = 1 */
    8.27 -    memset(mc_ext, 0, sizeof(struct mcinfo_extended));
    8.28 -    mc_ext->common.type = MC_TYPE_EXTENDED;
    8.29 -    mc_ext->common.size = sizeof(mc_ext);
    8.30 -    mc_ext->mc_msrs = 10;
    8.31 -
    8.32 -    mc_ext->mc_msr[0].reg = MSR_IA32_MCG_EAX;
    8.33 -    rdmsrl(MSR_IA32_MCG_EAX, mc_ext->mc_msr[0].value);
    8.34 -    mc_ext->mc_msr[1].reg = MSR_IA32_MCG_EBX;
    8.35 -    rdmsrl(MSR_IA32_MCG_EBX, mc_ext->mc_msr[1].value);
    8.36 -    mc_ext->mc_msr[2].reg = MSR_IA32_MCG_ECX;
    8.37 -    rdmsrl(MSR_IA32_MCG_ECX, mc_ext->mc_msr[2].value);
    8.38 +    memset(&mc_ext, 0, sizeof(struct mcinfo_extended));
    8.39 +    mc_ext.common.type = MC_TYPE_EXTENDED;
    8.40 +    mc_ext.common.size = sizeof(mc_ext);
    8.41 +    mc_ext.mc_msrs = 10;
    8.42  
    8.43 -    mc_ext->mc_msr[3].reg = MSR_IA32_MCG_EDX;
    8.44 -    rdmsrl(MSR_IA32_MCG_EDX, mc_ext->mc_msr[3].value);
    8.45 -    mc_ext->mc_msr[4].reg = MSR_IA32_MCG_ESI;
    8.46 -    rdmsrl(MSR_IA32_MCG_ESI, mc_ext->mc_msr[4].value);
    8.47 -    mc_ext->mc_msr[5].reg = MSR_IA32_MCG_EDI;
    8.48 -    rdmsrl(MSR_IA32_MCG_EDI, mc_ext->mc_msr[5].value);
    8.49 +    mc_ext.mc_msr[0].reg = MSR_IA32_MCG_EAX;
    8.50 +    rdmsrl(MSR_IA32_MCG_EAX, mc_ext.mc_msr[0].value);
    8.51 +    mc_ext.mc_msr[1].reg = MSR_IA32_MCG_EBX;
    8.52 +    rdmsrl(MSR_IA32_MCG_EBX, mc_ext.mc_msr[1].value);
    8.53 +    mc_ext.mc_msr[2].reg = MSR_IA32_MCG_ECX;
    8.54 +    rdmsrl(MSR_IA32_MCG_ECX, mc_ext.mc_msr[2].value);
    8.55  
    8.56 -    mc_ext->mc_msr[6].reg = MSR_IA32_MCG_EBP;
    8.57 -    rdmsrl(MSR_IA32_MCG_EBP, mc_ext->mc_msr[6].value);
    8.58 -    mc_ext->mc_msr[7].reg = MSR_IA32_MCG_ESP;
    8.59 -    rdmsrl(MSR_IA32_MCG_ESP, mc_ext->mc_msr[7].value);
    8.60 -    mc_ext->mc_msr[8].reg = MSR_IA32_MCG_EFLAGS;
    8.61 -    rdmsrl(MSR_IA32_MCG_EFLAGS, mc_ext->mc_msr[8].value);
    8.62 -    mc_ext->mc_msr[9].reg = MSR_IA32_MCG_EIP;
    8.63 -    rdmsrl(MSR_IA32_MCG_EIP, mc_ext->mc_msr[9].value);
    8.64 +    mc_ext.mc_msr[3].reg = MSR_IA32_MCG_EDX;
    8.65 +    rdmsrl(MSR_IA32_MCG_EDX, mc_ext.mc_msr[3].value);
    8.66 +    mc_ext.mc_msr[4].reg = MSR_IA32_MCG_ESI;
    8.67 +    rdmsrl(MSR_IA32_MCG_ESI, mc_ext.mc_msr[4].value);
    8.68 +    mc_ext.mc_msr[5].reg = MSR_IA32_MCG_EDI;
    8.69 +    rdmsrl(MSR_IA32_MCG_EDI, mc_ext.mc_msr[5].value);
    8.70 +
    8.71 +    mc_ext.mc_msr[6].reg = MSR_IA32_MCG_EBP;
    8.72 +    rdmsrl(MSR_IA32_MCG_EBP, mc_ext.mc_msr[6].value);
    8.73 +    mc_ext.mc_msr[7].reg = MSR_IA32_MCG_ESP;
    8.74 +    rdmsrl(MSR_IA32_MCG_ESP, mc_ext.mc_msr[7].value);
    8.75 +    mc_ext.mc_msr[8].reg = MSR_IA32_MCG_EFLAGS;
    8.76 +    rdmsrl(MSR_IA32_MCG_EFLAGS, mc_ext.mc_msr[8].value);
    8.77 +    mc_ext.mc_msr[9].reg = MSR_IA32_MCG_EIP;
    8.78 +    rdmsrl(MSR_IA32_MCG_EIP, mc_ext.mc_msr[9].value);
    8.79 +
    8.80 +    x86_mcinfo_add(mci, &mc_ext);
    8.81 +
    8.82 +    return MCA_EXTINFO_GLOBAL;
    8.83  }
    8.84  
    8.85 -/* machine_check_poll might be called by following types:
    8.86 - * 1. called when do mcheck_init.
    8.87 - * 2. called in cmci interrupt handler
    8.88 - * 3. called in polling handler
    8.89 - * It will generate a new mc_info item if found CE/UC errors. DOM0 is the 
    8.90 - * consumer.
    8.91 - */
    8.92 -static struct mc_info *machine_check_poll(int calltype)
    8.93 +static void intel_machine_check(struct cpu_user_regs * regs, long error_code)
    8.94  {
    8.95 -    struct mc_info *mi = NULL;
    8.96 -    int exceptions = (read_cr4() & X86_CR4_MCE);
    8.97 -    int i, nr_unit = 0, uc = 0, pcc = 0;
    8.98 -    uint64_t status, addr;
    8.99 -    struct mcinfo_global mcg;
   8.100 -    struct mcinfo_extended mce;
   8.101 -    unsigned int cpu;
   8.102 -    struct domain *d;
   8.103 -
   8.104 -    cpu = smp_processor_id();
   8.105 -
   8.106 -    memset(&mcg, 0, sizeof(mcg));
   8.107 -    mcg.common.type = MC_TYPE_GLOBAL;
   8.108 -    mcg.common.size = sizeof(mcg);
   8.109 -    /* If called from cpu-reset check, don't need to fill them.
   8.110 -     * If called from cmci context, we'll try to fill domid by memory addr
   8.111 -     */
   8.112 -    mcg.mc_domid = -1;
   8.113 -    mcg.mc_vcpuid = -1;
   8.114 -    if (calltype == MC_FLAG_POLLED || calltype == MC_FLAG_RESET)
   8.115 -        mcg.mc_flags = MC_FLAG_POLLED;
   8.116 -    else if (calltype == MC_FLAG_CMCI)
   8.117 -        mcg.mc_flags = MC_FLAG_CMCI;
   8.118 -    x86_mc_get_cpu_info(
   8.119 -        cpu, &mcg.mc_socketid, &mcg.mc_coreid,
   8.120 -        &mcg.mc_core_threadid, &mcg.mc_apicid, NULL, NULL, NULL);
   8.121 -    rdmsrl(MSR_IA32_MCG_STATUS, mcg.mc_gstatus);
   8.122 -
   8.123 -    for ( i = 0; i < nr_mce_banks; i++ ) {
   8.124 -        struct mcinfo_bank mcb;
   8.125 -        /* For CMCI, only owners checks the owned MSRs */
   8.126 -        if ( !test_bit(i, __get_cpu_var(mce_banks_owned)) &&
   8.127 -             (calltype & MC_FLAG_CMCI) )
   8.128 -            continue;
   8.129 -        rdmsrl(MSR_IA32_MC0_STATUS + 4 * i, status);
   8.130 -
   8.131 -        if (! (status & MCi_STATUS_VAL) )
   8.132 -            continue;
   8.133 -        /*
   8.134 -         * Uncorrected events are handled by the exception
   8.135 -         * handler when it is enabled. But when the exception
   8.136 -         * is disabled such as when mcheck_init, log everything.
   8.137 -         */
   8.138 -        if ((status & MCi_STATUS_UC) && exceptions)
   8.139 -            continue;
   8.140 -
   8.141 -        if (status & MCi_STATUS_UC)
   8.142 -            uc = 1;
   8.143 -        if (status & MCi_STATUS_PCC)
   8.144 -            pcc = 1;
   8.145 -
   8.146 -        if (!mi) {
   8.147 -            mi = x86_mcinfo_getptr();
   8.148 -            if (!mi) {
   8.149 -                printk(KERN_ERR "mcheck_poll: Failed to get mc_info entry\n");
   8.150 -                return NULL;
   8.151 -            }
   8.152 -            x86_mcinfo_clear(mi);
   8.153 -        }
   8.154 -        memset(&mcb, 0, sizeof(mcb));
   8.155 -        mcb.common.type = MC_TYPE_BANK;
   8.156 -        mcb.common.size = sizeof(mcb);
   8.157 -        mcb.mc_bank = i;
   8.158 -        mcb.mc_status = status;
   8.159 -        if (status & MCi_STATUS_MISCV)
   8.160 -            rdmsrl(MSR_IA32_MC0_MISC + 4 * i, mcb.mc_misc);
   8.161 -        if (status & MCi_STATUS_ADDRV) {
   8.162 -            rdmsrl(MSR_IA32_MC0_ADDR + 4 * i, addr);
   8.163 -            d = maddr_get_owner(addr);
   8.164 -            if ( d && (calltype == MC_FLAG_CMCI || calltype == MC_FLAG_POLLED) )
   8.165 -                mcb.mc_domid = d->domain_id;
   8.166 -        }
   8.167 -        if (cmci_support)
   8.168 -            rdmsrl(MSR_IA32_MC0_CTL2 + i, mcb.mc_ctrl2);
   8.169 -        if (calltype == MC_FLAG_CMCI)
   8.170 -            rdtscll(mcb.mc_tsc);
   8.171 -        x86_mcinfo_add(mi, &mcb);
   8.172 -        nr_unit++;
   8.173 -        add_taint(TAINT_MACHINE_CHECK);
   8.174 -        /* Clear state for this bank */
   8.175 -        wrmsrl(MSR_IA32_MC0_STATUS + 4 * i, 0);
   8.176 -        printk(KERN_DEBUG "mcheck_poll: bank%i CPU%d status[%"PRIx64"]\n", 
   8.177 -                i, cpu, status);
   8.178 -        printk(KERN_DEBUG "mcheck_poll: CPU%d, SOCKET%d, CORE%d, APICID[%d], "
   8.179 -                "thread[%d]\n", cpu, mcg.mc_socketid, 
   8.180 -                mcg.mc_coreid, mcg.mc_apicid, mcg.mc_core_threadid);
   8.181 - 
   8.182 -    }
   8.183 -    /* if pcc = 1, uc must be 1 */
   8.184 -    if (pcc)
   8.185 -        mcg.mc_flags |= MC_FLAG_UNCORRECTABLE;
   8.186 -    else if (uc)
   8.187 -        mcg.mc_flags |= MC_FLAG_RECOVERABLE;
   8.188 -    else /* correctable */
   8.189 -        mcg.mc_flags |= MC_FLAG_CORRECTABLE;
   8.190 -
   8.191 -    if (nr_unit && nr_intel_ext_msrs && 
   8.192 -                    (mcg.mc_gstatus & MCG_STATUS_EIPV)) {
   8.193 -        intel_get_extended_msrs(&mce);
   8.194 -        x86_mcinfo_add(mi, &mce);
   8.195 -    }
   8.196 -    if (nr_unit) 
   8.197 -        x86_mcinfo_add(mi, &mcg);
   8.198 -    /* Clear global state */
   8.199 -    return mi;
   8.200 -}
   8.201 -
   8.202 -static fastcall void intel_machine_check(struct cpu_user_regs * regs, long error_code)
   8.203 -{
   8.204 -    /* MACHINE CHECK Error handler will be sent in another patch,
   8.205 -     * simply copy old solutions here. This code will be replaced
   8.206 -     * by upcoming machine check patches
   8.207 -     */
   8.208 -
   8.209 -    int recover=1;
   8.210 -    u32 alow, ahigh, high, low;
   8.211 -    u32 mcgstl, mcgsth;
   8.212 -    int i;
   8.213 -   
   8.214 -    rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
   8.215 -    if (mcgstl & (1<<0))       /* Recoverable ? */
   8.216 -        recover=0;
   8.217 -    
   8.218 -    printk(KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n",
   8.219 -           smp_processor_id(), mcgsth, mcgstl);
   8.220 -    
   8.221 -    for (i=0; i<nr_mce_banks; i++) {
   8.222 -        rdmsr (MSR_IA32_MC0_STATUS+i*4,low, high);
   8.223 -        if (high & (1<<31)) {
   8.224 -            if (high & (1<<29))
   8.225 -                recover |= 1;
   8.226 -            if (high & (1<<25))
   8.227 -                recover |= 2;
   8.228 -            printk (KERN_EMERG "Bank %d: %08x%08x", i, high, low);
   8.229 -            high &= ~(1<<31);
   8.230 -            if (high & (1<<27)) {
   8.231 -                rdmsr (MSR_IA32_MC0_MISC+i*4, alow, ahigh);
   8.232 -                printk ("[%08x%08x]", ahigh, alow);
   8.233 -            }
   8.234 -            if (high & (1<<26)) {
   8.235 -                rdmsr (MSR_IA32_MC0_ADDR+i*4, alow, ahigh);
   8.236 -                printk (" at %08x%08x", ahigh, alow);
   8.237 -            }
   8.238 -            printk ("\n");
   8.239 -        }
   8.240 -    }
   8.241 -    
   8.242 -    if (recover & 2)
   8.243 -        mc_panic ("CPU context corrupt");
   8.244 -    if (recover & 1)
   8.245 -        mc_panic ("Unable to continue");
   8.246 -    
   8.247 -    printk(KERN_EMERG "Attempting to continue.\n");
   8.248 -    /* 
   8.249 -     * Do not clear the MSR_IA32_MCi_STATUS if the error is not 
   8.250 -     * recoverable/continuable.This will allow BIOS to look at the MSRs
   8.251 -     * for errors if the OS could not log the error.
   8.252 -     */
   8.253 -    for (i=0; i<nr_mce_banks; i++) {
   8.254 -        u32 msr;
   8.255 -        msr = MSR_IA32_MC0_STATUS+i*4;
   8.256 -        rdmsr (msr, low, high);
   8.257 -        if (high&(1<<31)) {
   8.258 -            /* Clear it */
   8.259 -            wrmsr(msr, 0UL, 0UL);
   8.260 -            /* Serialize */
   8.261 -            wmb();
   8.262 -            add_taint(TAINT_MACHINE_CHECK);
   8.263 -        }
   8.264 -    }
   8.265 -    mcgstl &= ~(1<<2);
   8.266 -    wrmsr (MSR_IA32_MCG_STATUS,mcgstl, mcgsth);
   8.267 +	mcheck_cmn_handler(regs, error_code, mca_allbanks);
   8.268  }
   8.269  
   8.270  static DEFINE_SPINLOCK(cmci_discover_lock);
   8.271 @@ -369,6 +199,8 @@ static void cmci_discover(void)
   8.272      unsigned long flags;
   8.273      int i;
   8.274      struct mc_info *mi = NULL;
   8.275 +    mctelem_cookie_t mctc;
   8.276 +    struct mca_summary bs;
   8.277  
   8.278      printk(KERN_DEBUG "CMCI: find owner on CPU%d\n", smp_processor_id());
   8.279  
   8.280 @@ -385,12 +217,20 @@ static void cmci_discover(void)
   8.281       * MCi_status (error_count bit 38~52) is not cleared,
   8.282       * the CMCI interrupt will never be triggered again.
   8.283       */
   8.284 -    mi = machine_check_poll(MC_FLAG_CMCI);
   8.285 -    if (mi) {
   8.286 -        x86_mcinfo_dump(mi);
   8.287 -        if (dom0 && guest_enabled_event(dom0->vcpu[0], VIRQ_MCA))
   8.288 +
   8.289 +    mctc = mcheck_mca_logout(
   8.290 +        MCA_CMCI_HANDLER, __get_cpu_var(mce_banks_owned), &bs);
   8.291 +
   8.292 +    if (bs.errcnt && mctc != NULL) {
   8.293 +        if (guest_enabled_event(dom0->vcpu[0], VIRQ_MCA)) {
   8.294 +            mctelem_commit(mctc);
   8.295              send_guest_global_virq(dom0, VIRQ_MCA);
   8.296 -    }
   8.297 +        } else {
   8.298 +            x86_mcinfo_dump(mi);
   8.299 +            mctelem_dismiss(mctc);
   8.300 +       }
   8.301 +    } else if (mctc != NULL)
   8.302 +        mctelem_dismiss(mctc);
   8.303  
   8.304      printk(KERN_DEBUG "CMCI: CPU%d owner_map[%lx], no_cmci_map[%lx]\n", 
   8.305             smp_processor_id(), 
   8.306 @@ -487,17 +327,26 @@ static void intel_init_cmci(struct cpuin
   8.307  fastcall void smp_cmci_interrupt(struct cpu_user_regs *regs)
   8.308  {
   8.309      struct mc_info *mi = NULL;
   8.310 -    int cpu = smp_processor_id();
   8.311 +    mctelem_cookie_t mctc;
   8.312 +    struct mca_summary bs;
   8.313  
   8.314      ack_APIC_irq();
   8.315      irq_enter();
   8.316 -    printk(KERN_DEBUG "CMCI: cmci_intr happen on CPU%d\n", cpu);
   8.317 -    mi = machine_check_poll(MC_FLAG_CMCI);
   8.318 -    if (mi) {
   8.319 -        x86_mcinfo_dump(mi);
   8.320 -        if (dom0 && guest_enabled_event(dom0->vcpu[0], VIRQ_MCA))
   8.321 +
   8.322 +    mctc = mcheck_mca_logout(
   8.323 +        MCA_CMCI_HANDLER, __get_cpu_var(mce_banks_owned), &bs);
   8.324 +
   8.325 +    if (bs.errcnt && mctc != NULL) {
   8.326 +        if (guest_enabled_event(dom0->vcpu[0], VIRQ_MCA)) {
   8.327 +            mctelem_commit(mctc);
   8.328              send_guest_global_virq(dom0, VIRQ_MCA);
   8.329 -    }
   8.330 +        } else {
   8.331 +            x86_mcinfo_dump(mi);
   8.332 +            mctelem_dismiss(mctc);
   8.333 +       }
   8.334 +    } else if (mctc != NULL)
   8.335 +        mctelem_dismiss(mctc);
   8.336 +
   8.337      irq_exit();
   8.338  }
   8.339  
   8.340 @@ -527,28 +376,28 @@ static void mce_cap_init(struct cpuinfo_
   8.341          printk (KERN_INFO "CPU%d: Intel Extended MCE MSRs (%d) available\n",
   8.342              smp_processor_id(), nr_intel_ext_msrs);
   8.343      }
   8.344 -    /* for most of p6 family, bank 0 is an alias bios MSR.
   8.345 -     * But after model>1a, bank 0 is available*/
   8.346 -    if ( c->x86 == 6 && c->x86_vendor == X86_VENDOR_INTEL
   8.347 -            && c->x86_model < 0x1A)
   8.348 -        firstbank = 1;
   8.349 -    else
   8.350 -        firstbank = 0;
   8.351 +    firstbank = mce_firstbank(c);
   8.352  }
   8.353  
   8.354  static void mce_init(void)
   8.355  {
   8.356      u32 l, h;
   8.357      int i;
   8.358 -    struct mc_info *mi;
   8.359 +    mctelem_cookie_t mctc;
   8.360 +    struct mca_summary bs;
   8.361 +
   8.362      clear_in_cr4(X86_CR4_MCE);
   8.363 +
   8.364      /* log the machine checks left over from the previous reset.
   8.365       * This also clears all registers*/
   8.366  
   8.367 -    mi = machine_check_poll(MC_FLAG_RESET);
   8.368 +    mctc = mcheck_mca_logout(MCA_RESET, mca_allbanks, &bs);
   8.369 +
   8.370      /* in the boot up stage, don't inject to DOM0, but print out */
   8.371 -    if (mi)
   8.372 -        x86_mcinfo_dump(mi);
   8.373 +    if (bs.errcnt && mctc != NULL) {
   8.374 +        x86_mcinfo_dump(mctelem_dataptr(mctc));
   8.375 +        mctelem_dismiss(mctc);
   8.376 +    }
   8.377  
   8.378      set_in_cr4(X86_CR4_MCE);
   8.379      rdmsr (MSR_IA32_MCG_CAP, l, h);
   8.380 @@ -573,71 +422,19 @@ static void mce_init(void)
   8.381  }
   8.382  
   8.383  /* p4/p6 family have similar MCA initialization process */
   8.384 -void intel_mcheck_init(struct cpuinfo_x86 *c)
   8.385 +int intel_mcheck_init(struct cpuinfo_x86 *c)
   8.386  {
   8.387      mce_cap_init(c);
   8.388      printk (KERN_INFO "Intel machine check reporting enabled on CPU#%d.\n",
   8.389              smp_processor_id());
   8.390 +
   8.391      /* machine check is available */
   8.392 -    machine_check_vector = intel_machine_check;
   8.393 +    x86_mce_vector_register(intel_machine_check);
   8.394 +    x86_mce_callback_register(intel_get_extended_msrs);
   8.395 +
   8.396      mce_init();
   8.397      mce_intel_feature_init(c);
   8.398      mce_set_owner();
   8.399 -}
   8.400 -
   8.401 -/*
   8.402 - * Periodic polling timer for "silent" machine check errors. If the
   8.403 - * poller finds an MCE, poll faster. When the poller finds no more 
   8.404 - * errors, poll slower
   8.405 -*/
   8.406 -static struct timer mce_timer;
   8.407 -
   8.408 -#define MCE_PERIOD 4000
   8.409 -#define MCE_MIN    2000
   8.410 -#define MCE_MAX    32000
   8.411 -
   8.412 -static u64 period = MCE_PERIOD;
   8.413 -static int adjust = 0;
   8.414 -
   8.415 -static void mce_intel_checkregs(void *info)
   8.416 -{
   8.417 -    struct mc_info *mi;
   8.418  
   8.419 -    if( !mce_available(&current_cpu_data))
   8.420 -        return;
   8.421 -    mi = machine_check_poll(MC_FLAG_POLLED);
   8.422 -    if (mi)
   8.423 -    {
   8.424 -        x86_mcinfo_dump(mi);
   8.425 -        adjust++;
   8.426 -        if (dom0 && guest_enabled_event(dom0->vcpu[0], VIRQ_MCA))
   8.427 -            send_guest_global_virq(dom0, VIRQ_MCA);
   8.428 -    }
   8.429 +    return 1;
   8.430  }
   8.431 -
   8.432 -static void mce_intel_work_fn(void *data)
   8.433 -{
   8.434 -    on_each_cpu(mce_intel_checkregs, data, 1, 1);
   8.435 -    if (adjust) {
   8.436 -        period = period / (adjust + 1);
   8.437 -        printk(KERN_DEBUG "mcheck_poll: Find error, shorten interval "
   8.438 -               "to %"PRIu64"\n", period);
   8.439 -    }
   8.440 -    else {
   8.441 -        period *= 2;
   8.442 -    }
   8.443 -    if (period > MCE_MAX) 
   8.444 -        period = MCE_MAX;
   8.445 -    if (period < MCE_MIN)
   8.446 -        period = MCE_MIN;
   8.447 -    set_timer(&mce_timer, NOW() + MILLISECS(period));
   8.448 -    adjust = 0;
   8.449 -}
   8.450 -
   8.451 -void intel_mcheck_timer(struct cpuinfo_x86 *c)
   8.452 -{
   8.453 -    printk(KERN_DEBUG "mcheck_poll: Init_mcheck_timer\n");
   8.454 -    init_timer(&mce_timer, mce_intel_work_fn, NULL, 0);
   8.455 -    set_timer(&mce_timer, NOW() + MILLISECS(MCE_PERIOD));
   8.456 -}
   8.457 -
     9.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     9.2 +++ b/xen/arch/x86/cpu/mcheck/mctelem.c	Tue Mar 17 14:22:50 2009 +0000
     9.3 @@ -0,0 +1,443 @@
     9.4 +/*
     9.5 + * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     9.6 + * Use is subject to license terms.
     9.7 + *
     9.8 + * This program is free software; you can redistribute it and/or
     9.9 + * modify it under the terms of the GNU General Public License as
    9.10 + * published by the Free Software Foundation, version 2 of the
    9.11 + * License.
    9.12 + */
    9.13 +
    9.14 +/*
    9.15 + * mctelem.c - x86 Machine Check Telemetry Transport
    9.16 + */
    9.17 +
    9.18 +#include <xen/init.h>
    9.19 +#include <xen/types.h>
    9.20 +#include <xen/kernel.h>
    9.21 +#include <xen/config.h>
    9.22 +#include <xen/smp.h>
    9.23 +#include <xen/errno.h>
    9.24 +#include <xen/sched.h>
    9.25 +#include <xen/sched-if.h>
    9.26 +#include <xen/cpumask.h>
    9.27 +#include <xen/event.h>
    9.28 +
    9.29 +#include <asm/processor.h>
    9.30 +#include <asm/system.h>
    9.31 +#include <asm/msr.h>
    9.32 +
    9.33 +#include "mce.h"
    9.34 +
    9.35 +struct mctelem_ent {
    9.36 +	struct mctelem_ent *mcte_next;	/* next in chronological order */
    9.37 +	struct mctelem_ent *mcte_prev;	/* previous in chronological order */
    9.38 +	uint32_t mcte_flags;		/* See MCTE_F_* below */
    9.39 +	uint32_t mcte_refcnt;		/* Reference count */
    9.40 +	void *mcte_data;		/* corresponding data payload */
    9.41 +};
    9.42 +
    9.43 +#define	MCTE_F_HOME_URGENT		0x0001U	/* free to urgent freelist */
    9.44 +#define	MCTE_F_HOME_NONURGENT		0x0002U /* free to nonurgent freelist */
    9.45 +#define	MCTE_F_CLASS_URGENT		0x0004U /* in use - urgent errors */
    9.46 +#define	MCTE_F_CLASS_NONURGENT		0x0008U /* in use - nonurgent errors */
    9.47 +#define	MCTE_F_STATE_FREE		0x0010U	/* on a freelist */
    9.48 +#define	MCTE_F_STATE_UNCOMMITTED	0x0020U	/* reserved; on no list */
    9.49 +#define	MCTE_F_STATE_COMMITTED		0x0040U	/* on a committed list */
    9.50 +#define	MCTE_F_STATE_PROCESSING		0x0080U	/* on a processing list */
    9.51 +
    9.52 +#define	MCTE_F_MASK_HOME	(MCTE_F_HOME_URGENT | MCTE_F_HOME_NONURGENT)
    9.53 +#define	MCTE_F_MASK_CLASS	(MCTE_F_CLASS_URGENT | MCTE_F_CLASS_NONURGENT)
    9.54 +#define	MCTE_F_MASK_STATE	(MCTE_F_STATE_FREE | \
    9.55 +				MCTE_F_STATE_UNCOMMITTED | \
    9.56 +				MCTE_F_STATE_COMMITTED | \
    9.57 +				MCTE_F_STATE_PROCESSING)
    9.58 +
    9.59 +#define	MCTE_HOME(tep) ((tep)->mcte_flags & MCTE_F_MASK_HOME)
    9.60 +
    9.61 +#define	MCTE_CLASS(tep) ((tep)->mcte_flags & MCTE_F_MASK_CLASS)
    9.62 +#define	MCTE_SET_CLASS(tep, new) do { \
    9.63 +    (tep)->mcte_flags &= ~MCTE_F_MASK_CLASS; \
    9.64 +    (tep)->mcte_flags |= MCTE_F_CLASS_##new; } while (0)
    9.65 +
    9.66 +#define	MCTE_STATE(tep) ((tep)->mcte_flags & MCTE_F_MASK_STATE)
    9.67 +#define	MCTE_TRANSITION_STATE(tep, old, new) do { \
    9.68 +    BUG_ON(MCTE_STATE(tep) != (MCTE_F_STATE_##old)); \
    9.69 +    (tep)->mcte_flags &= ~MCTE_F_MASK_STATE; \
    9.70 +    (tep)->mcte_flags |= (MCTE_F_STATE_##new); } while (0)
    9.71 +
    9.72 +#define	MC_URGENT_NENT		10
    9.73 +#define	MC_NONURGENT_NENT	20
    9.74 +
    9.75 +#define	MC_NCLASSES		(MC_NONURGENT + 1)
    9.76 +
    9.77 +#define	COOKIE2MCTE(c)		((struct mctelem_ent *)(c))
    9.78 +#define	MCTE2COOKIE(tep)	((mctelem_cookie_t)(tep))
    9.79 +
    9.80 +static struct mc_telem_ctl {
    9.81 +	/* Linked lists that thread the array members together.
    9.82 +	 *
    9.83 +	 * The free lists are singly-linked via mcte_next, and we allocate
    9.84 +	 * from them by atomically unlinking an element from the head.
    9.85 +	 * Consumed entries are returned to the head of the free list.
    9.86 +	 * When an entry is reserved off the free list it is not linked
    9.87 +	 * on any list until it is committed or dismissed.
    9.88 +	 *
    9.89 +	 * The committed list grows at the head and we do not maintain a
    9.90 +	 * tail pointer; insertions are performed atomically.  The head
    9.91 +	 * thus has the most-recently committed telemetry, i.e. the
    9.92 +	 * list is in reverse chronological order.  The committed list
    9.93 +	 * is singly-linked via mcte_prev pointers, and mcte_next is NULL.
    9.94 +	 * When we move telemetry from the committed list to the processing
    9.95 +	 * list we atomically unlink the committed list and keep a pointer
    9.96 +	 * to the head of that list;  we then traverse the list following
    9.97 +	 * mcte_prev and fill in mcte_next to doubly-link the list, and then
    9.98 +	 * append the tail of the list onto the processing list.  If we panic
    9.99 +	 * during this manipulation of the committed list we still have
   9.100 +	 * the pointer to its head so we can recover all entries during
   9.101 +	 * the panic flow (albeit in reverse chronological order).
   9.102 +	 *
   9.103 +	 * The processing list is updated in a controlled context, and
   9.104 +	 * we can lock it for updates.  The head of the processing list
   9.105 +	 * always has the oldest telemetry, and we append (as above)
   9.106 +	 * at the tail of the processing list. */
   9.107 +	struct mctelem_ent *mctc_free[MC_NCLASSES];
   9.108 +	struct mctelem_ent *mctc_committed[MC_NCLASSES];
   9.109 +	struct mctelem_ent *mctc_processing_head[MC_NCLASSES];
   9.110 +	struct mctelem_ent *mctc_processing_tail[MC_NCLASSES];
   9.111 +	/*
   9.112 +	 * Telemetry array
   9.113 +	 */
   9.114 +	struct mctelem_ent *mctc_elems;
   9.115 +} mctctl;
   9.116 +
   9.117 +/* Lock protecting all processing lists */
   9.118 +static DEFINE_SPINLOCK(processing_lock);
   9.119 +
   9.120 +static void *cmpxchgptr(void *ptr, void *old, void *new)
   9.121 +{
   9.122 +	unsigned long *ulp = (unsigned long *)ptr;
   9.123 +	unsigned long a = (unsigned long)old;
   9.124 +	unsigned long b = (unsigned long)new;
   9.125 +
   9.126 +	return (void *)cmpxchg(ulp, a, b);
   9.127 +}
   9.128 +
   9.129 +/* Free an entry to its native free list; the entry must not be linked on
   9.130 + * any list.
   9.131 + */
   9.132 +static void mctelem_free(struct mctelem_ent *tep)
   9.133 +{
   9.134 +	mctelem_class_t target = MCTE_HOME(tep) == MCTE_F_HOME_URGENT ?
   9.135 +	    MC_URGENT : MC_NONURGENT;
   9.136 +	struct mctelem_ent **freelp;
   9.137 +	struct mctelem_ent *oldhead;
   9.138 +
   9.139 +	BUG_ON(tep->mcte_refcnt != 0);
   9.140 +	BUG_ON(MCTE_STATE(tep) != MCTE_F_STATE_FREE);
   9.141 +
   9.142 +	tep->mcte_prev = NULL;
   9.143 +	freelp = &mctctl.mctc_free[target];
   9.144 +	for (;;) {
   9.145 +		oldhead = *freelp;
   9.146 +		tep->mcte_next = oldhead;
   9.147 +		wmb();
   9.148 +		if (cmpxchgptr(freelp, oldhead, tep) == oldhead)
   9.149 +			break;
   9.150 +	}
   9.151 +}
   9.152 +
   9.153 +/* Increment the reference count of an entry that is not linked on to
   9.154 + * any list and which only the caller has a pointer to.
   9.155 + */
   9.156 +static void mctelem_hold(struct mctelem_ent *tep)
   9.157 +{
   9.158 +	tep->mcte_refcnt++;
   9.159 +}
   9.160 +
   9.161 +/* Increment the reference count on an entry that is linked at the head of
   9.162 + * a processing list.  The caller is responsible for locking the list.
   9.163 + */
   9.164 +static void mctelem_processing_hold(struct mctelem_ent *tep)
   9.165 +{
   9.166 +	int which = MCTE_CLASS(tep) == MCTE_F_CLASS_URGENT ?
   9.167 +	    MC_URGENT : MC_NONURGENT;
   9.168 +
   9.169 +	BUG_ON(tep != mctctl.mctc_processing_head[which]);
   9.170 +	tep->mcte_refcnt++;
   9.171 +}
   9.172 +
   9.173 +/* Decrement the reference count on an entry that is linked at the head of
   9.174 + * a processing list.  The caller is responsible for locking the list.
   9.175 + */
   9.176 +static void mctelem_processing_release(struct mctelem_ent *tep)
   9.177 +{
   9.178 +	int which = MCTE_CLASS(tep) == MCTE_F_CLASS_URGENT ?
   9.179 +	    MC_URGENT : MC_NONURGENT;
   9.180 +
   9.181 +	BUG_ON(tep != mctctl.mctc_processing_head[which]);
   9.182 +	if (--tep->mcte_refcnt == 0) {
   9.183 +		MCTE_TRANSITION_STATE(tep, PROCESSING, FREE);
   9.184 +		mctctl.mctc_processing_head[which] = tep->mcte_next;
   9.185 +		mctelem_free(tep);
   9.186 +	}
   9.187 +}
   9.188 +
   9.189 +void mctelem_init(int reqdatasz)
   9.190 +{
   9.191 +	static int called = 0;
   9.192 +	static int datasz = 0, realdatasz = 0;
   9.193 +	char *datarr;
   9.194 +	int i;
   9.195 +	
   9.196 +	BUG_ON(MC_URGENT != 0 || MC_NONURGENT != 1 || MC_NCLASSES != 2);
   9.197 +
   9.198 +	/* Called from mcheck_init for all processors; initialize for the
   9.199 +	 * first call only (no race here since the boot cpu completes
   9.200 +	 * init before others start up). */
   9.201 +	if (++called == 1) {
   9.202 +		realdatasz = reqdatasz;
   9.203 +		datasz = (reqdatasz & ~0xf) + 0x10;	/* 16 byte roundup */
   9.204 +	} else {
   9.205 +		BUG_ON(reqdatasz != realdatasz);
   9.206 +		return;
   9.207 +	}
   9.208 +
   9.209 +	if ((mctctl.mctc_elems = xmalloc_array(struct mctelem_ent,
   9.210 +	    MC_URGENT_NENT + MC_NONURGENT_NENT)) == NULL ||
   9.211 +	    (datarr = xmalloc_bytes((MC_URGENT_NENT + MC_NONURGENT_NENT) *
   9.212 +	    datasz)) == NULL) {
   9.213 +		if (mctctl.mctc_elems)
   9.214 +			xfree(mctctl.mctc_elems);
   9.215 +		printk("Allocations for MCA telemetry failed\n");
   9.216 +		return;
   9.217 +	}
   9.218 +
   9.219 +	for (i = 0; i < MC_URGENT_NENT + MC_NONURGENT_NENT; i++) {
   9.220 +		struct mctelem_ent *tep, **tepp;
   9.221 +
   9.222 +		tep = mctctl.mctc_elems + i;
   9.223 +		tep->mcte_flags = MCTE_F_STATE_FREE;
   9.224 +		tep->mcte_refcnt = 0;
   9.225 +		tep->mcte_data = datarr + i * datasz;
   9.226 +
   9.227 +		if (i < MC_URGENT_NENT) {
   9.228 +			tepp = &mctctl.mctc_free[MC_URGENT];
   9.229 +			tep->mcte_flags |= MCTE_F_HOME_URGENT;
   9.230 +		} else {
   9.231 +			tepp = &mctctl.mctc_free[MC_NONURGENT];
   9.232 +			tep->mcte_flags |= MCTE_F_HOME_NONURGENT;
   9.233 +		}
   9.234 +
   9.235 +		tep->mcte_next = *tepp;
   9.236 +		tep->mcte_prev = NULL;
   9.237 +		*tepp = tep;
   9.238 +	}
   9.239 +}
   9.240 +
   9.241 +/* incremented non-atomically when reserve fails */
   9.242 +static int mctelem_drop_count;
   9.243 +
   9.244 +/* Reserve a telemetry entry, or return NULL if none available.
   9.245 + * If we return an entry then the caller must subsequently call exactly one of
   9.246 + * mctelem_unreserve or mctelem_commit for that entry.
   9.247 + */
   9.248 +mctelem_cookie_t mctelem_reserve(mctelem_class_t which)
   9.249 +{
   9.250 +	struct mctelem_ent **freelp;
   9.251 +	struct mctelem_ent *oldhead, *newhead;
   9.252 +	mctelem_class_t target = (which == MC_URGENT) ?
   9.253 +	    MC_URGENT : MC_NONURGENT;
   9.254 +
   9.255 +	freelp = &mctctl.mctc_free[target];
   9.256 +	for (;;) {
   9.257 +		if ((oldhead = *freelp) == NULL) {
   9.258 +			if (which == MC_URGENT && target == MC_URGENT) {
   9.259 +				/* raid the non-urgent freelist */
   9.260 +				target = MC_NONURGENT;
   9.261 +				freelp = &mctctl.mctc_free[target];
   9.262 +				continue;
   9.263 +			} else {
   9.264 +				mctelem_drop_count++;
   9.265 +				return (NULL);
   9.266 +			}
   9.267 +		}
   9.268 +
   9.269 +		newhead = oldhead->mcte_next;
   9.270 +		if (cmpxchgptr(freelp, oldhead, newhead) == oldhead) {
   9.271 +			struct mctelem_ent *tep = oldhead;
   9.272 +
   9.273 +			mctelem_hold(tep);
   9.274 +			MCTE_TRANSITION_STATE(tep, FREE, UNCOMMITTED);
   9.275 +			tep->mcte_next = NULL;
   9.276 +			tep->mcte_prev = NULL;
   9.277 +			if (which == MC_URGENT)
   9.278 +				MCTE_SET_CLASS(tep, URGENT);
   9.279 +			else
   9.280 +				MCTE_SET_CLASS(tep, NONURGENT);
   9.281 +			return MCTE2COOKIE(tep);
   9.282 +		}
   9.283 +	}
   9.284 +}
   9.285 +
   9.286 +void *mctelem_dataptr(mctelem_cookie_t cookie)
   9.287 +{
   9.288 +	struct mctelem_ent *tep = COOKIE2MCTE(cookie);
   9.289 +
   9.290 +	return tep->mcte_data;
   9.291 +}
   9.292 +
   9.293 +/* Release a previously reserved entry back to the freelist without
   9.294 + * submitting it for logging.  The entry must not be linked on to any
   9.295 + * list - that's how mctelem_reserve handed it out.
   9.296 + */
   9.297 +void mctelem_dismiss(mctelem_cookie_t cookie)
   9.298 +{
   9.299 +	struct mctelem_ent *tep = COOKIE2MCTE(cookie);
   9.300 +
   9.301 +	tep->mcte_refcnt--;
   9.302 +	MCTE_TRANSITION_STATE(tep, UNCOMMITTED, FREE);
   9.303 +	mctelem_free(tep);
   9.304 +}
   9.305 +
   9.306 +/* Commit an entry with completed telemetry for logging.  The caller must
   9.307 + * not reference the entry after this call.  Note that we add entries
   9.308 + * at the head of the committed list, so that list therefore has entries
   9.309 + * in reverse chronological order.
   9.310 + */
   9.311 +void mctelem_commit(mctelem_cookie_t cookie)
   9.312 +{
   9.313 +	struct mctelem_ent *tep = COOKIE2MCTE(cookie);
   9.314 +	struct mctelem_ent **commlp;
   9.315 +	struct mctelem_ent *oldhead;
   9.316 +	mctelem_class_t target = MCTE_CLASS(tep) == MCTE_F_CLASS_URGENT ?
   9.317 +	    MC_URGENT : MC_NONURGENT;
   9.318 +
   9.319 +	BUG_ON(tep->mcte_next != NULL || tep->mcte_prev != NULL);
   9.320 +	MCTE_TRANSITION_STATE(tep, UNCOMMITTED, COMMITTED);
   9.321 +
   9.322 +	commlp = &mctctl.mctc_committed[target];
   9.323 +	for (;;) {
   9.324 +		oldhead = *commlp;
   9.325 +		tep->mcte_prev = oldhead;
   9.326 +		wmb();
   9.327 +		if (cmpxchgptr(commlp, oldhead, tep) == oldhead)
   9.328 +			break;
   9.329 +	}
   9.330 +}
   9.331 +
   9.332 +/* Move telemetry from committed list to processing list, reversing the
   9.333 + * list into chronological order.  The processing list has been
   9.334 + * locked by the caller, and may be non-empty.  We append the
   9.335 + * reversed committed list on to the tail of the processing list.
   9.336 + * The committed list may grow even while we run, so use atomic
   9.337 + * operations to swap NULL to the freelist head.
   9.338 + *
   9.339 + * Note that "chronological order" means the order in which producers
   9.340 + * won additions to the processing list, which may not reflect the
   9.341 + * strict chronological order of the associated events if events are
   9.342 + * closely spaced in time and contend for the processing list at once.
   9.343 + */
   9.344 +
   9.345 +static struct mctelem_ent *dangling[MC_NCLASSES];
   9.346 +
   9.347 +static void mctelem_append_processing(mctelem_class_t which)
   9.348 +{
   9.349 +	mctelem_class_t target = which == MC_URGENT ?
   9.350 +	    MC_URGENT : MC_NONURGENT;
   9.351 +	struct mctelem_ent **commlp = &mctctl.mctc_committed[target];
   9.352 +	struct mctelem_ent **proclhp = &mctctl.mctc_processing_head[target];
   9.353 +	struct mctelem_ent **procltp = &mctctl.mctc_processing_tail[target];
   9.354 +	struct mctelem_ent *tep, *ltep;
   9.355 +
   9.356 +	/* Check for an empty list; no race since we hold the processing lock */
   9.357 +	if (*commlp == NULL)
   9.358 +		return;
   9.359 +
   9.360 +	/* Atomically unlink the committed list, and keep a pointer to
   9.361 +	 * the list we unlink in a well-known location so it can be
   9.362 +	 * picked up in panic code should we panic between this unlink
   9.363 +	 * and the append to the processing list. */
   9.364 +	for (;;) {
   9.365 +		dangling[target] = *commlp;
   9.366 +		wmb();
   9.367 +		if (cmpxchgptr(commlp, dangling[target], NULL) ==
   9.368 +		    dangling[target])
   9.369 +			break;
   9.370 +	}
   9.371 +
   9.372 +	if (dangling[target] == NULL)
   9.373 +		return;
   9.374 +
   9.375 +	/* Traverse the list following the previous pointers (reverse
   9.376 +	 * chronological order).  For each entry fill in the next pointer
   9.377 +	 * and transition the element state.  */
   9.378 +	for (tep = dangling[target], ltep = NULL; tep != NULL;
   9.379 +	    tep = tep->mcte_prev) {
   9.380 +		MCTE_TRANSITION_STATE(tep, COMMITTED, PROCESSING);
   9.381 +		tep->mcte_next = ltep;
   9.382 +		ltep = tep;
   9.383 +	}
   9.384 +
   9.385 +	/* ltep points to the head of a chronologically ordered linked
   9.386 +	 * list of telemetry entries ending at the most recent entry
   9.387 +	 * dangling[target] if mcte_next is followed; tack this on to
   9.388 +	 * the processing list.
   9.389 +	 */
   9.390 +	if (*proclhp == NULL) {
   9.391 +		*proclhp = ltep;
   9.392 +		*procltp = dangling[target];
   9.393 +	} else {
   9.394 +		(*procltp)->mcte_next = ltep;
   9.395 +		ltep->mcte_prev = *procltp;
   9.396 +		*procltp = dangling[target];
   9.397 +	}
   9.398 +	wmb();
   9.399 +	dangling[target] = NULL;
   9.400 +	wmb();
   9.401 +}
   9.402 +
   9.403 +mctelem_cookie_t mctelem_consume_oldest_begin(mctelem_class_t which)
   9.404 +{
   9.405 +	mctelem_class_t target = (which == MC_URGENT) ?
   9.406 +	    MC_URGENT : MC_NONURGENT;
   9.407 +	struct mctelem_ent *tep;
   9.408 +
   9.409 +	spin_lock(&processing_lock);
   9.410 +	mctelem_append_processing(target);
   9.411 +	if ((tep = mctctl.mctc_processing_head[target]) == NULL) {
   9.412 +		spin_unlock(&processing_lock);
   9.413 +		return NULL;
   9.414 +	}
   9.415 +
   9.416 +	mctelem_processing_hold(tep);
   9.417 +	wmb();
   9.418 +	spin_unlock(&processing_lock);
   9.419 +	return MCTE2COOKIE(tep);
   9.420 +}
   9.421 +
   9.422 +void mctelem_consume_oldest_end(mctelem_cookie_t cookie)
   9.423 +{
   9.424 +	struct mctelem_ent *tep = COOKIE2MCTE(cookie);
   9.425 +
   9.426 +	spin_lock(&processing_lock);
   9.427 +	mctelem_processing_release(tep);
   9.428 +	wmb();
   9.429 +	spin_unlock(&processing_lock);
   9.430 +}
   9.431 +
   9.432 +void mctelem_ack(mctelem_class_t which, mctelem_cookie_t cookie)
   9.433 +{
   9.434 +	mctelem_class_t target = (which == MC_URGENT) ?
   9.435 +	    MC_URGENT : MC_NONURGENT;
   9.436 +	struct mctelem_ent *tep = COOKIE2MCTE(cookie);
   9.437 +
   9.438 +	if (tep == NULL)
   9.439 +		return;
   9.440 +
   9.441 +	spin_lock(&processing_lock);
   9.442 +	if (tep == mctctl.mctc_processing_head[target])
   9.443 +		mctelem_processing_release(tep);
   9.444 +	wmb();
   9.445 +	spin_unlock(&processing_lock);
   9.446 +}
    10.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    10.2 +++ b/xen/arch/x86/cpu/mcheck/mctelem.h	Tue Mar 17 14:22:50 2009 +0000
    10.3 @@ -0,0 +1,71 @@
    10.4 +/*
    10.5 + * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
    10.6 + * Use is subject to license terms.
    10.7 + *
    10.8 + * This program is free software; you can redistribute it and/or
    10.9 + * modify it under the terms of the GNU General Public License as
   10.10 + * published by the Free Software Foundation, version 2 of the
   10.11 + * License.
   10.12 + */
   10.13 +
   10.14 +#ifndef _MCTELEM_H
   10.15 +
   10.16 +#define	_MCTELEM_H
   10.17 +
   10.18 +#include <xen/init.h>
   10.19 +#include <xen/smp.h>
   10.20 +#include <asm/traps.h>
   10.21 +
   10.22 +/* Helper functions used for collecting error telemetry.
   10.23 + *
   10.24 + * mctelem_init preallocates a number of data areas for use during
   10.25 + * machine check data "logout".  Two classes are distinguished -
   10.26 + * urgent uses, intended for use from machine check exception handlers,
   10.27 + * and non-urgent uses intended for use from error pollers.
   10.28 + * Associated with each logout entry of whatever class is a data area
   10.29 + * sized per the single argument to mctelem_init.  mcelem_init should be
   10.30 + * called from MCA init code before anybody has the chance to change the
   10.31 + * machine check vector with mcheck_mca_logout or to use mcheck_mca_logout.
   10.32 + *
   10.33 + * To reserve an entry of a given class for use in logout, call
   10.34 + * mctelem_reserve (or use the common handler functions which do all this
   10.35 + * for you).  This returns an opaque cookie, or NULL if no elements are
   10.36 + * available.  Elements are reserved with an atomic operation so no deadlock
   10.37 + * will occur if, for example, a machine check exception interrupts a
   10.38 + * scheduled error poll.  The implementation will raid free non-urgent
   10.39 + * entries if all urgent entries are in use when an urgent request is received.
   10.40 + * Once an entry is reserved the caller must eventually perform exactly
   10.41 + * one of two actions: mctelem_commit or mctelem_dismiss.
   10.42 + *
   10.43 + * On mctelem_commit the entry is appended to a processing list; mctelem_dismiss
   10.44 + * frees the element without processing.  After either call the cookie
   10.45 + * must not be referenced again.
   10.46 + *
   10.47 + * To consume committed telemetry call mctelem_consume_oldest_begin
   10.48 + * which will return a cookie referencing the oldest (first committed)
   10.49 + * entry of the requested class.  Access the associated data using
   10.50 + * mctelem_dataptr and when finished use mctelem_consume_oldest_end - in the
   10.51 + * begin .. end bracket you are guaranteed that the entry canot be freed
   10.52 + * even if it is ack'd elsewhere).  Once the ultimate consumer of the
   10.53 + * telemetry has processed it to stable storage it should acknowledge
   10.54 + * the telemetry quoting the cookie id, at which point we will free
   10.55 + * the element from the processing list.
   10.56 + */
   10.57 +
   10.58 +typedef struct mctelem_cookie *mctelem_cookie_t;
   10.59 +
   10.60 +typedef enum mctelem_class {
   10.61 +	MC_URGENT,
   10.62 +	MC_NONURGENT
   10.63 +} mctelem_class_t;
   10.64 +
   10.65 +extern void mctelem_init(int);
   10.66 +extern mctelem_cookie_t mctelem_reserve(mctelem_class_t);
   10.67 +extern void *mctelem_dataptr(mctelem_cookie_t);
   10.68 +extern void mctelem_commit(mctelem_cookie_t);
   10.69 +extern void mctelem_dismiss(mctelem_cookie_t);
   10.70 +extern mctelem_cookie_t mctelem_consume_oldest_begin(mctelem_class_t);
   10.71 +extern void mctelem_consume_oldest_end(mctelem_cookie_t);
   10.72 +extern void mctelem_ack(mctelem_class_t, mctelem_cookie_t);
   10.73 +
   10.74 +#endif
    11.1 --- a/xen/arch/x86/cpu/mcheck/non-fatal.c	Tue Mar 17 14:21:18 2009 +0000
    11.2 +++ b/xen/arch/x86/cpu/mcheck/non-fatal.c	Tue Mar 17 14:22:50 2009 +0000
    11.3 @@ -14,46 +14,76 @@
    11.4  #include <xen/smp.h>
    11.5  #include <xen/timer.h>
    11.6  #include <xen/errno.h>
    11.7 +#include <xen/event.h>
    11.8 +#include <xen/sched.h>
    11.9  #include <asm/processor.h> 
   11.10  #include <asm/system.h>
   11.11  #include <asm/msr.h>
   11.12  
   11.13  #include "mce.h"
   11.14 -#include "x86_mca.h"
   11.15 -int firstbank = 0;
   11.16 +
   11.17 +static cpu_banks_t bankmask;
   11.18  static struct timer mce_timer;
   11.19  
   11.20 -#define MCE_PERIOD MILLISECS(15000)
   11.21 +#define MCE_PERIOD MILLISECS(8000)
   11.22 +#define MCE_PERIOD_MIN MILLISECS(2000)
   11.23 +#define MCE_PERIOD_MAX MILLISECS(16000)
   11.24 +
   11.25 +static uint64_t period = MCE_PERIOD;
   11.26 +static int adjust = 0;
   11.27 +static int variable_period = 1;
   11.28  
   11.29  static void mce_checkregs (void *info)
   11.30  {
   11.31 -	u32 low, high;
   11.32 -	int i;
   11.33 -
   11.34 -	for (i=firstbank; i<nr_mce_banks; i++) {
   11.35 -		rdmsr (MSR_IA32_MC0_STATUS+i*4, low, high);
   11.36 +	mctelem_cookie_t mctc;
   11.37 +	struct mca_summary bs;
   11.38 +	static uint64_t dumpcount = 0;
   11.39  
   11.40 -		if (high & (1<<31)) {
   11.41 -			printk(KERN_INFO "MCE: The hardware reports a non "
   11.42 -				"fatal, correctable incident occurred on "
   11.43 -				"CPU %d.\n",
   11.44 -				smp_processor_id());
   11.45 -			printk (KERN_INFO "Bank %d: %08x%08x\n", i, high, low);
   11.46 +	mctc = mcheck_mca_logout(MCA_POLLER, bankmask, &bs);
   11.47  
   11.48 -			/* Scrub the error so we don't pick it up in MCE_RATE seconds time. */
   11.49 -			wrmsr (MSR_IA32_MC0_STATUS+i*4, 0UL, 0UL);
   11.50 +	if (bs.errcnt && mctc != NULL) {
   11.51 +		adjust++;
   11.52  
   11.53 -			/* Serialize */
   11.54 -			wmb();
   11.55 -			add_taint(TAINT_MACHINE_CHECK);
   11.56 +		/* If Dom0 enabled the VIRQ_MCA event, then notify it.
   11.57 +		 * Otherwise, if dom0 has had plenty of time to register
   11.58 +		 * the virq handler but still hasn't then dump telemetry
   11.59 +		 * to the Xen console.  The call count may be incremented
   11.60 +		 * on multiple cpus at once and is indicative only - just
   11.61 +		 * a simple-minded attempt to avoid spamming the console
   11.62 +		 * for corrected errors in early startup.
   11.63 +		 */
   11.64 +
   11.65 +		if (guest_enabled_event(dom0->vcpu[0], VIRQ_MCA)) {
   11.66 +			mctelem_commit(mctc);
   11.67 +			send_guest_global_virq(dom0, VIRQ_MCA);
   11.68 +		} else if (++dumpcount >= 10) {
   11.69 +			x86_mcinfo_dump((struct mc_info *)mctelem_dataptr(mctc));
   11.70 +			mctelem_dismiss(mctc);
   11.71 +		} else {
   11.72 +			mctelem_dismiss(mctc);
   11.73  		}
   11.74 +	} else if (mctc != NULL) {
   11.75 +		mctelem_dismiss(mctc);
   11.76  	}
   11.77  }
   11.78  
   11.79  static void mce_work_fn(void *data)
   11.80  { 
   11.81  	on_each_cpu(mce_checkregs, NULL, 1, 1);
   11.82 -	set_timer(&mce_timer, NOW() + MCE_PERIOD);
   11.83 +
   11.84 +	if (variable_period) {
   11.85 +		if (adjust)
   11.86 +			period /= (adjust + 1);
   11.87 +		else
   11.88 +			period *= 2;
   11.89 +		if (period > MCE_PERIOD_MAX)
   11.90 +			period = MCE_PERIOD_MAX;
   11.91 +		if (period < MCE_PERIOD_MIN)
   11.92 +			period = MCE_PERIOD_MIN;
   11.93 +	}
   11.94 +
   11.95 +	set_timer(&mce_timer, NOW() + period);
   11.96 +	adjust = 0;
   11.97  }
   11.98  
   11.99  static int __init init_nonfatal_mce_checker(void)
  11.100 @@ -63,13 +93,17 @@ static int __init init_nonfatal_mce_chec
  11.101  	/* Check for MCE support */
  11.102  	if (!mce_available(c))
  11.103  		return -ENODEV;
  11.104 +
  11.105 +	memcpy(&bankmask, &mca_allbanks, sizeof (cpu_banks_t));
  11.106 +	if (mce_firstbank(c) == 1)
  11.107 +		clear_bit(0, bankmask);
  11.108 +
  11.109  	/*
  11.110  	 * Check for non-fatal errors every MCE_RATE s
  11.111  	 */
  11.112  	switch (c->x86_vendor) {
  11.113  	case X86_VENDOR_AMD:
  11.114  		if (c->x86 == 6) { /* K7 */
  11.115 -			firstbank = 1;
  11.116  			init_timer(&mce_timer, mce_work_fn, NULL, 0);
  11.117  			set_timer(&mce_timer, NOW() + MCE_PERIOD);
  11.118  			break;
  11.119 @@ -80,15 +114,14 @@ static int __init init_nonfatal_mce_chec
  11.120  		break;
  11.121  
  11.122  	case X86_VENDOR_INTEL:
  11.123 -		/* p5 family is different. P4/P6 and latest CPUs shares the
  11.124 -		 * same polling methods
  11.125 -		*/
  11.126 +		/*
  11.127 +		 * The P5 family is different. P4/P6 and latest CPUs share the
  11.128 +		 * same polling methods.
  11.129 +		 */
  11.130  		if ( c->x86 != 5 )
  11.131  		{
  11.132 -			/* some CPUs or banks don't support cmci, we need to 
  11.133 -			 * enable this feature anyway
  11.134 -			 */
  11.135 -			intel_mcheck_timer(c);
  11.136 +			init_timer(&mce_timer, mce_work_fn, NULL, 0);
  11.137 +			set_timer(&mce_timer, NOW() + MCE_PERIOD);
  11.138  		}
  11.139  		break;
  11.140  	}
    12.1 --- a/xen/arch/x86/cpu/mcheck/p5.c	Tue Mar 17 14:21:18 2009 +0000
    12.2 +++ b/xen/arch/x86/cpu/mcheck/p5.c	Tue Mar 17 14:22:50 2009 +0000
    12.3 @@ -16,7 +16,7 @@
    12.4  #include "x86_mca.h"
    12.5  
    12.6  /* Machine check handler for Pentium class Intel */
    12.7 -static fastcall void pentium_machine_check(struct cpu_user_regs * regs, long error_code)
    12.8 +static void pentium_machine_check(struct cpu_user_regs * regs, long error_code)
    12.9  {
   12.10  	u32 loaddr, hi, lotype;
   12.11  	rdmsr(MSR_IA32_P5_MC_ADDR, loaddr, hi);
   12.12 @@ -28,19 +28,14 @@ static fastcall void pentium_machine_che
   12.13  }
   12.14  
   12.15  /* Set up machine check reporting for processors with Intel style MCE */
   12.16 -void intel_p5_mcheck_init(struct cpuinfo_x86 *c)
   12.17 +int intel_p5_mcheck_init(struct cpuinfo_x86 *c)
   12.18  {
   12.19  	u32 l, h;
   12.20  	
   12.21 -	/*Check for MCE support */
   12.22 -	if( !cpu_has(c, X86_FEATURE_MCE) )
   12.23 -		return;	
   12.24 -
   12.25  	/* Default P5 to off as its often misconnected */
   12.26  	if(mce_disabled != -1)
   12.27 -		return;
   12.28 -	machine_check_vector = pentium_machine_check;
   12.29 -	wmb();
   12.30 +		return 0;
   12.31 +	x86_mce_vector_register(pentium_machine_check);
   12.32  
   12.33  	/* Read registers before enabling */
   12.34  	rdmsr(MSR_IA32_P5_MC_ADDR, l, h);
   12.35 @@ -50,4 +45,6 @@ void intel_p5_mcheck_init(struct cpuinfo
   12.36   	/* Enable MCE */
   12.37  	set_in_cr4(X86_CR4_MCE);
   12.38  	printk(KERN_INFO "Intel old style machine check reporting enabled on CPU#%d.\n", smp_processor_id());
   12.39 +
   12.40 +	return 1;
   12.41  }
    13.1 --- a/xen/arch/x86/cpu/mcheck/winchip.c	Tue Mar 17 14:21:18 2009 +0000
    13.2 +++ b/xen/arch/x86/cpu/mcheck/winchip.c	Tue Mar 17 14:22:50 2009 +0000
    13.3 @@ -16,22 +16,24 @@
    13.4  #include "mce.h"
    13.5  
    13.6  /* Machine check handler for WinChip C6 */
    13.7 -static fastcall void winchip_machine_check(struct cpu_user_regs * regs, long error_code)
    13.8 +static void winchip_machine_check(struct cpu_user_regs * regs, long error_code)
    13.9  {
   13.10  	printk(KERN_EMERG "CPU0: Machine Check Exception.\n");
   13.11  	add_taint(TAINT_MACHINE_CHECK);
   13.12  }
   13.13  
   13.14  /* Set up machine check reporting on the Winchip C6 series */
   13.15 -void winchip_mcheck_init(struct cpuinfo_x86 *c)
   13.16 +int winchip_mcheck_init(struct cpuinfo_x86 *c)
   13.17  {
   13.18  	u32 lo, hi;
   13.19 -	machine_check_vector = winchip_machine_check;
   13.20 +
   13.21  	wmb();
   13.22 +	x86_mce_vector_register(winchip_machine_check);
   13.23  	rdmsr(MSR_IDT_FCR1, lo, hi);
   13.24  	lo|= (1<<2);	/* Enable EIERRINT (int 18 MCE) */
   13.25  	lo&= ~(1<<4);	/* Enable MCE */
   13.26  	wrmsr(MSR_IDT_FCR1, lo, hi);
   13.27  	set_in_cr4(X86_CR4_MCE);
   13.28  	printk(KERN_INFO "Winchip machine check reporting enabled on CPU#0.\n");
   13.29 +	return (1);
   13.30  }
    14.1 --- a/xen/arch/x86/cpu/mcheck/x86_mca.h	Tue Mar 17 14:21:18 2009 +0000
    14.2 +++ b/xen/arch/x86/cpu/mcheck/x86_mca.h	Tue Mar 17 14:22:50 2009 +0000
    14.3 @@ -17,6 +17,10 @@
    14.4   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
    14.5   */
    14.6  
    14.7 +#ifndef X86_MCA_H
    14.8 +
    14.9 +#define X86_MCA_H
   14.10 +
   14.11  
   14.12  /* The MCA/MCE MSRs should not be used anywhere else.
   14.13   * They are cpu family/model specific and are only for use
   14.14 @@ -73,6 +77,9 @@
   14.15  /* reserved bits */
   14.16  #define MCi_STATUS_OTHER_RESERVED2      0x0180000000000000ULL
   14.17  
   14.18 +/* Bitfield of MSR_K8_HWCR register */
   14.19 +#define K8_HWCR_MCi_STATUS_WREN		(1ULL << 18)
   14.20 +
   14.21  /*Intel Specific bitfield*/
   14.22  #define CMCI_THRESHOLD			0x2
   14.23  
   14.24 @@ -87,3 +94,4 @@ extern int mce_disabled;
   14.25  extern unsigned int nr_mce_banks;
   14.26  extern int firstbank;
   14.27  
   14.28 +#endif /* X86_MCA_H */
    15.1 --- a/xen/include/asm-x86/traps.h	Tue Mar 17 14:21:18 2009 +0000
    15.2 +++ b/xen/include/asm-x86/traps.h	Tue Mar 17 14:22:50 2009 +0000
    15.3 @@ -28,7 +28,7 @@ struct softirq_trap {
    15.4  
    15.5  struct cpu_user_regs;
    15.6  
    15.7 -extern void (*machine_check_vector)(struct cpu_user_regs *regs, long error_code);
    15.8 +extern void machine_check_vector(struct cpu_user_regs *regs, long error_code);
    15.9   
   15.10  /**
   15.11   * guest_has_trap_callback
    16.1 --- a/xen/include/public/arch-x86/xen-mca.h	Tue Mar 17 14:21:18 2009 +0000
    16.2 +++ b/xen/include/public/arch-x86/xen-mca.h	Tue Mar 17 14:22:50 2009 +0000
    16.3 @@ -56,13 +56,20 @@
    16.4  /* Hypercall */
    16.5  #define __HYPERVISOR_mca __HYPERVISOR_arch_0
    16.6  
    16.7 -#define XEN_MCA_INTERFACE_VERSION 0x03000002
    16.8 +/*
    16.9 + * The xen-unstable repo has interface version 0x03000001; out interface
   16.10 + * is incompatible with that and any future minor revisions, so we
   16.11 + * choose a different version number range that is numerically less
   16.12 + * than that used in xen-unstable.
   16.13 + */
   16.14 +#define XEN_MCA_INTERFACE_VERSION 0x01ecc002
   16.15  
   16.16 -/* IN: Dom0 calls hypercall from MC event handler. */
   16.17 -#define XEN_MC_CORRECTABLE  0x0
   16.18 -/* IN: Dom0/DomU calls hypercall from MC trap handler. */
   16.19 -#define XEN_MC_TRAP         0x1
   16.20 -/* XEN_MC_CORRECTABLE and XEN_MC_TRAP are mutually exclusive. */
   16.21 +/* IN: Dom0 calls hypercall to retrieve nonurgent telemetry */
   16.22 +#define XEN_MC_NONURGENT  0x0001
   16.23 +/* IN: Dom0/DomU calls hypercall to retrieve urgent telemetry */
   16.24 +#define XEN_MC_URGENT     0x0002
   16.25 +/* IN: Dom0 acknowledges previosly-fetched telemetry */
   16.26 +#define XEN_MC_ACK        0x0004
   16.27  
   16.28  /* OUT: All is ok */
   16.29  #define XEN_MC_OK           0x0
   16.30 @@ -110,6 +117,7 @@ struct mcinfo_common {
   16.31  #define MC_FLAG_POLLED		(1 << 3)
   16.32  #define MC_FLAG_RESET		(1 << 4)
   16.33  #define MC_FLAG_CMCI		(1 << 5)
   16.34 +#define MC_FLAG_MCE		(1 << 6)
   16.35  /* contains global x86 mc information */
   16.36  struct mcinfo_global {
   16.37      struct mcinfo_common common;
   16.38 @@ -174,6 +182,7 @@ struct mc_info {
   16.39      uint8_t mi_data[MCINFO_MAXSIZE - sizeof(uint32_t)];
   16.40  };
   16.41  typedef struct mc_info mc_info_t;
   16.42 +DEFINE_XEN_GUEST_HANDLE(mc_info_t);
   16.43  
   16.44  #define __MC_MSR_ARRAYSIZE 8
   16.45  #define __MC_NMSRS 1
   16.46 @@ -274,14 +283,14 @@ DEFINE_XEN_GUEST_HANDLE(xen_mc_logical_c
   16.47  #define XEN_MC_fetch            1
   16.48  struct xen_mc_fetch {
   16.49      /* IN/OUT variables. */
   16.50 -    uint32_t flags;
   16.51 -
   16.52 -/* IN: XEN_MC_CORRECTABLE, XEN_MC_TRAP */
   16.53 -/* OUT: XEN_MC_OK, XEN_MC_FETCHFAILED, XEN_MC_NODATA, XEN_MC_NOMATCH */
   16.54 +    uint32_t flags;	/* IN: XEN_MC_NONURGENT, XEN_MC_URGENT,
   16.55 +                           XEN_MC_ACK if ack'ing an earlier fetch */
   16.56 +			/* OUT: XEN_MC_OK, XEN_MC_FETCHFAILED,
   16.57 +			   XEN_MC_NODATA, XEN_MC_NOMATCH */
   16.58 +    uint64_t fetch_id;	/* OUT: id for ack, IN: id we are ack'ing */
   16.59  
   16.60      /* OUT variables. */
   16.61 -    uint32_t fetch_idx;  /* only useful for Dom0 for the notify hypercall */
   16.62 -    struct mc_info mc_info;
   16.63 +    XEN_GUEST_HANDLE(mc_info_t) data;
   16.64  };
   16.65  typedef struct xen_mc_fetch xen_mc_fetch_t;
   16.66  DEFINE_XEN_GUEST_HANDLE(xen_mc_fetch_t);
   16.67 @@ -296,7 +305,6 @@ struct xen_mc_notifydomain {
   16.68      uint16_t mc_domid;    /* The unprivileged domain to notify. */
   16.69      uint16_t mc_vcpuid;   /* The vcpu in mc_domid to notify.
   16.70                             * Usually echo'd value from the fetch hypercall. */
   16.71 -    uint32_t fetch_idx;   /* echo'd value from the fetch hypercall. */
   16.72  
   16.73      /* IN/OUT variables. */
   16.74      uint32_t flags;
   16.75 @@ -316,15 +324,16 @@ struct xen_mc_physcpuinfo {
   16.76  	XEN_GUEST_HANDLE(xen_mc_logical_cpu_t) info;
   16.77  };
   16.78  
   16.79 +typedef union {
   16.80 +    struct xen_mc_fetch        mc_fetch;
   16.81 +    struct xen_mc_notifydomain mc_notifydomain;
   16.82 +    struct xen_mc_physcpuinfo  mc_physcpuinfo;
   16.83 +} xen_mc_arg_t;
   16.84 +
   16.85  struct xen_mc {
   16.86      uint32_t cmd;
   16.87      uint32_t interface_version; /* XEN_MCA_INTERFACE_VERSION */
   16.88 -    union {
   16.89 -        struct xen_mc_fetch        mc_fetch;
   16.90 -        struct xen_mc_notifydomain mc_notifydomain;
   16.91 -        struct xen_mc_physcpuinfo  mc_physcpuinfo;
   16.92 -        uint8_t pad[MCINFO_HYPERCALLSIZE];
   16.93 -    } u;
   16.94 +    xen_mc_arg_t u;
   16.95  };
   16.96  typedef struct xen_mc xen_mc_t;
   16.97  DEFINE_XEN_GUEST_HANDLE(xen_mc_t);