ia64/xen-unstable

changeset 17971:a49673cd23d2

x86: MCA support.
Signed-off-by: Christoph Egger <Christoph.Egger@amd.com>
author Keir Fraser <keir.fraser@citrix.com>
date Fri Jul 04 16:27:44 2008 +0100 (2008-07-04)
parents d133d452cb71
children 433d1b26fd51
files xen/arch/x86/cpu/mcheck/Makefile xen/arch/x86/cpu/mcheck/amd_f10.c xen/arch/x86/cpu/mcheck/amd_k8.c xen/arch/x86/cpu/mcheck/amd_nonfatal.c xen/arch/x86/cpu/mcheck/k7.c xen/arch/x86/cpu/mcheck/mce.c xen/arch/x86/cpu/mcheck/mce.h xen/arch/x86/cpu/mcheck/non-fatal.c xen/arch/x86/cpu/mcheck/x86_mca.h xen/arch/x86/nmi.c xen/arch/x86/traps.c xen/arch/x86/x86_32/asm-offsets.c xen/arch/x86/x86_32/entry.S xen/arch/x86/x86_32/traps.c xen/arch/x86/x86_64/asm-offsets.c xen/arch/x86/x86_64/compat/entry.S xen/arch/x86/x86_64/compat/traps.c xen/arch/x86/x86_64/entry.S xen/arch/x86/x86_64/traps.c xen/common/domain.c xen/common/event_channel.c xen/include/Makefile xen/include/asm-x86/event.h xen/include/asm-x86/mm.h xen/include/asm-x86/traps.h xen/include/public/arch-x86/xen-mca.h xen/include/public/arch-x86/xen.h xen/include/xen/event.h xen/include/xen/sched.h
line diff
     1.1 --- a/xen/arch/x86/cpu/mcheck/Makefile	Fri Jul 04 13:02:31 2008 +0100
     1.2 +++ b/xen/arch/x86/cpu/mcheck/Makefile	Fri Jul 04 16:27:44 2008 +0100
     1.3 @@ -1,4 +1,7 @@
     1.4 +obj-y += amd_nonfatal.o
     1.5  obj-y += k7.o
     1.6 +obj-y += amd_k8.o
     1.7 +obj-y += amd_f10.o
     1.8  obj-y += mce.o
     1.9  obj-y += non-fatal.o
    1.10  obj-y += p4.o
     2.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     2.2 +++ b/xen/arch/x86/cpu/mcheck/amd_f10.c	Fri Jul 04 16:27:44 2008 +0100
     2.3 @@ -0,0 +1,131 @@
     2.4 +/*
     2.5 + * MCA implementation for AMD Family10 CPUs
     2.6 + * Copyright (c) 2007 Advanced Micro Devices, Inc.
     2.7 + *
     2.8 + * This program is free software; you can redistribute it and/or modify
     2.9 + * it under the terms of the GNU General Public License as published by
    2.10 + * the Free Software Foundation; either version 2 of the License, or
    2.11 + * (at your option) any later version.
    2.12 + *
    2.13 + * This program is distributed in the hope that it will be useful,
    2.14 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
    2.15 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    2.16 + * GNU General Public License for more details.
    2.17 + *
    2.18 + * You should have received a copy of the GNU General Public License
    2.19 + * along with this program; if not, write to the Free Software
    2.20 + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
    2.21 + */
    2.22 +
    2.23 +
    2.24 +/* K8 common MCA documentation published at
    2.25 + *
    2.26 + * AMD64 Architecture Programmer's Manual Volume 2:
    2.27 + * System Programming
    2.28 + * Publication # 24593 Revision: 3.12
    2.29 + * Issue Date: September 2006
    2.30 + */
    2.31 +
    2.32 +/* Family10 MCA documentation published at
    2.33 + *
    2.34 + * BIOS and Kernel Developer's Guide
    2.35 + * For AMD Family 10h Processors
    2.36 + * Publication # 31116 Revision: 1.08
    2.37 + * Isse Date: June 10, 2007
    2.38 + */
    2.39 +
    2.40 +
    2.41 +#include <xen/init.h>
    2.42 +#include <xen/types.h>
    2.43 +#include <xen/kernel.h>
    2.44 +#include <xen/config.h>
    2.45 +#include <xen/smp.h>
    2.46 +
    2.47 +#include <asm/processor.h>
    2.48 +#include <asm/system.h>
    2.49 +#include <asm/msr.h>
    2.50 +
    2.51 +#include "mce.h"
    2.52 +#include "x86_mca.h"
    2.53 +
    2.54 +
    2.55 +static int amd_f10_handler(struct mc_info *mi, uint16_t bank, uint64_t status)
    2.56 +{
    2.57 +	struct mcinfo_extended mc_ext;
    2.58 +
    2.59 +	/* Family 0x10 introduced additional MSR that belong to the
    2.60 +	 * northbridge bank (4). */
    2.61 +	if (bank != 4)
    2.62 +		return 0;
    2.63 +
    2.64 +	if (!(status & MCi_STATUS_VAL))
    2.65 +		return 0;
    2.66 +
    2.67 +	if (!(status & MCi_STATUS_MISCV))
    2.68 +		return 0;
    2.69 +
    2.70 +	memset(&mc_ext, 0, sizeof(mc_ext));
    2.71 +	mc_ext.common.type = MC_TYPE_EXTENDED;
    2.72 +	mc_ext.common.size = sizeof(mc_ext);
    2.73 +	mc_ext.mc_msrs = 3;
    2.74 +
    2.75 +	mc_ext.mc_msr[0].reg = MSR_F10_MC4_MISC1;
    2.76 +	mc_ext.mc_msr[1].reg = MSR_F10_MC4_MISC2;
    2.77 +	mc_ext.mc_msr[2].reg = MSR_F10_MC4_MISC3;
    2.78 +
    2.79 +	rdmsrl(MSR_F10_MC4_MISC1, mc_ext.mc_msr[0].value);
    2.80 +	rdmsrl(MSR_F10_MC4_MISC2, mc_ext.mc_msr[1].value);
    2.81 +	rdmsrl(MSR_F10_MC4_MISC3, mc_ext.mc_msr[2].value);
    2.82 +	
    2.83 +	x86_mcinfo_add(mi, &mc_ext);
    2.84 +	return 1;
    2.85 +}
    2.86 +
    2.87 +
    2.88 +extern void k8_machine_check(struct cpu_user_regs *regs, long error_code);
    2.89 +
    2.90 +/* AMD Family10 machine check */
    2.91 +void amd_f10_mcheck_init(struct cpuinfo_x86 *c) 
    2.92 +{ 
    2.93 +	uint64_t value;
    2.94 +	uint32_t i;
    2.95 +	int cpu_nr;
    2.96 +
    2.97 +	machine_check_vector = k8_machine_check;
    2.98 +	mc_callback_bank_extended = amd_f10_handler;
    2.99 +	cpu_nr = smp_processor_id();
   2.100 +	wmb();
   2.101 +
   2.102 +	rdmsrl(MSR_IA32_MCG_CAP, value);
   2.103 +	if (value & MCG_CTL_P)	/* Control register present ? */
   2.104 +		wrmsrl (MSR_IA32_MCG_CTL, 0xffffffffffffffffULL);
   2.105 +	nr_mce_banks = value & MCG_CAP_COUNT;
   2.106 +
   2.107 +	for (i = 0; i < nr_mce_banks; i++) {
   2.108 +		switch (i) {
   2.109 +		case 4: /* Northbridge */
   2.110 +			/* Enable error reporting of all errors,
   2.111 +			 * enable error checking and
   2.112 +			 * disable sync flooding */
   2.113 +			wrmsrl(MSR_IA32_MC4_CTL, 0x02c3c008ffffffffULL);
   2.114 +			wrmsrl(MSR_IA32_MC4_STATUS, 0x0ULL);
   2.115 +
   2.116 +			/* XXX: We should write the value 0x1087821UL into
   2.117 +			 * to register F3x180 here, which sits in
   2.118 +			 * the PCI extended configuration space.
   2.119 +			 * Since this is not possible here, we can only hope,
   2.120 +			 * Dom0 is doing that.
   2.121 +			 */
   2.122 +			break;
   2.123 +
   2.124 +		default:
   2.125 +			/* Enable error reporting of all errors */
   2.126 +			wrmsrl(MSR_IA32_MC0_CTL + 4 * i, 0xffffffffffffffffULL);
   2.127 +			wrmsrl(MSR_IA32_MC0_STATUS + 4 * i, 0x0ULL);
   2.128 +			break;
   2.129 +		}
   2.130 +	}
   2.131 +
   2.132 +	set_in_cr4(X86_CR4_MCE);
   2.133 +	printk("CPU%i: AMD Family10h machine check reporting enabled.\n", cpu_nr);
   2.134 +}
     3.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     3.2 +++ b/xen/arch/x86/cpu/mcheck/amd_k8.c	Fri Jul 04 16:27:44 2008 +0100
     3.3 @@ -0,0 +1,324 @@
     3.4 +/*
     3.5 + * MCA implementation for AMD K8 CPUs
     3.6 + * Copyright (c) 2007 Advanced Micro Devices, Inc.
     3.7 + *
     3.8 + * This program is free software; you can redistribute it and/or modify
     3.9 + * it under the terms of the GNU General Public License as published by
    3.10 + * the Free Software Foundation; either version 2 of the License, or
    3.11 + * (at your option) any later version.
    3.12 + *
    3.13 + * This program is distributed in the hope that it will be useful,
    3.14 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
    3.15 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    3.16 + * GNU General Public License for more details.
    3.17 + *
    3.18 + * You should have received a copy of the GNU General Public License
    3.19 + * along with this program; if not, write to the Free Software
    3.20 + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
    3.21 + */
    3.22 +
    3.23 +
    3.24 +/* K8 common MCA documentation published at
    3.25 + *
    3.26 + * AMD64 Architecture Programmer's Manual Volume 2:
    3.27 + * System Programming
    3.28 + * Publication # 24593 Revision: 3.12
    3.29 + * Issue Date: September 2006
    3.30 + *
    3.31 + * URL:
    3.32 + * http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/24593.pdf
    3.33 + */
    3.34 +
    3.35 +/* The related documentation for K8 Revisions A - E is:
    3.36 + *
    3.37 + * BIOS and Kernel Developer's Guide for
    3.38 + * AMD Athlon 64 and AMD Opteron Processors
    3.39 + * Publication # 26094 Revision: 3.30
    3.40 + * Issue Date: February 2006
    3.41 + *
    3.42 + * URL:
    3.43 + * http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/26094.PDF
    3.44 + */
    3.45 +
    3.46 +/* The related documentation for K8 Revisions F - G is:
    3.47 + *
    3.48 + * BIOS and Kernel Developer's Guide for
    3.49 + * AMD NPT Family 0Fh Processors
    3.50 + * Publication # 32559 Revision: 3.04
    3.51 + * Issue Date: December 2006
    3.52 + *
    3.53 + * URL:
    3.54 + * http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/32559.pdf
    3.55 + */
    3.56 +
    3.57 +
    3.58 +#include <xen/config.h>
    3.59 +#include <xen/init.h>
    3.60 +#include <xen/types.h>
    3.61 +#include <xen/kernel.h>
    3.62 +#include <xen/smp.h>
    3.63 +#include <xen/sched.h>
    3.64 +#include <xen/sched-if.h>
    3.65 +#include <xen/softirq.h>
    3.66 +
    3.67 +#include <asm/processor.h>
    3.68 +#include <asm/shared.h>
    3.69 +#include <asm/system.h>
    3.70 +#include <asm/msr.h>
    3.71 +
    3.72 +#include "mce.h"
    3.73 +#include "x86_mca.h"
    3.74 +
    3.75 +
    3.76 +/* Machine Check Handler for AMD K8 family series */
    3.77 +void k8_machine_check(struct cpu_user_regs *regs, long error_code)
    3.78 +{
    3.79 +	struct vcpu *vcpu = current;
    3.80 +	struct domain *curdom;
    3.81 +	struct mc_info *mc_data;
    3.82 +	struct mcinfo_global mc_global;
    3.83 +	struct mcinfo_bank mc_info;
    3.84 +	uint64_t status, addrv, miscv, uc;
    3.85 +	uint32_t i;
    3.86 +	unsigned int cpu_nr;
    3.87 +	uint32_t xen_impacted = 0;
    3.88 +#define DOM_NORMAL	0
    3.89 +#define DOM0_TRAP	1
    3.90 +#define DOMU_TRAP	2
    3.91 +#define DOMU_KILLED	4
    3.92 +	uint32_t dom_state = DOM_NORMAL;
    3.93 +
    3.94 +	/* This handler runs as interrupt gate. So IPIs from the
    3.95 +	 * polling service routine are defered until we finished.
    3.96 +	 */
    3.97 +
    3.98 +        /* Disable interrupts for the _vcpu_. It may not re-scheduled to
    3.99 +	 * an other physical CPU or the impacted process in the guest
   3.100 +	 * continues running with corrupted data, otherwise. */
   3.101 +        vcpu_schedule_lock_irq(vcpu);
   3.102 +
   3.103 +	mc_data = x86_mcinfo_getptr();
   3.104 +	cpu_nr = smp_processor_id();
   3.105 +	curdom = vcpu->domain;
   3.106 +
   3.107 +	memset(&mc_global, 0, sizeof(mc_global));
   3.108 +	mc_global.common.type = MC_TYPE_GLOBAL;
   3.109 +	mc_global.common.size = sizeof(mc_global);
   3.110 +
   3.111 +	mc_global.mc_domid = curdom->domain_id; /* impacted domain */
   3.112 +	mc_global.mc_coreid = vcpu->processor; /* impacted physical cpu */
   3.113 +	BUG_ON(cpu_nr != vcpu->processor);
   3.114 +	mc_global.mc_core_threadid = 0;
   3.115 +	mc_global.mc_vcpuid = vcpu->vcpu_id; /* impacted vcpu */
   3.116 +#if 0 /* TODO: on which socket is this physical core?
   3.117 +         It's not clear to me how to figure this out. */
   3.118 +	mc_global.mc_socketid = ???;
   3.119 +#endif
   3.120 +	mc_global.mc_flags |= MC_FLAG_UNCORRECTABLE;
   3.121 +	rdmsrl(MSR_IA32_MCG_STATUS, mc_global.mc_gstatus);
   3.122 +
   3.123 +	/* Quick check, who is impacted */
   3.124 +	xen_impacted = is_idle_domain(curdom);
   3.125 +
   3.126 +	/* Dom0 */
   3.127 +	x86_mcinfo_clear(mc_data);
   3.128 +	x86_mcinfo_add(mc_data, &mc_global);
   3.129 +
   3.130 +	for (i = 0; i < nr_mce_banks; i++) {
   3.131 +		struct domain *d;
   3.132 +
   3.133 +		rdmsrl(MSR_IA32_MC0_STATUS + 4 * i, status);
   3.134 +
   3.135 +		if (!(status & MCi_STATUS_VAL))
   3.136 +			continue;
   3.137 +
   3.138 +		/* An error happened in this bank.
   3.139 +		 * This is expected to be an uncorrectable error,
   3.140 +		 * since correctable errors get polled.
   3.141 +		 */
   3.142 +		uc = status & MCi_STATUS_UC;
   3.143 +
   3.144 +		memset(&mc_info, 0, sizeof(mc_info));
   3.145 +		mc_info.common.type = MC_TYPE_BANK;
   3.146 +		mc_info.common.size = sizeof(mc_info);
   3.147 +		mc_info.mc_bank = i;
   3.148 +		mc_info.mc_status = status;
   3.149 +
   3.150 +		addrv = 0;
   3.151 +		if (status & MCi_STATUS_ADDRV) {
   3.152 +			rdmsrl(MSR_IA32_MC0_ADDR + 4 * i, addrv);
   3.153 +			
   3.154 +			d = maddr_get_owner(addrv);
   3.155 +			if (d != NULL)
   3.156 +				mc_info.mc_domid = d->domain_id;
   3.157 +		}
   3.158 +
   3.159 +		miscv = 0;
   3.160 +		if (status & MCi_STATUS_MISCV)
   3.161 +			rdmsrl(MSR_IA32_MC0_MISC + 4 * i, miscv);
   3.162 +
   3.163 +		mc_info.mc_addr = addrv;
   3.164 +		mc_info.mc_misc = miscv;
   3.165 +
   3.166 +		x86_mcinfo_add(mc_data, &mc_info); /* Dom0 */
   3.167 +
   3.168 +		if (mc_callback_bank_extended)
   3.169 +			mc_callback_bank_extended(mc_data, i, status);
   3.170 +
   3.171 +		/* clear status */
   3.172 +		wrmsrl(MSR_IA32_MC0_STATUS + 4 * i, 0x0ULL);
   3.173 +		wmb();
   3.174 +		add_taint(TAINT_MACHINE_CHECK);
   3.175 +	}
   3.176 +
   3.177 +	status = mc_global.mc_gstatus;
   3.178 +
   3.179 +	/* clear MCIP or cpu enters shutdown state
   3.180 +	 * in case another MCE occurs. */
   3.181 +	status &= ~MCG_STATUS_MCIP;
   3.182 +	wrmsrl(MSR_IA32_MCG_STATUS, status);
   3.183 +	wmb();
   3.184 +
   3.185 +	/* For the details see the discussion "MCE/MCA concept" on xen-devel.
   3.186 +	 * The thread started here:
   3.187 +	 * http://lists.xensource.com/archives/html/xen-devel/2007-05/msg01015.html
   3.188 +	 */
   3.189 +
   3.190 +	/* MCG_STATUS_RIPV: 
   3.191 +	 * When this bit is not set, then the instruction pointer onto the stack
   3.192 +	 * to resume at is not valid. If xen is interrupted, then we panic anyway
   3.193 +	 * right below. Otherwise it is up to the guest to figure out if 
   3.194 +	 * guest kernel or guest userland is affected and should kill either
   3.195 +	 * itself or the affected process.
   3.196 +	 */
   3.197 +
   3.198 +	/* MCG_STATUS_EIPV:
   3.199 +	 * Evaluation of EIPV is the job of the guest.
   3.200 +	 */
   3.201 +
   3.202 +	if (xen_impacted) {
   3.203 +		/* Now we are going to panic anyway. Allow interrupts, so that
   3.204 +		 * printk on serial console can work. */
   3.205 +		vcpu_schedule_unlock_irq(vcpu);
   3.206 +
   3.207 +		/* Uh, that means, machine check exception
   3.208 +		 * inside Xen occured. */
   3.209 +		printk("Machine check exception occured in Xen.\n");
   3.210 +
   3.211 +		/* if MCG_STATUS_EIPV indicates, the IP on the stack is related
   3.212 +		 * to the error then it makes sense to print a stack trace.
   3.213 +		 * That can be useful for more detailed error analysis and/or
   3.214 +		 * error case studies to figure out, if we can clear
   3.215 +		 * xen_impacted and kill a DomU instead
   3.216 +		 * (i.e. if a guest only control structure is affected, but then
   3.217 +		 * we must ensure the bad pages are not re-used again).
   3.218 +		 */
   3.219 +		if (status & MCG_STATUS_EIPV) {
   3.220 +			printk("MCE: Instruction Pointer is related to the error. "
   3.221 +				"Therefore, print the execution state.\n");
   3.222 +			show_execution_state(regs);
   3.223 +		}
   3.224 +		x86_mcinfo_dump(mc_data);
   3.225 +		panic("End of MCE. Use mcelog to decode above error codes.\n");
   3.226 +	}
   3.227 +
   3.228 +	/* If Dom0 registered a machine check handler, which is only possible
   3.229 +	 * with a PV MCA driver, then ... */
   3.230 +	if ( guest_has_trap_callback(dom0, 0, TRAP_machine_check) ) {
   3.231 +		dom_state = DOM0_TRAP;
   3.232 +
   3.233 +		/* ... deliver machine check trap to Dom0. */
   3.234 +		send_guest_trap(dom0, 0, TRAP_machine_check);
   3.235 +
   3.236 +		/* Xen may tell Dom0 now to notify the DomU.
   3.237 +		 * But this will happen through a hypercall. */
   3.238 +	} else
   3.239 +		/* Dom0 did not register a machine check handler, but if DomU
   3.240 +		 * did so, then... */
   3.241 +                if ( guest_has_trap_callback(curdom, vcpu->vcpu_id, TRAP_machine_check) ) {
   3.242 +			dom_state = DOMU_TRAP;
   3.243 +
   3.244 +			/* ... deliver machine check trap to DomU */
   3.245 +			send_guest_trap(curdom, vcpu->vcpu_id, TRAP_machine_check);
   3.246 +	} else {
   3.247 +		/* hmm... noone feels responsible to handle the error.
   3.248 +		 * So, do a quick check if a DomU is impacted or not.
   3.249 +		 */
   3.250 +		if (curdom == dom0) {
   3.251 +			/* Dom0 is impacted. Since noone can't handle
   3.252 +			 * this error, panic! */
   3.253 +			x86_mcinfo_dump(mc_data);
   3.254 +			panic("MCE occured in Dom0, which it can't handle\n");
   3.255 +
   3.256 +			/* UNREACHED */
   3.257 +		} else {
   3.258 +			dom_state = DOMU_KILLED;
   3.259 +
   3.260 +			/* Enable interrupts. This basically results in
   3.261 +			 * calling sti on the *physical* cpu. But after
   3.262 +			 * domain_crash() the vcpu pointer is invalid.
   3.263 +			 * Therefore, we must unlock the irqs before killing
   3.264 +			 * it. */
   3.265 +			vcpu_schedule_unlock_irq(vcpu);
   3.266 +
   3.267 +			/* DomU is impacted. Kill it and continue. */
   3.268 +			domain_crash(curdom);
   3.269 +		}
   3.270 +	}
   3.271 +
   3.272 +
   3.273 +	switch (dom_state) {
   3.274 +	case DOM0_TRAP:
   3.275 +	case DOMU_TRAP:
   3.276 +		/* Enable interrupts. */
   3.277 +		vcpu_schedule_unlock_irq(vcpu);
   3.278 +
   3.279 +		/* guest softirqs and event callbacks are scheduled
   3.280 +		 * immediately after this handler exits. */
   3.281 +		break;
   3.282 +	case DOMU_KILLED:
   3.283 +		/* Nothing to do here. */
   3.284 +		break;
   3.285 +	default:
   3.286 +		BUG();
   3.287 +	}
   3.288 +}
   3.289 +
   3.290 +
   3.291 +/* AMD K8 machine check */
   3.292 +void amd_k8_mcheck_init(struct cpuinfo_x86 *c)
   3.293 +{
   3.294 +	uint64_t value;
   3.295 +	uint32_t i;
   3.296 +	int cpu_nr;
   3.297 +
   3.298 +	machine_check_vector = k8_machine_check;
   3.299 +	cpu_nr = smp_processor_id();
   3.300 +	wmb();
   3.301 +
   3.302 +	rdmsrl(MSR_IA32_MCG_CAP, value);
   3.303 +	if (value & MCG_CTL_P)	/* Control register present ? */
   3.304 +		wrmsrl (MSR_IA32_MCG_CTL, 0xffffffffffffffffULL);
   3.305 +	nr_mce_banks = value & MCG_CAP_COUNT;
   3.306 +
   3.307 +	for (i = 0; i < nr_mce_banks; i++) {
   3.308 +		switch (i) {
   3.309 +		case 4: /* Northbridge */
   3.310 +			/* Enable error reporting of all errors,
   3.311 +			 * enable error checking and
   3.312 +			 * disable sync flooding */
   3.313 +			wrmsrl(MSR_IA32_MC4_CTL, 0x02c3c008ffffffffULL);
   3.314 +			wrmsrl(MSR_IA32_MC4_STATUS, 0x0ULL);
   3.315 +			break;
   3.316 +
   3.317 +		default:
   3.318 +			/* Enable error reporting of all errors */
   3.319 +			wrmsrl(MSR_IA32_MC0_CTL + 4 * i, 0xffffffffffffffffULL);
   3.320 +			wrmsrl(MSR_IA32_MC0_STATUS + 4 * i, 0x0ULL);
   3.321 +			break;
   3.322 +		}
   3.323 +	}
   3.324 +
   3.325 +	set_in_cr4(X86_CR4_MCE);
   3.326 +	printk("CPU%i: AMD K8 machine check reporting enabled.\n", cpu_nr);
   3.327 +}
     4.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     4.2 +++ b/xen/arch/x86/cpu/mcheck/amd_nonfatal.c	Fri Jul 04 16:27:44 2008 +0100
     4.3 @@ -0,0 +1,303 @@
     4.4 +/*
     4.5 + * MCA implementation for AMD CPUs
     4.6 + * Copyright (c) 2007 Advanced Micro Devices, Inc.
     4.7 + *
     4.8 + * This program is free software; you can redistribute it and/or modify
     4.9 + * it under the terms of the GNU General Public License as published by
    4.10 + * the Free Software Foundation; either version 2 of the License, or
    4.11 + * (at your option) any later version.
    4.12 + *
    4.13 + * This program is distributed in the hope that it will be useful,
    4.14 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
    4.15 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    4.16 + * GNU General Public License for more details.
    4.17 + *
    4.18 + * You should have received a copy of the GNU General Public License
    4.19 + * along with this program; if not, write to the Free Software
    4.20 + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
    4.21 + */
    4.22 +
    4.23 +
    4.24 +/* K8 common MCA documentation published at
    4.25 + *
    4.26 + * AMD64 Architecture Programmer's Manual Volume 2:
    4.27 + * System Programming
    4.28 + * Publication # 24593 Revision: 3.12
    4.29 + * Issue Date: September 2006
    4.30 + *
    4.31 + * URL:
    4.32 + * http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/24593.pdf
    4.33 + */
    4.34 +
    4.35 +/* The related documentation for K8 Revisions A - E is:
    4.36 + *
    4.37 + * BIOS and Kernel Developer's Guide for
    4.38 + * AMD Athlon 64 and AMD Opteron Processors
    4.39 + * Publication # 26094 Revision: 3.30
    4.40 + * Issue Date: February 2006
    4.41 + *
    4.42 + * URL:
    4.43 + * http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/26094.PDF
    4.44 + */
    4.45 +
    4.46 +/* The related documentation for K8 Revisions F - G is:
    4.47 + *
    4.48 + * BIOS and Kernel Developer's Guide for
    4.49 + * AMD NPT Family 0Fh Processors
    4.50 + * Publication # 32559 Revision: 3.04
    4.51 + * Issue Date: December 2006
    4.52 + *
    4.53 + * URL:
    4.54 + * http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/32559.pdf
    4.55 + */
    4.56 +
    4.57 +#include <xen/config.h>
    4.58 +#include <xen/init.h>
    4.59 +#include <xen/types.h>
    4.60 +#include <xen/kernel.h>
    4.61 +#include <xen/smp.h>
    4.62 +#include <xen/timer.h>
    4.63 +#include <xen/event.h>
    4.64 +#include <asm/processor.h> 
    4.65 +#include <asm/system.h>
    4.66 +#include <asm/msr.h>
    4.67 +
    4.68 +#include "mce.h"
    4.69 +#include "x86_mca.h"
    4.70 +
    4.71 +static struct timer mce_timer;
    4.72 +
    4.73 +#define MCE_PERIOD MILLISECS(15000)
    4.74 +#define MCE_MIN    MILLISECS(2000)
    4.75 +#define MCE_MAX    MILLISECS(30000)
    4.76 +
    4.77 +static s_time_t period = MCE_PERIOD;
    4.78 +static int hw_threshold = 0;
    4.79 +static int adjust = 0;
    4.80 +
    4.81 +/* The polling service routine:
    4.82 + * Collects information of correctable errors and notifies
    4.83 + * Dom0 via an event.
    4.84 + */
    4.85 +void mce_amd_checkregs(void *info)
    4.86 +{
    4.87 +	struct vcpu *vcpu = current;
    4.88 +	struct mc_info *mc_data;
    4.89 +	struct mcinfo_global mc_global;
    4.90 +	struct mcinfo_bank mc_info;
    4.91 +	uint64_t status, addrv, miscv;
    4.92 +	unsigned int i;
    4.93 +	unsigned int event_enabled;
    4.94 +	unsigned int cpu_nr;
    4.95 +	int error_found;
    4.96 +
    4.97 +	/* We don't need a slot yet. Only allocate one on error. */
    4.98 +	mc_data = NULL;
    4.99 +
   4.100 +	cpu_nr = smp_processor_id();
   4.101 +	event_enabled = guest_enabled_event(dom0->vcpu[0], VIRQ_MCA);
   4.102 +	error_found = 0;
   4.103 +
   4.104 +	memset(&mc_global, 0, sizeof(mc_global));
   4.105 +	mc_global.common.type = MC_TYPE_GLOBAL;
   4.106 +	mc_global.common.size = sizeof(mc_global);
   4.107 +
   4.108 +	mc_global.mc_domid = vcpu->domain->domain_id; /* impacted domain */
   4.109 +	mc_global.mc_coreid = vcpu->processor; /* impacted physical cpu */
   4.110 +	BUG_ON(cpu_nr != vcpu->processor);
   4.111 +	mc_global.mc_core_threadid = 0;
   4.112 +	mc_global.mc_vcpuid = vcpu->vcpu_id; /* impacted vcpu */
   4.113 +#if 0 /* TODO: on which socket is this physical core?
   4.114 +         It's not clear to me how to figure this out. */
   4.115 +	mc_global.mc_socketid = ???;
   4.116 +#endif
   4.117 +	mc_global.mc_flags |= MC_FLAG_CORRECTABLE;
   4.118 +	rdmsrl(MSR_IA32_MCG_STATUS, mc_global.mc_gstatus);
   4.119 +
   4.120 +	for (i = 0; i < nr_mce_banks; i++) {
   4.121 +		struct domain *d;
   4.122 +
   4.123 +		rdmsrl(MSR_IA32_MC0_STATUS + i * 4, status);
   4.124 +
   4.125 +		if (!(status & MCi_STATUS_VAL))
   4.126 +			continue;
   4.127 +
   4.128 +		if (mc_data == NULL) {
   4.129 +			/* Now we need a slot to fill in error telemetry. */
   4.130 +			mc_data = x86_mcinfo_getptr();
   4.131 +			BUG_ON(mc_data == NULL);
   4.132 +			x86_mcinfo_clear(mc_data);
   4.133 +			x86_mcinfo_add(mc_data, &mc_global);
   4.134 +		}
   4.135 +
   4.136 +		memset(&mc_info, 0, sizeof(mc_info));
   4.137 +		mc_info.common.type = MC_TYPE_BANK;
   4.138 +		mc_info.common.size = sizeof(mc_info);
   4.139 +		mc_info.mc_bank = i;
   4.140 +		mc_info.mc_status = status;
   4.141 +
   4.142 +		/* Increase polling frequency */
   4.143 +		error_found = 1;
   4.144 +
   4.145 +		addrv = 0;
   4.146 +		if (status & MCi_STATUS_ADDRV) {
   4.147 +			rdmsrl(MSR_IA32_MC0_ADDR + i * 4, addrv);
   4.148 +
   4.149 +			d = maddr_get_owner(addrv);
   4.150 +			if (d != NULL)
   4.151 +				mc_info.mc_domid = d->domain_id;
   4.152 +		}
   4.153 +
   4.154 +		miscv = 0;
   4.155 +		if (status & MCi_STATUS_MISCV)
   4.156 +			rdmsrl(MSR_IA32_MC0_MISC + i * 4, miscv);
   4.157 +
   4.158 +		mc_info.mc_addr = addrv;
   4.159 +		mc_info.mc_misc = miscv;
   4.160 +		x86_mcinfo_add(mc_data, &mc_info);
   4.161 +
   4.162 +		if (mc_callback_bank_extended)
   4.163 +			mc_callback_bank_extended(mc_data, i, status);
   4.164 +
   4.165 +		/* clear status */
   4.166 +		wrmsrl(MSR_IA32_MC0_STATUS + i * 4, 0x0ULL);
   4.167 +		wmb();
   4.168 +	}
   4.169 +
   4.170 +	if (error_found > 0) {
   4.171 +		/* If Dom0 enabled the VIRQ_MCA event, then ... */
   4.172 +		if (event_enabled)
   4.173 +			/* ... notify it. */
   4.174 +			send_guest_global_virq(dom0, VIRQ_MCA);
   4.175 +		else
   4.176 +			/* ... or dump it */
   4.177 +			x86_mcinfo_dump(mc_data);
   4.178 +	}
   4.179 +
   4.180 +	adjust += error_found;
   4.181 +}
   4.182 +
   4.183 +/* polling service routine invoker:
   4.184 + * Adjust poll frequency at runtime. No error means slow polling frequency,
   4.185 + * an error means higher polling frequency.
   4.186 + * It uses hw threshold register introduced in AMD K8 RevF to detect
   4.187 + * multiple correctable errors between two polls. In that case,
   4.188 + * increase polling frequency higher than normal.
   4.189 + */
   4.190 +static void mce_amd_work_fn(void *data)
   4.191 +{
   4.192 +	on_each_cpu(mce_amd_checkregs, data, 1, 1);
   4.193 +
   4.194 +	if (adjust > 0) {
   4.195 +		if ( !guest_enabled_event(dom0->vcpu[0], VIRQ_MCA) ) {
   4.196 +			/* Dom0 did not enable VIRQ_MCA, so Xen is reporting. */
   4.197 +			printk("MCE: polling routine found correctable error. "
   4.198 +				" Use mcelog to parse above error output.\n");
   4.199 +		}
   4.200 +	}
   4.201 +
   4.202 +	if (hw_threshold) {
   4.203 +		uint64_t value;
   4.204 +		uint32_t counter;
   4.205 +
   4.206 +		rdmsrl(MSR_IA32_MC4_MISC, value);
   4.207 +		/* Only the error counter field is of interest
   4.208 +		 * Bit field is described in AMD K8 BKDG chapter 6.4.5.5
   4.209 +		 */
   4.210 +		counter = (value & 0xFFF00000000ULL) >> 32U;
   4.211 +
   4.212 +		/* HW does not count *all* kinds of correctable errors.
   4.213 +		 * Thus it is possible, that the polling routine finds an
   4.214 +		 * correctable error even if the HW reports nothing.
   4.215 +		 * However, the other way around is not possible (= BUG).
   4.216 +		 */ 
   4.217 +		if (counter > 0) {
   4.218 +			/* HW reported correctable errors,
   4.219 +			 * the polling routine did not find...
   4.220 +			 */
   4.221 +			BUG_ON(adjust == 0);
   4.222 +			/* subtract 1 to not double count the error 
   4.223 +			 * from the polling service routine */ 
   4.224 +			adjust += (counter - 1);
   4.225 +
   4.226 +			/* Restart counter */
   4.227 +			/* No interrupt, reset counter value */
   4.228 +			value &= ~(0x60FFF00000000ULL);
   4.229 +			/* Counter enable */
   4.230 +			value |= (1ULL << 51);
   4.231 +			wrmsrl(MSR_IA32_MC4_MISC, value);
   4.232 +			wmb();
   4.233 +		}
   4.234 +	}
   4.235 +
   4.236 +	if (adjust > 0) {
   4.237 +		/* Increase polling frequency */
   4.238 +		adjust++; /* adjust == 1 must have an effect */
   4.239 +		period /= adjust;
   4.240 +	} else {
   4.241 +		/* Decrease polling frequency */
   4.242 +		period *= 2;
   4.243 +	}
   4.244 +	if (period > MCE_MAX) {
   4.245 +		/* limit: Poll at least every 30s */
   4.246 +		period = MCE_MAX;
   4.247 +	}
   4.248 +	if (period < MCE_MIN) {
   4.249 +		/* limit: Poll every 2s.
   4.250 +		 * When this is reached an uncorrectable error
   4.251 +		 * is expected to happen, if Dom0 does nothing.
   4.252 +		 */
   4.253 +		period = MCE_MIN;
   4.254 +	}
   4.255 +
   4.256 +	set_timer(&mce_timer, NOW() + period);
   4.257 +	adjust = 0;
   4.258 +}
   4.259 +
   4.260 +void amd_nonfatal_mcheck_init(struct cpuinfo_x86 *c)
   4.261 +{
   4.262 +	if (c->x86_vendor != X86_VENDOR_AMD)
   4.263 +		return;
   4.264 +
   4.265 +	/* Assume we are on K8 or newer AMD CPU here */
   4.266 +
   4.267 +	/* The threshold bitfields in MSR_IA32_MC4_MISC has
   4.268 +	 * been introduced along with the SVME feature bit. */
   4.269 +	if (cpu_has(c, X86_FEATURE_SVME)) {
   4.270 +		uint64_t value;
   4.271 +
   4.272 +		/* hw threshold registers present */
   4.273 +		hw_threshold = 1;
   4.274 +		rdmsrl(MSR_IA32_MC4_MISC, value);
   4.275 +
   4.276 +		if (value & (1ULL << 61)) { /* Locked bit */
   4.277 +			/* Locked by BIOS. Not available for use */
   4.278 +			hw_threshold = 0;
   4.279 +		}
   4.280 +		if (!(value & (1ULL << 63))) { /* Valid bit */
   4.281 +			/* No CtrP present */
   4.282 +			hw_threshold = 0;
   4.283 +		} else {
   4.284 +			if (!(value & (1ULL << 62))) { /* Counter Bit */
   4.285 +				/* No counter field present */
   4.286 +				hw_threshold = 0;
   4.287 +			}
   4.288 +		}
   4.289 +
   4.290 +		if (hw_threshold) {
   4.291 +			/* No interrupt, reset counter value */
   4.292 +			value &= ~(0x60FFF00000000ULL);
   4.293 +			/* Counter enable */
   4.294 +			value |= (1ULL << 51);
   4.295 +			wrmsrl(MSR_IA32_MC4_MISC, value);
   4.296 +			/* serialize */
   4.297 +			wmb();
   4.298 +			printk(XENLOG_INFO "MCA: Use hw thresholding to adjust polling frequency\n");
   4.299 +		}
   4.300 +	}
   4.301 +
   4.302 +	init_timer(&mce_timer, mce_amd_work_fn, NULL, 0);
   4.303 +	set_timer(&mce_timer, NOW() + period);
   4.304 +
   4.305 +	return;
   4.306 +}
     5.1 --- a/xen/arch/x86/cpu/mcheck/k7.c	Fri Jul 04 13:02:31 2008 +0100
     5.2 +++ b/xen/arch/x86/cpu/mcheck/k7.c	Fri Jul 04 16:27:44 2008 +0100
     5.3 @@ -66,8 +66,8 @@ static fastcall void k7_machine_check(st
     5.4  }
     5.5  
     5.6  
     5.7 -/* AMD K7 machine check is Intel like */
     5.8 -void amd_mcheck_init(struct cpuinfo_x86 *c)
     5.9 +/* AMD K7 machine check */
    5.10 +void amd_k7_mcheck_init(struct cpuinfo_x86 *c)
    5.11  {
    5.12  	u32 l, h;
    5.13  	int i;
    5.14 @@ -75,7 +75,6 @@ void amd_mcheck_init(struct cpuinfo_x86 
    5.15  	machine_check_vector = k7_machine_check;
    5.16  	wmb();
    5.17  
    5.18 -	printk (KERN_INFO "Intel machine check architecture supported.\n");
    5.19  	rdmsr (MSR_IA32_MCG_CAP, l, h);
    5.20  	if (l & (1<<8))	/* Control register present ? */
    5.21  		wrmsr (MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
    5.22 @@ -90,6 +89,6 @@ void amd_mcheck_init(struct cpuinfo_x86 
    5.23  	}
    5.24  
    5.25  	set_in_cr4 (X86_CR4_MCE);
    5.26 -	printk (KERN_INFO "Intel machine check reporting enabled on CPU#%d.\n",
    5.27 +	printk (KERN_INFO "CPU%d: AMD K7 machine check reporting enabled.\n",
    5.28  		smp_processor_id());
    5.29  }
     6.1 --- a/xen/arch/x86/cpu/mcheck/mce.c	Fri Jul 04 13:02:31 2008 +0100
     6.2 +++ b/xen/arch/x86/cpu/mcheck/mce.c	Fri Jul 04 16:27:44 2008 +0100
     6.3 @@ -8,73 +8,151 @@
     6.4  #include <xen/kernel.h>
     6.5  #include <xen/config.h>
     6.6  #include <xen/smp.h>
     6.7 +#include <xen/errno.h>
     6.8  
     6.9  #include <asm/processor.h> 
    6.10  #include <asm/system.h>
    6.11  
    6.12  #include "mce.h"
    6.13 +#include "x86_mca.h"
    6.14  
    6.15  int mce_disabled = 0;
    6.16 -int nr_mce_banks;
    6.17 +unsigned int nr_mce_banks;
    6.18  
    6.19  EXPORT_SYMBOL_GPL(nr_mce_banks);	/* non-fatal.o */
    6.20  
    6.21 +/* XXX For now a fixed array is used. Later this should be changed
    6.22 + * to a dynamic allocated array with the size calculated in relation
    6.23 + * to physical cpus present in the machine.
    6.24 + * The more physical cpus are available, the more entries you need.
    6.25 + */
    6.26 +#define MAX_MCINFO	10
    6.27 +
    6.28 +struct mc_machine_notify {
    6.29 +	struct mc_info mc;
    6.30 +	uint32_t fetch_idx;
    6.31 +	uint32_t valid;
    6.32 +};
    6.33 +
    6.34 +struct mc_machine {
    6.35 +
    6.36 +	/* Array structure used for collecting machine check error telemetry. */
    6.37 +	struct mc_info mc[MAX_MCINFO];
    6.38 +
    6.39 +	/* We handle multiple machine check reports lockless by
    6.40 +	 * iterating through the array using the producer/consumer concept.
    6.41 +	 */
    6.42 +	/* Producer array index to fill with machine check error data.
    6.43 +	 * Index must be increased atomically. */
    6.44 +	uint32_t error_idx;
    6.45 +
    6.46 +	/* Consumer array index to fetch machine check error data from.
    6.47 +	 * Index must be increased atomically. */
    6.48 +	uint32_t fetch_idx;
    6.49 +
    6.50 +	/* Integer array holding the indeces of the mc array that allows
    6.51 +         * a Dom0 to notify a DomU to re-fetch the same machine check error
    6.52 +         * data. The notification and refetch also uses its own 
    6.53 +	 * producer/consumer mechanism, because Dom0 may decide to not report
    6.54 +	 * every error to the impacted DomU.
    6.55 +	 */
    6.56 +	struct mc_machine_notify notify[MAX_MCINFO];
    6.57 +
    6.58 +	/* Array index to get fetch_idx from.
    6.59 +	 * Index must be increased atomically. */
    6.60 +	uint32_t notifyproducer_idx;
    6.61 +	uint32_t notifyconsumer_idx;
    6.62 +};
    6.63 +
    6.64 +/* Global variable with machine check information. */
    6.65 +struct mc_machine mc_data;
    6.66 +
    6.67  /* Handle unconfigured int18 (should never happen) */
    6.68 -static fastcall void unexpected_machine_check(struct cpu_user_regs * regs, long error_code)
    6.69 +static void unexpected_machine_check(struct cpu_user_regs *regs, long error_code)
    6.70  {	
    6.71 -	printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n", smp_processor_id());
    6.72 +	printk(XENLOG_ERR "CPU#%d: Unexpected int18 (Machine Check).\n",
    6.73 +		smp_processor_id());
    6.74  }
    6.75  
    6.76 +
    6.77  /* Call the installed machine check handler for this CPU setup. */
    6.78 -void fastcall (*machine_check_vector)(struct cpu_user_regs *, long error_code) = unexpected_machine_check;
    6.79 +void (*machine_check_vector)(struct cpu_user_regs *regs, long error_code) = unexpected_machine_check;
    6.80 +
    6.81 +/* Init machine check callback handler
    6.82 + * It is used to collect additional information provided by newer
    6.83 + * CPU families/models without the need to duplicate the whole handler.
    6.84 + * This avoids having many handlers doing almost nearly the same and each
    6.85 + * with its own tweaks ands bugs. */
    6.86 +int (*mc_callback_bank_extended)(struct mc_info *, uint16_t, uint64_t) = NULL;
    6.87 +
    6.88 +
    6.89 +static void amd_mcheck_init(struct cpuinfo_x86 *ci)
    6.90 +{
    6.91 +
    6.92 +	switch (ci->x86) {
    6.93 +	case 6:
    6.94 +		amd_k7_mcheck_init(ci);
    6.95 +		break;
    6.96 +
    6.97 +	case 0xf:
    6.98 +		amd_k8_mcheck_init(ci);
    6.99 +		break;
   6.100 +
   6.101 +	case 0x10:
   6.102 +		amd_f10_mcheck_init(ci);
   6.103 +		break;
   6.104 +
   6.105 +	default:
   6.106 +		/* Assume that machine check support is available.
   6.107 +		 * The minimum provided support is at least the K8. */
   6.108 +		amd_k8_mcheck_init(ci);
   6.109 +	}
   6.110 +}
   6.111  
   6.112  /* This has to be run for each processor */
   6.113  void mcheck_init(struct cpuinfo_x86 *c)
   6.114  {
   6.115 -	if (mce_disabled==1)
   6.116 +	if (mce_disabled == 1) {
   6.117 +		printk(XENLOG_INFO "MCE support disabled by bootparam\n");
   6.118  		return;
   6.119 +	}
   6.120 +
   6.121 +	if (!cpu_has(c, X86_FEATURE_MCE)) {
   6.122 +		printk(XENLOG_INFO "CPU%i: No machine check support available\n",
   6.123 +			smp_processor_id());
   6.124 +		return;
   6.125 +	}
   6.126 +
   6.127 +	memset(&mc_data, 0, sizeof(struct mc_machine));
   6.128  
   6.129  	switch (c->x86_vendor) {
   6.130 -		case X86_VENDOR_AMD:
   6.131 -			amd_mcheck_init(c);
   6.132 -			break;
   6.133 +	case X86_VENDOR_AMD:
   6.134 +		amd_mcheck_init(c);
   6.135 +		break;
   6.136  
   6.137 -		case X86_VENDOR_INTEL:
   6.138 +	case X86_VENDOR_INTEL:
   6.139  #ifndef CONFIG_X86_64
   6.140 -			if (c->x86==5)
   6.141 -				intel_p5_mcheck_init(c);
   6.142 -			if (c->x86==6)
   6.143 -				intel_p6_mcheck_init(c);
   6.144 +		if (c->x86==5)
   6.145 +			intel_p5_mcheck_init(c);
   6.146 +		if (c->x86==6)
   6.147 +			intel_p6_mcheck_init(c);
   6.148  #endif
   6.149 -			if (c->x86==15)
   6.150 -				intel_p4_mcheck_init(c);
   6.151 -			break;
   6.152 +		if (c->x86==15)
   6.153 +			intel_p4_mcheck_init(c);
   6.154 +		break;
   6.155  
   6.156  #ifndef CONFIG_X86_64
   6.157 -		case X86_VENDOR_CENTAUR:
   6.158 -			if (c->x86==5)
   6.159 -				winchip_mcheck_init(c);
   6.160 -			break;
   6.161 +	case X86_VENDOR_CENTAUR:
   6.162 +		if (c->x86==5)
   6.163 +			winchip_mcheck_init(c);
   6.164 +		break;
   6.165  #endif
   6.166  
   6.167 -		default:
   6.168 -			break;
   6.169 +	default:
   6.170 +		break;
   6.171  	}
   6.172  }
   6.173  
   6.174 -static unsigned long old_cr4 __initdata;
   6.175 -
   6.176 -void __init stop_mce(void)
   6.177 -{
   6.178 -	old_cr4 = read_cr4();
   6.179 -	clear_in_cr4(X86_CR4_MCE);
   6.180 -}
   6.181 -
   6.182 -void __init restart_mce(void)
   6.183 -{
   6.184 -	if (old_cr4 & X86_CR4_MCE)
   6.185 -		set_in_cr4(X86_CR4_MCE);
   6.186 -}
   6.187  
   6.188  static void __init mcheck_disable(char *str)
   6.189  {
   6.190 @@ -88,3 +166,411 @@ static void __init mcheck_enable(char *s
   6.191  
   6.192  custom_param("nomce", mcheck_disable);
   6.193  custom_param("mce", mcheck_enable);
   6.194 +
   6.195 +
   6.196 +#include <xen/guest_access.h>
   6.197 +#include <asm/traps.h>
   6.198 +
   6.199 +struct mc_info *x86_mcinfo_getptr(void)
   6.200 +{
   6.201 +	struct mc_info *mi;
   6.202 +	uint32_t entry, next;
   6.203 +
   6.204 +	for (;;) {
   6.205 +		entry = mc_data.error_idx;
   6.206 +		smp_rmb();
   6.207 +		next = entry + 1;
   6.208 +		if (cmpxchg(&mc_data.error_idx, entry, next) == entry)
   6.209 +			break;
   6.210 +	}
   6.211 +
   6.212 +	mi = &(mc_data.mc[(entry % MAX_MCINFO)]);
   6.213 +	BUG_ON(mc_data.error_idx < mc_data.fetch_idx);
   6.214 +
   6.215 +	return mi;
   6.216 +}
   6.217 +
   6.218 +static int x86_mcinfo_matches_guest(const struct mc_info *mi,
   6.219 +			const struct domain *d, const struct vcpu *v)
   6.220 +{
   6.221 +	struct mcinfo_common *mic;
   6.222 +	struct mcinfo_global *mig;
   6.223 +
   6.224 +	x86_mcinfo_lookup(mic, mi, MC_TYPE_GLOBAL);
   6.225 +	mig = (struct mcinfo_global *)mic;
   6.226 +	if (mig == NULL)
   6.227 +		return 0;
   6.228 +
   6.229 +	if (d->domain_id != mig->mc_domid)
   6.230 +		return 0;
   6.231 +
   6.232 +	if (v->vcpu_id != mig->mc_vcpuid)
   6.233 +		return 0;
   6.234 +
   6.235 +	return 1;
   6.236 +}
   6.237 +
   6.238 +
   6.239 +#define x86_mcinfo_mcdata(idx) (mc_data.mc[(idx % MAX_MCINFO)])
   6.240 +
   6.241 +static struct mc_info *x86_mcinfo_getfetchptr(uint32_t *fetch_idx,
   6.242 +				const struct domain *d, const struct vcpu *v)
   6.243 +{
   6.244 +	struct mc_info *mi;
   6.245 +
   6.246 +	/* This function is called from the fetch hypercall with
   6.247 +	 * the mc_lock spinlock held. Thus, no need for locking here.
   6.248 +	 */
   6.249 +	mi = &(x86_mcinfo_mcdata(mc_data.fetch_idx));
   6.250 +	if ((d != dom0) && !x86_mcinfo_matches_guest(mi, d, v)) {
   6.251 +		/* Bogus domU command detected. */
   6.252 +		*fetch_idx = 0;
   6.253 +		return NULL;
   6.254 +	}
   6.255 +
   6.256 +	*fetch_idx = mc_data.fetch_idx;
   6.257 +	mc_data.fetch_idx++;
   6.258 +	BUG_ON(mc_data.fetch_idx > mc_data.error_idx);
   6.259 +
   6.260 +	return mi;
   6.261 +}
   6.262 +
   6.263 +
   6.264 +static void x86_mcinfo_marknotified(struct xen_mc_notifydomain *mc_notifydomain)
   6.265 +{
   6.266 +	struct mc_machine_notify *mn;
   6.267 +	struct mcinfo_common *mic = NULL;
   6.268 +	struct mcinfo_global *mig;
   6.269 +	struct domain *d;
   6.270 +	int i;
   6.271 +
   6.272 +	/* This function is called from the notifier hypercall with
   6.273 +	 * the mc_notify_lock spinlock held. Thus, no need for locking here.
   6.274 +	 */
   6.275 +
   6.276 +	/* First invalidate entries for guests that disappeared after
   6.277 +	 * notification (e.g. shutdown/crash). This step prevents the
   6.278 +	 * notification array from filling up with stalling/leaking entries.
   6.279 +	 */
   6.280 +	for (i = mc_data.notifyconsumer_idx; i < mc_data.notifyproducer_idx; i++) {
   6.281 +		mn = &(mc_data.notify[(i % MAX_MCINFO)]);
   6.282 +		x86_mcinfo_lookup(mic, &mn->mc, MC_TYPE_GLOBAL);
   6.283 +		BUG_ON(mic == NULL);
   6.284 +		mig = (struct mcinfo_global *)mic;
   6.285 +		d = get_domain_by_id(mig->mc_domid);
   6.286 +		if (d == NULL) {
   6.287 +			/* Domain does not exist. */
   6.288 +			mn->valid = 0;
   6.289 +		}
   6.290 +		if ((!mn->valid) && (i == mc_data.notifyconsumer_idx))
   6.291 +			mc_data.notifyconsumer_idx++;
   6.292 +	}
   6.293 +
   6.294 +	/* Now put in the error telemetry. Since all error data fetchable
   6.295 +	 * by domUs are uncorrectable errors, they are very important.
   6.296 +	 * So we dump them before overriding them. When a guest takes that long,
   6.297 +	 * then we can assume something bad already happened (crash, hang, etc.)
   6.298 +	 */
   6.299 +	mn = &(mc_data.notify[(mc_data.notifyproducer_idx % MAX_MCINFO)]);
   6.300 +
   6.301 +	if (mn->valid) {
   6.302 +		struct mcinfo_common *mic = NULL;
   6.303 +		struct mcinfo_global *mig;
   6.304 +
   6.305 +		/* To not loose the information, we dump it. */
   6.306 +		x86_mcinfo_lookup(mic, &mn->mc, MC_TYPE_GLOBAL);
   6.307 +		BUG_ON(mic == NULL);
   6.308 +		mig = (struct mcinfo_global *)mic;
   6.309 +		printk(XENLOG_WARNING "Domain ID %u was notified by Dom0 to "
   6.310 +			"fetch machine check error telemetry. But Domain ID "
   6.311 +			"did not do that in time.\n",
   6.312 +			mig->mc_domid);
   6.313 +		x86_mcinfo_dump(&mn->mc);
   6.314 +	}
   6.315 +
   6.316 +	memcpy(&mn->mc, &(x86_mcinfo_mcdata(mc_notifydomain->fetch_idx)),
   6.317 +		sizeof(struct mc_info));
   6.318 +	mn->fetch_idx = mc_notifydomain->fetch_idx;
   6.319 +	mn->valid = 1;
   6.320 +
   6.321 +	mc_data.notifyproducer_idx++;
   6.322 +
   6.323 +	/* By design there can never be more notifies than machine check errors.
   6.324 +	 * If that ever happens, then we hit a bug. */
   6.325 +	BUG_ON(mc_data.notifyproducer_idx > mc_data.fetch_idx);
   6.326 +	BUG_ON(mc_data.notifyconsumer_idx > mc_data.notifyproducer_idx);
   6.327 +}
   6.328 +
   6.329 +static struct mc_info *x86_mcinfo_getnotifiedptr(uint32_t *fetch_idx,
   6.330 +				const struct domain *d, const struct vcpu *v)
   6.331 +{
   6.332 +	struct mc_machine_notify *mn = NULL;
   6.333 +	uint32_t i;
   6.334 +	int found;
   6.335 +
   6.336 +	/* This function is called from the fetch hypercall with
   6.337 +	 * the mc_notify_lock spinlock held. Thus, no need for locking here.
   6.338 +	 */
   6.339 +
   6.340 +	/* The notifier data is filled in the order guests get notified, but
   6.341 +	 * guests may fetch them in a different order. That's why we need
   6.342 +	 * the game with valid/invalid entries. */
   6.343 +	found = 0;
   6.344 +	for (i = mc_data.notifyconsumer_idx; i < mc_data.notifyproducer_idx; i++) {
   6.345 +		mn = &(mc_data.notify[(i % MAX_MCINFO)]);
   6.346 +		if (!mn->valid) {
   6.347 +			if (i == mc_data.notifyconsumer_idx)
   6.348 +				mc_data.notifyconsumer_idx++;
   6.349 +			continue;
   6.350 +		}
   6.351 +		if (x86_mcinfo_matches_guest(&mn->mc, d, v)) {
   6.352 +			found = 1;
   6.353 +			break;
   6.354 +		}
   6.355 +	}
   6.356 +
   6.357 +	if (!found) {
   6.358 +		/* This domain has never been notified. This must be
   6.359 +		 * a bogus domU command. */
   6.360 +		*fetch_idx = 0;
   6.361 +		return NULL;
   6.362 +	}
   6.363 +
   6.364 +	BUG_ON(mn == NULL);
   6.365 +	*fetch_idx = mn->fetch_idx;
   6.366 +	mn->valid = 0;
   6.367 +
   6.368 +	BUG_ON(mc_data.notifyconsumer_idx > mc_data.notifyproducer_idx);
   6.369 +	return &mn->mc;
   6.370 +}
   6.371 +
   6.372 +
   6.373 +void x86_mcinfo_clear(struct mc_info *mi)
   6.374 +{
   6.375 +	memset(mi, 0, sizeof(struct mc_info));
   6.376 +	x86_mcinfo_nentries(mi) = 0;
   6.377 +}
   6.378 +
   6.379 +
   6.380 +int x86_mcinfo_add(struct mc_info *mi, void *mcinfo)
   6.381 +{
   6.382 +	int i;
   6.383 +	unsigned long end1, end2;
   6.384 +	struct mcinfo_common *mic, *mic_base, *mic_index;
   6.385 +
   6.386 +	mic = (struct mcinfo_common *)mcinfo;
   6.387 +	mic_index = mic_base = x86_mcinfo_first(mi);
   6.388 +
   6.389 +	/* go to first free entry */
   6.390 +	for (i = 0; i < x86_mcinfo_nentries(mi); i++) {
   6.391 +		mic_index = x86_mcinfo_next(mic_index);
   6.392 +	}
   6.393 +
   6.394 +	/* check if there is enough size */
   6.395 +	end1 = (unsigned long)((uint8_t *)mic_base + sizeof(struct mc_info));
   6.396 +	end2 = (unsigned long)((uint8_t *)mic_index + mic->size);
   6.397 +
   6.398 +	if (end1 < end2)
   6.399 +		return -ENOSPC; /* No space. Can't add entry. */
   6.400 +
   6.401 +	/* there's enough space. add entry. */
   6.402 +	memcpy(mic_index, mic, mic->size);
   6.403 +	x86_mcinfo_nentries(mi)++;
   6.404 +
   6.405 +	return 0;
   6.406 +}
   6.407 +
   6.408 +
   6.409 +/* Dump machine check information in a format,
   6.410 + * mcelog can parse. This is used only when
   6.411 + * Dom0 does not take the notification. */
   6.412 +void x86_mcinfo_dump(struct mc_info *mi)
   6.413 +{
   6.414 +	struct mcinfo_common *mic = NULL;
   6.415 +	struct mcinfo_global *mc_global;
   6.416 +	struct mcinfo_bank *mc_bank;
   6.417 +
   6.418 +	/* first print the global info */
   6.419 +	x86_mcinfo_lookup(mic, mi, MC_TYPE_GLOBAL);
   6.420 +	if (mic == NULL)
   6.421 +		return;
   6.422 +	mc_global = (struct mcinfo_global *)mic;
   6.423 +	if (mc_global->mc_flags & MC_FLAG_UNCORRECTABLE) {
   6.424 +		printk(XENLOG_WARNING
   6.425 +			"CPU%d: Machine Check Exception: %16"PRIx64"\n",
   6.426 +			mc_global->mc_coreid, mc_global->mc_gstatus);
   6.427 +	} else {
   6.428 +		printk(XENLOG_WARNING "MCE: The hardware reports a non "
   6.429 +			"fatal, correctable incident occured on "
   6.430 +			"CPU %d.\n",
   6.431 +			mc_global->mc_coreid);
   6.432 +	}
   6.433 +
   6.434 +	/* then the bank information */
   6.435 +	x86_mcinfo_lookup(mic, mi, MC_TYPE_BANK); /* finds the first entry */
   6.436 +	do {
   6.437 +		if (mic == NULL)
   6.438 +			return;
   6.439 +		if (mic->type != MC_TYPE_BANK)
   6.440 +			continue;
   6.441 +
   6.442 +		mc_bank = (struct mcinfo_bank *)mic;
   6.443 +	
   6.444 +		printk(XENLOG_WARNING "Bank %d: %16"PRIx64,
   6.445 +			mc_bank->mc_bank,
   6.446 +			mc_bank->mc_status);
   6.447 +		if (mc_bank->mc_status & MCi_STATUS_MISCV)
   6.448 +			printk("[%16"PRIx64"]", mc_bank->mc_misc);
   6.449 +		if (mc_bank->mc_status & MCi_STATUS_ADDRV)
   6.450 +			printk(" at %16"PRIx64, mc_bank->mc_addr);
   6.451 +
   6.452 +		printk("\n");
   6.453 +		mic = x86_mcinfo_next(mic); /* next entry */
   6.454 +		if ((mic == NULL) || (mic->size == 0))
   6.455 +			break;
   6.456 +	} while (1);
   6.457 +}
   6.458 +
   6.459 +
   6.460 +
   6.461 +/* Machine Check Architecture Hypercall */
   6.462 +long do_mca(XEN_GUEST_HANDLE(xen_mc_t) u_xen_mc)
   6.463 +{
   6.464 +	long ret = 0;
   6.465 +	struct xen_mc curop, *op = &curop;
   6.466 +	struct vcpu *v = current;
   6.467 +	struct domain *domU;
   6.468 +	struct xen_mc_fetch *mc_fetch;
   6.469 +	struct xen_mc_notifydomain *mc_notifydomain;
   6.470 +	struct mc_info *mi;
   6.471 +	uint32_t flags;
   6.472 +	uint32_t fetch_idx;
   6.473 +        uint16_t vcpuid;
   6.474 +	/* Use a different lock for the notify hypercall in order to allow
   6.475 +	 * a DomU to fetch mc data while Dom0 notifies another DomU. */
   6.476 +	static DEFINE_SPINLOCK(mc_lock);
   6.477 +	static DEFINE_SPINLOCK(mc_notify_lock);
   6.478 +
   6.479 +	if ( copy_from_guest(op, u_xen_mc, 1) )
   6.480 +		return -EFAULT;
   6.481 +
   6.482 +	if ( op->interface_version != XEN_MCA_INTERFACE_VERSION )
   6.483 +		return -EACCES;
   6.484 +
   6.485 +	switch ( op->cmd ) {
   6.486 +	case XEN_MC_fetch:
   6.487 +		/* This hypercall is for any domain */
   6.488 +		mc_fetch = &op->u.mc_fetch;
   6.489 +
   6.490 +		switch (mc_fetch->flags) {
   6.491 +		case XEN_MC_CORRECTABLE:
   6.492 +			/* But polling mode is Dom0 only, because
   6.493 +			 * correctable errors are reported to Dom0 only */
   6.494 +			if ( !IS_PRIV(v->domain) )
   6.495 +				return -EPERM;
   6.496 +			break;
   6.497 +
   6.498 +		case XEN_MC_TRAP:
   6.499 +			break;
   6.500 +		default:
   6.501 +			return -EFAULT;
   6.502 +		}
   6.503 +
   6.504 +		flags = XEN_MC_OK;
   6.505 +		spin_lock(&mc_lock);
   6.506 +
   6.507 +		if ( IS_PRIV(v->domain) ) {
   6.508 +			/* this must be Dom0. So a notify hypercall
   6.509 +			 * can't have happened before. */
   6.510 +			mi = x86_mcinfo_getfetchptr(&fetch_idx, dom0, v);
   6.511 +		} else {
   6.512 +			/* Hypercall comes from an unprivileged domain */
   6.513 +			domU = v->domain;
   6.514 +			if (guest_has_trap_callback(dom0, 0, TRAP_machine_check)) {
   6.515 +				/* Dom0 must have notified this DomU before
   6.516 +				 * via the notify hypercall. */
   6.517 +				mi = x86_mcinfo_getnotifiedptr(&fetch_idx, domU, v);
   6.518 +			} else {
   6.519 +				/* Xen notified the DomU. */
   6.520 +				mi = x86_mcinfo_getfetchptr(&fetch_idx, domU, v);
   6.521 +			}
   6.522 +		}
   6.523 +
   6.524 +		if (mi) {
   6.525 +			memcpy(&mc_fetch->mc_info, mi,
   6.526 +				sizeof(struct mc_info));
   6.527 +		} else {
   6.528 +			/* There is no data for a bogus DomU command. */
   6.529 +			flags |= XEN_MC_NODATA;
   6.530 +			memset(&mc_fetch->mc_info, 0, sizeof(struct mc_info));
   6.531 +		}
   6.532 +
   6.533 +		mc_fetch->flags = flags;
   6.534 +		mc_fetch->fetch_idx = fetch_idx;
   6.535 +
   6.536 +		if ( copy_to_guest(u_xen_mc, op, 1) )
   6.537 +			ret = -EFAULT;
   6.538 +
   6.539 +		spin_unlock(&mc_lock);
   6.540 +		break;
   6.541 +
   6.542 +	case XEN_MC_notifydomain:
   6.543 +		/* This hypercall is for Dom0 only */
   6.544 +		if ( !IS_PRIV(v->domain) )
   6.545 +			return -EPERM;
   6.546 +
   6.547 +		spin_lock(&mc_notify_lock);
   6.548 +
   6.549 +		mc_notifydomain = &op->u.mc_notifydomain;
   6.550 +		domU = get_domain_by_id(mc_notifydomain->mc_domid);
   6.551 +		vcpuid = mc_notifydomain->mc_vcpuid;
   6.552 +
   6.553 +		if ((domU == NULL) || (domU == dom0)) {
   6.554 +			/* It's not possible to notify a non-existent domain
   6.555 +			 * or the dom0. */
   6.556 +			spin_unlock(&mc_notify_lock);
   6.557 +			return -EACCES;
   6.558 +		}
   6.559 +
   6.560 +		if (vcpuid >= MAX_VIRT_CPUS) {
   6.561 +			/* It's not possible to notify a vcpu, Xen can't
   6.562 +			 * assign to a domain. */
   6.563 +			spin_unlock(&mc_notify_lock);
   6.564 +			return -EACCES;
   6.565 +		}
   6.566 +
   6.567 +		mc_notifydomain->flags = XEN_MC_OK;
   6.568 +
   6.569 +		mi = &(x86_mcinfo_mcdata(mc_notifydomain->fetch_idx));
   6.570 +		if (!x86_mcinfo_matches_guest(mi, domU, domU->vcpu[vcpuid])) {
   6.571 +			/* The error telemetry is not for the guest, Dom0
   6.572 +			 * wants to notify. */
   6.573 +			mc_notifydomain->flags |= XEN_MC_NOMATCH;
   6.574 +		} else if ( guest_has_trap_callback(domU, vcpuid,
   6.575 +						TRAP_machine_check) )
   6.576 +		{
   6.577 +			/* Send notification */
   6.578 +			if ( send_guest_trap(domU, vcpuid, TRAP_machine_check) )
   6.579 +				mc_notifydomain->flags |= XEN_MC_NOTDELIVERED;
   6.580 +		} else
   6.581 +			mc_notifydomain->flags |= XEN_MC_CANNOTHANDLE;
   6.582 +
   6.583 +#ifdef DEBUG
   6.584 +		/* sanity check - these two flags are mutually exclusive */
   6.585 +		if ((flags & XEN_MC_CANNOTHANDLE) && (flags & XEN_MC_NOTDELIVERED))
   6.586 +			BUG();
   6.587 +#endif
   6.588 +
   6.589 +		if ( copy_to_guest(u_xen_mc, op, 1) )
   6.590 +			ret = -EFAULT;
   6.591 +
   6.592 +		if (ret == 0) {
   6.593 +			x86_mcinfo_marknotified(mc_notifydomain);
   6.594 +		}
   6.595 +
   6.596 +		spin_unlock(&mc_notify_lock);
   6.597 +		break;
   6.598 +	}
   6.599 +
   6.600 +	return ret;
   6.601 +}
     7.1 --- a/xen/arch/x86/cpu/mcheck/mce.h	Fri Jul 04 13:02:31 2008 +0100
     7.2 +++ b/xen/arch/x86/cpu/mcheck/mce.h	Fri Jul 04 16:27:44 2008 +0100
     7.3 @@ -1,14 +1,30 @@
     7.4  #include <xen/init.h>
     7.5 +#include <asm/traps.h>
     7.6  
     7.7 -void amd_mcheck_init(struct cpuinfo_x86 *c);
     7.8 +/* Init functions */
     7.9 +void amd_nonfatal_mcheck_init(struct cpuinfo_x86 *c);
    7.10 +void amd_k7_mcheck_init(struct cpuinfo_x86 *c);
    7.11 +void amd_k8_mcheck_init(struct cpuinfo_x86 *c);
    7.12 +void amd_f10_mcheck_init(struct cpuinfo_x86 *c);
    7.13  void intel_p4_mcheck_init(struct cpuinfo_x86 *c);
    7.14  void intel_p5_mcheck_init(struct cpuinfo_x86 *c);
    7.15  void intel_p6_mcheck_init(struct cpuinfo_x86 *c);
    7.16  void winchip_mcheck_init(struct cpuinfo_x86 *c);
    7.17  
    7.18 -/* Call the installed machine check handler for this CPU setup. */
    7.19 -extern fastcall void (*machine_check_vector)(struct cpu_user_regs *, long error_code);
    7.20 +/* Function pointer used in the handlers to collect additional information
    7.21 + * provided by newer CPU families/models without the need to duplicate
    7.22 + * the whole handler resulting in various handlers each with its own
    7.23 + * tweaks and bugs */
    7.24 +extern int (*mc_callback_bank_extended)(struct mc_info *mi,
    7.25 +		uint16_t bank, uint64_t status);
    7.26  
    7.27 +
    7.28 +/* Helper functions used for collecting error telemetry */
    7.29 +struct mc_info *x86_mcinfo_getptr(void);
    7.30 +void x86_mcinfo_clear(struct mc_info *mi);
    7.31 +int x86_mcinfo_add(struct mc_info *mi, void *mcinfo);
    7.32 +void x86_mcinfo_dump(struct mc_info *mi);
    7.33 +
    7.34 +/* Global variables */
    7.35  extern int mce_disabled __initdata;
    7.36 -extern int nr_mce_banks;
    7.37 -
    7.38 +extern unsigned int nr_mce_banks;
     8.1 --- a/xen/arch/x86/cpu/mcheck/non-fatal.c	Fri Jul 04 13:02:31 2008 +0100
     8.2 +++ b/xen/arch/x86/cpu/mcheck/non-fatal.c	Fri Jul 04 16:27:44 2008 +0100
     8.3 @@ -68,19 +68,29 @@ static int __init init_nonfatal_mce_chec
     8.4  	if (!cpu_has(c, X86_FEATURE_MCA))
     8.5  		return -ENODEV;
     8.6  
     8.7 -	/* Some Athlons misbehave when we frob bank 0 */
     8.8 -	if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
     8.9 -		boot_cpu_data.x86 == 6)
    8.10 -			firstbank = 1;
    8.11 -	else
    8.12 -			firstbank = 0;
    8.13 -
    8.14  	/*
    8.15  	 * Check for non-fatal errors every MCE_RATE s
    8.16  	 */
    8.17 -	init_timer(&mce_timer, mce_work_fn, NULL, 0);
    8.18 -	set_timer(&mce_timer, NOW() + MCE_PERIOD);
    8.19 -	printk(KERN_INFO "Machine check exception polling timer started.\n");
    8.20 +	switch (c->x86_vendor) {
    8.21 +	case X86_VENDOR_AMD:
    8.22 +		if (c->x86 == 6) { /* K7 */
    8.23 +			firstbank = 1;
    8.24 +			init_timer(&mce_timer, mce_work_fn, NULL, 0);
    8.25 +			set_timer(&mce_timer, NOW() + MCE_PERIOD);
    8.26 +			break;
    8.27 +		}
    8.28 +
    8.29 +		/* Assume we are on K8 or newer AMD CPU here */
    8.30 +		amd_nonfatal_mcheck_init(c);
    8.31 +		break;
    8.32 +
    8.33 +	case X86_VENDOR_INTEL:
    8.34 +		init_timer(&mce_timer, mce_work_fn, NULL, 0);
    8.35 +		set_timer(&mce_timer, NOW() + MCE_PERIOD);
    8.36 +		break;
    8.37 +	}
    8.38 +
    8.39 +	printk(KERN_INFO "MCA: Machine check polling timer started.\n");
    8.40  	return 0;
    8.41  }
    8.42  __initcall(init_nonfatal_mce_checker);
     9.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     9.2 +++ b/xen/arch/x86/cpu/mcheck/x86_mca.h	Fri Jul 04 16:27:44 2008 +0100
     9.3 @@ -0,0 +1,72 @@
     9.4 +/*
     9.5 + * MCA implementation for AMD K7/K8 CPUs
     9.6 + * Copyright (c) 2007 Advanced Micro Devices, Inc. 
     9.7 + *
     9.8 + * This program is free software; you can redistribute it and/or modify
     9.9 + * it under the terms of the GNU General Public License as published by
    9.10 + * the Free Software Foundation; either version 2 of the License, or
    9.11 + * (at your option) any later version.
    9.12 + *
    9.13 + * This program is distributed in the hope that it will be useful,
    9.14 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
    9.15 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    9.16 + * GNU General Public License for more details.
    9.17 + *
    9.18 + * You should have received a copy of the GNU General Public License
    9.19 + * along with this program; if not, write to the Free Software
    9.20 + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
    9.21 + */
    9.22 +
    9.23 +
    9.24 +/* The MCA/MCE MSRs should not be used anywhere else.
    9.25 + * They are cpu family/model specific and are only for use
    9.26 + * in terms of machine check handling.
    9.27 + * So we define them here rather in <asm/msr.h>.
    9.28 + */
    9.29 +
    9.30 +
    9.31 +/* Bitfield of the MSR_IA32_MCG_CAP register */
    9.32 +#define MCG_CAP_COUNT           0x00000000000000ffULL
    9.33 +#define MCG_CTL_P               0x0000000000000100ULL
    9.34 +/* Bits 9-63 are reserved */
    9.35 +
    9.36 +/* Bitfield of the MSR_IA32_MCG_STATUS register */
    9.37 +#define MCG_STATUS_RIPV         0x0000000000000001ULL
    9.38 +#define MCG_STATUS_EIPV         0x0000000000000002ULL
    9.39 +#define MCG_STATUS_MCIP         0x0000000000000004ULL
    9.40 +/* Bits 3-63 are reserved */
    9.41 +
    9.42 +/* Bitfield of MSR_K8_MCi_STATUS registers */
    9.43 +/* MCA error code */
    9.44 +#define MCi_STATUS_MCA          0x000000000000ffffULL
    9.45 +/* model-specific error code */
    9.46 +#define MCi_STATUS_MSEC         0x00000000ffff0000ULL
    9.47 +/* Other information */
    9.48 +#define MCi_STATUS_OTHER        0x01ffffff00000000ULL
    9.49 +/* processor context corrupt */
    9.50 +#define MCi_STATUS_PCC          0x0200000000000000ULL
    9.51 +/* MSR_K8_MCi_ADDR register valid */
    9.52 +#define MCi_STATUS_ADDRV        0x0400000000000000ULL
    9.53 +/* MSR_K8_MCi_MISC register valid */
    9.54 +#define MCi_STATUS_MISCV        0x0800000000000000ULL
    9.55 +/* error condition enabled */
    9.56 +#define MCi_STATUS_EN           0x1000000000000000ULL
    9.57 +/* uncorrected error */
    9.58 +#define MCi_STATUS_UC           0x2000000000000000ULL
    9.59 +/* status register overflow */
    9.60 +#define MCi_STATUS_OVER         0x4000000000000000ULL
    9.61 +/* valid */
    9.62 +#define MCi_STATUS_VAL          0x8000000000000000ULL
    9.63 +
    9.64 +/* Bitfield of MSi_STATUS_OTHER field */
    9.65 +/* reserved bits */
    9.66 +#define MCi_STATUS_OTHER_RESERVED1      0x00001fff00000000ULL
    9.67 +/* uncorrectable ECC error */
    9.68 +#define MCi_STATUS_OTEHR_UC_ECC         0x0000200000000000ULL
    9.69 +/* correctable ECC error */
    9.70 +#define MCi_STATUS_OTHER_C_ECC          0x0000400000000000ULL
    9.71 +/* ECC syndrome of an ECC error */
    9.72 +#define MCi_STATUS_OTHER_ECC_SYNDROME   0x007f800000000000ULL
    9.73 +/* reserved bits */
    9.74 +#define MCi_STATUS_OTHER_RESERVED2      0x0180000000000000ULL
    9.75 +
    10.1 --- a/xen/arch/x86/nmi.c	Fri Jul 04 13:02:31 2008 +0100
    10.2 +++ b/xen/arch/x86/nmi.c	Fri Jul 04 16:27:44 2008 +0100
    10.3 @@ -457,10 +457,10 @@ static void do_nmi_stats(unsigned char k
    10.4      if ( ((d = dom0) == NULL) || ((v = d->vcpu[0]) == NULL) )
    10.5          return;
    10.6  
    10.7 -    if ( v->nmi_pending || v->nmi_masked )
    10.8 +    if ( v->nmi_pending || (v->trap_priority >= VCPU_TRAP_NMI) )
    10.9          printk("dom0 vpu0: NMI %s%s\n",
   10.10                 v->nmi_pending ? "pending " : "",
   10.11 -               v->nmi_masked  ? "masked " : "");
   10.12 +               (v->trap_priority >= VCPU_TRAP_NMI)  ? "masked " : "");
   10.13      else
   10.14          printk("dom0 vcpu0: NMI neither pending nor masked\n");
   10.15  }
    11.1 --- a/xen/arch/x86/traps.c	Fri Jul 04 13:02:31 2008 +0100
    11.2 +++ b/xen/arch/x86/traps.c	Fri Jul 04 16:27:44 2008 +0100
    11.3 @@ -61,6 +61,7 @@
    11.4  #include <asm/msr.h>
    11.5  #include <asm/shared.h>
    11.6  #include <asm/x86_emulate.h>
    11.7 +#include <asm/traps.h>
    11.8  #include <asm/hvm/vpt.h>
    11.9  #include <public/arch-x86/cpuid.h>
   11.10  
   11.11 @@ -486,6 +487,20 @@ static unsigned int check_guest_io_break
   11.12  }
   11.13  
   11.14  /*
   11.15 + * Called from asm to set up the MCE trapbounce info.
   11.16 + * Returns 0 if no callback is set up, else 1.
   11.17 + */
   11.18 +asmlinkage int set_guest_machinecheck_trapbounce(void)
   11.19 +{
   11.20 +    struct vcpu *v = current;
   11.21 +    struct trap_bounce *tb = &v->arch.trap_bounce;
   11.22 + 
   11.23 +    do_guest_trap(TRAP_machine_check, guest_cpu_user_regs(), 0);
   11.24 +    tb->flags &= ~TBF_EXCEPTION; /* not needed for MCE delivery path */
   11.25 +    return !null_trap_bounce(v, tb);
   11.26 +}
   11.27 +
   11.28 +/*
   11.29   * Called from asm to set up the NMI trapbounce info.
   11.30   * Returns 0 if no callback is set up, else 1.
   11.31   */
   11.32 @@ -904,8 +919,6 @@ asmlinkage void do_int3(struct cpu_user_
   11.33  
   11.34  asmlinkage void do_machine_check(struct cpu_user_regs *regs)
   11.35  {
   11.36 -    extern fastcall void (*machine_check_vector)(
   11.37 -        struct cpu_user_regs *, long error_code);
   11.38      machine_check_vector(regs, regs->error_code);
   11.39  }
   11.40  
   11.41 @@ -2678,25 +2691,51 @@ asmlinkage void do_general_protection(st
   11.42      panic("GENERAL PROTECTION FAULT\n[error_code=%04x]\n", regs->error_code);
   11.43  }
   11.44  
   11.45 +static DEFINE_PER_CPU(struct softirq_trap, softirq_trap);
   11.46 +
   11.47  static void nmi_mce_softirq(void)
   11.48  {
   11.49 -    /* Only used to defer wakeup of dom0,vcpu0 to a safe (non-NMI) context. */
   11.50 -    vcpu_kick(dom0->vcpu[0]);
   11.51 +    int cpu = smp_processor_id();
   11.52 +    struct softirq_trap *st = &per_cpu(softirq_trap, cpu);
   11.53 +    cpumask_t affinity;
   11.54 +
   11.55 +    BUG_ON(st == NULL);
   11.56 +    BUG_ON(st->vcpu == NULL);
   11.57 +
   11.58 +    /* Set the tmp value unconditionally, so that
   11.59 +     * the check in the iret hypercall works. */
   11.60 +    st->vcpu->cpu_affinity_tmp = st->vcpu->cpu_affinity;
   11.61 +
   11.62 +    if ((cpu != st->processor)
   11.63 +       || (st->processor != st->vcpu->processor))
   11.64 +    {
   11.65 +        /* We are on a different physical cpu.
   11.66 +         * Make sure to wakeup the vcpu on the
   11.67 +         * specified processor.
   11.68 +         */
   11.69 +        cpus_clear(affinity);
   11.70 +        cpu_set(st->processor, affinity);
   11.71 +        vcpu_set_affinity(st->vcpu, &affinity);
   11.72 +
   11.73 +        /* Affinity is restored in the iret hypercall. */
   11.74 +    }
   11.75 +
   11.76 +    /* Only used to defer wakeup of domain/vcpu to
   11.77 +     * a safe (non-NMI/MCE) context.
   11.78 +     */
   11.79 +    vcpu_kick(st->vcpu);
   11.80  }
   11.81  
   11.82  static void nmi_dom0_report(unsigned int reason_idx)
   11.83  {
   11.84 -    struct domain *d;
   11.85 -    struct vcpu   *v;
   11.86 -
   11.87 -    if ( ((d = dom0) == NULL) || ((v = d->vcpu[0]) == NULL) )
   11.88 +    struct domain *d = dom0;
   11.89 +
   11.90 +    if ( (d == NULL) || (d->vcpu[0] == NULL) )
   11.91          return;
   11.92  
   11.93      set_bit(reason_idx, nmi_reason(d));
   11.94  
   11.95 -    /* Not safe to wake a vcpu here, or even to schedule a tasklet! */
   11.96 -    if ( !test_and_set_bool(v->nmi_pending) )
   11.97 -        raise_softirq(NMI_MCE_SOFTIRQ);
   11.98 +    send_guest_trap(d, 0, TRAP_nmi);
   11.99  }
  11.100  
  11.101  asmlinkage void mem_parity_error(struct cpu_user_regs *regs)
  11.102 @@ -3010,6 +3049,70 @@ long unregister_guest_nmi_callback(void)
  11.103      return 0;
  11.104  }
  11.105  
  11.106 +int guest_has_trap_callback(struct domain *d, uint16_t vcpuid, unsigned int trap_nr)
  11.107 +{
  11.108 +    struct vcpu *v;
  11.109 +    struct trap_info *t;
  11.110 +
  11.111 +    BUG_ON(d == NULL);
  11.112 +    BUG_ON(vcpuid >= MAX_VIRT_CPUS);
  11.113 +
  11.114 +    /* Sanity check - XXX should be more fine grained. */
  11.115 +    BUG_ON(trap_nr > TRAP_syscall);
  11.116 +
  11.117 +    v = d->vcpu[vcpuid];
  11.118 +    t = &v->arch.guest_context.trap_ctxt[trap_nr];
  11.119 +
  11.120 +    return (t->address != 0);
  11.121 +}
  11.122 +
  11.123 +
  11.124 +int send_guest_trap(struct domain *d, uint16_t vcpuid, unsigned int trap_nr)
  11.125 +{
  11.126 +    struct vcpu *v;
  11.127 +    struct softirq_trap *st;
  11.128 +
  11.129 +    BUG_ON(d == NULL);
  11.130 +    BUG_ON(vcpuid >= MAX_VIRT_CPUS);
  11.131 +    v = d->vcpu[vcpuid];
  11.132 +
  11.133 +    switch (trap_nr) {
  11.134 +    case TRAP_nmi:
  11.135 +        if ( !test_and_set_bool(v->nmi_pending) ) {
  11.136 +               st = &per_cpu(softirq_trap, smp_processor_id());
  11.137 +               st->domain = dom0;
  11.138 +               st->vcpu = dom0->vcpu[0];
  11.139 +               st->processor = st->vcpu->processor;
  11.140 +
  11.141 +               /* not safe to wake up a vcpu here */
  11.142 +               raise_softirq(NMI_MCE_SOFTIRQ);
  11.143 +               return 0;
  11.144 +        }
  11.145 +        break;
  11.146 +
  11.147 +    case TRAP_machine_check:
  11.148 +
  11.149 +        /* We are called by the machine check (exception or polling) handlers
  11.150 +         * on the physical CPU that reported a machine check error. */
  11.151 +
  11.152 +        if ( !test_and_set_bool(v->mce_pending) ) {
  11.153 +                st = &per_cpu(softirq_trap, smp_processor_id());
  11.154 +                st->domain = d;
  11.155 +                st->vcpu = v;
  11.156 +                st->processor = v->processor;
  11.157 +
  11.158 +                /* not safe to wake up a vcpu here */
  11.159 +                raise_softirq(NMI_MCE_SOFTIRQ);
  11.160 +                return 0;
  11.161 +        }
  11.162 +        break;
  11.163 +    }
  11.164 +
  11.165 +    /* delivery failed */
  11.166 +    return -EIO;
  11.167 +}
  11.168 +
  11.169 +
  11.170  long do_set_trap_table(XEN_GUEST_HANDLE(const_trap_info_t) traps)
  11.171  {
  11.172      struct trap_info cur;
    12.1 --- a/xen/arch/x86/x86_32/asm-offsets.c	Fri Jul 04 13:02:31 2008 +0100
    12.2 +++ b/xen/arch/x86/x86_32/asm-offsets.c	Fri Jul 04 16:27:44 2008 +0100
    12.3 @@ -67,7 +67,11 @@ void __dummy__(void)
    12.4             arch.guest_context.kernel_sp);
    12.5      OFFSET(VCPU_guest_context_flags, struct vcpu, arch.guest_context.flags);
    12.6      OFFSET(VCPU_nmi_pending, struct vcpu, nmi_pending);
    12.7 -    OFFSET(VCPU_nmi_masked, struct vcpu, nmi_masked);
    12.8 +    OFFSET(VCPU_mce_pending, struct vcpu, mce_pending);
    12.9 +    OFFSET(VCPU_old_trap_priority, struct vcpu, old_trap_priority);
   12.10 +    OFFSET(VCPU_trap_priority, struct vcpu, trap_priority);
   12.11 +    DEFINE(VCPU_TRAP_NMI, VCPU_TRAP_NMI);
   12.12 +    DEFINE(VCPU_TRAP_MCE, VCPU_TRAP_MCE);
   12.13      DEFINE(_VGCF_failsafe_disables_events, _VGCF_failsafe_disables_events);
   12.14      BLANK();
   12.15  
    13.1 --- a/xen/arch/x86/x86_32/entry.S	Fri Jul 04 13:02:31 2008 +0100
    13.2 +++ b/xen/arch/x86/x86_32/entry.S	Fri Jul 04 16:27:44 2008 +0100
    13.3 @@ -229,6 +229,8 @@ test_all_events:
    13.4          shl  $IRQSTAT_shift,%eax
    13.5          test %ecx,irq_stat(%eax,1)
    13.6          jnz  process_softirqs
    13.7 +        testb $1,VCPU_mce_pending(%ebx)
    13.8 +        jnz  process_mce
    13.9          testb $1,VCPU_nmi_pending(%ebx)
   13.10          jnz  process_nmi
   13.11  test_guest_events:
   13.12 @@ -255,15 +257,35 @@ process_softirqs:
   13.13          jmp  test_all_events
   13.14  
   13.15          ALIGN
   13.16 +/* %ebx: struct vcpu */
   13.17 +process_mce:
   13.18 +        cmpw $VCPU_TRAP_MCE,VCPU_trap_priority(%ebx)
   13.19 +        jae  test_guest_events
   13.20 +        sti
   13.21 +        movb $0,VCPU_mce_pending(%ebx)
   13.22 +        call set_guest_machinecheck_trapbounce
   13.23 +        test %eax,%eax
   13.24 +        jz   test_all_events
   13.25 +        movw VCPU_trap_priority(%ebx),%dx           # safe priority for the
   13.26 +        movw %dx,VCPU_old_trap_priority(%ebx)       # iret hypercall
   13.27 +        movw $VCPU_TRAP_MCE,VCPU_trap_priority(%ebx)
   13.28 +        jmp process_trap
   13.29 +
   13.30 +        ALIGN
   13.31 +/* %ebx: struct vcpu */
   13.32  process_nmi:
   13.33 -        testb $1,VCPU_nmi_masked(%ebx)
   13.34 -        jnz  test_guest_events
   13.35 +        cmpw $VCPU_TRAP_NMI,VCPU_trap_priority(%ebx)
   13.36 +        jae  test_guest_events
   13.37          sti
   13.38          movb $0,VCPU_nmi_pending(%ebx)
   13.39          call set_guest_nmi_trapbounce
   13.40          test %eax,%eax
   13.41          jz   test_all_events
   13.42 -        movb $1,VCPU_nmi_masked(%ebx)
   13.43 +        movw VCPU_trap_priority(%ebx),%dx           # safe priority for the
   13.44 +        movw %dx,VCPU_old_trap_priority(%ebx)       # iret hypercall
   13.45 +        movw $VCPU_TRAP_NMI,VCPU_trap_priority(%ebx)
   13.46 +        /* FALLTHROUGH */
   13.47 +process_trap:
   13.48          leal VCPU_trap_bounce(%ebx),%edx
   13.49          call create_bounce_frame
   13.50          jmp  test_all_events
   13.51 @@ -681,6 +703,10 @@ ENTRY(hypercall_table)
   13.52          .long do_sysctl             /* 35 */
   13.53          .long do_domctl
   13.54          .long do_kexec_op
   13.55 +        .rept __HYPERVISOR_arch_0-((.-hypercall_table)/4)
   13.56 +        .long do_ni_hypercall
   13.57 +        .endr
   13.58 +        .long do_mca                /* 48 */
   13.59          .rept NR_hypercalls-((.-hypercall_table)/4)
   13.60          .long do_ni_hypercall
   13.61          .endr
   13.62 @@ -724,6 +750,10 @@ ENTRY(hypercall_args_table)
   13.63          .byte 1 /* do_sysctl            */  /* 35 */
   13.64          .byte 1 /* do_domctl            */
   13.65          .byte 2 /* do_kexec_op          */
   13.66 +        .rept __HYPERVISOR_arch_0-(.-hypercall_args_table)
   13.67 +        .byte 0 /* do_ni_hypercall      */
   13.68 +        .endr
   13.69 +        .byte 1 /* do_mca               */  /* 48 */
   13.70          .rept NR_hypercalls-(.-hypercall_args_table)
   13.71          .byte 0 /* do_ni_hypercall      */
   13.72          .endr
    14.1 --- a/xen/arch/x86/x86_32/traps.c	Fri Jul 04 13:02:31 2008 +0100
    14.2 +++ b/xen/arch/x86/x86_32/traps.c	Fri Jul 04 16:27:44 2008 +0100
    14.3 @@ -255,8 +255,13 @@ unsigned long do_iret(void)
    14.4              goto exit_and_crash;
    14.5      }
    14.6  
    14.7 -    /* No longer in NMI context. */
    14.8 -    v->nmi_masked = 0;
    14.9 +    /* Restore affinity.  */
   14.10 +    if ((v->trap_priority >= VCPU_TRAP_NMI)
   14.11 +       && !cpus_equal(v->cpu_affinity_tmp, v->cpu_affinity))
   14.12 +        vcpu_set_affinity(v, &v->cpu_affinity_tmp);
   14.13 +
   14.14 +    /* Restore previous trap priority */
   14.15 +    v->trap_priority = v->old_trap_priority;
   14.16  
   14.17      /* Restore upcall mask from supplied EFLAGS.IF. */
   14.18      vcpu_info(v, evtchn_upcall_mask) = !(eflags & X86_EFLAGS_IF);
    15.1 --- a/xen/arch/x86/x86_64/asm-offsets.c	Fri Jul 04 13:02:31 2008 +0100
    15.2 +++ b/xen/arch/x86/x86_64/asm-offsets.c	Fri Jul 04 16:27:44 2008 +0100
    15.3 @@ -92,7 +92,11 @@ void __dummy__(void)
    15.4      OFFSET(VCPU_kernel_ss, struct vcpu, arch.guest_context.kernel_ss);
    15.5      OFFSET(VCPU_guest_context_flags, struct vcpu, arch.guest_context.flags);
    15.6      OFFSET(VCPU_nmi_pending, struct vcpu, nmi_pending);
    15.7 -    OFFSET(VCPU_nmi_masked, struct vcpu, nmi_masked);
    15.8 +    OFFSET(VCPU_mce_pending, struct vcpu, mce_pending);
    15.9 +    OFFSET(VCPU_old_trap_priority, struct vcpu, old_trap_priority);
   15.10 +    OFFSET(VCPU_trap_priority, struct vcpu, trap_priority);
   15.11 +    DEFINE(VCPU_TRAP_NMI, VCPU_TRAP_NMI);
   15.12 +    DEFINE(VCPU_TRAP_MCE, VCPU_TRAP_MCE);
   15.13      DEFINE(_VGCF_failsafe_disables_events, _VGCF_failsafe_disables_events);
   15.14      DEFINE(_VGCF_syscall_disables_events,  _VGCF_syscall_disables_events);
   15.15      BLANK();
    16.1 --- a/xen/arch/x86/x86_64/compat/entry.S	Fri Jul 04 13:02:31 2008 +0100
    16.2 +++ b/xen/arch/x86/x86_64/compat/entry.S	Fri Jul 04 16:27:44 2008 +0100
    16.3 @@ -101,6 +101,8 @@ ENTRY(compat_test_all_events)
    16.4          leaq  irq_stat(%rip),%rcx
    16.5          testl $~0,(%rcx,%rax,1)
    16.6          jnz   compat_process_softirqs
    16.7 +        testb $1,VCPU_mce_pending(%rbx)
    16.8 +        jnz   compat_process_mce
    16.9          testb $1,VCPU_nmi_pending(%rbx)
   16.10          jnz   compat_process_nmi
   16.11  compat_test_guest_events:
   16.12 @@ -129,15 +131,34 @@ compat_process_softirqs:
   16.13  
   16.14  	ALIGN
   16.15  /* %rbx: struct vcpu */
   16.16 +compat_process_mce:
   16.17 +        cmpw $VCPU_TRAP_MCE,VCPU_trap_priority(%rbx)
   16.18 +        jae  compat_test_guest_events
   16.19 +        sti
   16.20 +        movb $0,VCPU_mce_pending(%rbx)
   16.21 +        call set_guest_machinecheck_trapbounce
   16.22 +        testl %eax,%eax
   16.23 +        jz    compat_test_all_events
   16.24 +        movw VCPU_trap_priority(%rbx),%dx           # safe priority for the
   16.25 +        movw %dx,VCPU_old_trap_priority(%rbx)       # iret hypercall
   16.26 +        movw  $VCPU_TRAP_MCE,VCPU_trap_priority(%rbx)
   16.27 +        jmp   compat_process_trap
   16.28 +
   16.29 +	ALIGN
   16.30 +/* %rbx: struct vcpu */
   16.31  compat_process_nmi:
   16.32 -        testb $1,VCPU_nmi_masked(%rbx)
   16.33 -        jnz   compat_test_guest_events
   16.34 +        cmpw $VCPU_TRAP_NMI,VCPU_trap_priority(%rbx)
   16.35 +        jae   compat_test_guest_events
   16.36          sti
   16.37          movb  $0,VCPU_nmi_pending(%rbx)
   16.38          call  set_guest_nmi_trapbounce
   16.39          testl %eax,%eax
   16.40          jz    compat_test_all_events
   16.41 -        movb  $1,VCPU_nmi_masked(%rbx)
   16.42 +        movw VCPU_trap_priority(%rbx),%dx           # safe priority for the
   16.43 +        movw %dx,VCPU_old_trap_priority(%rbx)       # iret hypercall
   16.44 +        movw  $VCPU_TRAP_NMI,VCPU_trap_priority(%rbx)
   16.45 +        /* FALLTHROUGH */
   16.46 +compat_process_trap:
   16.47          leaq  VCPU_trap_bounce(%rbx),%rdx
   16.48          call  compat_create_bounce_frame
   16.49          jmp   compat_test_all_events
   16.50 @@ -386,6 +407,10 @@ ENTRY(compat_hypercall_table)
   16.51          .quad do_sysctl                 /* 35 */
   16.52          .quad do_domctl
   16.53          .quad compat_kexec_op
   16.54 +        .rept __HYPERVISOR_arch_0-((.-compat_hypercall_table)/8)
   16.55 +        .quad compat_ni_hypercall
   16.56 +        .endr
   16.57 +        .quad do_mca                    /* 48 */
   16.58          .rept NR_hypercalls-((.-compat_hypercall_table)/8)
   16.59          .quad compat_ni_hypercall
   16.60          .endr
   16.61 @@ -429,6 +454,10 @@ ENTRY(compat_hypercall_args_table)
   16.62          .byte 1 /* do_sysctl                */  /* 35 */
   16.63          .byte 1 /* do_domctl                */
   16.64          .byte 2 /* compat_kexec_op          */
   16.65 +        .rept __HYPERVISOR_arch_0-(.-compat_hypercall_args_table)
   16.66 +        .byte 0 /* compat_ni_hypercall      */
   16.67 +        .endr
   16.68 +        .byte 1 /* do_mca                   */
   16.69          .rept NR_hypercalls-(.-compat_hypercall_args_table)
   16.70          .byte 0 /* compat_ni_hypercall      */
   16.71          .endr
    17.1 --- a/xen/arch/x86/x86_64/compat/traps.c	Fri Jul 04 13:02:31 2008 +0100
    17.2 +++ b/xen/arch/x86/x86_64/compat/traps.c	Fri Jul 04 16:27:44 2008 +0100
    17.3 @@ -121,8 +121,13 @@ unsigned int compat_iret(void)
    17.4      else
    17.5          regs->_esp += 16;
    17.6  
    17.7 -    /* No longer in NMI context. */
    17.8 -    v->nmi_masked = 0;
    17.9 +    /* Restore affinity.  */
   17.10 +    if ((v->trap_priority >= VCPU_TRAP_NMI)
   17.11 +       && !cpus_equal(v->cpu_affinity_tmp, v->cpu_affinity))
   17.12 +        vcpu_set_affinity(v, &v->cpu_affinity_tmp);
   17.13 +
   17.14 +    /* Restore previous trap priority */
   17.15 +    v->trap_priority = v->old_trap_priority;
   17.16  
   17.17      /* Restore upcall mask from supplied EFLAGS.IF. */
   17.18      vcpu_info(v, evtchn_upcall_mask) = !(eflags & X86_EFLAGS_IF);
    18.1 --- a/xen/arch/x86/x86_64/entry.S	Fri Jul 04 13:02:31 2008 +0100
    18.2 +++ b/xen/arch/x86/x86_64/entry.S	Fri Jul 04 16:27:44 2008 +0100
    18.3 @@ -205,6 +205,8 @@ test_all_events:
    18.4          leaq  irq_stat(%rip),%rcx
    18.5          testl $~0,(%rcx,%rax,1)
    18.6          jnz   process_softirqs
    18.7 +        testb $1,VCPU_mce_pending(%rbx)
    18.8 +        jnz   process_mce
    18.9          testb $1,VCPU_nmi_pending(%rbx)
   18.10          jnz   process_nmi
   18.11  test_guest_events:
   18.12 @@ -231,15 +233,34 @@ process_softirqs:
   18.13  
   18.14          ALIGN
   18.15  /* %rbx: struct vcpu */
   18.16 +process_mce:
   18.17 +        cmpw $VCPU_TRAP_MCE,VCPU_trap_priority(%rbx)
   18.18 +        jae  test_guest_events
   18.19 +        sti
   18.20 +        movb $0,VCPU_mce_pending(%rbx)
   18.21 +        call set_guest_machinecheck_trapbounce
   18.22 +        test %eax,%eax
   18.23 +        jz   test_all_events
   18.24 +        movw VCPU_trap_priority(%rbx),%dx           # safe priority for the
   18.25 +        movw %dx,VCPU_old_trap_priority(%rbx)       # iret hypercall
   18.26 +        movw $VCPU_TRAP_MCE,VCPU_trap_priority(%rbx)
   18.27 +        jmp  process_trap
   18.28 +
   18.29 +        ALIGN
   18.30 +/* %rbx: struct vcpu */
   18.31  process_nmi:
   18.32 -        testb $1,VCPU_nmi_masked(%rbx)
   18.33 -        jnz  test_guest_events
   18.34 +        cmpw $VCPU_TRAP_NMI,VCPU_trap_priority(%rbx)
   18.35 +        jae  test_guest_events
   18.36          sti
   18.37          movb $0,VCPU_nmi_pending(%rbx)
   18.38          call set_guest_nmi_trapbounce
   18.39          test %eax,%eax
   18.40          jz   test_all_events
   18.41 -        movb $1,VCPU_nmi_masked(%rbx)
   18.42 +        movw VCPU_trap_priority(%rbx),%dx           # safe priority for the
   18.43 +        movw %dx,VCPU_old_trap_priority(%rbx)       # iret hypercall
   18.44 +        movw $VCPU_TRAP_NMI,VCPU_trap_priority(%rbx)
   18.45 +        /* FALLTHROUGH */
   18.46 +process_trap:
   18.47          leaq VCPU_trap_bounce(%rbx),%rdx
   18.48          call create_bounce_frame
   18.49          jmp  test_all_events
   18.50 @@ -671,6 +692,10 @@ ENTRY(hypercall_table)
   18.51          .quad do_sysctl             /* 35 */
   18.52          .quad do_domctl
   18.53          .quad do_kexec_op
   18.54 +        .rept __HYPERVISOR_arch_0-((.-hypercall_table)/8)
   18.55 +        .quad do_ni_hypercall
   18.56 +        .endr
   18.57 +        .quad do_mca                /* 48 */
   18.58          .rept NR_hypercalls-((.-hypercall_table)/8)
   18.59          .quad do_ni_hypercall
   18.60          .endr
   18.61 @@ -715,6 +740,10 @@ ENTRY(hypercall_args_table)
   18.62          .byte 1 /* do_domctl            */
   18.63          .byte 2 /* do_kexec             */
   18.64          .byte 1 /* do_xsm_op            */
   18.65 +        .rept __HYPERVISOR_arch_0-(.-hypercall_args_table)
   18.66 +        .byte 0 /* do_ni_hypercall      */
   18.67 +        .endr
   18.68 +        .byte 1 /* do_mca               */  /* 48 */
   18.69          .rept NR_hypercalls-(.-hypercall_args_table)
   18.70          .byte 0 /* do_ni_hypercall      */
   18.71          .endr
    19.1 --- a/xen/arch/x86/x86_64/traps.c	Fri Jul 04 13:02:31 2008 +0100
    19.2 +++ b/xen/arch/x86/x86_64/traps.c	Fri Jul 04 16:27:44 2008 +0100
    19.3 @@ -288,8 +288,13 @@ unsigned long do_iret(void)
    19.4          regs->rcx = iret_saved.rcx;
    19.5      }
    19.6  
    19.7 -    /* No longer in NMI context. */
    19.8 -    v->nmi_masked = 0;
    19.9 +    /* Restore affinity.  */
   19.10 +    if ((v->trap_priority >= VCPU_TRAP_NMI)
   19.11 +       && !cpus_equal(v->cpu_affinity_tmp, v->cpu_affinity))
   19.12 +        vcpu_set_affinity(v, &v->cpu_affinity_tmp);
   19.13 +
   19.14 +    /* Restore previous trap priority */
   19.15 +    v->trap_priority = v->old_trap_priority;
   19.16  
   19.17      /* Restore upcall mask from supplied EFLAGS.IF. */
   19.18      vcpu_info(v, evtchn_upcall_mask) = !(iret_saved.rflags & EF_IE);
    20.1 --- a/xen/common/domain.c	Fri Jul 04 13:02:31 2008 +0100
    20.2 +++ b/xen/common/domain.c	Fri Jul 04 16:27:44 2008 +0100
    20.3 @@ -654,7 +654,9 @@ void vcpu_reset(struct vcpu *v)
    20.4      v->is_polling      = 0;
    20.5      v->is_initialised  = 0;
    20.6      v->nmi_pending     = 0;
    20.7 -    v->nmi_masked      = 0;
    20.8 +    v->mce_pending     = 0;
    20.9 +    v->old_trap_priority = VCPU_TRAP_NONE;
   20.10 +    v->trap_priority   = VCPU_TRAP_NONE;
   20.11      clear_bit(_VPF_blocked, &v->pause_flags);
   20.12  
   20.13      domain_unlock(v->domain);
    21.1 --- a/xen/common/event_channel.c	Fri Jul 04 13:02:31 2008 +0100
    21.2 +++ b/xen/common/event_channel.c	Fri Jul 04 16:27:44 2008 +0100
    21.3 @@ -587,6 +587,21 @@ void send_guest_vcpu_virq(struct vcpu *v
    21.4      evtchn_set_pending(v, port);
    21.5  }
    21.6  
    21.7 +int guest_enabled_event(struct vcpu *v, int virq)
    21.8 +{
    21.9 +    int port;
   21.10 +
   21.11 +    if ( unlikely(v == NULL) )
   21.12 +        return 0;
   21.13 +
   21.14 +    port = v->virq_to_evtchn[virq];
   21.15 +    if ( port == 0 )
   21.16 +        return 0;
   21.17 +
   21.18 +    /* virq is in use */
   21.19 +    return 1;
   21.20 +}
   21.21 +
   21.22  void send_guest_global_virq(struct domain *d, int virq)
   21.23  {
   21.24      int port;
    22.1 --- a/xen/include/Makefile	Fri Jul 04 13:02:31 2008 +0100
    22.2 +++ b/xen/include/Makefile	Fri Jul 04 16:27:44 2008 +0100
    22.3 @@ -20,6 +20,7 @@ headers-y := \
    22.4      compat/xen.h \
    22.5      compat/xencomm.h \
    22.6      compat/xenoprof.h
    22.7 +headers-$(CONFIG_X86)     += compat/arch-x86/xen-mca.h
    22.8  headers-$(CONFIG_X86)     += compat/arch-x86/xen.h
    22.9  headers-$(CONFIG_X86)     += compat/arch-x86/xen-$(compat-arch-y).h
   22.10  headers-y                 += compat/arch-$(compat-arch-y).h compat/xlat.h
    23.1 --- a/xen/include/asm-x86/event.h	Fri Jul 04 13:02:31 2008 +0100
    23.2 +++ b/xen/include/asm-x86/event.h	Fri Jul 04 16:27:44 2008 +0100
    23.3 @@ -69,7 +69,12 @@ static inline void local_event_delivery_
    23.4  /* No arch specific virq definition now. Default to global. */
    23.5  static inline int arch_virq_is_global(int virq)
    23.6  {
    23.7 -    return 1;
    23.8 +    switch (virq) {
    23.9 +    case VIRQ_MCA:
   23.10 +        return 1;
   23.11 +    default:
   23.12 +        return 1;
   23.13 +    }
   23.14  }
   23.15  
   23.16  #endif
    24.1 --- a/xen/include/asm-x86/mm.h	Fri Jul 04 13:02:31 2008 +0100
    24.2 +++ b/xen/include/asm-x86/mm.h	Fri Jul 04 16:27:44 2008 +0100
    24.3 @@ -141,6 +141,9 @@ static inline u32 pickle_domptr(struct d
    24.4  #define page_get_owner(_p)    (unpickle_domptr((_p)->u.inuse._domain))
    24.5  #define page_set_owner(_p,_d) ((_p)->u.inuse._domain = pickle_domptr(_d))
    24.6  
    24.7 +#define maddr_get_owner(ma)   (page_get_owner(maddr_to_page((ma))))
    24.8 +#define vaddr_get_owner(va)   (page_get_owner(virt_to_page((va))))
    24.9 +
   24.10  #define XENSHARE_writable 0
   24.11  #define XENSHARE_readonly 1
   24.12  extern void share_xen_page_with_guest(
    25.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    25.2 +++ b/xen/include/asm-x86/traps.h	Fri Jul 04 16:27:44 2008 +0100
    25.3 @@ -0,0 +1,50 @@
    25.4 +/*
    25.5 + * Copyright (c) 2007, 2008 Advanced Micro Devices, Inc.
    25.6 + * Author: Christoph Egger <Christoph.Egger@amd.com>
    25.7 + *
    25.8 + * This program is free software; you can redistribute it and/or modify
    25.9 + * it under the terms of the GNU General Public License as published by
   25.10 + * the Free Software Foundation; either version 2 of the License, or
   25.11 + * (at your option) any later version.
   25.12 + *
   25.13 + * This program is distributed in the hope that it will be useful,
   25.14 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
   25.15 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   25.16 + * GNU General Public License for more details.
   25.17 + *
   25.18 + * You should have received a copy of the GNU General Public License
   25.19 + * along with this program; if not, write to the Free Software
   25.20 + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
   25.21 + */
   25.22 +
   25.23 +#ifndef ASM_TRAP_H
   25.24 +#define ASM_TRAP_H
   25.25 +
   25.26 +struct softirq_trap {
   25.27 +	struct domain *domain;  /* domain to inject trap */
   25.28 +	struct vcpu *vcpu;	/* vcpu to inject trap */
   25.29 +	int processor;		/* physical cpu to inject trap */
   25.30 +};
   25.31 +
   25.32 +struct cpu_user_regs;
   25.33 +
   25.34 +extern void (*machine_check_vector)(struct cpu_user_regs *regs, long error_code);
   25.35 + 
   25.36 +/**
   25.37 + * guest_has_trap_callback
   25.38 + *
   25.39 + * returns true (non-zero) if guest registered a trap handler
   25.40 + */
   25.41 +extern int guest_has_trap_callback(struct domain *d, uint16_t vcpuid,
   25.42 +				unsigned int trap_nr);
   25.43 +
   25.44 +/**
   25.45 + * send_guest_trap
   25.46 + *
   25.47 + * delivers trap to guest analogous to send_guest_global_virq
   25.48 + * return 0 on successful delivery
   25.49 + */
   25.50 +extern int send_guest_trap(struct domain *d, uint16_t vcpuid,
   25.51 +				unsigned int trap_nr);
   25.52 +
   25.53 +#endif /* ASM_TRAP_H */
    26.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    26.2 +++ b/xen/include/public/arch-x86/xen-mca.h	Fri Jul 04 16:27:44 2008 +0100
    26.3 @@ -0,0 +1,279 @@
    26.4 +/******************************************************************************
    26.5 + * arch-x86/mca.h
    26.6 + * 
    26.7 + * Contributed by Advanced Micro Devices, Inc.
    26.8 + * Author: Christoph Egger <Christoph.Egger@amd.com>
    26.9 + *
   26.10 + * Guest OS machine check interface to x86 Xen.
   26.11 + * 
   26.12 + * Permission is hereby granted, free of charge, to any person obtaining a copy
   26.13 + * of this software and associated documentation files (the "Software"), to
   26.14 + * deal in the Software without restriction, including without limitation the
   26.15 + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
   26.16 + * sell copies of the Software, and to permit persons to whom the Software is
   26.17 + * furnished to do so, subject to the following conditions:
   26.18 + *
   26.19 + * The above copyright notice and this permission notice shall be included in
   26.20 + * all copies or substantial portions of the Software.
   26.21 + *
   26.22 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
   26.23 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
   26.24 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
   26.25 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
   26.26 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
   26.27 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
   26.28 + * DEALINGS IN THE SOFTWARE.
   26.29 + */
   26.30 +
   26.31 +/* Full MCA functionality has the following Usecases from the guest side:
   26.32 + *
   26.33 + * Must have's:
   26.34 + * 1. Dom0 and DomU register machine check trap callback handlers
   26.35 + *    (already done via "set_trap_table" hypercall)
   26.36 + * 2. Dom0 registers machine check event callback handler
   26.37 + *    (doable via EVTCHNOP_bind_virq)
   26.38 + * 3. Dom0 and DomU fetches machine check data
   26.39 + * 4. Dom0 wants Xen to notify a DomU
   26.40 + * 5. Dom0 gets DomU ID from physical address
   26.41 + * 6. Dom0 wants Xen to kill DomU (already done for "xm destroy")
   26.42 + *
   26.43 + * Nice to have's:
   26.44 + * 7. Dom0 wants Xen to deactivate a physical CPU
   26.45 + *    This is better done as separate task, physical CPU hotplugging,
   26.46 + *    and hypercall(s) should be sysctl's
   26.47 + * 8. Page migration proposed from Xen NUMA work, where Dom0 can tell Xen to
   26.48 + *    move a DomU (or Dom0 itself) away from a malicious page
   26.49 + *    producing correctable errors.
   26.50 + * 9. offlining physical page:
   26.51 + *    Xen free's and never re-uses a certain physical page.
   26.52 + * 10. Testfacility: Allow Dom0 to write values into machine check MSR's
   26.53 + *     and tell Xen to trigger a machine check
   26.54 + */
   26.55 +
   26.56 +#ifndef __XEN_PUBLIC_ARCH_X86_MCA_H__
   26.57 +#define __XEN_PUBLIC_ARCH_X86_MCA_H__
   26.58 +
   26.59 +/* Hypercall */
   26.60 +#define __HYPERVISOR_mca __HYPERVISOR_arch_0
   26.61 +
   26.62 +#define XEN_MCA_INTERFACE_VERSION 0x03000001
   26.63 +
   26.64 +/* IN: Dom0 calls hypercall from MC event handler. */
   26.65 +#define XEN_MC_CORRECTABLE  0x0
   26.66 +/* IN: Dom0/DomU calls hypercall from MC trap handler. */
   26.67 +#define XEN_MC_TRAP         0x1
   26.68 +/* XEN_MC_CORRECTABLE and XEN_MC_TRAP are mutually exclusive. */
   26.69 +
   26.70 +/* OUT: All is ok */
   26.71 +#define XEN_MC_OK           0x0
   26.72 +/* OUT: Domain could not fetch data. */
   26.73 +#define XEN_MC_FETCHFAILED  0x1
   26.74 +/* OUT: There was no machine check data to fetch. */
   26.75 +#define XEN_MC_NODATA       0x2
   26.76 +/* OUT: Between notification time and this hypercall an other
   26.77 + *  (most likely) correctable error happened. The fetched data,
   26.78 + *  does not match the original machine check data. */
   26.79 +#define XEN_MC_NOMATCH      0x4
   26.80 +
   26.81 +/* OUT: DomU did not register MC NMI handler. Try something else. */
   26.82 +#define XEN_MC_CANNOTHANDLE 0x8
   26.83 +/* OUT: Notifying DomU failed. Retry later or try something else. */
   26.84 +#define XEN_MC_NOTDELIVERED 0x10
   26.85 +/* Note, XEN_MC_CANNOTHANDLE and XEN_MC_NOTDELIVERED are mutually exclusive. */
   26.86 +
   26.87 +
   26.88 +#ifndef __ASSEMBLY__
   26.89 +
   26.90 +#define VIRQ_MCA VIRQ_ARCH_0 /* G. (DOM0) Machine Check Architecture */
   26.91 +
   26.92 +/*
   26.93 + * Machine Check Architecure:
   26.94 + * structs are read-only and used to report all kinds of
   26.95 + * correctable and uncorrectable errors detected by the HW.
   26.96 + * Dom0 and DomU: register a handler to get notified.
   26.97 + * Dom0 only: Correctable errors are reported via VIRQ_MCA
   26.98 + * Dom0 and DomU: Uncorrectable errors are reported via nmi handlers
   26.99 + */
  26.100 +#define MC_TYPE_GLOBAL          0
  26.101 +#define MC_TYPE_BANK            1
  26.102 +#define MC_TYPE_EXTENDED        2
  26.103 +
  26.104 +struct mcinfo_common {
  26.105 +    uint16_t type;      /* structure type */
  26.106 +    uint16_t size;      /* size of this struct in bytes */
  26.107 +};
  26.108 +
  26.109 +
  26.110 +#define MC_FLAG_CORRECTABLE     (1 << 0)
  26.111 +#define MC_FLAG_UNCORRECTABLE   (1 << 1)
  26.112 +
  26.113 +/* contains global x86 mc information */
  26.114 +struct mcinfo_global {
  26.115 +    struct mcinfo_common common;
  26.116 +
  26.117 +    /* running domain at the time in error (most likely the impacted one) */
  26.118 +    uint16_t mc_domid;
  26.119 +    uint32_t mc_socketid; /* physical socket of the physical core */
  26.120 +    uint16_t mc_coreid; /* physical impacted core */
  26.121 +    uint16_t mc_core_threadid; /* core thread of physical core */
  26.122 +    uint16_t mc_vcpuid; /* virtual cpu scheduled for mc_domid */
  26.123 +    uint64_t mc_gstatus; /* global status */
  26.124 +    uint32_t mc_flags;
  26.125 +};
  26.126 +
  26.127 +/* contains bank local x86 mc information */
  26.128 +struct mcinfo_bank {
  26.129 +    struct mcinfo_common common;
  26.130 +
  26.131 +    uint16_t mc_bank; /* bank nr */
  26.132 +    uint16_t mc_domid; /* Usecase 5: domain referenced by mc_addr on dom0
  26.133 +                        * and if mc_addr is valid. Never valid on DomU. */
  26.134 +    uint64_t mc_status; /* bank status */
  26.135 +    uint64_t mc_addr;   /* bank address, only valid
  26.136 +                         * if addr bit is set in mc_status */
  26.137 +    uint64_t mc_misc;
  26.138 +};
  26.139 +
  26.140 +
  26.141 +struct mcinfo_msr {
  26.142 +    uint64_t reg;   /* MSR */
  26.143 +    uint64_t value; /* MSR value */
  26.144 +};
  26.145 +
  26.146 +/* contains mc information from other
  26.147 + * or additional mc MSRs */ 
  26.148 +struct mcinfo_extended {
  26.149 +    struct mcinfo_common common;
  26.150 +
  26.151 +    /* You can fill up to five registers.
  26.152 +     * If you need more, then use this structure
  26.153 +     * multiple times. */
  26.154 +
  26.155 +    uint32_t mc_msrs; /* Number of msr with valid values. */
  26.156 +    struct mcinfo_msr mc_msr[5];
  26.157 +};
  26.158 +
  26.159 +#define MCINFO_HYPERCALLSIZE	1024
  26.160 +#define MCINFO_MAXSIZE		768
  26.161 +
  26.162 +struct mc_info {
  26.163 +    /* Number of mcinfo_* entries in mi_data */
  26.164 +    uint32_t mi_nentries;
  26.165 +
  26.166 +    uint8_t mi_data[MCINFO_MAXSIZE - sizeof(uint32_t)];
  26.167 +};
  26.168 +typedef struct mc_info mc_info_t;
  26.169 +
  26.170 +
  26.171 +
  26.172 +/* 
  26.173 + * OS's should use these instead of writing their own lookup function
  26.174 + * each with its own bugs and drawbacks.
  26.175 + * We use macros instead of static inline functions to allow guests
  26.176 + * to include this header in assembly files (*.S).
  26.177 + */
  26.178 +/* Prototype:
  26.179 + *    uint32_t x86_mcinfo_nentries(struct mc_info *mi);
  26.180 + */
  26.181 +#define x86_mcinfo_nentries(_mi)    \
  26.182 +    (_mi)->mi_nentries
  26.183 +/* Prototype:
  26.184 + *    struct mcinfo_common *x86_mcinfo_first(struct mc_info *mi);
  26.185 + */
  26.186 +#define x86_mcinfo_first(_mi)       \
  26.187 +    (struct mcinfo_common *)((_mi)->mi_data)
  26.188 +/* Prototype:
  26.189 + *    struct mcinfo_common *x86_mcinfo_next(struct mcinfo_common *mic);
  26.190 + */
  26.191 +#define x86_mcinfo_next(_mic)       \
  26.192 +    (struct mcinfo_common *)((uint8_t *)(_mic) + (_mic)->size)
  26.193 +
  26.194 +/* Prototype:
  26.195 + *    void x86_mcinfo_lookup(void *ret, struct mc_info *mi, uint16_t type);
  26.196 + */
  26.197 +#define x86_mcinfo_lookup(_ret, _mi, _type)    \
  26.198 +    do {                                                        \
  26.199 +        uint32_t found, i;                                      \
  26.200 +        struct mcinfo_common *_mic;                             \
  26.201 +                                                                \
  26.202 +        found = 0;                                              \
  26.203 +	(_ret) = NULL;						\
  26.204 +	if (_mi == NULL) break;					\
  26.205 +        _mic = x86_mcinfo_first(_mi);                           \
  26.206 +        for (i = 0; i < x86_mcinfo_nentries(_mi); i++) {        \
  26.207 +            if (_mic->type == (_type)) {                        \
  26.208 +                found = 1;                                      \
  26.209 +                break;                                          \
  26.210 +            }                                                   \
  26.211 +            _mic = x86_mcinfo_next(_mic);                       \
  26.212 +        }                                                       \
  26.213 +        (_ret) = found ? _mic : NULL;                           \
  26.214 +    } while (0)
  26.215 +
  26.216 +
  26.217 +/* Usecase 1
  26.218 + * Register machine check trap callback handler
  26.219 + *    (already done via "set_trap_table" hypercall)
  26.220 + */
  26.221 +
  26.222 +/* Usecase 2
  26.223 + * Dom0 registers machine check event callback handler
  26.224 + * done by EVTCHNOP_bind_virq
  26.225 + */
  26.226 +
  26.227 +/* Usecase 3
  26.228 + * Fetch machine check data from hypervisor.
  26.229 + * Note, this hypercall is special, because both Dom0 and DomU must use this.
  26.230 + */
  26.231 +#define XEN_MC_fetch            1
  26.232 +struct xen_mc_fetch {
  26.233 +    /* IN/OUT variables. */
  26.234 +    uint32_t flags;
  26.235 +
  26.236 +/* IN: XEN_MC_CORRECTABLE, XEN_MC_TRAP */
  26.237 +/* OUT: XEN_MC_OK, XEN_MC_FETCHFAILED, XEN_MC_NODATA, XEN_MC_NOMATCH */
  26.238 +
  26.239 +    /* OUT variables. */
  26.240 +    uint32_t fetch_idx;  /* only useful for Dom0 for the notify hypercall */
  26.241 +    struct mc_info mc_info;
  26.242 +};
  26.243 +typedef struct xen_mc_fetch xen_mc_fetch_t;
  26.244 +DEFINE_XEN_GUEST_HANDLE(xen_mc_fetch_t);
  26.245 +
  26.246 +
  26.247 +/* Usecase 4
  26.248 + * This tells the hypervisor to notify a DomU about the machine check error
  26.249 + */
  26.250 +#define XEN_MC_notifydomain     2
  26.251 +struct xen_mc_notifydomain {
  26.252 +    /* IN variables. */
  26.253 +    uint16_t mc_domid;    /* The unprivileged domain to notify. */
  26.254 +    uint16_t mc_vcpuid;   /* The vcpu in mc_domid to notify.
  26.255 +                           * Usually echo'd value from the fetch hypercall. */
  26.256 +    uint32_t fetch_idx;   /* echo'd value from the fetch hypercall. */
  26.257 +
  26.258 +    /* IN/OUT variables. */
  26.259 +    uint32_t flags;
  26.260 +
  26.261 +/* IN: XEN_MC_CORRECTABLE, XEN_MC_TRAP */
  26.262 +/* OUT: XEN_MC_OK, XEN_MC_CANNOTHANDLE, XEN_MC_NOTDELIVERED, XEN_MC_NOMATCH */
  26.263 +};
  26.264 +typedef struct xen_mc_notifydomain xen_mc_notifydomain_t;
  26.265 +DEFINE_XEN_GUEST_HANDLE(xen_mc_notifydomain_t);
  26.266 +
  26.267 +
  26.268 +struct xen_mc {
  26.269 +    uint32_t cmd;
  26.270 +    uint32_t interface_version; /* XEN_MCA_INTERFACE_VERSION */
  26.271 +    union {
  26.272 +        struct xen_mc_fetch        mc_fetch;
  26.273 +        struct xen_mc_notifydomain mc_notifydomain;
  26.274 +        uint8_t pad[MCINFO_HYPERCALLSIZE];
  26.275 +    } u;
  26.276 +};
  26.277 +typedef struct xen_mc xen_mc_t;
  26.278 +DEFINE_XEN_GUEST_HANDLE(xen_mc_t);
  26.279 +
  26.280 +#endif /* __ASSEMBLY__ */
  26.281 +
  26.282 +#endif /* __XEN_PUBLIC_ARCH_X86_MCA_H__ */
    27.1 --- a/xen/include/public/arch-x86/xen.h	Fri Jul 04 13:02:31 2008 +0100
    27.2 +++ b/xen/include/public/arch-x86/xen.h	Fri Jul 04 16:27:44 2008 +0100
    27.3 @@ -76,6 +76,10 @@ typedef unsigned long xen_pfn_t;
    27.4  /* Maximum number of virtual CPUs in multi-processor guests. */
    27.5  #define MAX_VIRT_CPUS 32
    27.6  
    27.7 +
    27.8 +/* Machine check support */
    27.9 +#include "xen-mca.h"
   27.10 +
   27.11  #ifndef __ASSEMBLY__
   27.12  
   27.13  typedef unsigned long xen_ulong_t;
    28.1 --- a/xen/include/xen/event.h	Fri Jul 04 13:02:31 2008 +0100
    28.2 +++ b/xen/include/xen/event.h	Fri Jul 04 16:27:44 2008 +0100
    28.3 @@ -50,6 +50,9 @@ int alloc_unbound_xen_event_channel(
    28.4  void free_xen_event_channel(
    28.5      struct vcpu *local_vcpu, int port);
    28.6  
    28.7 +/* Query if event channel is in use by the guest */
    28.8 +int guest_enabled_event(struct vcpu *v, int virq);
    28.9 +
   28.10  /* Notify remote end of a Xen-attached event channel.*/
   28.11  void notify_via_xen_event_channel(int lport);
   28.12  
    29.1 --- a/xen/include/xen/sched.h	Fri Jul 04 13:02:31 2008 +0100
    29.2 +++ b/xen/include/xen/sched.h	Fri Jul 04 16:27:44 2008 +0100
    29.3 @@ -112,10 +112,21 @@ struct vcpu
    29.4      bool_t           is_initialised;
    29.5      /* Currently running on a CPU? */
    29.6      bool_t           is_running;
    29.7 +    /* MCE callback pending for this VCPU? */
    29.8 +    bool_t           mce_pending;
    29.9      /* NMI callback pending for this VCPU? */
   29.10      bool_t           nmi_pending;
   29.11 -    /* Avoid NMI reentry by allowing NMIs to be masked for short periods. */
   29.12 -    bool_t           nmi_masked;
   29.13 +
   29.14 +    /* Higher priorized traps may interrupt lower priorized traps,
   29.15 +     * lower priorized traps wait until higher priorized traps finished.
   29.16 +     * Note: This concept is known as "system priority level" (spl)
   29.17 +     * in the UNIX world. */
   29.18 +    uint16_t         old_trap_priority;
   29.19 +    uint16_t         trap_priority;
   29.20 +#define VCPU_TRAP_NONE    0
   29.21 +#define VCPU_TRAP_NMI     1
   29.22 +#define VCPU_TRAP_MCE     2
   29.23 +
   29.24      /* Require shutdown to be deferred for some asynchronous operation? */
   29.25      bool_t           defer_shutdown;
   29.26      /* VCPU is paused following shutdown request (d->is_shutting_down)? */