ia64/xen-unstable

changeset 19427:c44c963ea162

x86: Core support for Intel MCA support

Those patches based on AMD and SUN's MCA related jobs.
We have latest rebase after SUN's latest improvements.
We will have late following patches for recovery actions. This is a
basic framework for Intel.

Some implementation notes:
1) When error happens, if the error is fatal (pcc = 1) or can't be
recovered (pcc = 0, yet no good recovery methods),
for avoiding losing logs in DOM0, we will reset machine
immediately. Most of MCA MSRs are sticky. After reboot,
MCA polling mechanism will send vIRQ to DOM0 for logging.
2) When MCE# happens, all CPUs enter MCA context. The first CPU who
read&clear the error MSR bank will be this
MCE# owner. Necessary locks/synchronization will help to judge the
owner and select most severe error.
3) For convenience, we will select the most offending CPU to do most
of processing&recovery job.
4) MCE# happens, we will do three jobs:
a. Send vIRQ to DOM0 for logging
b. Send vMCE# to Impacted Guest (Currently Only inject to impacted
DOM0)
c. Guest vMCE MSR virtualization
5) Some further improvement/adds for newer CPUs might be done later
a) Connection with recovery actions (cpu/memory online/offline)
b) More software-recovery identification in severity_scan
c) More refines and tests for HVM might be done when needed.

This patch Enable basic MCA support For Intel

Signed-off-by: Jiang, Yunhong<yunhong.jiang@intel.com>
Signed-off-by: Ke, Liping <Liping.ke@intel.com>
author Keir Fraser <keir.fraser@citrix.com>
date Fri Mar 20 17:24:29 2009 +0000 (2009-03-20)
parents 11650ecdd31e
children cc60defe5b96
files xen/arch/x86/cpu/mcheck/mce_intel.c xen/arch/x86/cpu/mcheck/x86_mca.h xen/arch/x86/domain.c xen/arch/x86/x86_64/traps.c xen/include/asm-x86/domain.h xen/include/asm-x86/softirq.h
line diff
     1.1 --- a/xen/arch/x86/cpu/mcheck/mce_intel.c	Fri Mar 20 15:38:46 2009 +0000
     1.2 +++ b/xen/arch/x86/cpu/mcheck/mce_intel.c	Fri Mar 20 17:24:29 2009 +0000
     1.3 @@ -3,6 +3,7 @@
     1.4  #include <xen/irq.h>
     1.5  #include <xen/event.h>
     1.6  #include <xen/kernel.h>
     1.7 +#include <xen/delay.h>
     1.8  #include <xen/smp.h>
     1.9  #include <asm/processor.h> 
    1.10  #include <asm/system.h>
    1.11 @@ -158,9 +159,378 @@ intel_get_extended_msrs(struct mc_info *
    1.12      return MCA_EXTINFO_GLOBAL;
    1.13  }
    1.14  
    1.15 +/* Below are for MCE handling */
    1.16 +
    1.17 +/* Log worst error severity and offending CPU.,
    1.18 + * Pick this CPU for further processing in softirq */
    1.19 +static int severity_cpu = -1;
    1.20 +static int worst = 0;
    1.21 +
    1.22 +/* Lock of entry@second round scanning in MCE# handler */
    1.23 +static cpumask_t scanned_cpus;
    1.24 +/* Lock for entry@Critical Section in MCE# handler */
    1.25 +static bool_t mce_enter_lock = 0;
    1.26 +/* Record how many CPUs impacted in this MCE# */
    1.27 +static cpumask_t impact_map;
    1.28 +
    1.29 +/* Lock of softirq rendezvous entering point */
    1.30 +static cpumask_t mced_cpus;
    1.31 +/*Lock of softirq rendezvous leaving point */
    1.32 +static cpumask_t finished_cpus;
    1.33 +/* Lock for picking one processing CPU */
    1.34 +static bool_t mce_process_lock = 0;
    1.35 +
    1.36 +/* Spinlock for vMCE# MSR virtualization data */
    1.37 +static DEFINE_SPINLOCK(mce_locks);
    1.38 +
    1.39 +/* Local buffer for holding MCE# data temporarily, sharing between mce
    1.40 + * handler and softirq handler. Those data will be finally committed
    1.41 + * for DOM0 Log and coped to per_dom related data for guest vMCE#
    1.42 + * MSR virtualization.
    1.43 + * Note: When local buffer is still in processing in softirq, another
    1.44 + * MCA comes, simply panic.
    1.45 + */
    1.46 +
    1.47 +struct mc_local_t
    1.48 +{
    1.49 +    bool_t in_use;
    1.50 +    mctelem_cookie_t mctc[NR_CPUS];
    1.51 +};
    1.52 +static struct mc_local_t mc_local;
    1.53 +
    1.54 +/* This node list records errors impacting a domain. when one
    1.55 + * MCE# happens, one error bank impacts a domain. This error node
    1.56 + * will be inserted to the tail of the per_dom data for vMCE# MSR
    1.57 + * virtualization. When one vMCE# injection is finished processing
    1.58 + * processed by guest, the corresponding node will be deleted. 
    1.59 + * This node list is for GUEST vMCE# MSRS virtualization.
    1.60 + */
    1.61 +static struct bank_entry* alloc_bank_entry(void) {
    1.62 +    struct bank_entry *entry;
    1.63 +
    1.64 +    entry = xmalloc(struct bank_entry);
    1.65 +    if (!entry) {
    1.66 +        printk(KERN_ERR "MCE: malloc bank_entry failed\n");
    1.67 +        return NULL;
    1.68 +    }
    1.69 +    memset(entry, 0x0, sizeof(entry));
    1.70 +    INIT_LIST_HEAD(&entry->list);
    1.71 +    return entry;
    1.72 +}
    1.73 +
    1.74 +/* Fill error bank info for #vMCE injection and GUEST vMCE#
    1.75 + * MSR virtualization data
    1.76 + * 1) Log down how many nr_injections of the impacted.
    1.77 + * 2) Copy MCE# error bank to impacted DOM node list, 
    1.78 +      for vMCE# MSRs virtualization
    1.79 +*/
    1.80 +
    1.81 +static int fill_vmsr_data(int cpu, struct mcinfo_bank *mc_bank, 
    1.82 +        uint64_t gstatus) {
    1.83 +    struct domain *d;
    1.84 +    struct bank_entry *entry;
    1.85 +
    1.86 +    /* This error bank impacts one domain, we need to fill domain related
    1.87 +     * data for vMCE MSRs virtualization and vMCE# injection */
    1.88 +    if (mc_bank->mc_domid != (uint16_t)~0) {
    1.89 +        d = get_domain_by_id(mc_bank->mc_domid);
    1.90 +
    1.91 +        /* Not impact a valid domain, skip this error of the bank */
    1.92 +        if (!d) {
    1.93 +            printk(KERN_DEBUG "MCE: Not found valid impacted DOM\n");
    1.94 +            return 0;
    1.95 +        }
    1.96 +
    1.97 +        entry = alloc_bank_entry();
    1.98 +        entry->mci_status = mc_bank->mc_status;
    1.99 +        entry->mci_addr = mc_bank->mc_addr;
   1.100 +        entry->mci_misc = mc_bank->mc_misc;
   1.101 +        entry->cpu = cpu;
   1.102 +        entry->bank = mc_bank->mc_bank;
   1.103 +
   1.104 +        /* New error Node, insert to the tail of the per_dom data */
   1.105 +        list_add_tail(&entry->list, &d->arch.vmca_msrs.impact_header);
   1.106 +        /* Fill MSR global status */
   1.107 +        d->arch.vmca_msrs.mcg_status = gstatus;
   1.108 +        /* New node impact the domain, need another vMCE# injection*/
   1.109 +        d->arch.vmca_msrs.nr_injection++;
   1.110 +
   1.111 +        printk(KERN_DEBUG "MCE: Found error @[CPU%d BANK%d "
   1.112 +                "status %lx addr %lx domid %d]\n ",
   1.113 +                entry->cpu, mc_bank->mc_bank,
   1.114 +                mc_bank->mc_status, mc_bank->mc_addr, mc_bank->mc_domid);
   1.115 +    }
   1.116 +    return 0;
   1.117 +}
   1.118 +
   1.119 +static int mce_actions(void) {
   1.120 +    int32_t cpu, ret;
   1.121 +    struct mc_info *local_mi;
   1.122 +    struct mcinfo_common *mic = NULL;
   1.123 +    struct mcinfo_global *mc_global;
   1.124 +    struct mcinfo_bank *mc_bank;
   1.125 +
   1.126 +    /* Spinlock is used for exclusive read/write of vMSR virtualization
   1.127 +     * (per_dom vMCE# data)
   1.128 +     */
   1.129 +    spin_lock(&mce_locks);
   1.130 +
   1.131 +    /*
   1.132 +     * If softirq is filling this buffer while another MCE# comes,
   1.133 +     * simply panic
   1.134 +     */
   1.135 +    test_and_set_bool(mc_local.in_use);
   1.136 +
   1.137 +    for_each_cpu_mask(cpu, impact_map) {
   1.138 +        if (mc_local.mctc[cpu] == NULL) {
   1.139 +            printk(KERN_ERR "MCE: get reserved entry failed\n ");
   1.140 +            ret = -1;
   1.141 +            goto end;
   1.142 +        }
   1.143 +        local_mi = (struct mc_info*)mctelem_dataptr(mc_local.mctc[cpu]);
   1.144 +        x86_mcinfo_lookup(mic, local_mi, MC_TYPE_GLOBAL);
   1.145 +        if (mic == NULL) {
   1.146 +            printk(KERN_ERR "MCE: get local buffer entry failed\n ");
   1.147 +            ret = -1;
   1.148 +       	    goto end;
   1.149 +        }
   1.150 +
   1.151 +        mc_global = (struct mcinfo_global *)mic;
   1.152 +
   1.153 +        /* Processing bank information */
   1.154 +        x86_mcinfo_lookup(mic, local_mi, MC_TYPE_BANK);
   1.155 +
   1.156 +        for ( ; mic && mic->size; mic = x86_mcinfo_next(mic) ) {
   1.157 +            if (mic->type != MC_TYPE_BANK) {
   1.158 +                continue;
   1.159 +            }
   1.160 +            mc_bank = (struct mcinfo_bank*)mic;
   1.161 +            /* Fill vMCE# injection and vMCE# MSR virtualization related data */
   1.162 +            if (fill_vmsr_data(cpu, mc_bank, mc_global->mc_gstatus) == -1) {
   1.163 +                ret = -1;
   1.164 +                goto end;
   1.165 +            }
   1.166 +
   1.167 +            /* TODO: Add recovery actions here, such as page-offline, etc */
   1.168 +        }
   1.169 +    } /* end of impact_map loop */
   1.170 +
   1.171 +    ret = 0;
   1.172 +
   1.173 +end:
   1.174 +
   1.175 +    for_each_cpu_mask(cpu, impact_map) {
   1.176 +        /* This reserved entry is processed, commit it */
   1.177 +        if (mc_local.mctc[cpu] != NULL) {
   1.178 +            mctelem_commit(mc_local.mctc[cpu]);
   1.179 +            printk(KERN_DEBUG "MCE: Commit one URGENT ENTRY\n");
   1.180 +        }
   1.181 +    }
   1.182 +
   1.183 +    test_and_clear_bool(mc_local.in_use);
   1.184 +    spin_unlock(&mce_locks);
   1.185 +    return ret;
   1.186 +}
   1.187 +
   1.188 +/* Softirq Handler for this MCE# processing */
   1.189 +static void mce_softirq(void)
   1.190 +{
   1.191 +    int cpu = smp_processor_id();
   1.192 +    cpumask_t affinity;
   1.193 +
   1.194 +    /* Wait until all cpus entered softirq */
   1.195 +    while ( cpus_weight(mced_cpus) != num_online_cpus() ) {
   1.196 +        cpu_relax();
   1.197 +    }
   1.198 +    /* Not Found worst error on severity_cpu, it's weird */
   1.199 +    if (severity_cpu == -1) {
   1.200 +        printk(KERN_WARNING "MCE: not found severity_cpu!\n");
   1.201 +        mc_panic("MCE: not found severity_cpu!");
   1.202 +        return;
   1.203 +    }
   1.204 +    /* We choose severity_cpu for further processing */
   1.205 +    if (severity_cpu == cpu) {
   1.206 +
   1.207 +        /* Step1: Fill DOM0 LOG buffer, vMCE injection buffer and
   1.208 +         * vMCE MSRs virtualization buffer
   1.209 +         */
   1.210 +        if (mce_actions())
   1.211 +            mc_panic("MCE recovery actions or Filling vMCE MSRS "
   1.212 +                     "virtualization data failed!\n");
   1.213 +
   1.214 +        /* Step2: Send Log to DOM0 through vIRQ */
   1.215 +        if (dom0 && guest_enabled_event(dom0->vcpu[0], VIRQ_MCA)) {
   1.216 +            printk(KERN_DEBUG "MCE: send MCE# to DOM0 through virq\n");
   1.217 +            send_guest_global_virq(dom0, VIRQ_MCA);
   1.218 +        }
   1.219 +
   1.220 +        /* Step3: Inject vMCE to impacted DOM. Currently we cares DOM0 only */
   1.221 +        if (guest_has_trap_callback
   1.222 +               (dom0, 0, TRAP_machine_check) &&
   1.223 +                 !test_and_set_bool(dom0->vcpu[0]->mce_pending)) {
   1.224 +            dom0->vcpu[0]->cpu_affinity_tmp = 
   1.225 +                    dom0->vcpu[0]->cpu_affinity;
   1.226 +            cpus_clear(affinity);
   1.227 +            cpu_set(cpu, affinity);
   1.228 +            printk(KERN_DEBUG "MCE: CPU%d set affinity, old %d\n", cpu,
   1.229 +                dom0->vcpu[0]->processor);
   1.230 +            vcpu_set_affinity(dom0->vcpu[0], &affinity);
   1.231 +            vcpu_kick(dom0->vcpu[0]);
   1.232 +        }
   1.233 +
   1.234 +        /* Clean Data */
   1.235 +        test_and_clear_bool(mce_process_lock);
   1.236 +        cpus_clear(impact_map);
   1.237 +        cpus_clear(scanned_cpus);
   1.238 +        worst = 0;
   1.239 +        cpus_clear(mced_cpus);
   1.240 +        memset(&mc_local, 0x0, sizeof(mc_local));
   1.241 +    }
   1.242 +
   1.243 +    cpu_set(cpu, finished_cpus);
   1.244 +    wmb();
   1.245 +   /* Leave until all cpus finished recovery actions in softirq */
   1.246 +    while ( cpus_weight(finished_cpus) != num_online_cpus() ) {
   1.247 +        cpu_relax();
   1.248 +    }
   1.249 +
   1.250 +    cpus_clear(finished_cpus);
   1.251 +    severity_cpu = -1;
   1.252 +    printk(KERN_DEBUG "CPU%d exit softirq \n", cpu);
   1.253 +}
   1.254 +
   1.255 +/* Machine Check owner judge algorithm:
   1.256 + * When error happens, all cpus serially read its msr banks.
   1.257 + * The first CPU who fetches the error bank's info will clear
   1.258 + * this bank. Later readers can't get any infor again.
   1.259 + * The first CPU is the actual mce_owner
   1.260 + *
   1.261 + * For Fatal (pcc=1) error, it might cause machine crash
   1.262 + * before we're able to log. For avoiding log missing, we adopt two
   1.263 + * round scanning:
   1.264 + * Round1: simply scan. If found pcc = 1 or ripv = 0, simply reset.
   1.265 + * All MCE banks are sticky, when boot up, MCE polling mechanism
   1.266 + * will help to collect and log those MCE errors.
   1.267 + * Round2: Do all MCE processing logic as normal.
   1.268 + */
   1.269 +
   1.270 +/* Simple Scan. Panic when found non-recovery errors. Doing this for
   1.271 + * avoiding LOG missing
   1.272 + */
   1.273 +static void severity_scan(void)
   1.274 +{
   1.275 +    uint64_t status;
   1.276 +    int32_t i;
   1.277 +
   1.278 +    /* TODO: For PCC = 0, we need to have further judge. If it is can't be
   1.279 +     * recovered, we need to RESET for avoiding DOM0 LOG missing
   1.280 +     */
   1.281 +    for ( i = 0; i < nr_mce_banks; i++) {
   1.282 +        rdmsrl(MSR_IA32_MC0_STATUS + 4 * i , status);
   1.283 +        if ( !(status & MCi_STATUS_VAL) )
   1.284 +            continue;
   1.285 +        /* MCE handler only handles UC error */
   1.286 +        if ( !(status & MCi_STATUS_UC) )
   1.287 +            continue;
   1.288 +        if ( !(status & MCi_STATUS_EN) )
   1.289 +            continue;
   1.290 +        if (status & MCi_STATUS_PCC)
   1.291 +            mc_panic("pcc = 1, cpu unable to continue\n");
   1.292 +    }
   1.293 +
   1.294 +    /* TODO: Further judgement for later CPUs here, maybe need MCACOD assistence */
   1.295 +    /* EIPV and RIPV is not a reliable way to judge the error severity */
   1.296 +
   1.297 +}
   1.298 +
   1.299 +
   1.300  static void intel_machine_check(struct cpu_user_regs * regs, long error_code)
   1.301  {
   1.302 -	mcheck_cmn_handler(regs, error_code, mca_allbanks);
   1.303 +    unsigned int cpu = smp_processor_id();
   1.304 +    int32_t severity = 0;
   1.305 +    uint64_t gstatus;
   1.306 +    mctelem_cookie_t mctc = NULL;
   1.307 +    struct mca_summary bs;
   1.308 +
   1.309 +    /* First round scanning */
   1.310 +    severity_scan();
   1.311 +    cpu_set(cpu, scanned_cpus);
   1.312 +    while (cpus_weight(scanned_cpus) < num_online_cpus())
   1.313 +        cpu_relax();
   1.314 +
   1.315 +    wmb();
   1.316 +    /* All CPUs Finished first round scanning */
   1.317 +    if (mc_local.in_use != 0) {
   1.318 +        mc_panic("MCE: Local buffer is being processed, can't handle new MCE!\n");
   1.319 +        return;
   1.320 +    }
   1.321 +
   1.322 +    /* Enter Critical Section */
   1.323 +    while (test_and_set_bool(mce_enter_lock)) {
   1.324 +        udelay (1);
   1.325 +    }
   1.326 +
   1.327 +    mctc = mcheck_mca_logout(MCA_MCE_HANDLER, mca_allbanks, &bs);
   1.328 +     /* local data point to the reserved entry, let softirq to
   1.329 +      * process the local data */
   1.330 +    if (!bs.errcnt) {
   1.331 +        if (mctc != NULL)
   1.332 +            mctelem_dismiss(mctc);
   1.333 +        mc_local.mctc[cpu] = NULL;
   1.334 +        cpu_set(cpu, mced_cpus);
   1.335 +        test_and_clear_bool(mce_enter_lock);
   1.336 +        raise_softirq(MACHINE_CHECK_SOFTIRQ);
   1.337 +        return;
   1.338 +    }
   1.339 +    else if ( mctc != NULL) {
   1.340 +        mc_local.mctc[cpu] = mctc;
   1.341 +    }
   1.342 +
   1.343 +    if (bs.uc || bs.pcc)
   1.344 +        add_taint(TAINT_MACHINE_CHECK);
   1.345 +
   1.346 +    if (bs.pcc) {
   1.347 +        printk(KERN_WARNING "PCC=1 should have caused reset\n");
   1.348 +        severity = 3;
   1.349 +    }
   1.350 +    else if (bs.uc) {
   1.351 +        severity = 2;
   1.352 +    }
   1.353 +    else {
   1.354 +        printk(KERN_WARNING "We should skip Correctable Error\n");
   1.355 +        severity = 1;
   1.356 +    }
   1.357 +    /* This is the offending cpu! */
   1.358 +    cpu_set(cpu, impact_map);
   1.359 +
   1.360 +    if ( severity > worst) {
   1.361 +        worst = severity;
   1.362 +        severity_cpu = cpu;
   1.363 +    }
   1.364 +    cpu_set(cpu, mced_cpus);
   1.365 +    test_and_clear_bool(mce_enter_lock);
   1.366 +    wmb();
   1.367 +
   1.368 +    /* Wait for all cpus Leave Critical */
   1.369 +    while (cpus_weight(mced_cpus) < num_online_cpus())
   1.370 +        cpu_relax();
   1.371 +    /* Print MCE error */
   1.372 +    x86_mcinfo_dump(mctelem_dataptr(mctc));
   1.373 +
   1.374 +    /* Pick one CPU to clear MCIP */
   1.375 +    if (!test_and_set_bool(mce_process_lock)) {
   1.376 +        rdmsrl(MSR_IA32_MCG_STATUS, gstatus);
   1.377 +        wrmsrl(MSR_IA32_MCG_STATUS, gstatus & ~MCG_STATUS_MCIP);
   1.378 +
   1.379 +        if (worst >= 3) {
   1.380 +            printk(KERN_WARNING "worst=3 should have caused RESET\n");
   1.381 +            mc_panic("worst=3 should have caused RESET");
   1.382 +        }
   1.383 +        else {
   1.384 +            printk(KERN_DEBUG "MCE: trying to recover\n");
   1.385 +        }
   1.386 +    }
   1.387 +    raise_softirq(MACHINE_CHECK_SOFTIRQ);
   1.388  }
   1.389  
   1.390  static DEFINE_SPINLOCK(cmci_discover_lock);
   1.391 @@ -227,7 +597,7 @@ static void cmci_discover(void)
   1.392          } else {
   1.393              x86_mcinfo_dump(mctelem_dataptr(mctc));
   1.394              mctelem_dismiss(mctc);
   1.395 -       }
   1.396 +        }
   1.397      } else if (mctc != NULL)
   1.398          mctelem_dismiss(mctc);
   1.399  
   1.400 @@ -337,11 +707,12 @@ fastcall void smp_cmci_interrupt(struct 
   1.401      if (bs.errcnt && mctc != NULL) {
   1.402          if (guest_enabled_event(dom0->vcpu[0], VIRQ_MCA)) {
   1.403              mctelem_commit(mctc);
   1.404 +            printk(KERN_DEBUG "CMCI: send CMCI to DOM0 through virq\n");
   1.405              send_guest_global_virq(dom0, VIRQ_MCA);
   1.406          } else {
   1.407              x86_mcinfo_dump(mctelem_dataptr(mctc));
   1.408              mctelem_dismiss(mctc);
   1.409 -        }
   1.410 +       }
   1.411      } else if (mctc != NULL)
   1.412          mctelem_dismiss(mctc);
   1.413  
   1.414 @@ -357,11 +728,15 @@ void mce_intel_feature_init(struct cpuin
   1.415      intel_init_cmci(c);
   1.416  }
   1.417  
   1.418 +uint64_t g_mcg_cap;
   1.419  static void mce_cap_init(struct cpuinfo_x86 *c)
   1.420  {
   1.421      u32 l, h;
   1.422  
   1.423      rdmsr (MSR_IA32_MCG_CAP, l, h);
   1.424 +    /* For Guest vMCE usage */
   1.425 +    g_mcg_cap = ((u64)h << 32 | l) & (~MCG_CMCI_P);
   1.426 +
   1.427      if ((l & MCG_CMCI_P) && cpu_has_apic)
   1.428          cmci_support = 1;
   1.429  
   1.430 @@ -434,5 +809,6 @@ int intel_mcheck_init(struct cpuinfo_x86
   1.431      mce_intel_feature_init(c);
   1.432      mce_set_owner();
   1.433  
   1.434 +    open_softirq(MACHINE_CHECK_SOFTIRQ, mce_softirq);
   1.435      return 1;
   1.436  }
     2.1 --- a/xen/arch/x86/cpu/mcheck/x86_mca.h	Fri Mar 20 15:38:46 2009 +0000
     2.2 +++ b/xen/arch/x86/cpu/mcheck/x86_mca.h	Fri Mar 20 17:24:29 2009 +0000
     2.3 @@ -83,9 +83,7 @@
     2.4  /*Intel Specific bitfield*/
     2.5  #define CMCI_THRESHOLD			0x2
     2.6  
     2.7 -
     2.8 -#define MAX_NR_BANKS 128
     2.9 -
    2.10 +#include <asm/domain.h>
    2.11  typedef DECLARE_BITMAP(cpu_banks_t, MAX_NR_BANKS);
    2.12  DECLARE_PER_CPU(cpu_banks_t, mce_banks_owned);
    2.13  
     3.1 --- a/xen/arch/x86/domain.c	Fri Mar 20 15:38:46 2009 +0000
     3.2 +++ b/xen/arch/x86/domain.c	Fri Mar 20 17:24:29 2009 +0000
     3.3 @@ -373,6 +373,7 @@ void vcpu_destroy(struct vcpu *v)
     3.4          hvm_vcpu_destroy(v);
     3.5  }
     3.6  
     3.7 +extern uint64_t g_mcg_cap;
     3.8  int arch_domain_create(struct domain *d, unsigned int domcr_flags)
     3.9  {
    3.10  #ifdef __x86_64__
    3.11 @@ -455,6 +456,16 @@ int arch_domain_create(struct domain *d,
    3.12  
    3.13          if ( (rc = iommu_domain_init(d)) != 0 )
    3.14              goto fail;
    3.15 +
    3.16 +        /* For Guest vMCE MSRs virtualization */
    3.17 +        d->arch.vmca_msrs.mcg_status = 0x0;
    3.18 +        d->arch.vmca_msrs.mcg_cap = g_mcg_cap;
    3.19 +        d->arch.vmca_msrs.mcg_ctl = (uint64_t)~0x0;
    3.20 +        d->arch.vmca_msrs.nr_injection = 0;
    3.21 +        memset(d->arch.vmca_msrs.mci_ctl, 0x1,
    3.22 +            sizeof(d->arch.vmca_msrs.mci_ctl));
    3.23 +        INIT_LIST_HEAD(&d->arch.vmca_msrs.impact_header);
    3.24 +
    3.25      }
    3.26  
    3.27      if ( is_hvm_domain(d) )
     4.1 --- a/xen/arch/x86/x86_64/traps.c	Fri Mar 20 15:38:46 2009 +0000
     4.2 +++ b/xen/arch/x86/x86_64/traps.c	Fri Mar 20 17:24:29 2009 +0000
     4.3 @@ -14,6 +14,8 @@
     4.4  #include <xen/nmi.h>
     4.5  #include <asm/current.h>
     4.6  #include <asm/flushtlb.h>
     4.7 +#include <asm/traps.h>
     4.8 +#include <asm/event.h>
     4.9  #include <asm/msr.h>
    4.10  #include <asm/page.h>
    4.11  #include <asm/shared.h>
    4.12 @@ -265,6 +267,9 @@ unsigned long do_iret(void)
    4.13      struct cpu_user_regs *regs = guest_cpu_user_regs();
    4.14      struct iret_context iret_saved;
    4.15      struct vcpu *v = current;
    4.16 +    struct domain *d = v->domain;
    4.17 +    struct bank_entry *entry;
    4.18 +    int cpu = smp_processor_id();
    4.19  
    4.20      if ( unlikely(copy_from_user(&iret_saved, (void *)regs->rsp,
    4.21                                   sizeof(iret_saved))) )
    4.22 @@ -304,6 +309,48 @@ unsigned long do_iret(void)
    4.23         && !cpus_equal(v->cpu_affinity_tmp, v->cpu_affinity))
    4.24          vcpu_set_affinity(v, &v->cpu_affinity_tmp);
    4.25  
    4.26 +   /*Currently, only inject vMCE to DOM0.*/
    4.27 +    if (v->trap_priority >= VCPU_TRAP_NMI) {
    4.28 +        printk(KERN_DEBUG "MCE: Return from vMCE# trap!");
    4.29 +        if (d->domain_id == 0 && v->vcpu_id == 0) {
    4.30 +            if ( !d->arch.vmca_msrs.nr_injection ) {
    4.31 +                printk(KERN_WARNING "MCE: Ret from vMCE#, nr_injection is 0\n");
    4.32 +                goto end;
    4.33 +            }
    4.34 +
    4.35 +            d->arch.vmca_msrs.nr_injection--;
    4.36 +            if (!list_empty(&d->arch.vmca_msrs.impact_header)) {
    4.37 +                entry = list_entry(d->arch.vmca_msrs.impact_header.next,
    4.38 +                    struct bank_entry, list);
    4.39 +                printk(KERN_DEBUG "MCE: Delete last injection Node\n");
    4.40 +                list_del(&entry->list);
    4.41 +            }
    4.42 +            else
    4.43 +                printk(KERN_DEBUG "MCE: Not found last injection "
    4.44 +                    "Node, something Wrong!\n");
    4.45 +
    4.46 +            /* futher injection*/
    4.47 +            if ( d->arch.vmca_msrs.nr_injection > 0) {
    4.48 +                if ( d->arch.vmca_msrs.nr_injection > 0 &&
    4.49 +                        guest_has_trap_callback(d, v->vcpu_id,
    4.50 +                            TRAP_machine_check) &&
    4.51 +                        !test_and_set_bool(dom0->vcpu[0]->mce_pending)) {
    4.52 +                    cpumask_t affinity;
    4.53 +
    4.54 +                    dom0->vcpu[0]->cpu_affinity_tmp =
    4.55 +                            dom0->vcpu[0]->cpu_affinity;
    4.56 +                    cpus_clear(affinity);
    4.57 +                    cpu_set(cpu, affinity);
    4.58 +                    printk(KERN_DEBUG "MCE: CPU%d set affinity, old %d\n", cpu,
    4.59 +                        dom0->vcpu[0]->processor);
    4.60 +                    vcpu_set_affinity(dom0->vcpu[0], &affinity);
    4.61 +                    vcpu_kick(dom0->vcpu[0]);
    4.62 +                }
    4.63 +            }
    4.64 +        }
    4.65 +    } /* end of outer-if */
    4.66 +
    4.67 +end:
    4.68      /* Restore previous trap priority */
    4.69      v->trap_priority = v->old_trap_priority;
    4.70  
     5.1 --- a/xen/include/asm-x86/domain.h	Fri Mar 20 15:38:46 2009 +0000
     5.2 +++ b/xen/include/asm-x86/domain.h	Fri Mar 20 17:24:29 2009 +0000
     5.3 @@ -203,6 +203,31 @@ typedef xen_domctl_cpuid_t cpuid_input_t
     5.4  
     5.5  struct p2m_domain;
     5.6  
     5.7 +/* Define for GUEST MCA handling */
     5.8 +#define MAX_NR_BANKS 30
     5.9 +
    5.10 +/* This entry is for recording bank nodes for the impacted domain,
    5.11 + * put into impact_header list. */
    5.12 +struct bank_entry {
    5.13 +    struct list_head list;
    5.14 +    int32_t cpu;
    5.15 +    uint16_t bank;
    5.16 +    uint64_t mci_status;
    5.17 +    uint64_t mci_addr;
    5.18 +    uint64_t mci_misc;
    5.19 +};
    5.20 +
    5.21 +struct domain_mca_msrs
    5.22 +{
    5.23 +    /* Guest should not change below values after DOM boot up */
    5.24 +    uint64_t mcg_cap;
    5.25 +    uint64_t mcg_ctl;
    5.26 +    uint64_t mcg_status;
    5.27 +    uint64_t mci_ctl[MAX_NR_BANKS];
    5.28 +    uint16_t nr_injection;
    5.29 +    struct list_head impact_header;
    5.30 +};
    5.31 +
    5.32  struct arch_domain
    5.33  {
    5.34      l1_pgentry_t *mm_perdomain_pt;
    5.35 @@ -269,6 +294,9 @@ struct arch_domain
    5.36      struct page_list_head relmem_list;
    5.37  
    5.38      cpuid_input_t cpuids[MAX_CPUID_INPUT];
    5.39 +
    5.40 +    /* For Guest vMCA handling */
    5.41 +    struct domain_mca_msrs vmca_msrs;
    5.42  } __cacheline_aligned;
    5.43  
    5.44  #define has_arch_pdevs(d)    (!list_empty(&(d)->arch.pdev_list))
     6.1 --- a/xen/include/asm-x86/softirq.h	Fri Mar 20 15:38:46 2009 +0000
     6.2 +++ b/xen/include/asm-x86/softirq.h	Fri Mar 20 17:24:29 2009 +0000
     6.3 @@ -5,6 +5,7 @@
     6.4  #define TIME_CALIBRATE_SOFTIRQ (NR_COMMON_SOFTIRQS + 1)
     6.5  #define VCPU_KICK_SOFTIRQ      (NR_COMMON_SOFTIRQS + 2)
     6.6  
     6.7 -#define NR_ARCH_SOFTIRQS       3
     6.8 +#define MACHINE_CHECK_SOFTIRQ  (NR_COMMON_SOFTIRQS + 3)
     6.9 +#define NR_ARCH_SOFTIRQS       4
    6.10  
    6.11  #endif /* __ASM_SOFTIRQ_H__ */