MCE: use new common mce handler on AMD CPUs

author Christoph Egger <Christoph.Egger@amd.com>

Mon, 17 Sep 2012 16:57:24 +0000 (17:57 +0100)

committer Christoph Egger <Christoph.Egger@amd.com>

Mon, 17 Sep 2012 16:57:24 +0000 (17:57 +0100)
author Christoph Egger <Christoph.Egger@amd.com>
Mon, 17 Sep 2012 16:57:24 +0000 (17:57 +0100)
committer Christoph Egger <Christoph.Egger@amd.com>
Mon, 17 Sep 2012 16:57:24 +0000 (17:57 +0100)
diff --git a/xen/arch/x86/cpu/mcheck/amd_k8.c b/xen/arch/x86/cpu/mcheck/amd_k8.c

index 3bce38e98a860c1442e7d433d8e40f46c5a063cf..6516a82be53ee7e690f2911254469ffadb0bf742 100644 (file)
--- a/xen/arch/x86/cpu/mcheck/amd_k8.c
+++ b/xen/arch/x86/cpu/mcheck/amd_k8.c
@@ -72,7 +72,7 @@
  /* Machine Check Handler for AMD K8 family series */
  static void k8_machine_check(struct cpu_user_regs *regs, long error_code)
  {
-       mcheck_cmn_handler(regs, error_code, mca_allbanks);
+       mcheck_cmn_handler(regs, error_code, mca_allbanks, NULL);
  }
  
  /* AMD K8 machine check */
@@ -83,6 +83,7 @@ enum mcheck_type amd_k8_mcheck_init(struct cpuinfo_x86 *c)
  
         quirkflag = mcequirk_lookup_amd_quirkdata(c);
  
+       mce_handler_init();
         x86_mce_vector_register(k8_machine_check);
  
         for (i = 0; i < nr_mce_banks; i++) {
diff --git a/xen/arch/x86/cpu/mcheck/mce.c b/xen/arch/x86/cpu/mcheck/mce.c

index f60d14fd1e1affe1f6e2eb20fd8a2fab1e2b7748..d0c4fc2160b1c05e172188d3242ff7710993601b 100644 (file)
--- a/xen/arch/x86/cpu/mcheck/mce.c
+++ b/xen/arch/x86/cpu/mcheck/mce.c
@@ -23,6 +23,8 @@
  #include <asm/msr.h>
  
  #include "mce.h"
+#include "barrier.h"
+#include "util.h"
  
  bool_t __read_mostly mce_disabled;
  invbool_param("mce", mce_disabled);
@@ -161,8 +163,30 @@ void mce_need_clearbank_register(mce_need_clearbank_t cbfunc)
      mc_need_clearbank_scan = cbfunc;
  }
  
+
+static struct mce_softirq_barrier mce_inside_bar, mce_severity_bar;
+static struct mce_softirq_barrier mce_trap_bar;
+
+/*
+ * mce_logout_lock should only be used in the trap handler,
+ * while MCIP has not been cleared yet in the global status
+ * register. Other use is not safe, since an MCE trap can
+ * happen at any moment, which would cause lock recursion.
+ */
+static DEFINE_SPINLOCK(mce_logout_lock);
+
+static atomic_t severity_cpu = ATOMIC_INIT(-1);
+static atomic_t found_error = ATOMIC_INIT(0);
+static cpumask_t mce_fatal_cpus;
+
+const struct mca_error_handler *__read_mostly mce_dhandlers;
+const struct mca_error_handler *__read_mostly mce_uhandlers;
+unsigned int __read_mostly mce_dhandler_num;
+unsigned int __read_mostly mce_uhandler_num;
+
+
  static void mca_init_bank(enum mca_source who,
-                                         struct mc_info *mi, int bank)
+    struct mc_info *mi, int bank)
  {
      struct mcinfo_bank *mib;
  
@@ -252,8 +276,9 @@ static int mca_init_global(uint32_t flags, struct mcinfo_global *mig)
   * For Intel latest CPU, whether to clear the error bank status needs to
   * be judged by the callback function defined above.
   */
-mctelem_cookie_t mcheck_mca_logout(enum mca_source who, struct mca_banks *bankmask,
-                                   struct mca_summary *sp, struct mca_banks* clear_bank)
+mctelem_cookie_t
+mcheck_mca_logout(enum mca_source who, struct mca_banks *bankmask,
+                  struct mca_summary *sp, struct mca_banks *clear_bank)
  {
      uint64_t gstatus, status;
      struct mcinfo_global *mig = NULL; /* on stack */
@@ -291,7 +316,7 @@ mctelem_cookie_t mcheck_mca_logout(enum mca_source who, struct mca_banks *bankma
      /* If no mc_recovery_scan callback handler registered,
       * this error is not recoverable
       */
-    recover = (mc_recoverable_scan)? 1: 0;
+    recover = (mc_recoverable_scan) ? 1 : 0;
  
      for (i = 0; i < nr_mce_banks; i++) {
          /* Skip bank if corresponding bit in bankmask is clear */
@@ -311,15 +336,14 @@ mctelem_cookie_t mcheck_mca_logout(enum mca_source who, struct mca_banks *bankma
  
          /* If this is the first bank with valid MCA DATA, then
           * try to reserve an entry from the urgent/nonurgent queue
-         * depending on whethere we are called from an exception or
+         * depending on whether we are called from an exception or
           * a poller;  this can fail (for example dom0 may not
           * yet have consumed past telemetry). */
          if (errcnt++ == 0) {
              if ( (mctc = mctelem_reserve(which)) != NULL ) {
                  mci = mctelem_dataptr(mctc);
                  mcinfo_clear(mci);
-                mig = (struct mcinfo_global*)x86_mcinfo_reserve
-                    (mci, sizeof(struct mcinfo_global));
+                mig = x86_mcinfo_reserve(mci, sizeof(struct mcinfo_global));
                  /* mc_info should at least hold up the global information */
                  ASSERT(mig);
                  mca_init_global(mc_flags, mig);
@@ -382,227 +406,141 @@ mctelem_cookie_t mcheck_mca_logout(enum mca_source who, struct mca_banks *bankma
      return mci != NULL ? mctc : NULL; /* may be NULL */
  }
  
-#define DOM_NORMAL 0
-#define DOM0_TRAP 1
-#define DOMU_TRAP 2
-#define DOMU_KILLED 4
+static void mce_spin_lock(spinlock_t *lk)
+{
+      while (!spin_trylock(lk)) {
+              cpu_relax();
+              mce_panic_check();
+      }
+}
  
-/* Shared #MC handler. */
-void mcheck_cmn_handler(struct cpu_user_regs *regs, long error_code,
-                        struct mca_banks *bankmask)
+static void mce_spin_unlock(spinlock_t *lk)
  {
-    int xen_state_lost, dom0_state_lost, domU_state_lost;
-    struct vcpu *v = current;
-    struct domain *curdom = v->domain;
-    domid_t domid = curdom->domain_id;
-    int ctx_xen, ctx_dom0, ctx_domU;
-    uint32_t dom_state = DOM_NORMAL;
-    mctelem_cookie_t mctc = NULL;
-    struct mca_summary bs;
-    struct mc_info *mci = NULL;
-    int irqlocked = 0;
-    uint64_t gstatus;
-    int ripv;
+      spin_unlock(lk);
+}
  
-    /* This handler runs as interrupt gate. So IPIs from the
-     * polling service routine are defered until we're finished.
-     */
+static enum mce_result mce_action(struct cpu_user_regs *regs,
+    mctelem_cookie_t mctc);
  
-    /* Disable interrupts for the _vcpu_. It may not re-scheduled to
-     * another physical CPU. */
-    vcpu_schedule_lock_irq(v);
-    irqlocked = 1;
+/*
+ * Return:
+ * -1: if system can't be recovered
+ * 0: Continue to next step
+ */
+static int mce_urgent_action(struct cpu_user_regs *regs,
+                              mctelem_cookie_t mctc)
+{
+    uint64_t gstatus;
+
+    if ( mctc == NULL)
+        return 0;
  
-    /* Read global status;  if it does not indicate machine check
-     * in progress then bail as long as we have a valid ip to return to. */
      gstatus = mca_rdmsr(MSR_IA32_MCG_STATUS);
-    ripv = ((gstatus & MCG_STATUS_RIPV) != 0);
-    if (!(gstatus & MCG_STATUS_MCIP) && ripv) {
-        add_taint(TAINT_MACHINE_CHECK); /* questionable */
-        vcpu_schedule_unlock_irq(v);
-        irqlocked = 0;
-        goto cmn_handler_done;
-    }
  
-    /* Go and grab error telemetry.  We must choose whether to commit
-     * for logging or dismiss the cookie that is returned, and must not
-     * reference the cookie after that action.
+    /*
+     * FIXME: When RIPV = EIPV = 0, it's a little bit tricky. It may be an
+     * asynchronic error, currently we have no way to precisely locate
+     * whether the error occur at guest or hypervisor.
+     * To avoid handling error in wrong way, we treat it as unrecovered.
+     *
+     * Another unrecovered case is RIPV = 0 while in hypervisor
+     * since Xen is not pre-emptible.
       */
-    mctc = mcheck_mca_logout(MCA_MCE_HANDLER, bankmask, &bs, NULL);
-    if (mctc != NULL)
-        mci = (struct mc_info *)mctelem_dataptr(mctc);
+    if ( !(gstatus & MCG_STATUS_RIPV) &&
+         (!(gstatus & MCG_STATUS_EIPV) || !guest_mode(regs)) )
+        return -1;
  
-    /* Clear MCIP or another #MC will enter shutdown state */
-    gstatus &= ~MCG_STATUS_MCIP;
-    mca_wrmsr(MSR_IA32_MCG_STATUS, gstatus);
-    wmb();
+    return mce_action(regs, mctc) == MCER_RESET ? -1 : 0;
+}
  
-    /* If no valid errors and our stack is intact, we're done */
-    if (ripv && bs.errcnt == 0) {
-        vcpu_schedule_unlock_irq(v);
-        irqlocked = 0;
-        goto cmn_handler_done;
-    }
+/* Shared #MC handler. */
+void mcheck_cmn_handler(struct cpu_user_regs *regs, long error_code,
+    struct mca_banks *bankmask, struct mca_banks *clear_bank)
+{
+    uint64_t gstatus;
+    mctelem_cookie_t mctc = NULL;
+    struct mca_summary bs;
  
-    if (bs.uc || bs.pcc)
-        add_taint(TAINT_MACHINE_CHECK);
+    mce_spin_lock(&mce_logout_lock);
  
-    /* Machine check exceptions will usually be for UC and/or PCC errors,
-     * but it is possible to configure machine check for some classes
-     * of corrected error.
-     *
-     * UC errors could compromise any domain or the hypervisor
-     * itself - for example a cache writeback of modified data that
-     * turned out to be bad could be for data belonging to anyone, not
-     * just the current domain.  In the absence of known data poisoning
-     * to prevent consumption of such bad data in the system we regard
-     * all UC errors as terminal.  It may be possible to attempt some
-     * heuristics based on the address affected, which guests have
-     * mappings to that mfn etc.
-     *
-     * PCC errors apply to the current context.
-     *
-     * If MCG_STATUS indicates !RIPV then even a #MC that is not UC
-     * and not PCC is terminal - the return instruction pointer
-     * pushed onto the stack is bogus.  If the interrupt context is
-     * the hypervisor or dom0 the game is over, otherwise we can
-     * limit the impact to a single domU but only if we trampoline
-     * somewhere safely - we can't return and unwind the stack.
-     * Since there is no trampoline in place we will treat !RIPV
-     * as terminal for any context.
-     */
-    ctx_xen = SEG_PL(regs->cs) == 0;
-    ctx_dom0 = !ctx_xen && (domid == 0);
-    ctx_domU = !ctx_xen && !ctx_dom0;
-
-    xen_state_lost = bs.uc != 0 || (ctx_xen && (bs.pcc || !ripv)) ||
-        !ripv;
-    dom0_state_lost = bs.uc != 0 || (ctx_dom0 && (bs.pcc || !ripv));
-    domU_state_lost = bs.uc != 0 || (ctx_domU && (bs.pcc || !ripv));
-
-    if (xen_state_lost) {
-        /* Now we are going to panic anyway. Allow interrupts, so that
-         * printk on serial console can work. */
-        vcpu_schedule_unlock_irq(v);
-        irqlocked = 0;
-
-        printk("Terminal machine check exception occurred in "
-               "hypervisor context.\n");
-
-        /* If MCG_STATUS_EIPV indicates, the IP on the stack is related
-         * to the error then it makes sense to print a stack trace.
-         * That can be useful for more detailed error analysis and/or
-         * error case studies to figure out, if we can clear
-         * xen_impacted and kill a DomU instead
-         * (i.e. if a guest only control structure is affected, but then
-         * we must ensure the bad pages are not re-used again).
-         */
-        if (bs.eipv & MCG_STATUS_EIPV) {
-            printk("MCE: Instruction Pointer is related to the "
-                   "error, therefore print the execution state.\n");
-            show_execution_state(regs);
-        }
-
-        /* Commit the telemetry so that panic flow can find it. */
-        if (mctc != NULL) {
-            x86_mcinfo_dump(mci);
-            mctelem_commit(mctc);
-        }
-        mc_panic("Hypervisor state lost due to machine check "
-                 "exception.\n");
-        /*NOTREACHED*/
+    if (clear_bank != NULL) {
+        memset( clear_bank->bank_map, 0x0,
+            sizeof(long) * BITS_TO_LONGS(clear_bank->num));
      }
+    mctc = mcheck_mca_logout(MCA_MCE_SCAN, bankmask, &bs, clear_bank);
  
-    /*
-     * Xen hypervisor state is intact.  If dom0 state is lost then
-     * give it a chance to decide what to do if it has registered
-     * a handler for this event, otherwise panic.
-     *
-     * XXFM Could add some Solaris dom0 contract kill here?
-     */
-    if (dom0_state_lost) {
-        if (dom0 && dom0->max_vcpus && dom0->vcpu[0] &&
-            guest_has_trap_callback(dom0, 0, TRAP_machine_check)) {
-            dom_state = DOM0_TRAP;
-            send_guest_trap(dom0, 0, TRAP_machine_check);
-            /* XXFM case of return with !ripv ??? */
+    if (bs.errcnt) {
+        /*
+         * Uncorrected errors must be dealt with in softirq context.
+         */
+        if (bs.uc || bs.pcc) {
+            add_taint(TAINT_MACHINE_CHECK);
+            if (mctc != NULL)
+                mctelem_defer(mctc);
+            /*
+             * For PCC=1 and can't be recovered, context is lost, so
+             * reboot now without clearing the banks, and deal with
+             * the telemetry after reboot (the MSRs are sticky)
+             */
+            if (bs.pcc || !bs.recoverable)
+                cpumask_set_cpu(smp_processor_id(), &mce_fatal_cpus);
          } else {
-            /* Commit telemetry for panic flow. */
-            if (mctc != NULL) {
-                x86_mcinfo_dump(mci);
+            if (mctc != NULL)
                  mctelem_commit(mctc);
-            }
-            mc_panic("Dom0 state lost due to machine check "
-                     "exception\n");
-            /*NOTREACHED*/
          }
+        atomic_set(&found_error, 1);
+
+        /* The last CPU will be take check/clean-up etc */
+        atomic_set(&severity_cpu, smp_processor_id());
+
+        mce_printk(MCE_CRITICAL, "MCE: clear_bank map %lx on CPU%d\n",
+                *((unsigned long*)clear_bank), smp_processor_id());
+        if (clear_bank != NULL)
+            mcheck_mca_clearbanks(clear_bank);
+    } else {
+        if (mctc != NULL)
+            mctelem_dismiss(mctc);
      }
+    mce_spin_unlock(&mce_logout_lock);
+
+    mce_barrier_enter(&mce_trap_bar);
+    if ( mctc != NULL && mce_urgent_action(regs, mctc))
+        cpumask_set_cpu(smp_processor_id(), &mce_fatal_cpus);
+    mce_barrier_exit(&mce_trap_bar);
  
      /*
-     * If a domU has lost state then send it a trap if it has registered
-     * a handler, otherwise crash the domain.
-     * XXFM Revisit this functionality.
+     * Wait until everybody has processed the trap.
       */
-    if (domU_state_lost) {
-        if (guest_has_trap_callback(v->domain, v->vcpu_id,
-                                    TRAP_machine_check)) {
-            dom_state = DOMU_TRAP;
-            send_guest_trap(curdom, v->vcpu_id,
-                            TRAP_machine_check);
-        } else {
-            dom_state = DOMU_KILLED;
-            /* Enable interrupts. This basically results in
-             * calling sti on the *physical* cpu. But after
-             * domain_crash() the vcpu pointer is invalid.
-             * Therefore, we must unlock the irqs before killing
-             * it. */
-            vcpu_schedule_unlock_irq(v);
-            irqlocked = 0;
-
-            /* DomU is impacted. Kill it and continue. */
-            domain_crash(curdom);
+    mce_barrier_enter(&mce_trap_bar);
+    if (atomic_read(&severity_cpu) == smp_processor_id())
+    {
+        /* According to SDM, if no error bank found on any cpus,
+         * something unexpected happening, we can't do any
+         * recovery job but to reset the system.
+         */
+        if (atomic_read(&found_error) == 0)
+            mc_panic("MCE: No CPU found valid MCE, need reset\n");
+        if (!cpumask_empty(&mce_fatal_cpus))
+        {
+            char *ebufp, ebuf[96] = "MCE: Fatal error happened on CPUs ";
+            ebufp = ebuf + strlen(ebuf);
+            cpumask_scnprintf(ebufp, 95 - strlen(ebuf), &mce_fatal_cpus);
+            mc_panic(ebuf);
          }
+        atomic_set(&found_error, 0);
      }
+    mce_barrier_exit(&mce_trap_bar); 
  
-    switch (dom_state) {
-    case DOM0_TRAP:
-    case DOMU_TRAP:
-        /* Enable interrupts. */
-        vcpu_schedule_unlock_irq(v);
-        irqlocked = 0;
-
-        /* guest softirqs and event callbacks are scheduled
-         * immediately after this handler exits. */
-        break;
-    case DOMU_KILLED:
-        /* Nothing to do here. */
-        break;
-
-    case DOM_NORMAL:
-        vcpu_schedule_unlock_irq(v);
-        irqlocked = 0;
-        break;
+    /* Clear flags after above fatal check */
+    mce_barrier_enter(&mce_trap_bar);
+    gstatus = mca_rdmsr(MSR_IA32_MCG_STATUS);
+    if ((gstatus & MCG_STATUS_MCIP) != 0) {
+        mce_printk(MCE_CRITICAL, "MCE: Clear MCIP@ last step");
+        mca_wrmsr(MSR_IA32_MCG_STATUS, gstatus & ~MCG_STATUS_MCIP);
      }
+    mce_barrier_exit(&mce_trap_bar);
  
- cmn_handler_done:
-    BUG_ON(irqlocked);
-    BUG_ON(!ripv);
-
-    if (bs.errcnt) {
-        /* Not panicing, so forward telemetry to dom0 now if it
-         * is interested. */
-        if (dom0_vmce_enabled()) {
-            if (mctc != NULL)
-                mctelem_commit(mctc);
-            send_global_virq(VIRQ_MCA);
-        } else {
-            x86_mcinfo_dump(mci);
-            if (mctc != NULL)
-                mctelem_dismiss(mctc);
-        }
-    } else if (mctc != NULL) {
-        mctelem_dismiss(mctc);
-    }
+    raise_softirq(MACHINE_CHECK_SOFTIRQ);
  }
  
  void mcheck_mca_clearbanks(struct mca_banks *bankmask)
@@ -1621,3 +1559,191 @@ void mc_panic(char *s)
      mc_panic_dump();
      panic("HARDWARE ERROR");
  }
+
+/* Machine Check owner judge algorithm:
+ * When error happens, all cpus serially read its msr banks.
+ * The first CPU who fetches the error bank's info will clear
+ * this bank. Later readers can't get any information again.
+ * The first CPU is the actual mce_owner
+ *
+ * For Fatal (pcc=1) error, it might cause machine crash
+ * before we're able to log. For avoiding log missing, we adopt two
+ * round scanning:
+ * Round1: simply scan. If found pcc = 1 or ripv = 0, simply reset.
+ * All MCE banks are sticky, when boot up, MCE polling mechanism
+ * will help to collect and log those MCE errors.
+ * Round2: Do all MCE processing logic as normal.
+ */
+
+/* Maybe called in MCE context, no lock, no printk */
+static enum mce_result mce_action(struct cpu_user_regs *regs,
+                      mctelem_cookie_t mctc)
+{
+    struct mc_info *local_mi;
+    enum mce_result bank_result = MCER_NOERROR;
+    enum mce_result worst_result = MCER_NOERROR;
+    struct mcinfo_common *mic = NULL;
+    struct mca_binfo binfo;
+    const struct mca_error_handler *handlers = mce_dhandlers;
+    unsigned int i, handler_num = mce_dhandler_num;
+
+    /* When in mce context, regs is valid */
+    if (regs)
+    {
+        handler_num = mce_uhandler_num;
+        handlers = mce_uhandlers;
+    }
+
+    /* At least a default handler should be registerd */
+    ASSERT(handler_num);
+
+    local_mi = (struct mc_info*)mctelem_dataptr(mctc);
+    x86_mcinfo_lookup(mic, local_mi, MC_TYPE_GLOBAL);
+    if (mic == NULL) {
+        printk(KERN_ERR "MCE: get local buffer entry failed\n ");
+        return MCER_CONTINUE;
+    }
+
+    memset(&binfo, 0, sizeof(binfo));
+    binfo.mig = (struct mcinfo_global *)mic;
+    binfo.mi = local_mi;
+
+    /* Processing bank information */
+    x86_mcinfo_lookup(mic, local_mi, MC_TYPE_BANK);
+
+    for ( ; bank_result != MCER_RESET && mic && mic->size;
+          mic = x86_mcinfo_next(mic) )
+    {
+        if (mic->type != MC_TYPE_BANK) {
+            continue;
+        }
+        binfo.mib = (struct mcinfo_bank*)mic;
+        binfo.bank = binfo.mib->mc_bank;
+        bank_result = MCER_NOERROR;
+        for ( i = 0; i < handler_num; i++ ) {
+            if (handlers[i].owned_error(binfo.mib->mc_status))
+            {
+                handlers[i].recovery_handler(&binfo, &bank_result, regs);
+                if (worst_result < bank_result)
+                    worst_result = bank_result;
+                break;
+            }
+        }
+        ASSERT(i != handler_num);
+    }
+
+    return worst_result;
+}
+
+/*
+ * Called from mctelem_process_deferred. Return 1 if the telemetry
+ * should be committed for dom0 consumption, 0 if it should be
+ * dismissed.
+ */
+static int mce_delayed_action(mctelem_cookie_t mctc)
+{
+    enum mce_result result;
+    int ret = 0;
+
+    result = mce_action(NULL, mctc);
+
+    switch (result)
+    {
+    case MCER_RESET:
+        dprintk(XENLOG_ERR, "MCE delayed action failed\n");
+        is_mc_panic = 1;
+        x86_mcinfo_dump(mctelem_dataptr(mctc));
+        panic("MCE: Software recovery failed for the UCR\n");
+        break;
+    case MCER_RECOVERED:
+        dprintk(XENLOG_INFO, "MCE: Error is successfully recovered\n");
+        ret  = 1;
+        break;
+    case MCER_CONTINUE:
+        dprintk(XENLOG_INFO, "MCE: Error can't be recovered, "
+            "system is tainted\n");
+        x86_mcinfo_dump(mctelem_dataptr(mctc));
+        ret = 1;
+        break;
+    default:
+        ret = 0;
+        break;
+    }
+    return ret;
+}
+
+/* Softirq Handler for this MCE# processing */
+static void mce_softirq(void)
+{
+    int cpu = smp_processor_id();
+    unsigned int workcpu;
+
+    mce_printk(MCE_VERBOSE, "CPU%d enter softirq\n", cpu);
+
+    mce_barrier_enter(&mce_inside_bar);
+
+    /*
+     * Everybody is here. Now let's see who gets to do the
+     * recovery work. Right now we just see if there's a CPU
+     * that did not have any problems, and pick that one.
+     *
+     * First, just set a default value: the last CPU who reaches this
+     * will overwrite the value and become the default.
+     */
+
+    atomic_set(&severity_cpu, cpu);
+
+    mce_barrier_enter(&mce_severity_bar);
+    if (!mctelem_has_deferred(cpu))
+        atomic_set(&severity_cpu, cpu);
+    mce_barrier_exit(&mce_severity_bar);
+
+    /* We choose severity_cpu for further processing */
+    if (atomic_read(&severity_cpu) == cpu) {
+
+        mce_printk(MCE_VERBOSE, "CPU%d handling errors\n", cpu);
+
+        /* Step1: Fill DOM0 LOG buffer, vMCE injection buffer and
+         * vMCE MSRs virtualization buffer
+         */
+        for_each_online_cpu(workcpu) {
+            mctelem_process_deferred(workcpu, mce_delayed_action);
+        }
+
+        /* Step2: Send Log to DOM0 through vIRQ */
+        if (dom0_vmce_enabled()) {
+            mce_printk(MCE_VERBOSE, "MCE: send MCE# to DOM0 through virq\n");
+            send_global_virq(VIRQ_MCA);
+        }
+    }
+
+    mce_barrier_exit(&mce_inside_bar);
+}
+
+/* Machine Check owner judge algorithm:
+ * When error happens, all cpus serially read its msr banks.
+ * The first CPU who fetches the error bank's info will clear
+ * this bank. Later readers can't get any infor again.
+ * The first CPU is the actual mce_owner
+ *
+ * For Fatal (pcc=1) error, it might cause machine crash
+ * before we're able to log. For avoiding log missing, we adopt two
+ * round scanning:
+ * Round1: simply scan. If found pcc = 1 or ripv = 0, simply reset.
+ * All MCE banks are sticky, when boot up, MCE polling mechanism
+ * will help to collect and log those MCE errors.
+ * Round2: Do all MCE processing logic as normal.
+ */
+void mce_handler_init(void)
+{
+    if (smp_processor_id() != 0)
+        return;
+
+    /* callback register, do we really need so many callback? */
+    /* mce handler data initialization */
+    mce_barrier_init(&mce_inside_bar);
+    mce_barrier_init(&mce_severity_bar);
+    mce_barrier_init(&mce_trap_bar);
+    spin_lock_init(&mce_logout_lock);
+    open_softirq(MACHINE_CHECK_SOFTIRQ, mce_softirq);
+}
diff --git a/xen/arch/x86/cpu/mcheck/mce.h b/xen/arch/x86/cpu/mcheck/mce.h

index 61fa7ee04a6649c62433240ce92a3c397229e956..8064d1a7b3171852757ec3a56b29351eae028f86 100644 (file)
--- a/xen/arch/x86/cpu/mcheck/mce.h
+++ b/xen/arch/x86/cpu/mcheck/mce.h
@@ -78,7 +78,8 @@ extern void x86_mce_vector_register(x86_mce_vector_t);
  
  /* Common generic MCE handler that implementations may nominate
   * via x86_mce_vector_register. */
-extern void mcheck_cmn_handler(struct cpu_user_regs *, long, struct mca_banks *);
+extern void mcheck_cmn_handler(struct cpu_user_regs *, long,
+    struct mca_banks *, struct mca_banks *);
  
  /* Register a handler for judging whether mce is recoverable. */
  typedef int (*mce_recoverable_t)(u64 status);
@@ -166,16 +167,28 @@ void *x86_mcinfo_reserve(struct mc_info *mi, int size);
  void x86_mcinfo_dump(struct mc_info *mi);
  
  int fill_vmsr_data(struct mcinfo_bank *mc_bank, struct domain *d,
-        uint64_t gstatus);
+    uint64_t gstatus);
  int inject_vmce(struct domain *d);
-int vmce_domain_inject(struct mcinfo_bank *bank, struct domain *d, struct mcinfo_global *global);
+int vmce_domain_inject(struct mcinfo_bank *bank, struct domain *d,
+    struct mcinfo_global *global);
  
  static inline int mce_vendor_bank_msr(const struct vcpu *v, uint32_t msr)
  {
-    if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL &&
-         msr >= MSR_IA32_MC0_CTL2 &&
-         msr < MSR_IA32_MCx_CTL2(v->arch.mcg_cap & MCG_CAP_COUNT) )
-          return 1;
+    switch (boot_cpu_data.x86_vendor) {
+    case X86_VENDOR_INTEL:
+        if (msr >= MSR_IA32_MC0_CTL2 &&
+            msr < MSR_IA32_MCx_CTL2(v->arch.mcg_cap & MCG_CAP_COUNT) )
+            return 1;
+        break;
+    case X86_VENDOR_AMD:
+        switch (msr) {
+        case MSR_F10_MC4_MISC1:
+        case MSR_F10_MC4_MISC2:
+        case MSR_F10_MC4_MISC3:
+            return 1;
+        }
+        break;
+    }
      return 0;
  }
  
@@ -188,27 +201,35 @@ static inline int mce_bank_msr(const struct vcpu *v, uint32_t msr)
      return 0;
  }
  
+/* MC softirq */
+void mce_handler_init(void);
+
+extern const struct mca_error_handler *mce_dhandlers;
+extern const struct mca_error_handler *mce_uhandlers;
+extern unsigned int mce_dhandler_num;
+extern unsigned int mce_uhandler_num;
+
  /* Fields are zero when not available */
  struct mce {
-    __u64 status;
-    __u64 misc;
-    __u64 addr;
-    __u64 mcgstatus;
-    __u64 ip;
-    __u64 tsc;      /* cpu time stamp counter */
-    __u64 time;     /* wall time_t when error was detected */
-    __u8  cpuvendor;        /* cpu vendor as encoded in system.h */
-    __u8  inject_flags;     /* software inject flags */
-    __u16  pad;
-    __u32 cpuid;    /* CPUID 1 EAX */
-    __u8  cs;               /* code segment */
-    __u8  bank;     /* machine check bank */
-    __u8  cpu;      /* cpu number; obsolete; use extcpu now */
-    __u8  finished;   /* entry is valid */
-    __u32 extcpu;   /* linux cpu number that detected the error */
-    __u32 socketid; /* CPU socket ID */
-    __u32 apicid;   /* CPU initial apic ID */
-    __u64 mcgcap;   /* MCGCAP MSR: machine check capabilities of CPU */
+    uint64_t status;
+    uint64_t misc;
+    uint64_t addr;
+    uint64_t mcgstatus;
+    uint64_t ip;
+    uint64_t tsc;      /* cpu time stamp counter */
+    uint64_t time;     /* wall time_t when error was detected */
+    uint8_t  cpuvendor;        /* cpu vendor as encoded in system.h */
+    uint8_t  inject_flags;     /* software inject flags */
+    uint16_t pad;
+    uint32_t cpuid;    /* CPUID 1 EAX */
+    uint8_t  cs;       /* code segment */
+    uint8_t  bank;     /* machine check bank */
+    uint8_t  cpu;      /* cpu number; obsolete; use extcpu now */
+    uint8_t  finished; /* entry is valid */
+    uint32_t extcpu;   /* linux cpu number that detected the error */
+    uint32_t socketid; /* CPU socket ID */
+    uint32_t apicid;   /* CPU initial apic ID */
+    uint64_t mcgcap;   /* MCGCAP MSR: machine check capabilities of CPU */
  };
  
  extern int apei_write_mce(struct mce *m);
diff --git a/xen/arch/x86/cpu/mcheck/mce_intel.c b/xen/arch/x86/cpu/mcheck/mce_intel.c

index 03e080f9d050595a120079a8418ced85b075f7ca..0890eff0c6646a9fd78fd2840ad22f70f7d39f66 100644 (file)
--- a/xen/arch/x86/cpu/mcheck/mce_intel.c
+++ b/xen/arch/x86/cpu/mcheck/mce_intel.c
@@ -166,244 +166,6 @@ static void intel_init_thermal(struct cpuinfo_x86 *c)
  }
  #endif /* CONFIG_X86_MCE_THERMAL */
  
-static struct mce_softirq_barrier mce_inside_bar, mce_severity_bar;
-static struct mce_softirq_barrier mce_trap_bar;
-
-/*
- * mce_logout_lock should only be used in the trap handler,
- * while MCIP has not been cleared yet in the global status
- * register. Other use is not safe, since an MCE trap can
- * happen at any moment, which would cause lock recursion.
- */
-static DEFINE_SPINLOCK(mce_logout_lock);
-
-static atomic_t severity_cpu = ATOMIC_INIT(-1);
-static atomic_t found_error = ATOMIC_INIT(0);
-static cpumask_t mce_fatal_cpus;
-
-static const struct mca_error_handler *__read_mostly mce_dhandlers;
-static const struct mca_error_handler *__read_mostly mce_uhandlers;
-static unsigned int __read_mostly mce_dhandler_num;
-static unsigned int __read_mostly mce_uhandler_num;
-
-/* Maybe called in MCE context, no lock, no printk */
-static enum mce_result mce_action(struct cpu_user_regs *regs,
-                      mctelem_cookie_t mctc)
-{
-    struct mc_info *local_mi;
-    enum mce_result bank_result = MCER_NOERROR;
-    enum mce_result worst_result = MCER_NOERROR;
-    struct mcinfo_common *mic = NULL;
-    struct mca_binfo binfo;
-    const struct mca_error_handler *handlers = mce_dhandlers;
-    unsigned int i, handler_num = mce_dhandler_num;
-
-    /* When in mce context, regs is valid */
-    if (regs)
-    {
-        handler_num = mce_uhandler_num;
-        handlers = mce_uhandlers;
-    }
-
-    /* At least a default handler should be registerd */
-    ASSERT(handler_num);
-
-    local_mi = (struct mc_info*)mctelem_dataptr(mctc);
-    x86_mcinfo_lookup(mic, local_mi, MC_TYPE_GLOBAL);
-    if (mic == NULL) {
-        printk(KERN_ERR "MCE: get local buffer entry failed\n ");
-        return MCER_CONTINUE;
-    }
-
-    memset(&binfo, 0, sizeof(binfo));
-    binfo.mig = (struct mcinfo_global *)mic;
-    binfo.mi = local_mi;
-
-    /* Processing bank information */
-    x86_mcinfo_lookup(mic, local_mi, MC_TYPE_BANK);
-
-    for ( ; bank_result != MCER_RESET && mic && mic->size;
-          mic = x86_mcinfo_next(mic) )
-    {
-        if (mic->type != MC_TYPE_BANK) {
-            continue;
-        }
-        binfo.mib = (struct mcinfo_bank*)mic;
-        binfo.bank = binfo.mib->mc_bank;
-        bank_result = MCER_NOERROR;
-        for ( i = 0; i < handler_num; i++ ) {
-            if (handlers[i].owned_error(binfo.mib->mc_status))
-            {
-                handlers[i].recovery_handler(&binfo, &bank_result, regs);
-                if (worst_result < bank_result)
-                    worst_result = bank_result;
-                break;
-            }
-        }
-        ASSERT(i != handler_num);
-    }
-
-    return worst_result;
-}
-
-/*
- * Called from mctelem_process_deferred. Return 1 if the telemetry
- * should be committed for dom0 consumption, 0 if it should be
- * dismissed.
- */
-static int mce_delayed_action(mctelem_cookie_t mctc)
-{
-    enum mce_result result;
-    int ret = 0;
-
-    result = mce_action(NULL, mctc);
-
-    switch (result)
-    {
-    case MCER_RESET:
-        dprintk(XENLOG_ERR, "MCE delayed action failed\n");
-        is_mc_panic = 1;
-        x86_mcinfo_dump(mctelem_dataptr(mctc));
-        panic("MCE: Software recovery failed for the UCR\n");
-        break;
-    case MCER_RECOVERED:
-        dprintk(XENLOG_INFO, "MCE: Error is successfully recovered\n");
-        ret  = 1;
-        break;
-    case MCER_CONTINUE:
-        dprintk(XENLOG_INFO, "MCE: Error can't be recovered, "
-            "system is tainted\n");
-        x86_mcinfo_dump(mctelem_dataptr(mctc));
-        ret = 1;
-        break;
-    default:
-        ret = 0;
-        break;
-    }
-    return ret;
-}
-
-/* Softirq Handler for this MCE# processing */
-static void mce_softirq(void)
-{
-    int cpu = smp_processor_id();
-    unsigned int workcpu;
-
-    mce_printk(MCE_VERBOSE, "CPU%d enter softirq\n", cpu);
-
-    mce_barrier_enter(&mce_inside_bar);
-
-    /*
-     * Everybody is here. Now let's see who gets to do the
-     * recovery work. Right now we just see if there's a CPU
-     * that did not have any problems, and pick that one.
-     *
-     * First, just set a default value: the last CPU who reaches this
-     * will overwrite the value and become the default.
-     */
-
-    atomic_set(&severity_cpu, cpu);
-
-    mce_barrier_enter(&mce_severity_bar);
-    if (!mctelem_has_deferred(cpu))
-        atomic_set(&severity_cpu, cpu);
-    mce_barrier_exit(&mce_severity_bar);
-
-    /* We choose severity_cpu for further processing */
-    if (atomic_read(&severity_cpu) == cpu) {
-
-        mce_printk(MCE_VERBOSE, "CPU%d handling errors\n", cpu);
-
-        /* Step1: Fill DOM0 LOG buffer, vMCE injection buffer and
-         * vMCE MSRs virtualization buffer
-         */
-        for_each_online_cpu(workcpu) {
-            mctelem_process_deferred(workcpu, mce_delayed_action);
-        }
-
-        /* Step2: Send Log to DOM0 through vIRQ */
-        if (dom0_vmce_enabled()) {
-            mce_printk(MCE_VERBOSE, "MCE: send MCE# to DOM0 through virq\n");
-            send_global_virq(VIRQ_MCA);
-        }
-    }
-
-    mce_barrier_exit(&mce_inside_bar);
-}
-
-/*
- * Return:
- * -1: if system can't be recoved
- * 0: Continoue to next step
- */
-static int mce_urgent_action(struct cpu_user_regs *regs,
-                              mctelem_cookie_t mctc)
-{
-    uint64_t gstatus;
-
-    if ( mctc == NULL)
-        return 0;
-
-    gstatus = mca_rdmsr(MSR_IA32_MCG_STATUS);
-
-    /*
-     * FIXME: When RIPV = EIPV = 0, it's a little bit tricky. It may be an
-     * asynchronic error, currently we have no way to precisely locate
-     * whether the error occur at guest or hypervisor.
-     * To avoid handling error in wrong way, we treat it as unrecovered.
-     *
-     * Another unrecovered case is RIPV = 0 while in hypervisor
-     * since Xen is not pre-emptible.
-     */
-    if ( !(gstatus & MCG_STATUS_RIPV) &&
-         (!(gstatus & MCG_STATUS_EIPV) || !guest_mode(regs)) )
-        return -1;
-
-    return mce_action(regs, mctc) == MCER_RESET ? -1 : 0;
-}
-
-/* Machine Check owner judge algorithm:
- * When error happens, all cpus serially read its msr banks.
- * The first CPU who fetches the error bank's info will clear
- * this bank. Later readers can't get any infor again.
- * The first CPU is the actual mce_owner
- *
- * For Fatal (pcc=1) error, it might cause machine crash
- * before we're able to log. For avoiding log missing, we adopt two
- * round scanning:
- * Round1: simply scan. If found pcc = 1 or ripv = 0, simply reset.
- * All MCE banks are sticky, when boot up, MCE polling mechanism
- * will help to collect and log those MCE errors.
- * Round2: Do all MCE processing logic as normal.
- */
-
-static void mce_handler_init(void)
-{
-    if (smp_processor_id() != 0)
-        return;
-
-    /* callback register, do we really need so many callback? */
-    /* mce handler data initialization */
-    mce_barrier_init(&mce_inside_bar);
-    mce_barrier_init(&mce_severity_bar);
-    mce_barrier_init(&mce_trap_bar);
-    spin_lock_init(&mce_logout_lock);
-    open_softirq(MACHINE_CHECK_SOFTIRQ, mce_softirq);
-}
-
-static void mce_spin_lock(spinlock_t *lk)
-{
-      while (!spin_trylock(lk)) {
-              cpu_relax();
-              mce_panic_check();
-      }
-}
-
-static void mce_spin_unlock(spinlock_t *lk)
-{
-      spin_unlock(lk);
-}
-
  /* Intel MCE handler */
  static inline void intel_get_extended_msr(struct mcinfo_extended *ext, u32 msr)
  {
@@ -729,88 +491,8 @@ static const struct mca_error_handler intel_mce_uhandlers[] = {
  
  static void intel_machine_check(struct cpu_user_regs * regs, long error_code)
  {
-    uint64_t gstatus;
-    mctelem_cookie_t mctc = NULL;
-    struct mca_summary bs;
-    struct mca_banks *clear_bank;
-
-    mce_spin_lock(&mce_logout_lock);
-
-    clear_bank = __get_cpu_var(mce_clear_banks);
-    memset( clear_bank->bank_map, 0x0,
-        sizeof(long) * BITS_TO_LONGS(clear_bank->num));
-    mctc = mcheck_mca_logout(MCA_MCE_SCAN, mca_allbanks, &bs, clear_bank);
-
-    if (bs.errcnt) {
-        /*
-         * Uncorrected errors must be dealth with in softirq context.
-         */
-        if (bs.uc || bs.pcc) {
-            add_taint(TAINT_MACHINE_CHECK);
-            if (mctc != NULL)
-                mctelem_defer(mctc);
-            /*
-             * For PCC=1 and can't be recovered, context is lost, so reboot now without
-             * clearing  the banks, and deal with the telemetry after reboot
-             * (the MSRs are sticky)
-             */
-            if (bs.pcc || !bs.recoverable)
-                cpumask_set_cpu(smp_processor_id(), &mce_fatal_cpus);
-        } else {
-            if (mctc != NULL)
-                mctelem_commit(mctc);
-        }
-        atomic_set(&found_error, 1);
-
-        /* The last CPU will be take check/clean-up etc */
-        atomic_set(&severity_cpu, smp_processor_id());
-
-        mce_printk(MCE_CRITICAL, "MCE: clear_bank map %lx on CPU%d\n",
-                *((unsigned long*)clear_bank), smp_processor_id());
-        mcheck_mca_clearbanks(clear_bank);
-    } else {
-        if (mctc != NULL)
-            mctelem_dismiss(mctc);
-    }
-    mce_spin_unlock(&mce_logout_lock);
-
-    mce_barrier_enter(&mce_trap_bar);
-    if ( mctc != NULL && mce_urgent_action(regs, mctc))
-        cpumask_set_cpu(smp_processor_id(), &mce_fatal_cpus);
-    mce_barrier_exit(&mce_trap_bar);
-    /*
-     * Wait until everybody has processed the trap.
-     */
-    mce_barrier_enter(&mce_trap_bar);
-    if (atomic_read(&severity_cpu) == smp_processor_id())
-    {
-        /* According to SDM, if no error bank found on any cpus,
-         * something unexpected happening, we can't do any
-         * recovery job but to reset the system.
-         */
-        if (atomic_read(&found_error) == 0)
-            mc_panic("MCE: No CPU found valid MCE, need reset\n");
-        if (!cpumask_empty(&mce_fatal_cpus))
-        {
-            char *ebufp, ebuf[96] = "MCE: Fatal error happened on CPUs ";
-            ebufp = ebuf + strlen(ebuf);
-            cpumask_scnprintf(ebufp, 95 - strlen(ebuf), &mce_fatal_cpus);
-            mc_panic(ebuf);
-        }
-        atomic_set(&found_error, 0);
-    }
-    mce_barrier_exit(&mce_trap_bar);
-
-    /* Clear flags after above fatal check */
-    mce_barrier_enter(&mce_trap_bar);
-    gstatus = mca_rdmsr(MSR_IA32_MCG_STATUS);
-    if ((gstatus & MCG_STATUS_MCIP) != 0) {
-        mce_printk(MCE_CRITICAL, "MCE: Clear MCIP@ last step");
-        mca_wrmsr(MSR_IA32_MCG_STATUS, gstatus & ~MCG_STATUS_MCIP);
-    }
-    mce_barrier_exit(&mce_trap_bar);
-
-    raise_softirq(MACHINE_CHECK_SOFTIRQ);
+    mcheck_cmn_handler(regs, error_code, mca_allbanks,
+        __get_cpu_var(mce_clear_banks));
  }
  
  /* According to MCA OS writer guide, CMCI handler need to clear bank when
diff --git a/xen/arch/x86/cpu/mcheck/x86_mca.h b/xen/arch/x86/cpu/mcheck/x86_mca.h

index 885d7ba21a3f916a5ac002c9dc16d897182ce28f..22b70c494f22913ff724310a035547051b05bada 100644 (file)
--- a/xen/arch/x86/cpu/mcheck/x86_mca.h
+++ b/xen/arch/x86/cpu/mcheck/x86_mca.h
@@ -32,11 +32,11 @@
  /* Bitfield of the MSR_IA32_MCG_CAP register */
  #define MCG_CAP_COUNT           0x00000000000000ffULL
  #define MCG_CTL_P               (1ULL<<8)
-#define MCG_EXT_P               (1ULL<<9)
-#define MCG_CMCI_P              (1ULL<<10)
-#define MCG_TES_P               (1ULL<<11)
-#define MCG_EXT_CNT             16
-#define MCG_SER_P               (1ULL<<24)
+#define MCG_EXT_P               (1ULL<<9)  /* Intel specific */
+#define MCG_CMCI_P              (1ULL<<10) /* Intel specific */
+#define MCG_TES_P               (1ULL<<11) /* Intel specific */
+#define MCG_EXT_CNT             16         /* Intel specific */
+#define MCG_SER_P               (1ULL<<24) /* Intel specific */
  /* Other bits are reserved */
  
  /* Bitfield of the MSR_IA32_MCG_STATUS register */
@@ -53,9 +53,9 @@
  /* Other information */
  #define MCi_STATUS_OTHER        0x01ffffff00000000ULL
  /* Action Required flag */
-#define MCi_STATUS_AR           0x0080000000000000ULL
+#define MCi_STATUS_AR           0x0080000000000000ULL  /* Intel specific */
  /* Signaling flag */
-#define MCi_STATUS_S            0x0100000000000000ULL
+#define MCi_STATUS_S            0x0100000000000000ULL  /* Intel specific */
  /* processor context corrupt */
  #define MCi_STATUS_PCC          0x0200000000000000ULL
  /* MSR_K8_MCi_ADDR register valid */
@@ -100,7 +100,7 @@ struct mca_banks
      unsigned long *bank_map;
  };
  
-static inline void mcabanks_clear(int bit, struct mca_banks *banks)    \
+static inline void mcabanks_clear(int bit, struct mca_banks *banks)
  {
      if (!banks || !banks->bank_map || bit >= banks->num)
          return ;
@@ -125,7 +125,7 @@ struct mca_banks *mcabanks_alloc(void);
  void mcabanks_free(struct mca_banks *banks);
  extern struct mca_banks *mca_allbanks;
  
-/*Keep bank so that we can get staus even if mib is NULL */
+/* Keep bank so that we can get status even if mib is NULL */
  struct mca_binfo {
      int bank;
      struct mcinfo_global *mig;
@@ -138,7 +138,7 @@ enum mce_result
  {
      MCER_NOERROR,
      MCER_RECOVERED,
-    /* Not recoverd, but can continue */
+    /* Not recovered, but can continue */
      MCER_CONTINUE,
      MCER_RESET,
  };
@@ -149,7 +149,7 @@ struct mca_error_handler
       * identified by mca_code. Otherwise, we might need to have
       * a seperate function to decode the corresponding actions
       * for the particular mca error later.
-    */
+     */
      int (*owned_error)(uint64_t status);
      void (*recovery_handler)(struct mca_binfo *binfo,
                      enum mce_result *result, struct cpu_user_regs *regs);
author	Christoph Egger <Christoph.Egger@amd.com>
	Mon, 17 Sep 2012 16:57:24 +0000 (17:57 +0100)
committer	Christoph Egger <Christoph.Egger@amd.com>
	Mon, 17 Sep 2012 16:57:24 +0000 (17:57 +0100)
xen/arch/x86/cpu/mcheck/amd_k8.c		patch \| blob \| blame \| history
xen/arch/x86/cpu/mcheck/mce.c		patch \| blob \| blame \| history
xen/arch/x86/cpu/mcheck/mce.h		patch \| blob \| blame \| history
xen/arch/x86/cpu/mcheck/mce_intel.c		patch \| blob \| blame \| history
xen/arch/x86/cpu/mcheck/x86_mca.h		patch \| blob \| blame \| history