ia64/xen-unstable

changeset 19528:0ed8616b99d6

x86 mce: fix a few possible issues

Do not share synchronization variables between the trap handler and
the softirq handler, as this will cause problems. Abstract the
synchronization bits into functions. Make the synchronization code
aware of a panic, so that spinning with interrupts disabled is
avoided.

To avoid problems with MCEs happening while we're doing recovery
actions in the softirq handler, implement a deferred list of telemetry
structures, using the mctelem interfaces. Thist list will get updated
atomically, so any additional MCEs will not cause error telemetry to
be missed or not handled.

Signed-off-by: Frank van der Linden <frank.vanderlinden@sun.com>
Signed-off-by: Liping Ke <liping.ke@intel.com>
Signed-off-by: Yunhong Jiang<yunhong.jiang@intel.com>
author Keir Fraser <keir.fraser@citrix.com>
date Thu Apr 09 08:41:28 2009 +0100 (2009-04-09)
parents 0e24e9674ded
children 90d5fb694620
files xen/arch/x86/cpu/mcheck/mce.c xen/arch/x86/cpu/mcheck/mce.h xen/arch/x86/cpu/mcheck/mce_intel.c xen/arch/x86/cpu/mcheck/mctelem.c xen/arch/x86/cpu/mcheck/mctelem.h xen/include/asm-x86/domain.h
line diff
     1.1 --- a/xen/arch/x86/cpu/mcheck/mce.c	Wed Apr 08 19:13:04 2009 +0100
     1.2 +++ b/xen/arch/x86/cpu/mcheck/mce.c	Thu Apr 09 08:41:28 2009 +0100
     1.3 @@ -23,6 +23,7 @@
     1.4  #include "mce.h"
     1.5  
     1.6  int mce_disabled = 0;
     1.7 +int is_mc_panic = 0;
     1.8  unsigned int nr_mce_banks;
     1.9  
    1.10  EXPORT_SYMBOL_GPL(nr_mce_banks);	/* non-fatal.o */
    1.11 @@ -124,6 +125,7 @@ mctelem_cookie_t mcheck_mca_logout(enum 
    1.12  
    1.13  	switch (who) {
    1.14  	case MCA_MCE_HANDLER:
    1.15 +	case MCA_MCE_SCAN:
    1.16  		mcg.mc_flags = MC_FLAG_MCE;
    1.17  		which = MC_URGENT;
    1.18  		break;
    1.19 @@ -219,8 +221,9 @@ mctelem_cookie_t mcheck_mca_logout(enum 
    1.20  			cbret = mc_callback_bank_extended(mci, i, status);
    1.21  		}
    1.22  
    1.23 -		/* Clear status */
    1.24 -		mca_wrmsrl(MSR_IA32_MC0_STATUS + 4 * i, 0x0ULL);
    1.25 +		if (who != MCA_MCE_SCAN)
    1.26 +			/* Clear status */
    1.27 +			mca_wrmsrl(MSR_IA32_MC0_STATUS + 4 * i, 0x0ULL);
    1.28  		wmb();
    1.29  	}
    1.30  
    1.31 @@ -469,6 +472,21 @@ cmn_handler_done:
    1.32  	}
    1.33  }
    1.34  
    1.35 +void mcheck_mca_clearbanks(cpu_banks_t bankmask)
    1.36 +{
    1.37 +	int i;
    1.38 +	uint64_t status;
    1.39 +
    1.40 +	for (i = 0; i < 32 && i < nr_mce_banks; i++) {
    1.41 +		if (!test_bit(i, bankmask))
    1.42 +			continue;
    1.43 +		mca_rdmsrl(MSR_IA32_MC0_STATUS + i * 4, status);
    1.44 +		if (!(status & MCi_STATUS_VAL))
    1.45 +			continue;
    1.46 +		mca_wrmsrl(MSR_IA32_MC0_STATUS + 4 * i, 0x0ULL);
    1.47 +	}
    1.48 +}
    1.49 +
    1.50  static int amd_mcheck_init(struct cpuinfo_x86 *ci)
    1.51  {
    1.52  	int rc = 0;
    1.53 @@ -1207,9 +1225,8 @@ long do_mca(XEN_GUEST_HANDLE(xen_mc_t) u
    1.54  
    1.55  		add_taint(TAINT_ERROR_INJECT);
    1.56  
    1.57 -		on_selected_cpus(cpumask_of_cpu(target),
    1.58 -		    x86_mc_mceinject, mc_mceinject, 1, 1);
    1.59 -
    1.60 +		on_selected_cpus(cpumask_of_cpu(target), x86_mc_mceinject,
    1.61 +		    mc_mceinject, 1, 1);
    1.62  		break;
    1.63  
    1.64  	default:
    1.65 @@ -1233,6 +1250,7 @@ void set_poll_bankmask(struct cpuinfo_x8
    1.66  }
    1.67  void mc_panic(char *s)
    1.68  {
    1.69 +    is_mc_panic = 1;
    1.70      console_start_sync();
    1.71      printk("Fatal machine check: %s\n", s);
    1.72      printk("\n"
     2.1 --- a/xen/arch/x86/cpu/mcheck/mce.h	Wed Apr 08 19:13:04 2009 +0100
     2.2 +++ b/xen/arch/x86/cpu/mcheck/mce.h	Thu Apr 09 08:41:28 2009 +0100
     2.3 @@ -70,7 +70,8 @@ enum mca_source {
     2.4  	MCA_MCE_HANDLER,
     2.5  	MCA_POLLER,
     2.6  	MCA_CMCI_HANDLER,
     2.7 -	MCA_RESET
     2.8 +	MCA_RESET,
     2.9 +	MCA_MCE_SCAN
    2.10  };
    2.11  
    2.12  enum mca_extinfo {
    2.13 @@ -92,6 +93,8 @@ void set_poll_bankmask(struct cpuinfo_x8
    2.14  DECLARE_PER_CPU(cpu_banks_t, poll_bankmask);
    2.15  DECLARE_PER_CPU(cpu_banks_t, no_cmci_banks);
    2.16  extern int cmci_support;
    2.17 +extern int is_mc_panic;
    2.18 +extern void mcheck_mca_clearbanks(cpu_banks_t);
    2.19  
    2.20  extern mctelem_cookie_t mcheck_mca_logout(enum mca_source, cpu_banks_t,
    2.21      struct mca_summary *);
     3.1 --- a/xen/arch/x86/cpu/mcheck/mce_intel.c	Wed Apr 08 19:13:04 2009 +0100
     3.2 +++ b/xen/arch/x86/cpu/mcheck/mce_intel.c	Thu Apr 09 08:41:28 2009 +0100
     3.3 @@ -18,6 +18,29 @@ int cmci_support = 0;
     3.4  static int nr_intel_ext_msrs = 0;
     3.5  static int firstbank;
     3.6  
     3.7 +/* Below are for MCE handling */
     3.8 +struct mce_softirq_barrier {
     3.9 +	atomic_t val;
    3.10 +	atomic_t ingen;
    3.11 +	atomic_t outgen;
    3.12 +};
    3.13 +
    3.14 +static struct mce_softirq_barrier mce_inside_bar, mce_severity_bar;
    3.15 +static struct mce_softirq_barrier mce_trap_bar;
    3.16 +
    3.17 +/*
    3.18 + * mce_logout_lock should only be used in the trap handler,
    3.19 + * while MCIP has not been cleared yet in the global status
    3.20 + * register. Other use is not safe, since an MCE trap can
    3.21 + * happen at any moment, which would cause lock recursion.
    3.22 + */
    3.23 +static DEFINE_SPINLOCK(mce_logout_lock);
    3.24 +
    3.25 +static atomic_t severity_cpu = ATOMIC_INIT(-1);
    3.26 +
    3.27 +static void mce_barrier_enter(struct mce_softirq_barrier *);
    3.28 +static void mce_barrier_exit(struct mce_softirq_barrier *);
    3.29 +
    3.30  #ifdef CONFIG_X86_MCE_THERMAL
    3.31  static void unexpected_thermal_interrupt(struct cpu_user_regs *regs)
    3.32  {
    3.33 @@ -123,7 +146,7 @@ static inline void intel_get_extended_ms
    3.34      if ( ext->mc_msrs < ARRAY_SIZE(ext->mc_msr)
    3.35           && msr < MSR_IA32_MCG_EAX + nr_intel_ext_msrs ) {
    3.36          ext->mc_msr[ext->mc_msrs].reg = msr;
    3.37 -        rdmsrl(msr, ext->mc_msr[ext->mc_msrs].value);
    3.38 +        mca_rdmsrl(msr, ext->mc_msr[ext->mc_msrs].value);
    3.39          ++ext->mc_msrs;
    3.40      }
    3.41  }
    3.42 @@ -169,45 +192,6 @@ intel_get_extended_msrs(struct mc_info *
    3.43      return MCA_EXTINFO_GLOBAL;
    3.44  }
    3.45  
    3.46 -/* Below are for MCE handling */
    3.47 -
    3.48 -/* Log worst error severity and offending CPU.,
    3.49 - * Pick this CPU for further processing in softirq */
    3.50 -static int severity_cpu = -1;
    3.51 -static int worst = 0;
    3.52 -
    3.53 -/* Lock of entry@second round scanning in MCE# handler */
    3.54 -static cpumask_t scanned_cpus;
    3.55 -/* Lock for entry@Critical Section in MCE# handler */
    3.56 -static bool_t mce_enter_lock = 0;
    3.57 -/* Record how many CPUs impacted in this MCE# */
    3.58 -static cpumask_t impact_map;
    3.59 -
    3.60 -/* Lock of softirq rendezvous entering point */
    3.61 -static cpumask_t mced_cpus;
    3.62 -/*Lock of softirq rendezvous leaving point */
    3.63 -static cpumask_t finished_cpus;
    3.64 -/* Lock for picking one processing CPU */
    3.65 -static bool_t mce_process_lock = 0;
    3.66 -
    3.67 -/* Spinlock for vMCE# MSR virtualization data */
    3.68 -static DEFINE_SPINLOCK(mce_locks);
    3.69 -
    3.70 -/* Local buffer for holding MCE# data temporarily, sharing between mce
    3.71 - * handler and softirq handler. Those data will be finally committed
    3.72 - * for DOM0 Log and coped to per_dom related data for guest vMCE#
    3.73 - * MSR virtualization.
    3.74 - * Note: When local buffer is still in processing in softirq, another
    3.75 - * MCA comes, simply panic.
    3.76 - */
    3.77 -
    3.78 -struct mc_local_t
    3.79 -{
    3.80 -    bool_t in_use;
    3.81 -    mctelem_cookie_t mctc[NR_CPUS];
    3.82 -};
    3.83 -static struct mc_local_t mc_local;
    3.84 -
    3.85  /* This node list records errors impacting a domain. when one
    3.86   * MCE# happens, one error bank impacts a domain. This error node
    3.87   * will be inserted to the tail of the per_dom data for vMCE# MSR
    3.88 @@ -252,18 +236,22 @@ static int fill_vmsr_data(int cpu, struc
    3.89          }
    3.90  
    3.91          entry = alloc_bank_entry();
    3.92 +        if (entry == NULL)
    3.93 +	    return -1;
    3.94          entry->mci_status = mc_bank->mc_status;
    3.95          entry->mci_addr = mc_bank->mc_addr;
    3.96          entry->mci_misc = mc_bank->mc_misc;
    3.97          entry->cpu = cpu;
    3.98          entry->bank = mc_bank->mc_bank;
    3.99  
   3.100 +	spin_lock(&d->arch.vmca_msrs.lock);
   3.101          /* New error Node, insert to the tail of the per_dom data */
   3.102          list_add_tail(&entry->list, &d->arch.vmca_msrs.impact_header);
   3.103          /* Fill MSR global status */
   3.104          d->arch.vmca_msrs.mcg_status = gstatus;
   3.105          /* New node impact the domain, need another vMCE# injection*/
   3.106          d->arch.vmca_msrs.nr_injection++;
   3.107 +	spin_unlock(&d->arch.vmca_msrs.lock);
   3.108  
   3.109          printk(KERN_DEBUG "MCE: Found error @[CPU%d BANK%d "
   3.110                  "status %"PRIx64" addr %"PRIx64" domid %d]\n ",
   3.111 @@ -273,100 +261,83 @@ static int fill_vmsr_data(int cpu, struc
   3.112      return 0;
   3.113  }
   3.114  
   3.115 -static int mce_actions(void) {
   3.116 -    int32_t cpu, ret;
   3.117 +/*
   3.118 + * Called from mctelem_process_deferred. Return 1 if the telemetry
   3.119 + * should be committed for dom0 consumption, 0 if it should be
   3.120 + * dismissed.
   3.121 + */
   3.122 +static int mce_action(unsigned int cpu, mctelem_cookie_t mctc)
   3.123 +{
   3.124      struct mc_info *local_mi;
   3.125      struct mcinfo_common *mic = NULL;
   3.126      struct mcinfo_global *mc_global;
   3.127      struct mcinfo_bank *mc_bank;
   3.128  
   3.129 -    /* Spinlock is used for exclusive read/write of vMSR virtualization
   3.130 -     * (per_dom vMCE# data)
   3.131 -     */
   3.132 -    spin_lock(&mce_locks);
   3.133 -
   3.134 -    /*
   3.135 -     * If softirq is filling this buffer while another MCE# comes,
   3.136 -     * simply panic
   3.137 -     */
   3.138 -    test_and_set_bool(mc_local.in_use);
   3.139 -
   3.140 -    for_each_cpu_mask(cpu, impact_map) {
   3.141 -        if (mc_local.mctc[cpu] == NULL) {
   3.142 -            printk(KERN_ERR "MCE: get reserved entry failed\n ");
   3.143 -            ret = -1;
   3.144 -            goto end;
   3.145 -        }
   3.146 -        local_mi = (struct mc_info*)mctelem_dataptr(mc_local.mctc[cpu]);
   3.147 -        x86_mcinfo_lookup(mic, local_mi, MC_TYPE_GLOBAL);
   3.148 -        if (mic == NULL) {
   3.149 -            printk(KERN_ERR "MCE: get local buffer entry failed\n ");
   3.150 -            ret = -1;
   3.151 -            goto end;
   3.152 -        }
   3.153 -
   3.154 -        mc_global = (struct mcinfo_global *)mic;
   3.155 -
   3.156 -        /* Processing bank information */
   3.157 -        x86_mcinfo_lookup(mic, local_mi, MC_TYPE_BANK);
   3.158 -
   3.159 -        for ( ; mic && mic->size; mic = x86_mcinfo_next(mic) ) {
   3.160 -            if (mic->type != MC_TYPE_BANK) {
   3.161 -                continue;
   3.162 -            }
   3.163 -            mc_bank = (struct mcinfo_bank*)mic;
   3.164 -            /* Fill vMCE# injection and vMCE# MSR virtualization related data */
   3.165 -            if (fill_vmsr_data(cpu, mc_bank, mc_global->mc_gstatus) == -1) {
   3.166 -                ret = -1;
   3.167 -                goto end;
   3.168 -            }
   3.169 -
   3.170 -            /* TODO: Add recovery actions here, such as page-offline, etc */
   3.171 -        }
   3.172 -    } /* end of impact_map loop */
   3.173 -
   3.174 -    ret = 0;
   3.175 -
   3.176 -end:
   3.177 -
   3.178 -    for_each_cpu_mask(cpu, impact_map) {
   3.179 -        /* This reserved entry is processed, commit it */
   3.180 -        if (mc_local.mctc[cpu] != NULL) {
   3.181 -            mctelem_commit(mc_local.mctc[cpu]);
   3.182 -            printk(KERN_DEBUG "MCE: Commit one URGENT ENTRY\n");
   3.183 -        }
   3.184 +    local_mi = (struct mc_info*)mctelem_dataptr(mctc);
   3.185 +    x86_mcinfo_lookup(mic, local_mi, MC_TYPE_GLOBAL);
   3.186 +    if (mic == NULL) {
   3.187 +        printk(KERN_ERR "MCE: get local buffer entry failed\n ");
   3.188 +        return 0;
   3.189      }
   3.190  
   3.191 -    test_and_clear_bool(mc_local.in_use);
   3.192 -    spin_unlock(&mce_locks);
   3.193 -    return ret;
   3.194 +    mc_global = (struct mcinfo_global *)mic;
   3.195 +
   3.196 +    /* Processing bank information */
   3.197 +    x86_mcinfo_lookup(mic, local_mi, MC_TYPE_BANK);
   3.198 +
   3.199 +    for ( ; mic && mic->size; mic = x86_mcinfo_next(mic) ) {
   3.200 +        if (mic->type != MC_TYPE_BANK) {
   3.201 +            continue;
   3.202 +        }
   3.203 +        mc_bank = (struct mcinfo_bank*)mic;
   3.204 +        /* Fill vMCE# injection and vMCE# MSR virtualization related data */
   3.205 +        if (fill_vmsr_data(cpu, mc_bank, mc_global->mc_gstatus) == -1)
   3.206 +             break;
   3.207 +
   3.208 +       /* TODO: Add recovery actions here, such as page-offline, etc */
   3.209 +    }
   3.210 +
   3.211 +    return 1;
   3.212  }
   3.213  
   3.214  /* Softirq Handler for this MCE# processing */
   3.215  static void mce_softirq(void)
   3.216  {
   3.217      int cpu = smp_processor_id();
   3.218 +    unsigned int workcpu;
   3.219      cpumask_t affinity;
   3.220  
   3.221 -    /* Wait until all cpus entered softirq */
   3.222 -    while ( cpus_weight(mced_cpus) != num_online_cpus() ) {
   3.223 -        cpu_relax();
   3.224 -    }
   3.225 -    /* Not Found worst error on severity_cpu, it's weird */
   3.226 -    if (severity_cpu == -1) {
   3.227 -        printk(KERN_WARNING "MCE: not found severity_cpu!\n");
   3.228 -        mc_panic("MCE: not found severity_cpu!");
   3.229 -        return;
   3.230 -    }
   3.231 +    printk(KERN_DEBUG "CPU%d enter softirq\n", cpu);
   3.232 +
   3.233 +    mce_barrier_enter(&mce_inside_bar);
   3.234 +
   3.235 +    /*
   3.236 +     * Everybody is here. Now let's see who gets to do the
   3.237 +     * recovery work. Right now we just see if there's a CPU
   3.238 +     * that did not have any problems, and pick that one.
   3.239 +     *
   3.240 +     * First, just set a default value: the last CPU who reaches this
   3.241 +     * will overwrite the value and become the default.
   3.242 +     */
   3.243 +
   3.244 +    atomic_set(&severity_cpu, cpu);
   3.245 +
   3.246 +    mce_barrier_enter(&mce_severity_bar);
   3.247 +    if (!mctelem_has_deferred(cpu))
   3.248 +        atomic_set(&severity_cpu, cpu);
   3.249 +    mce_barrier_exit(&mce_severity_bar);
   3.250 +
   3.251      /* We choose severity_cpu for further processing */
   3.252 -    if (severity_cpu == cpu) {
   3.253 +    if (atomic_read(&severity_cpu) == cpu) {
   3.254 +
   3.255 +        printk(KERN_DEBUG "CPU%d handling errors\n", cpu);
   3.256  
   3.257          /* Step1: Fill DOM0 LOG buffer, vMCE injection buffer and
   3.258           * vMCE MSRs virtualization buffer
   3.259           */
   3.260 -        if (mce_actions())
   3.261 -            mc_panic("MCE recovery actions or Filling vMCE MSRS "
   3.262 -                     "virtualization data failed!\n");
   3.263 +        for_each_online_cpu(workcpu) {
   3.264 +	    mctelem_process_deferred(workcpu, mce_action);
   3.265 +        }
   3.266  
   3.267          /* Step2: Send Log to DOM0 through vIRQ */
   3.268          if (dom0 && guest_enabled_event(dom0->vcpu[0], VIRQ_MCA)) {
   3.269 @@ -387,26 +358,9 @@ static void mce_softirq(void)
   3.270              vcpu_set_affinity(dom0->vcpu[0], &affinity);
   3.271              vcpu_kick(dom0->vcpu[0]);
   3.272          }
   3.273 -
   3.274 -        /* Clean Data */
   3.275 -        test_and_clear_bool(mce_process_lock);
   3.276 -        cpus_clear(impact_map);
   3.277 -        cpus_clear(scanned_cpus);
   3.278 -        worst = 0;
   3.279 -        cpus_clear(mced_cpus);
   3.280 -        memset(&mc_local, 0x0, sizeof(mc_local));
   3.281      }
   3.282  
   3.283 -    cpu_set(cpu, finished_cpus);
   3.284 -    wmb();
   3.285 -   /* Leave until all cpus finished recovery actions in softirq */
   3.286 -    while ( cpus_weight(finished_cpus) != num_online_cpus() ) {
   3.287 -        cpu_relax();
   3.288 -    }
   3.289 -
   3.290 -    cpus_clear(finished_cpus);
   3.291 -    severity_cpu = -1;
   3.292 -    printk(KERN_DEBUG "CPU%d exit softirq \n", cpu);
   3.293 +    mce_barrier_exit(&mce_inside_bar);
   3.294  }
   3.295  
   3.296  /* Machine Check owner judge algorithm:
   3.297 @@ -424,127 +378,157 @@ static void mce_softirq(void)
   3.298   * Round2: Do all MCE processing logic as normal.
   3.299   */
   3.300  
   3.301 -/* Simple Scan. Panic when found non-recovery errors. Doing this for
   3.302 - * avoiding LOG missing
   3.303 - */
   3.304 -static void severity_scan(void)
   3.305 +static void mce_panic_check(void)
   3.306  {
   3.307 -    uint64_t status;
   3.308 -    int32_t i;
   3.309 -
   3.310 -    /* TODO: For PCC = 0, we need to have further judge. If it is can't be
   3.311 -     * recovered, we need to RESET for avoiding DOM0 LOG missing
   3.312 -     */
   3.313 -    for ( i = 0; i < nr_mce_banks; i++) {
   3.314 -        mca_rdmsrl(MSR_IA32_MC0_STATUS + 4 * i , status);
   3.315 -        if ( !(status & MCi_STATUS_VAL) )
   3.316 -            continue;
   3.317 -        /* MCE handler only handles UC error */
   3.318 -        if ( !(status & MCi_STATUS_UC) )
   3.319 -            continue;
   3.320 -        if ( !(status & MCi_STATUS_EN) )
   3.321 -            continue;
   3.322 -        /*
   3.323 -         * If this was an injected error, keep going, since the
   3.324 -         * interposed value will be lost at reboot.
   3.325 -         */
   3.326 -        if (status & MCi_STATUS_PCC && intpose_lookup(smp_processor_id(),
   3.327 -          MSR_IA32_MC0_STATUS + 4 * i, NULL) == NULL)
   3.328 -            mc_panic("pcc = 1, cpu unable to continue\n");
   3.329 -    }
   3.330 -
   3.331 -    /* TODO: Further judgement for later CPUs here, maybe need MCACOD assistence */
   3.332 -    /* EIPV and RIPV is not a reliable way to judge the error severity */
   3.333 -
   3.334 +      if (is_mc_panic) {
   3.335 +              local_irq_enable();
   3.336 +              for ( ; ; )
   3.337 +                      halt();
   3.338 +      }
   3.339  }
   3.340  
   3.341 +/*
   3.342 + * Initialize a barrier. Just set it to 0.
   3.343 + */
   3.344 +static void mce_barrier_init(struct mce_softirq_barrier *bar)
   3.345 +{
   3.346 +      atomic_set(&bar->val, 0);
   3.347 +      atomic_set(&bar->ingen, 0);
   3.348 +      atomic_set(&bar->outgen, 0);
   3.349 +}
   3.350 +
   3.351 +#if 0
   3.352 +/*
   3.353 + * This function will need to be used when offlining a CPU in the
   3.354 + * recovery actions.
   3.355 + *
   3.356 + * Decrement a barrier only. Needed for cases where the CPU
   3.357 + * in question can't do it itself (e.g. it is being offlined).
   3.358 + */
   3.359 +static void mce_barrier_dec(struct mce_softirq_barrier *bar)
   3.360 +{
   3.361 +      atomic_inc(&bar->outgen);
   3.362 +      wmb();
   3.363 +      atomic_dec(&bar->val);
   3.364 +}
   3.365 +#endif
   3.366 +
   3.367 +static void mce_spin_lock(spinlock_t *lk)
   3.368 +{
   3.369 +      while (!spin_trylock(lk)) {
   3.370 +              cpu_relax();
   3.371 +              mce_panic_check();
   3.372 +      }
   3.373 +}
   3.374 +
   3.375 +static void mce_spin_unlock(spinlock_t *lk)
   3.376 +{
   3.377 +      spin_unlock(lk);
   3.378 +}
   3.379 +
   3.380 +/*
   3.381 + * Increment the generation number and the value. The generation number
   3.382 + * is incremented when entering a barrier. This way, it can be checked
   3.383 + * on exit if a CPU is trying to re-enter the barrier. This can happen
   3.384 + * if the first CPU to make it out immediately exits or re-enters, while
   3.385 + * another CPU that is still in the loop becomes otherwise occupied
   3.386 + * (e.g. it needs to service an interrupt, etc), missing the value
   3.387 + * it's waiting for.
   3.388 + *
   3.389 + * These barrier functions should always be paired, so that the
   3.390 + * counter value will reach 0 again after all CPUs have exited.
   3.391 + */
   3.392 +static void mce_barrier_enter(struct mce_softirq_barrier *bar)
   3.393 +{
   3.394 +      int gen;
   3.395 +
   3.396 +      atomic_inc(&bar->ingen);
   3.397 +      gen = atomic_read(&bar->outgen);
   3.398 +      mb();
   3.399 +      atomic_inc(&bar->val);
   3.400 +      while ( atomic_read(&bar->val) != num_online_cpus() &&
   3.401 +          atomic_read(&bar->outgen) == gen) {
   3.402 +              mb();
   3.403 +              mce_panic_check();
   3.404 +      }
   3.405 +}
   3.406 +
   3.407 +static void mce_barrier_exit(struct mce_softirq_barrier *bar)
   3.408 +{
   3.409 +      int gen;
   3.410 +
   3.411 +      atomic_inc(&bar->outgen);
   3.412 +      gen = atomic_read(&bar->ingen);
   3.413 +      mb();
   3.414 +      atomic_dec(&bar->val);
   3.415 +      while ( atomic_read(&bar->val) != 0 &&
   3.416 +          atomic_read(&bar->ingen) == gen ) {
   3.417 +              mb();
   3.418 +              mce_panic_check();
   3.419 +      }
   3.420 +}
   3.421 +
   3.422 +static void mce_barrier(struct mce_softirq_barrier *bar)
   3.423 +{
   3.424 +      mce_barrier_enter(bar);
   3.425 +      mce_barrier_exit(bar);
   3.426 +}
   3.427  
   3.428  static void intel_machine_check(struct cpu_user_regs * regs, long error_code)
   3.429  {
   3.430 -    unsigned int cpu = smp_processor_id();
   3.431 -    int32_t severity = 0;
   3.432      uint64_t gstatus;
   3.433      mctelem_cookie_t mctc = NULL;
   3.434      struct mca_summary bs;
   3.435  
   3.436 -    /* First round scanning */
   3.437 -    severity_scan();
   3.438 -    cpu_set(cpu, scanned_cpus);
   3.439 -    while (cpus_weight(scanned_cpus) < num_online_cpus())
   3.440 -        cpu_relax();
   3.441 +    mce_spin_lock(&mce_logout_lock);
   3.442  
   3.443 -    wmb();
   3.444 -    /* All CPUs Finished first round scanning */
   3.445 -    if (mc_local.in_use != 0) {
   3.446 -        mc_panic("MCE: Local buffer is being processed, can't handle new MCE!\n");
   3.447 -        return;
   3.448 -    }
   3.449 +    mctc = mcheck_mca_logout(MCA_MCE_SCAN, mca_allbanks, &bs);
   3.450  
   3.451 -    /* Enter Critical Section */
   3.452 -    while (test_and_set_bool(mce_enter_lock)) {
   3.453 -        udelay (1);
   3.454 -    }
   3.455 -
   3.456 -    mctc = mcheck_mca_logout(MCA_MCE_HANDLER, mca_allbanks, &bs);
   3.457 -     /* local data point to the reserved entry, let softirq to
   3.458 -      * process the local data */
   3.459 -    if (!bs.errcnt) {
   3.460 +    if (bs.errcnt) {
   3.461 +        /*
   3.462 +         * Uncorrected errors must be dealth with in softirq context.
   3.463 +         */
   3.464 +        if (bs.uc || bs.pcc) {
   3.465 +            add_taint(TAINT_MACHINE_CHECK);
   3.466 +            if (mctc != NULL)
   3.467 +                mctelem_defer(mctc);
   3.468 +            /*
   3.469 +             * For PCC=1, context is lost, so reboot now without clearing
   3.470 +             * the banks, and deal with the telemetry after reboot
   3.471 +             * (the MSRs are sticky)
   3.472 +             */
   3.473 +            if (bs.pcc)
   3.474 +                mc_panic("State lost due to machine check exception.\n");
   3.475 +        } else {
   3.476 +            if (mctc != NULL)
   3.477 +                mctelem_commit(mctc);
   3.478 +        }
   3.479 +        mcheck_mca_clearbanks(mca_allbanks);
   3.480 +    } else {
   3.481          if (mctc != NULL)
   3.482              mctelem_dismiss(mctc);
   3.483 -        mc_local.mctc[cpu] = NULL;
   3.484 -        cpu_set(cpu, mced_cpus);
   3.485 -        test_and_clear_bool(mce_enter_lock);
   3.486 -        raise_softirq(MACHINE_CHECK_SOFTIRQ);
   3.487 -        return;
   3.488 -    }
   3.489 -    else if ( mctc != NULL) {
   3.490 -        mc_local.mctc[cpu] = mctc;
   3.491      }
   3.492  
   3.493 -    if (bs.uc || bs.pcc)
   3.494 -        add_taint(TAINT_MACHINE_CHECK);
   3.495 +    mce_spin_unlock(&mce_logout_lock);
   3.496  
   3.497 -    if (bs.pcc) {
   3.498 -        printk(KERN_WARNING "PCC=1 should have caused reset\n");
   3.499 -        severity = 3;
   3.500 -    }
   3.501 -    else if (bs.uc) {
   3.502 -        severity = 2;
   3.503 -    }
   3.504 -    else {
   3.505 -        printk(KERN_WARNING "We should skip Correctable Error\n");
   3.506 -        severity = 1;
   3.507 -    }
   3.508 -    /* This is the offending cpu! */
   3.509 -    cpu_set(cpu, impact_map);
   3.510 +    /*
   3.511 +     * Wait until everybody has processed the trap.
   3.512 +     */
   3.513 +    mce_barrier(&mce_trap_bar);
   3.514  
   3.515 -    if ( severity > worst) {
   3.516 -        worst = severity;
   3.517 -        severity_cpu = cpu;
   3.518 +    /*
   3.519 +     * Clear MCIP if it wasn't already. There is a small
   3.520 +     * chance that more than 1 CPU will end up doing this,
   3.521 +     * but that's OK.
   3.522 +     */
   3.523 +    if (bs.errcnt) {
   3.524 +        mca_rdmsrl(MSR_IA32_MCG_STATUS, gstatus);
   3.525 +        if ((gstatus & MCG_STATUS_MCIP) != 0)
   3.526 +            mca_wrmsrl(MSR_IA32_MCG_STATUS, gstatus & ~MCG_STATUS_MCIP);
   3.527 +        /* Print MCE error */
   3.528 +        x86_mcinfo_dump(mctelem_dataptr(mctc));
   3.529      }
   3.530 -    cpu_set(cpu, mced_cpus);
   3.531 -    test_and_clear_bool(mce_enter_lock);
   3.532 -    wmb();
   3.533 -
   3.534 -    /* Wait for all cpus Leave Critical */
   3.535 -    while (cpus_weight(mced_cpus) < num_online_cpus())
   3.536 -        cpu_relax();
   3.537 -    /* Print MCE error */
   3.538 -    x86_mcinfo_dump(mctelem_dataptr(mctc));
   3.539  
   3.540 -    /* Pick one CPU to clear MCIP */
   3.541 -    if (!test_and_set_bool(mce_process_lock)) {
   3.542 -        mca_rdmsrl(MSR_IA32_MCG_STATUS, gstatus);
   3.543 -        mca_wrmsrl(MSR_IA32_MCG_STATUS, gstatus & ~MCG_STATUS_MCIP);
   3.544 -
   3.545 -        if (worst >= 3) {
   3.546 -            printk(KERN_WARNING "worst=3 should have caused RESET\n");
   3.547 -            mc_panic("worst=3 should have caused RESET");
   3.548 -        }
   3.549 -        else {
   3.550 -            printk(KERN_DEBUG "MCE: trying to recover\n");
   3.551 -        }
   3.552 -    }
   3.553      raise_softirq(MACHINE_CHECK_SOFTIRQ);
   3.554  }
   3.555  
   3.556 @@ -778,6 +762,11 @@ static void mce_init(void)
   3.557  
   3.558      clear_in_cr4(X86_CR4_MCE);
   3.559  
   3.560 +    mce_barrier_init(&mce_inside_bar);
   3.561 +    mce_barrier_init(&mce_severity_bar);
   3.562 +    mce_barrier_init(&mce_trap_bar);
   3.563 +    spin_lock_init(&mce_logout_lock);
   3.564 +
   3.565      /* log the machine checks left over from the previous reset.
   3.566       * This also clears all registers*/
   3.567  
   3.568 @@ -840,6 +829,7 @@ void intel_mce_init_msr(struct domain *d
   3.569      memset(d->arch.vmca_msrs.mci_ctl, ~0,
   3.570             sizeof(d->arch.vmca_msrs.mci_ctl));
   3.571      INIT_LIST_HEAD(&d->arch.vmca_msrs.impact_header);
   3.572 +    spin_lock_init(&d->arch.vmca_msrs.lock);
   3.573  }
   3.574  
   3.575  int intel_mce_wrmsr(u32 msr, u64 value)
   3.576 @@ -849,7 +839,7 @@ int intel_mce_wrmsr(u32 msr, u64 value)
   3.577      unsigned int bank;
   3.578      int ret = 1;
   3.579  
   3.580 -    spin_lock(&mce_locks);
   3.581 +    spin_lock(&d->arch.vmca_msrs.lock);
   3.582      switch(msr)
   3.583      {
   3.584      case MSR_IA32_MCG_CTL:
   3.585 @@ -924,7 +914,7 @@ int intel_mce_wrmsr(u32 msr, u64 value)
   3.586          ret = 0;
   3.587          break;
   3.588      }
   3.589 -    spin_unlock(&mce_locks);
   3.590 +    spin_unlock(&d->arch.vmca_msrs.lock);
   3.591      return ret;
   3.592  }
   3.593  
   3.594 @@ -936,7 +926,7 @@ int intel_mce_rdmsr(u32 msr, u32 *lo, u3
   3.595      struct bank_entry *entry = NULL;
   3.596  
   3.597      *lo = *hi = 0x0;
   3.598 -    spin_lock(&mce_locks);
   3.599 +    spin_lock(&d->arch.vmca_msrs.lock);
   3.600      switch(msr)
   3.601      {
   3.602      case MSR_IA32_MCG_STATUS:
   3.603 @@ -1022,7 +1012,7 @@ int intel_mce_rdmsr(u32 msr, u32 *lo, u3
   3.604          ret = 0;
   3.605          break;
   3.606      }
   3.607 -    spin_unlock(&mce_locks);
   3.608 +    spin_unlock(&d->arch.vmca_msrs.lock);
   3.609      return ret;
   3.610  }
   3.611  
     4.1 --- a/xen/arch/x86/cpu/mcheck/mctelem.c	Wed Apr 08 19:13:04 2009 +0100
     4.2 +++ b/xen/arch/x86/cpu/mcheck/mctelem.c	Thu Apr 09 08:41:28 2009 +0100
     4.3 @@ -109,6 +109,14 @@ static struct mc_telem_ctl {
     4.4  	 * Telemetry array
     4.5  	 */
     4.6  	struct mctelem_ent *mctc_elems;
     4.7 +	/*
     4.8 +	 * Per-CPU processing lists, used for deferred (softirq)
     4.9 +	 * processing of telemetry. mctc_cpu is indexed by the
    4.10 +	 * CPU that the telemetry belongs to. mctc_cpu_processing
    4.11 +	 * is indexed by the CPU that is processing the telemetry.
    4.12 +	 */
    4.13 +	struct mctelem_ent *mctc_cpu[NR_CPUS];
    4.14 +	struct mctelem_ent *mctc_cpu_processing[NR_CPUS];
    4.15  } mctctl;
    4.16  
    4.17  /* Lock protecting all processing lists */
    4.18 @@ -123,6 +131,82 @@ static void *cmpxchgptr(void *ptr, void 
    4.19  	return (void *)cmpxchg(ulp, a, b);
    4.20  }
    4.21  
    4.22 +static void mctelem_xchg_head(struct mctelem_ent **headp,
    4.23 +				struct mctelem_ent **old,
    4.24 +				struct mctelem_ent *new)
    4.25 +{
    4.26 +	for (;;) {
    4.27 +		*old = *headp;
    4.28 +		wmb();
    4.29 +		if (cmpxchgptr(headp, *old, new) == *old)
    4.30 +			break;
    4.31 +	}
    4.32 +}
    4.33 +
    4.34 +
    4.35 +void mctelem_defer(mctelem_cookie_t cookie)
    4.36 +{
    4.37 +	struct mctelem_ent *tep = COOKIE2MCTE(cookie);
    4.38 +
    4.39 +	mctelem_xchg_head(&mctctl.mctc_cpu[smp_processor_id()],
    4.40 +	    &tep->mcte_next, tep);
    4.41 +}
    4.42 +
    4.43 +void mctelem_process_deferred(unsigned int cpu,
    4.44 +			      int (*fn)(unsigned int, mctelem_cookie_t))
    4.45 +{
    4.46 +	struct mctelem_ent *tep;
    4.47 +	struct mctelem_ent *head, *prev;
    4.48 +	int ret;
    4.49 +
    4.50 +	/*
    4.51 +	 * First, unhook the list of telemetry structures, and	
    4.52 +	 * hook it up to the processing list head for this CPU.
    4.53 +	 */
    4.54 +	mctelem_xchg_head(&mctctl.mctc_cpu[cpu],
    4.55 +	    &mctctl.mctc_cpu_processing[smp_processor_id()], NULL);
    4.56 +
    4.57 +	head = mctctl.mctc_cpu_processing[smp_processor_id()];
    4.58 +
    4.59 +	/*
    4.60 +	 * Then, fix up the list to include prev pointers, to make
    4.61 +	 * things a little easier, as the list must be traversed in
    4.62 +	 * chronological order, which is backward from the order they
    4.63 +	 * are in.
    4.64 +	 */
    4.65 +	for (tep = head, prev = NULL; tep != NULL; tep = tep->mcte_next) {
    4.66 +		tep->mcte_prev = prev;
    4.67 +		prev = tep;
    4.68 +	}
    4.69 +
    4.70 +	/*
    4.71 +	 * Now walk the list of telemetry structures, handling each
    4.72 +	 * one of them. Unhooking the structure here does not need to
    4.73 +	 * be atomic, as this list is only accessed from a softirq
    4.74 +	 * context; the MCE handler does not touch it.
    4.75 +	 */
    4.76 +	for (tep = prev; tep != NULL; tep = prev) {
    4.77 +		prev = tep->mcte_prev;
    4.78 +		tep->mcte_next = tep->mcte_prev = NULL;
    4.79 +
    4.80 +		ret = fn(cpu, MCTE2COOKIE(tep));
    4.81 +		if (prev != NULL)
    4.82 +			prev->mcte_next = NULL;
    4.83 +		tep->mcte_prev = tep->mcte_next = NULL;
    4.84 +		if (ret != 0)
    4.85 +			mctelem_commit(MCTE2COOKIE(tep));
    4.86 +		else
    4.87 +			mctelem_dismiss(MCTE2COOKIE(tep));
    4.88 +	}
    4.89 +}
    4.90 +
    4.91 +int mctelem_has_deferred(unsigned int cpu)
    4.92 +{
    4.93 +	if (mctctl.mctc_cpu[cpu] != NULL)
    4.94 +		return 1;
    4.95 +	return 0;
    4.96 +}
    4.97 +
    4.98  /* Free an entry to its native free list; the entry must not be linked on
    4.99   * any list.
   4.100   */
   4.101 @@ -130,21 +214,12 @@ static void mctelem_free(struct mctelem_
   4.102  {
   4.103  	mctelem_class_t target = MCTE_HOME(tep) == MCTE_F_HOME_URGENT ?
   4.104  	    MC_URGENT : MC_NONURGENT;
   4.105 -	struct mctelem_ent **freelp;
   4.106 -	struct mctelem_ent *oldhead;
   4.107  
   4.108  	BUG_ON(tep->mcte_refcnt != 0);
   4.109  	BUG_ON(MCTE_STATE(tep) != MCTE_F_STATE_FREE);
   4.110  
   4.111  	tep->mcte_prev = NULL;
   4.112 -	freelp = &mctctl.mctc_free[target];
   4.113 -	for (;;) {
   4.114 -		oldhead = *freelp;
   4.115 -		tep->mcte_next = oldhead;
   4.116 -		wmb();
   4.117 -		if (cmpxchgptr(freelp, oldhead, tep) == oldhead)
   4.118 -			break;
   4.119 -	}
   4.120 +	mctelem_xchg_head(&mctctl.mctc_free[target], &tep->mcte_next, tep);
   4.121  }
   4.122  
   4.123  /* Increment the reference count of an entry that is not linked on to
   4.124 @@ -308,22 +383,13 @@ void mctelem_dismiss(mctelem_cookie_t co
   4.125  void mctelem_commit(mctelem_cookie_t cookie)
   4.126  {
   4.127  	struct mctelem_ent *tep = COOKIE2MCTE(cookie);
   4.128 -	struct mctelem_ent **commlp;
   4.129 -	struct mctelem_ent *oldhead;
   4.130  	mctelem_class_t target = MCTE_CLASS(tep) == MCTE_F_CLASS_URGENT ?
   4.131  	    MC_URGENT : MC_NONURGENT;
   4.132  
   4.133  	BUG_ON(tep->mcte_next != NULL || tep->mcte_prev != NULL);
   4.134  	MCTE_TRANSITION_STATE(tep, UNCOMMITTED, COMMITTED);
   4.135  
   4.136 -	commlp = &mctctl.mctc_committed[target];
   4.137 -	for (;;) {
   4.138 -		oldhead = *commlp;
   4.139 -		tep->mcte_prev = oldhead;
   4.140 -		wmb();
   4.141 -		if (cmpxchgptr(commlp, oldhead, tep) == oldhead)
   4.142 -			break;
   4.143 -	}
   4.144 +	mctelem_xchg_head(&mctctl.mctc_committed[target], &tep->mcte_prev, tep);
   4.145  }
   4.146  
   4.147  /* Move telemetry from committed list to processing list, reversing the
   4.148 @@ -358,13 +424,7 @@ static void mctelem_append_processing(mc
   4.149  	 * the list we unlink in a well-known location so it can be
   4.150  	 * picked up in panic code should we panic between this unlink
   4.151  	 * and the append to the processing list. */
   4.152 -	for (;;) {
   4.153 -		dangling[target] = *commlp;
   4.154 -		wmb();
   4.155 -		if (cmpxchgptr(commlp, dangling[target], NULL) ==
   4.156 -		    dangling[target])
   4.157 -			break;
   4.158 -	}
   4.159 +	mctelem_xchg_head(commlp, &dangling[target], NULL);
   4.160  
   4.161  	if (dangling[target] == NULL)
   4.162  		return;
     5.1 --- a/xen/arch/x86/cpu/mcheck/mctelem.h	Wed Apr 08 19:13:04 2009 +0100
     5.2 +++ b/xen/arch/x86/cpu/mcheck/mctelem.h	Thu Apr 09 08:41:28 2009 +0100
     5.3 @@ -67,5 +67,9 @@ extern void mctelem_dismiss(mctelem_cook
     5.4  extern mctelem_cookie_t mctelem_consume_oldest_begin(mctelem_class_t);
     5.5  extern void mctelem_consume_oldest_end(mctelem_cookie_t);
     5.6  extern void mctelem_ack(mctelem_class_t, mctelem_cookie_t);
     5.7 +extern void mctelem_defer(mctelem_cookie_t);
     5.8 +extern void mctelem_process_deferred(unsigned int,
     5.9 +    int (*)(unsigned int, mctelem_cookie_t));
    5.10 +int mctelem_has_deferred(unsigned int);
    5.11  
    5.12  #endif
     6.1 --- a/xen/include/asm-x86/domain.h	Wed Apr 08 19:13:04 2009 +0100
     6.2 +++ b/xen/include/asm-x86/domain.h	Thu Apr 09 08:41:28 2009 +0100
     6.3 @@ -226,6 +226,7 @@ struct domain_mca_msrs
     6.4      uint64_t mci_ctl[MAX_NR_BANKS];
     6.5      uint16_t nr_injection;
     6.6      struct list_head impact_header;
     6.7 +    spinlock_t lock;
     6.8  };
     6.9  
    6.10  struct arch_domain