From: Keir Fraser Date: Fri, 20 Mar 2009 17:25:29 +0000 (+0000) Subject: MCA interfaces between XEN/DOM0, let DOM0 know the MCA recovery action X-Git-Tag: 3.4.0-rc1~100^2~2 X-Git-Url: http://xenbits.xensource.com/gitweb?a=commitdiff_plain;h=c8596872e035392185e014a736169ea522d30530;p=xen.git MCA interfaces between XEN/DOM0, let DOM0 know the MCA recovery action Signed-off-by: Jiang, yunhong Signed-off-by: Ke, liping --- diff --git a/xen/arch/x86/cpu/mcheck/x86_mca.h b/xen/arch/x86/cpu/mcheck/x86_mca.h index a84c9dc100..ac98744932 100644 --- a/xen/arch/x86/cpu/mcheck/x86_mca.h +++ b/xen/arch/x86/cpu/mcheck/x86_mca.h @@ -87,6 +87,53 @@ typedef DECLARE_BITMAP(cpu_banks_t, MAX_NR_BANKS); DECLARE_PER_CPU(cpu_banks_t, mce_banks_owned); +/* Below interfaces are defined for MCA internal processing: + * a. pre_handler will be called early in MCA ISR context, mainly for early + * need_reset detection for avoiding log missing. Also, it is used to judge + * impacted DOMAIN if possible. + * b. mca_error_handler is actually a (error_action_index, + * recovery_hanlder pointer) pair. The defined recovery_handler + * performs the actual recovery operations such as page_offline, cpu_offline + * in softIRQ context when the per_bank MCA error matching the corresponding + * mca_code index. If pre_handler can't judge the impacted domain, + * recovery_handler must figure it out. +*/ + +/* MCA error has been recovered successfully by the recovery action*/ +#define MCA_RECOVERED (0x1 < 0) +/* MCA error impact the specified DOMAIN in owner field below */ +#define MCA_OWNER (0x1 < 1) +/* MCA error can't be recovered and need reset */ +#define MCA_NEED_RESET (0x1 < 2) +/* MCA error need further actions in softIRQ context for recovery */ +#define MCA_MORE_ACTION (0x1 < 3) + +struct mca_handle_result +{ + uint32_t result; + /* Used one result & MCA_OWNER */ + domid_t owner; + /* Used by mca_error_handler, result & MCA_RECOVRED */ + struct recovery_action *action; +}; + +extern void (*mca_prehandler)( struct cpu_user_regs *regs, + struct mca_handle_result *result); + +struct mca_error_handler +{ + /* Assume corresponding recovery action could be uniquely + * identified by mca_code. Otherwise, we might need to have + * a seperate function to decode the corresponding actions + * for the particular mca error later. + */ + uint16_t mca_code; + void (*recovery_handler)( struct mcinfo_bank *bank, + struct mcinfo_global *global, + struct mcinfo_extended *extension, + struct mca_handle_result *result); +}; + /* Global variables */ extern int mce_disabled; extern unsigned int nr_mce_banks; diff --git a/xen/include/public/arch-x86/xen-mca.h b/xen/include/public/arch-x86/xen-mca.h index 521864b448..b02ebf0e07 100644 --- a/xen/include/public/arch-x86/xen-mca.h +++ b/xen/include/public/arch-x86/xen-mca.h @@ -104,6 +104,7 @@ #define MC_TYPE_GLOBAL 0 #define MC_TYPE_BANK 1 #define MC_TYPE_EXTENDED 2 +#define MC_TYPE_RECOVERY 3 struct mcinfo_common { uint16_t type; /* structure type */ @@ -172,6 +173,68 @@ struct mcinfo_extended { struct mcinfo_msr mc_msr[10]; }; +/* Recovery Action flags. Giving recovery result information to DOM0 */ + +/* Xen takes successful recovery action, the error is recovered */ +#define REC_ACTION_RECOVERED (0x1 << 0) +/* No action is performed by XEN */ +#define REC_ACTION_NONE (0x1 << 1) +/* It's possible DOM0 might take action ownership in some case */ +#define REC_ACTION_NEED_RESET (0x1 << 2) + +/* Different Recovery Action types, if the action is performed successfully, + * REC_ACTION_RECOVERED flag will be returned. + */ + +/* Page Offline Action */ +#define MC_ACTION_PAGE_OFFLINE (0x1 << 0) +/* CPU offline Action */ +#define MC_ACTION_CPU_OFFLINE (0x1 << 1) +/* L3 cache disable Action */ +#define MC_ACTION_CACHE_SHRINK (0x1 << 2) + +/* Below interface used between XEN/DOM0 for passing XEN's recovery action + * information to DOM0. + * usage Senario: After offlining broken page, XEN might pass its page offline + * recovery action result to DOM0. DOM0 will save the information in + * non-volatile memory for further proactive actions, such as offlining the + * easy broken page earlier when doing next reboot. +*/ +struct page_offline_action +{ + /* Params for passing the offlined page number to DOM0 */ + uint64_t mfn; + uint64_t status; +}; + +struct cpu_offline_action +{ + /* Params for passing the identity of the offlined CPU to DOM0 */ + uint32_t mc_socketid; + uint16_t mc_coreid; + uint16_t mc_core_threadid; +}; + +#define MAX_UNION_SIZE 16 +struct mc_recovery +{ + uint16_t mc_bank; /* bank nr */ + uint8_t action_flags; + uint8_t action_types; + union { + struct page_offline_action page_retire; + struct cpu_offline_action cpu_offline; + uint8_t pad[MAX_UNION_SIZE]; + } action_info; +}; + +struct mcinfo_recovery +{ + struct mcinfo_common common; + struct mc_recovery mc_action; +}; + + #define MCINFO_HYPERCALLSIZE 1024 #define MCINFO_MAXSIZE 768