amd64_edac: cleanup/complete NB MCE decoding
* don't dump info which mcheck already does * update to newest BKDG * mv amd64_process_error_info -> amd64_decode_nb_mce * shorten error struct names * remove redundant info ptr in amd64_process_error_info * remove unused ErrorCodeExt[19:16] (MCx_STATUS) defines Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>
This commit is contained in:
parent
ef44cc4c22
commit
5110dbdeab
4 changed files with 57 additions and 100 deletions
|
@ -2355,62 +2355,47 @@ static void amd64_decode_bus_error(struct mem_ctl_info *mci,
|
|||
"Error Overflow set");
|
||||
}
|
||||
|
||||
int amd64_process_error_info(struct mem_ctl_info *mci,
|
||||
struct err_regs *regs,
|
||||
void amd64_decode_nb_mce(struct mem_ctl_info *mci, struct err_regs *regs,
|
||||
int handle_errors)
|
||||
{
|
||||
struct amd64_pvt *pvt;
|
||||
u32 err_code, ext_ec;
|
||||
int gart_tlb_error = 0;
|
||||
|
||||
pvt = mci->pvt_info;
|
||||
struct amd64_pvt *pvt = mci->pvt_info;
|
||||
int ecc;
|
||||
u32 ec = ERROR_CODE(regs->nbsl);
|
||||
u32 xec = EXT_ERROR_CODE(regs->nbsl);
|
||||
|
||||
if (!handle_errors)
|
||||
return 1;
|
||||
return;
|
||||
|
||||
debugf1("NorthBridge ERROR: mci(0x%p)\n", mci);
|
||||
debugf1(" MC node(%d) Error-Address(0x%.8x-%.8x)\n",
|
||||
pvt->mc_node_id, regs->nbeah, regs->nbeal);
|
||||
debugf1(" nbsh(0x%.8x) nbsl(0x%.8x)\n",
|
||||
regs->nbsh, regs->nbsl);
|
||||
debugf1(" Valid Error=%s Overflow=%s\n",
|
||||
(regs->nbsh & K8_NBSH_VALID_BIT) ? "True" : "False",
|
||||
(regs->nbsh & K8_NBSH_OVERFLOW) ? "True" : "False");
|
||||
debugf1(" Err Uncorrected=%s MCA Error Reporting=%s\n",
|
||||
(regs->nbsh & K8_NBSH_UNCORRECTED_ERR) ?
|
||||
"True" : "False",
|
||||
(regs->nbsh & K8_NBSH_ERR_ENABLE) ?
|
||||
"True" : "False");
|
||||
debugf1(" MiscErr Valid=%s ErrAddr Valid=%s PCC=%s\n",
|
||||
(regs->nbsh & K8_NBSH_MISC_ERR_VALID) ?
|
||||
"True" : "False",
|
||||
(regs->nbsh & K8_NBSH_VALID_ERROR_ADDR) ?
|
||||
"True" : "False",
|
||||
(regs->nbsh & K8_NBSH_PCC) ?
|
||||
"True" : "False");
|
||||
debugf1(" CECC=%s UECC=%s Found by Scruber=%s\n",
|
||||
(regs->nbsh & K8_NBSH_CECC) ?
|
||||
"True" : "False",
|
||||
(regs->nbsh & K8_NBSH_UECC) ?
|
||||
"True" : "False",
|
||||
(regs->nbsh & K8_NBSH_ERR_SCRUBER) ?
|
||||
"True" : "False");
|
||||
debugf1(" CORE0=%s CORE1=%s CORE2=%s CORE3=%s\n",
|
||||
(regs->nbsh & K8_NBSH_CORE0) ? "True" : "False",
|
||||
(regs->nbsh & K8_NBSH_CORE1) ? "True" : "False",
|
||||
(regs->nbsh & K8_NBSH_CORE2) ? "True" : "False",
|
||||
(regs->nbsh & K8_NBSH_CORE3) ? "True" : "False");
|
||||
pr_emerg(" Northbridge ERROR, mc node %d", pvt->mc_node_id);
|
||||
|
||||
|
||||
err_code = ERROR_CODE(regs->nbsl);
|
||||
|
||||
/* Determine which error type:
|
||||
* 1) GART errors - non-fatal, developmental events
|
||||
* 2) MEMORY errors
|
||||
* 3) BUS errors
|
||||
* 4) Unknown error
|
||||
/*
|
||||
* F10h, revD can disable ErrCpu[3:0] so check that first and also the
|
||||
* value encoding has changed so interpret those differently
|
||||
*/
|
||||
if (TLB_ERROR(err_code)) {
|
||||
if ((boot_cpu_data.x86 == 0x10) &&
|
||||
(boot_cpu_data.x86_model > 8)) {
|
||||
if (regs->nbsh & K8_NBSH_ERR_CPU_VAL)
|
||||
pr_cont(", core: %u\n", (u8)(regs->nbsh & 0xf));
|
||||
} else {
|
||||
pr_cont(", core: %d\n", ilog2((regs->nbsh & 0xf)));
|
||||
}
|
||||
|
||||
pr_emerg(" Error: %sorrected",
|
||||
((regs->nbsh & K8_NBSH_UC_ERR) ? "Unc" : "C"));
|
||||
pr_cont(", Report Error: %s",
|
||||
((regs->nbsh & K8_NBSH_ERR_EN) ? "yes" : "no"));
|
||||
pr_cont(", MiscV: %svalid, CPU context corrupt: %s",
|
||||
((regs->nbsh & K8_NBSH_MISCV) ? "" : "In"),
|
||||
((regs->nbsh & K8_NBSH_PCC) ? "yes" : "no"));
|
||||
|
||||
/* do the two bits[14:13] together */
|
||||
ecc = regs->nbsh & (0x3 << 13);
|
||||
if (ecc)
|
||||
pr_cont(", %sECC Error", ((ecc == 2) ? "C" : "U"));
|
||||
|
||||
pr_cont("\n");
|
||||
|
||||
if (TLB_ERROR(ec)) {
|
||||
/*
|
||||
* GART errors are intended to help graphics driver developers
|
||||
* to detect bad GART PTEs. It is recommended by AMD to disable
|
||||
|
@ -2423,53 +2408,35 @@ int amd64_process_error_info(struct mem_ctl_info *mci,
|
|||
* [1] section 13.10.1 on BIOS and Kernel Developers Guide for
|
||||
* AMD NPT family 0Fh processors
|
||||
*/
|
||||
if (report_gart_errors == 0)
|
||||
return 1;
|
||||
if (!report_gart_errors)
|
||||
return;
|
||||
|
||||
/*
|
||||
* Only if GART error reporting is requested should we generate
|
||||
* any logs.
|
||||
*/
|
||||
gart_tlb_error = 1;
|
||||
|
||||
debugf1("GART TLB error\n");
|
||||
pr_emerg("GART TLB error\n");
|
||||
amd64_decode_gart_tlb_error(mci, regs);
|
||||
} else if (MEM_ERROR(err_code)) {
|
||||
debugf1("Memory/Cache error\n");
|
||||
} else if (MEM_ERROR(ec)) {
|
||||
pr_emerg("Memory/Cache error\n");
|
||||
amd64_decode_mem_cache_error(mci, regs);
|
||||
} else if (BUS_ERROR(err_code)) {
|
||||
debugf1("Bus (Link/DRAM) error\n");
|
||||
} else if (BUS_ERROR(ec)) {
|
||||
pr_emerg("Bus (Link/DRAM) error\n");
|
||||
amd64_decode_bus_error(mci, regs);
|
||||
} else {
|
||||
/* shouldn't reach here! */
|
||||
amd64_mc_printk(mci, KERN_WARNING,
|
||||
"%s(): unknown MCE error 0x%x\n", __func__,
|
||||
err_code);
|
||||
ec);
|
||||
}
|
||||
|
||||
ext_ec = EXT_ERROR_CODE(regs->nbsl);
|
||||
amd64_mc_printk(mci, KERN_ERR,
|
||||
"ExtErr=(0x%x) %s\n", ext_ec, ext_msgs[ext_ec]);
|
||||
pr_emerg("%s.\n", EXT_ERR_MSG(xec));
|
||||
|
||||
/*
|
||||
* Check the UE bit of the NB status high register, if set generate some
|
||||
* logs. If NOT a GART error, then process the event as a NO-INFO event.
|
||||
* If it was a GART error, skip that process.
|
||||
*/
|
||||
if (regs->nbsh & K8_NBSH_UNCORRECTED_ERR) {
|
||||
amd64_mc_printk(mci, KERN_CRIT, "uncorrected error\n");
|
||||
if (!gart_tlb_error)
|
||||
edac_mc_handle_ue_no_info(mci, "UE bit is set\n");
|
||||
if (regs->nbsh & K8_NBSH_UC_ERR && !report_gart_errors)
|
||||
edac_mc_handle_ue_no_info(mci, "UE bit is set");
|
||||
}
|
||||
|
||||
if (regs->nbsh & K8_NBSH_PCC)
|
||||
amd64_mc_printk(mci, KERN_CRIT,
|
||||
"PCC (processor context corrupt) set\n");
|
||||
|
||||
return 1;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(amd64_process_error_info);
|
||||
|
||||
/*
|
||||
* The main polling 'check' function, called FROM the edac core to perform the
|
||||
* error checking and if an error is encountered, error processing.
|
||||
|
@ -2479,7 +2446,7 @@ static void amd64_check(struct mem_ctl_info *mci)
|
|||
struct err_regs regs;
|
||||
|
||||
if (amd64_get_error_info(mci, ®s))
|
||||
amd64_process_error_info(mci, ®s, 1);
|
||||
amd64_decode_nb_mce(mci, ®s, 1);
|
||||
}
|
||||
|
||||
/*
|
||||
|
|
|
@ -306,16 +306,7 @@ enum {
|
|||
|
||||
/* Family F10h: Normalized Extended Error Codes */
|
||||
#define F10_NBSL_EXT_ERR_RES 0x0
|
||||
#define F10_NBSL_EXT_ERR_CRC 0x1
|
||||
#define F10_NBSL_EXT_ERR_SYNC 0x2
|
||||
#define F10_NBSL_EXT_ERR_MST 0x3
|
||||
#define F10_NBSL_EXT_ERR_TGT 0x4
|
||||
#define F10_NBSL_EXT_ERR_GART 0x5
|
||||
#define F10_NBSL_EXT_ERR_RMW 0x6
|
||||
#define F10_NBSL_EXT_ERR_WDT 0x7
|
||||
#define F10_NBSL_EXT_ERR_ECC 0x8
|
||||
#define F10_NBSL_EXT_ERR_DEV 0x9
|
||||
#define F10_NBSL_EXT_ERR_LINK_DATA 0xA
|
||||
|
||||
/* Next two are overloaded values */
|
||||
#define F10_NBSL_EXT_ERR_LINK_PROTO 0xB
|
||||
|
@ -360,18 +351,15 @@ enum {
|
|||
|
||||
#define K8_NBSH_VALID_BIT BIT(31)
|
||||
#define K8_NBSH_OVERFLOW BIT(30)
|
||||
#define K8_NBSH_UNCORRECTED_ERR BIT(29)
|
||||
#define K8_NBSH_ERR_ENABLE BIT(28)
|
||||
#define K8_NBSH_MISC_ERR_VALID BIT(27)
|
||||
#define K8_NBSH_UC_ERR BIT(29)
|
||||
#define K8_NBSH_ERR_EN BIT(28)
|
||||
#define K8_NBSH_MISCV BIT(27)
|
||||
#define K8_NBSH_VALID_ERROR_ADDR BIT(26)
|
||||
#define K8_NBSH_PCC BIT(25)
|
||||
#define K8_NBSH_ERR_CPU_VAL BIT(24)
|
||||
#define K8_NBSH_CECC BIT(14)
|
||||
#define K8_NBSH_UECC BIT(13)
|
||||
#define K8_NBSH_ERR_SCRUBER BIT(8)
|
||||
#define K8_NBSH_CORE3 BIT(3)
|
||||
#define K8_NBSH_CORE2 BIT(2)
|
||||
#define K8_NBSH_CORE1 BIT(1)
|
||||
#define K8_NBSH_CORE0 BIT(0)
|
||||
|
||||
#define EXTRACT_ERR_CPU_MAP(x) ((x) & 0xF)
|
||||
|
||||
|
@ -622,8 +610,8 @@ static inline struct low_ops *family_ops(int index)
|
|||
#define F10_MIN_SCRUB_RATE_BITS 0x5
|
||||
#define F11_MIN_SCRUB_RATE_BITS 0x6
|
||||
|
||||
int amd64_process_error_info(struct mem_ctl_info *mci,
|
||||
struct err_regs *info,
|
||||
void amd64_decode_nb_mce(struct mem_ctl_info *mci, struct err_regs *info,
|
||||
int handle_errors);
|
||||
|
||||
int amd64_get_dram_hole_info(struct mem_ctl_info *mci, u64 *hole_base,
|
||||
u64 *hole_offset, u64 *hole_size);
|
||||
|
|
|
@ -24,7 +24,7 @@ static ssize_t amd64_nbea_store(struct mem_ctl_info *mci, const char *data,
|
|||
|
||||
/* Process the Mapping request */
|
||||
/* TODO: Add race prevention */
|
||||
amd64_process_error_info(mci, &pvt->ctl_error_info, 1);
|
||||
amd64_decode_nb_mce(mci, &pvt->ctl_error_info, 1);
|
||||
|
||||
return count;
|
||||
}
|
||||
|
|
|
@ -1,5 +1,7 @@
|
|||
#define ERROR_CODE(x) ((x) & 0xffff)
|
||||
#define EXT_ERROR_CODE(x) (((x) >> 16) & 0x1f)
|
||||
#define EXT_ERR_MSG(x) ext_msgs[EXT_ERROR_CODE(x)]
|
||||
|
||||
#define LOW_SYNDROME(x) (((x) >> 15) & 0xff)
|
||||
#define HIGH_SYNDROME(x) (((x) >> 24) & 0xff)
|
||||
|
||||
|
|
Loading…
Reference in a new issue