Merge branch 'linux_next' of git://git.kernel.org/pub/scm/linux/kernel/git/mchehab/linux-edac
Pull EDAC fixes and ghes-edac from Mauro Carvalho Chehab: "For: - Some fixes at edac drivers (i7core_edac, sb_edac, i3200_edac); - error injection support for i5100, when EDAC debug is enabled; - fix edac when it is loaded builtin (early init for the subsystem); - a "Firmware First" EDAC driver, allowing ghes to report errors via EDAC (ghes-edac). With regards to ghes-edac, this fixes a longstanding BZ at Red Hat that happens with Nehalem and Sandy Bridge CPUs: when both GHES and i7core_edac or sb_edac are running, the error reports are unpredictable, as both BIOS and OS race to access the registers. With ghes-edac, the EDAC core will refuse to register any other concurrent memory error driver. This patchset moves the ghes struct definitions to a separate header file (include/acpi/ghes.h) and adds 3 hooks at apei/ghes.c to register/unregister and to report errors via ghes-edac. Those changes were acked by ghes driver maintainer (Huang)." * 'linux_next' of git://git.kernel.org/pub/scm/linux/kernel/git/mchehab/linux-edac: (30 commits) i5100_edac: convert to use simple_open() ghes_edac: fix to use list_for_each_entry_safe() when delete list items ghes_edac: Fix RAS tracing ghes_edac: Make it compliant with UEFI spec 2.3.1 ghes_edac: Improve driver's printk messages ghes_edac: Don't credit the same memory dimm twice ghes_edac: do a better job of filling EDAC DIMM info ghes_edac: add support for reporting errors via EDAC ghes_edac: Register at EDAC core the BIOS report ghes: add the needed hooks for EDAC error report ghes: move structures/enum to a header file edac: add support for error type "Info" edac: add support for raw error reports edac: reduce stack pressure by using a pre-allocated buffer edac: lock module owner to avoid error report conflicts edac: remove proc_name from mci structure edac: add a new memory layer type edac: initialize the core earlier edac: better report error conditions in debug mode i5100_edac: Remove two checkpatch warnings ...
This commit is contained in:
commit
ad6c2c2eb3
18 changed files with 1078 additions and 141 deletions
|
@ -2904,6 +2904,13 @@ W: bluesmoke.sourceforge.net
|
||||||
S: Maintained
|
S: Maintained
|
||||||
F: drivers/edac/e7xxx_edac.c
|
F: drivers/edac/e7xxx_edac.c
|
||||||
|
|
||||||
|
EDAC-GHES
|
||||||
|
M: Mauro Carvalho Chehab <mchehab@redhat.com>
|
||||||
|
L: linux-edac@vger.kernel.org
|
||||||
|
W: bluesmoke.sourceforge.net
|
||||||
|
S: Maintained
|
||||||
|
F: drivers/edac/ghes-edac.c
|
||||||
|
|
||||||
EDAC-I82443BXGX
|
EDAC-I82443BXGX
|
||||||
M: Tim Small <tim@buttersideup.com>
|
M: Tim Small <tim@buttersideup.com>
|
||||||
L: linux-edac@vger.kernel.org
|
L: linux-edac@vger.kernel.org
|
||||||
|
|
|
@ -48,8 +48,8 @@
|
||||||
#include <linux/genalloc.h>
|
#include <linux/genalloc.h>
|
||||||
#include <linux/pci.h>
|
#include <linux/pci.h>
|
||||||
#include <linux/aer.h>
|
#include <linux/aer.h>
|
||||||
#include <acpi/apei.h>
|
|
||||||
#include <acpi/hed.h>
|
#include <acpi/ghes.h>
|
||||||
#include <asm/mce.h>
|
#include <asm/mce.h>
|
||||||
#include <asm/tlbflush.h>
|
#include <asm/tlbflush.h>
|
||||||
#include <asm/nmi.h>
|
#include <asm/nmi.h>
|
||||||
|
@ -84,42 +84,6 @@
|
||||||
((struct acpi_hest_generic_status *) \
|
((struct acpi_hest_generic_status *) \
|
||||||
((struct ghes_estatus_node *)(estatus_node) + 1))
|
((struct ghes_estatus_node *)(estatus_node) + 1))
|
||||||
|
|
||||||
/*
|
|
||||||
* One struct ghes is created for each generic hardware error source.
|
|
||||||
* It provides the context for APEI hardware error timer/IRQ/SCI/NMI
|
|
||||||
* handler.
|
|
||||||
*
|
|
||||||
* estatus: memory buffer for error status block, allocated during
|
|
||||||
* HEST parsing.
|
|
||||||
*/
|
|
||||||
#define GHES_TO_CLEAR 0x0001
|
|
||||||
#define GHES_EXITING 0x0002
|
|
||||||
|
|
||||||
struct ghes {
|
|
||||||
struct acpi_hest_generic *generic;
|
|
||||||
struct acpi_hest_generic_status *estatus;
|
|
||||||
u64 buffer_paddr;
|
|
||||||
unsigned long flags;
|
|
||||||
union {
|
|
||||||
struct list_head list;
|
|
||||||
struct timer_list timer;
|
|
||||||
unsigned int irq;
|
|
||||||
};
|
|
||||||
};
|
|
||||||
|
|
||||||
struct ghes_estatus_node {
|
|
||||||
struct llist_node llnode;
|
|
||||||
struct acpi_hest_generic *generic;
|
|
||||||
};
|
|
||||||
|
|
||||||
struct ghes_estatus_cache {
|
|
||||||
u32 estatus_len;
|
|
||||||
atomic_t count;
|
|
||||||
struct acpi_hest_generic *generic;
|
|
||||||
unsigned long long time_in;
|
|
||||||
struct rcu_head rcu;
|
|
||||||
};
|
|
||||||
|
|
||||||
bool ghes_disable;
|
bool ghes_disable;
|
||||||
module_param_named(disable, ghes_disable, bool, 0);
|
module_param_named(disable, ghes_disable, bool, 0);
|
||||||
|
|
||||||
|
@ -333,13 +297,6 @@ static void ghes_fini(struct ghes *ghes)
|
||||||
apei_unmap_generic_address(&ghes->generic->error_status_address);
|
apei_unmap_generic_address(&ghes->generic->error_status_address);
|
||||||
}
|
}
|
||||||
|
|
||||||
enum {
|
|
||||||
GHES_SEV_NO = 0x0,
|
|
||||||
GHES_SEV_CORRECTED = 0x1,
|
|
||||||
GHES_SEV_RECOVERABLE = 0x2,
|
|
||||||
GHES_SEV_PANIC = 0x3,
|
|
||||||
};
|
|
||||||
|
|
||||||
static inline int ghes_severity(int severity)
|
static inline int ghes_severity(int severity)
|
||||||
{
|
{
|
||||||
switch (severity) {
|
switch (severity) {
|
||||||
|
@ -452,7 +409,8 @@ static void ghes_clear_estatus(struct ghes *ghes)
|
||||||
ghes->flags &= ~GHES_TO_CLEAR;
|
ghes->flags &= ~GHES_TO_CLEAR;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ghes_do_proc(const struct acpi_hest_generic_status *estatus)
|
static void ghes_do_proc(struct ghes *ghes,
|
||||||
|
const struct acpi_hest_generic_status *estatus)
|
||||||
{
|
{
|
||||||
int sev, sec_sev;
|
int sev, sec_sev;
|
||||||
struct acpi_hest_generic_data *gdata;
|
struct acpi_hest_generic_data *gdata;
|
||||||
|
@ -464,6 +422,8 @@ static void ghes_do_proc(const struct acpi_hest_generic_status *estatus)
|
||||||
CPER_SEC_PLATFORM_MEM)) {
|
CPER_SEC_PLATFORM_MEM)) {
|
||||||
struct cper_sec_mem_err *mem_err;
|
struct cper_sec_mem_err *mem_err;
|
||||||
mem_err = (struct cper_sec_mem_err *)(gdata+1);
|
mem_err = (struct cper_sec_mem_err *)(gdata+1);
|
||||||
|
ghes_edac_report_mem_error(ghes, sev, mem_err);
|
||||||
|
|
||||||
#ifdef CONFIG_X86_MCE
|
#ifdef CONFIG_X86_MCE
|
||||||
apei_mce_report_mem_error(sev == GHES_SEV_CORRECTED,
|
apei_mce_report_mem_error(sev == GHES_SEV_CORRECTED,
|
||||||
mem_err);
|
mem_err);
|
||||||
|
@ -682,7 +642,7 @@ static int ghes_proc(struct ghes *ghes)
|
||||||
if (ghes_print_estatus(NULL, ghes->generic, ghes->estatus))
|
if (ghes_print_estatus(NULL, ghes->generic, ghes->estatus))
|
||||||
ghes_estatus_cache_add(ghes->generic, ghes->estatus);
|
ghes_estatus_cache_add(ghes->generic, ghes->estatus);
|
||||||
}
|
}
|
||||||
ghes_do_proc(ghes->estatus);
|
ghes_do_proc(ghes, ghes->estatus);
|
||||||
out:
|
out:
|
||||||
ghes_clear_estatus(ghes);
|
ghes_clear_estatus(ghes);
|
||||||
return 0;
|
return 0;
|
||||||
|
@ -775,7 +735,7 @@ static void ghes_proc_in_irq(struct irq_work *irq_work)
|
||||||
estatus = GHES_ESTATUS_FROM_NODE(estatus_node);
|
estatus = GHES_ESTATUS_FROM_NODE(estatus_node);
|
||||||
len = apei_estatus_len(estatus);
|
len = apei_estatus_len(estatus);
|
||||||
node_len = GHES_ESTATUS_NODE_LEN(len);
|
node_len = GHES_ESTATUS_NODE_LEN(len);
|
||||||
ghes_do_proc(estatus);
|
ghes_do_proc(estatus_node->ghes, estatus);
|
||||||
if (!ghes_estatus_cached(estatus)) {
|
if (!ghes_estatus_cached(estatus)) {
|
||||||
generic = estatus_node->generic;
|
generic = estatus_node->generic;
|
||||||
if (ghes_print_estatus(NULL, generic, estatus))
|
if (ghes_print_estatus(NULL, generic, estatus))
|
||||||
|
@ -864,6 +824,7 @@ static int ghes_notify_nmi(unsigned int cmd, struct pt_regs *regs)
|
||||||
estatus_node = (void *)gen_pool_alloc(ghes_estatus_pool,
|
estatus_node = (void *)gen_pool_alloc(ghes_estatus_pool,
|
||||||
node_len);
|
node_len);
|
||||||
if (estatus_node) {
|
if (estatus_node) {
|
||||||
|
estatus_node->ghes = ghes;
|
||||||
estatus_node->generic = ghes->generic;
|
estatus_node->generic = ghes->generic;
|
||||||
estatus = GHES_ESTATUS_FROM_NODE(estatus_node);
|
estatus = GHES_ESTATUS_FROM_NODE(estatus_node);
|
||||||
memcpy(estatus, ghes->estatus, len);
|
memcpy(estatus, ghes->estatus, len);
|
||||||
|
@ -942,6 +903,11 @@ static int ghes_probe(struct platform_device *ghes_dev)
|
||||||
ghes = NULL;
|
ghes = NULL;
|
||||||
goto err;
|
goto err;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
rc = ghes_edac_register(ghes, &ghes_dev->dev);
|
||||||
|
if (rc < 0)
|
||||||
|
goto err;
|
||||||
|
|
||||||
switch (generic->notify.type) {
|
switch (generic->notify.type) {
|
||||||
case ACPI_HEST_NOTIFY_POLLED:
|
case ACPI_HEST_NOTIFY_POLLED:
|
||||||
ghes->timer.function = ghes_poll_func;
|
ghes->timer.function = ghes_poll_func;
|
||||||
|
@ -954,13 +920,13 @@ static int ghes_probe(struct platform_device *ghes_dev)
|
||||||
if (acpi_gsi_to_irq(generic->notify.vector, &ghes->irq)) {
|
if (acpi_gsi_to_irq(generic->notify.vector, &ghes->irq)) {
|
||||||
pr_err(GHES_PFX "Failed to map GSI to IRQ for generic hardware error source: %d\n",
|
pr_err(GHES_PFX "Failed to map GSI to IRQ for generic hardware error source: %d\n",
|
||||||
generic->header.source_id);
|
generic->header.source_id);
|
||||||
goto err;
|
goto err_edac_unreg;
|
||||||
}
|
}
|
||||||
if (request_irq(ghes->irq, ghes_irq_func,
|
if (request_irq(ghes->irq, ghes_irq_func,
|
||||||
0, "GHES IRQ", ghes)) {
|
0, "GHES IRQ", ghes)) {
|
||||||
pr_err(GHES_PFX "Failed to register IRQ for generic hardware error source: %d\n",
|
pr_err(GHES_PFX "Failed to register IRQ for generic hardware error source: %d\n",
|
||||||
generic->header.source_id);
|
generic->header.source_id);
|
||||||
goto err;
|
goto err_edac_unreg;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
case ACPI_HEST_NOTIFY_SCI:
|
case ACPI_HEST_NOTIFY_SCI:
|
||||||
|
@ -986,6 +952,8 @@ static int ghes_probe(struct platform_device *ghes_dev)
|
||||||
platform_set_drvdata(ghes_dev, ghes);
|
platform_set_drvdata(ghes_dev, ghes);
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
|
err_edac_unreg:
|
||||||
|
ghes_edac_unregister(ghes);
|
||||||
err:
|
err:
|
||||||
if (ghes) {
|
if (ghes) {
|
||||||
ghes_fini(ghes);
|
ghes_fini(ghes);
|
||||||
|
@ -1038,6 +1006,9 @@ static int ghes_remove(struct platform_device *ghes_dev)
|
||||||
}
|
}
|
||||||
|
|
||||||
ghes_fini(ghes);
|
ghes_fini(ghes);
|
||||||
|
|
||||||
|
ghes_edac_unregister(ghes);
|
||||||
|
|
||||||
kfree(ghes);
|
kfree(ghes);
|
||||||
|
|
||||||
platform_set_drvdata(ghes_dev, NULL);
|
platform_set_drvdata(ghes_dev, NULL);
|
||||||
|
|
|
@ -80,6 +80,29 @@ config EDAC_MM_EDAC
|
||||||
occurred so that a particular failing memory module can be
|
occurred so that a particular failing memory module can be
|
||||||
replaced. If unsure, select 'Y'.
|
replaced. If unsure, select 'Y'.
|
||||||
|
|
||||||
|
config EDAC_GHES
|
||||||
|
bool "Output ACPI APEI/GHES BIOS detected errors via EDAC"
|
||||||
|
depends on ACPI_APEI_GHES && (EDAC_MM_EDAC=y)
|
||||||
|
default y
|
||||||
|
help
|
||||||
|
Not all machines support hardware-driven error report. Some of those
|
||||||
|
provide a BIOS-driven error report mechanism via ACPI, using the
|
||||||
|
APEI/GHES driver. By enabling this option, the error reports provided
|
||||||
|
by GHES are sent to userspace via the EDAC API.
|
||||||
|
|
||||||
|
When this option is enabled, it will disable the hardware-driven
|
||||||
|
mechanisms, if a GHES BIOS is detected, entering into the
|
||||||
|
"Firmware First" mode.
|
||||||
|
|
||||||
|
It should be noticed that keeping both GHES and a hardware-driven
|
||||||
|
error mechanism won't work well, as BIOS will race with OS, while
|
||||||
|
reading the error registers. So, if you want to not use "Firmware
|
||||||
|
first" GHES error mechanism, you should disable GHES either at
|
||||||
|
compilation time or by passing "ghes.disable=1" Kernel parameter
|
||||||
|
at boot time.
|
||||||
|
|
||||||
|
In doubt, say 'Y'.
|
||||||
|
|
||||||
config EDAC_AMD64
|
config EDAC_AMD64
|
||||||
tristate "AMD64 (Opteron, Athlon64) K8, F10h"
|
tristate "AMD64 (Opteron, Athlon64) K8, F10h"
|
||||||
depends on EDAC_MM_EDAC && AMD_NB && X86_64 && EDAC_DECODE_MCE
|
depends on EDAC_MM_EDAC && AMD_NB && X86_64 && EDAC_DECODE_MCE
|
||||||
|
|
|
@ -16,6 +16,7 @@ ifdef CONFIG_PCI
|
||||||
edac_core-y += edac_pci.o edac_pci_sysfs.o
|
edac_core-y += edac_pci.o edac_pci_sysfs.o
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
obj-$(CONFIG_EDAC_GHES) += ghes_edac.o
|
||||||
obj-$(CONFIG_EDAC_MCE_INJ) += mce_amd_inj.o
|
obj-$(CONFIG_EDAC_MCE_INJ) += mce_amd_inj.o
|
||||||
|
|
||||||
edac_mce_amd-y := mce_amd.o
|
edac_mce_amd-y := mce_amd.o
|
||||||
|
|
|
@ -453,6 +453,11 @@ extern struct mem_ctl_info *find_mci_by_dev(struct device *dev);
|
||||||
extern struct mem_ctl_info *edac_mc_del_mc(struct device *dev);
|
extern struct mem_ctl_info *edac_mc_del_mc(struct device *dev);
|
||||||
extern int edac_mc_find_csrow_by_page(struct mem_ctl_info *mci,
|
extern int edac_mc_find_csrow_by_page(struct mem_ctl_info *mci,
|
||||||
unsigned long page);
|
unsigned long page);
|
||||||
|
|
||||||
|
void edac_raw_mc_handle_error(const enum hw_event_mc_err_type type,
|
||||||
|
struct mem_ctl_info *mci,
|
||||||
|
struct edac_raw_error_desc *e);
|
||||||
|
|
||||||
void edac_mc_handle_error(const enum hw_event_mc_err_type type,
|
void edac_mc_handle_error(const enum hw_event_mc_err_type type,
|
||||||
struct mem_ctl_info *mci,
|
struct mem_ctl_info *mci,
|
||||||
const u16 error_count,
|
const u16 error_count,
|
||||||
|
|
|
@ -42,6 +42,12 @@
|
||||||
static DEFINE_MUTEX(mem_ctls_mutex);
|
static DEFINE_MUTEX(mem_ctls_mutex);
|
||||||
static LIST_HEAD(mc_devices);
|
static LIST_HEAD(mc_devices);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Used to lock EDAC MC to just one module, avoiding two drivers e. g.
|
||||||
|
* apei/ghes and i7core_edac to be used at the same time.
|
||||||
|
*/
|
||||||
|
static void const *edac_mc_owner;
|
||||||
|
|
||||||
unsigned edac_dimm_info_location(struct dimm_info *dimm, char *buf,
|
unsigned edac_dimm_info_location(struct dimm_info *dimm, char *buf,
|
||||||
unsigned len)
|
unsigned len)
|
||||||
{
|
{
|
||||||
|
@ -441,13 +447,6 @@ struct mem_ctl_info *edac_mc_alloc(unsigned mc_num,
|
||||||
|
|
||||||
mci->op_state = OP_ALLOC;
|
mci->op_state = OP_ALLOC;
|
||||||
|
|
||||||
/* at this point, the root kobj is valid, and in order to
|
|
||||||
* 'free' the object, then the function:
|
|
||||||
* edac_mc_unregister_sysfs_main_kobj() must be called
|
|
||||||
* which will perform kobj unregistration and the actual free
|
|
||||||
* will occur during the kobject callback operation
|
|
||||||
*/
|
|
||||||
|
|
||||||
return mci;
|
return mci;
|
||||||
|
|
||||||
error:
|
error:
|
||||||
|
@ -666,9 +665,9 @@ static int add_mc_to_global_list(struct mem_ctl_info *mci)
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void del_mc_from_global_list(struct mem_ctl_info *mci)
|
static int del_mc_from_global_list(struct mem_ctl_info *mci)
|
||||||
{
|
{
|
||||||
atomic_dec(&edac_handlers);
|
int handlers = atomic_dec_return(&edac_handlers);
|
||||||
list_del_rcu(&mci->link);
|
list_del_rcu(&mci->link);
|
||||||
|
|
||||||
/* these are for safe removal of devices from global list while
|
/* these are for safe removal of devices from global list while
|
||||||
|
@ -676,6 +675,8 @@ static void del_mc_from_global_list(struct mem_ctl_info *mci)
|
||||||
*/
|
*/
|
||||||
synchronize_rcu();
|
synchronize_rcu();
|
||||||
INIT_LIST_HEAD(&mci->link);
|
INIT_LIST_HEAD(&mci->link);
|
||||||
|
|
||||||
|
return handlers;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -719,6 +720,7 @@ EXPORT_SYMBOL(edac_mc_find);
|
||||||
/* FIXME - should a warning be printed if no error detection? correction? */
|
/* FIXME - should a warning be printed if no error detection? correction? */
|
||||||
int edac_mc_add_mc(struct mem_ctl_info *mci)
|
int edac_mc_add_mc(struct mem_ctl_info *mci)
|
||||||
{
|
{
|
||||||
|
int ret = -EINVAL;
|
||||||
edac_dbg(0, "\n");
|
edac_dbg(0, "\n");
|
||||||
|
|
||||||
#ifdef CONFIG_EDAC_DEBUG
|
#ifdef CONFIG_EDAC_DEBUG
|
||||||
|
@ -749,6 +751,11 @@ int edac_mc_add_mc(struct mem_ctl_info *mci)
|
||||||
#endif
|
#endif
|
||||||
mutex_lock(&mem_ctls_mutex);
|
mutex_lock(&mem_ctls_mutex);
|
||||||
|
|
||||||
|
if (edac_mc_owner && edac_mc_owner != mci->mod_name) {
|
||||||
|
ret = -EPERM;
|
||||||
|
goto fail0;
|
||||||
|
}
|
||||||
|
|
||||||
if (add_mc_to_global_list(mci))
|
if (add_mc_to_global_list(mci))
|
||||||
goto fail0;
|
goto fail0;
|
||||||
|
|
||||||
|
@ -775,6 +782,8 @@ int edac_mc_add_mc(struct mem_ctl_info *mci)
|
||||||
edac_mc_printk(mci, KERN_INFO, "Giving out device to '%s' '%s':"
|
edac_mc_printk(mci, KERN_INFO, "Giving out device to '%s' '%s':"
|
||||||
" DEV %s\n", mci->mod_name, mci->ctl_name, edac_dev_name(mci));
|
" DEV %s\n", mci->mod_name, mci->ctl_name, edac_dev_name(mci));
|
||||||
|
|
||||||
|
edac_mc_owner = mci->mod_name;
|
||||||
|
|
||||||
mutex_unlock(&mem_ctls_mutex);
|
mutex_unlock(&mem_ctls_mutex);
|
||||||
return 0;
|
return 0;
|
||||||
|
|
||||||
|
@ -783,7 +792,7 @@ int edac_mc_add_mc(struct mem_ctl_info *mci)
|
||||||
|
|
||||||
fail0:
|
fail0:
|
||||||
mutex_unlock(&mem_ctls_mutex);
|
mutex_unlock(&mem_ctls_mutex);
|
||||||
return 1;
|
return ret;
|
||||||
}
|
}
|
||||||
EXPORT_SYMBOL_GPL(edac_mc_add_mc);
|
EXPORT_SYMBOL_GPL(edac_mc_add_mc);
|
||||||
|
|
||||||
|
@ -809,7 +818,8 @@ struct mem_ctl_info *edac_mc_del_mc(struct device *dev)
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
del_mc_from_global_list(mci);
|
if (!del_mc_from_global_list(mci))
|
||||||
|
edac_mc_owner = NULL;
|
||||||
mutex_unlock(&mem_ctls_mutex);
|
mutex_unlock(&mem_ctls_mutex);
|
||||||
|
|
||||||
/* flush workq processes */
|
/* flush workq processes */
|
||||||
|
@ -907,6 +917,7 @@ const char *edac_layer_name[] = {
|
||||||
[EDAC_MC_LAYER_CHANNEL] = "channel",
|
[EDAC_MC_LAYER_CHANNEL] = "channel",
|
||||||
[EDAC_MC_LAYER_SLOT] = "slot",
|
[EDAC_MC_LAYER_SLOT] = "slot",
|
||||||
[EDAC_MC_LAYER_CHIP_SELECT] = "csrow",
|
[EDAC_MC_LAYER_CHIP_SELECT] = "csrow",
|
||||||
|
[EDAC_MC_LAYER_ALL_MEM] = "memory",
|
||||||
};
|
};
|
||||||
EXPORT_SYMBOL_GPL(edac_layer_name);
|
EXPORT_SYMBOL_GPL(edac_layer_name);
|
||||||
|
|
||||||
|
@ -1054,7 +1065,46 @@ static void edac_ue_error(struct mem_ctl_info *mci,
|
||||||
edac_inc_ue_error(mci, enable_per_layer_report, pos, error_count);
|
edac_inc_ue_error(mci, enable_per_layer_report, pos, error_count);
|
||||||
}
|
}
|
||||||
|
|
||||||
#define OTHER_LABEL " or "
|
/**
|
||||||
|
* edac_raw_mc_handle_error - reports a memory event to userspace without doing
|
||||||
|
* anything to discover the error location
|
||||||
|
*
|
||||||
|
* @type: severity of the error (CE/UE/Fatal)
|
||||||
|
* @mci: a struct mem_ctl_info pointer
|
||||||
|
* @e: error description
|
||||||
|
*
|
||||||
|
* This raw function is used internally by edac_mc_handle_error(). It should
|
||||||
|
* only be called directly when the hardware error come directly from BIOS,
|
||||||
|
* like in the case of APEI GHES driver.
|
||||||
|
*/
|
||||||
|
void edac_raw_mc_handle_error(const enum hw_event_mc_err_type type,
|
||||||
|
struct mem_ctl_info *mci,
|
||||||
|
struct edac_raw_error_desc *e)
|
||||||
|
{
|
||||||
|
char detail[80];
|
||||||
|
int pos[EDAC_MAX_LAYERS] = { e->top_layer, e->mid_layer, e->low_layer };
|
||||||
|
|
||||||
|
/* Memory type dependent details about the error */
|
||||||
|
if (type == HW_EVENT_ERR_CORRECTED) {
|
||||||
|
snprintf(detail, sizeof(detail),
|
||||||
|
"page:0x%lx offset:0x%lx grain:%ld syndrome:0x%lx",
|
||||||
|
e->page_frame_number, e->offset_in_page,
|
||||||
|
e->grain, e->syndrome);
|
||||||
|
edac_ce_error(mci, e->error_count, pos, e->msg, e->location, e->label,
|
||||||
|
detail, e->other_detail, e->enable_per_layer_report,
|
||||||
|
e->page_frame_number, e->offset_in_page, e->grain);
|
||||||
|
} else {
|
||||||
|
snprintf(detail, sizeof(detail),
|
||||||
|
"page:0x%lx offset:0x%lx grain:%ld",
|
||||||
|
e->page_frame_number, e->offset_in_page, e->grain);
|
||||||
|
|
||||||
|
edac_ue_error(mci, e->error_count, pos, e->msg, e->location, e->label,
|
||||||
|
detail, e->other_detail, e->enable_per_layer_report);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
EXPORT_SYMBOL_GPL(edac_raw_mc_handle_error);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* edac_mc_handle_error - reports a memory event to userspace
|
* edac_mc_handle_error - reports a memory event to userspace
|
||||||
|
@ -1086,19 +1136,27 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type,
|
||||||
const char *msg,
|
const char *msg,
|
||||||
const char *other_detail)
|
const char *other_detail)
|
||||||
{
|
{
|
||||||
/* FIXME: too much for stack: move it to some pre-alocated area */
|
|
||||||
char detail[80], location[80];
|
|
||||||
char label[(EDAC_MC_LABEL_LEN + 1 + sizeof(OTHER_LABEL)) * mci->tot_dimms];
|
|
||||||
char *p;
|
char *p;
|
||||||
int row = -1, chan = -1;
|
int row = -1, chan = -1;
|
||||||
int pos[EDAC_MAX_LAYERS] = { top_layer, mid_layer, low_layer };
|
int pos[EDAC_MAX_LAYERS] = { top_layer, mid_layer, low_layer };
|
||||||
int i;
|
int i, n_labels = 0;
|
||||||
long grain;
|
|
||||||
bool enable_per_layer_report = false;
|
|
||||||
u8 grain_bits;
|
u8 grain_bits;
|
||||||
|
struct edac_raw_error_desc *e = &mci->error_desc;
|
||||||
|
|
||||||
edac_dbg(3, "MC%d\n", mci->mc_idx);
|
edac_dbg(3, "MC%d\n", mci->mc_idx);
|
||||||
|
|
||||||
|
/* Fills the error report buffer */
|
||||||
|
memset(e, 0, sizeof (*e));
|
||||||
|
e->error_count = error_count;
|
||||||
|
e->top_layer = top_layer;
|
||||||
|
e->mid_layer = mid_layer;
|
||||||
|
e->low_layer = low_layer;
|
||||||
|
e->page_frame_number = page_frame_number;
|
||||||
|
e->offset_in_page = offset_in_page;
|
||||||
|
e->syndrome = syndrome;
|
||||||
|
e->msg = msg;
|
||||||
|
e->other_detail = other_detail;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Check if the event report is consistent and if the memory
|
* Check if the event report is consistent and if the memory
|
||||||
* location is known. If it is known, enable_per_layer_report will be
|
* location is known. If it is known, enable_per_layer_report will be
|
||||||
|
@ -1121,7 +1179,7 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type,
|
||||||
pos[i] = -1;
|
pos[i] = -1;
|
||||||
}
|
}
|
||||||
if (pos[i] >= 0)
|
if (pos[i] >= 0)
|
||||||
enable_per_layer_report = true;
|
e->enable_per_layer_report = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -1135,8 +1193,7 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type,
|
||||||
* where each memory belongs to a separate channel within the same
|
* where each memory belongs to a separate channel within the same
|
||||||
* branch.
|
* branch.
|
||||||
*/
|
*/
|
||||||
grain = 0;
|
p = e->label;
|
||||||
p = label;
|
|
||||||
*p = '\0';
|
*p = '\0';
|
||||||
|
|
||||||
for (i = 0; i < mci->tot_dimms; i++) {
|
for (i = 0; i < mci->tot_dimms; i++) {
|
||||||
|
@ -1150,8 +1207,8 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type,
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
/* get the max grain, over the error match range */
|
/* get the max grain, over the error match range */
|
||||||
if (dimm->grain > grain)
|
if (dimm->grain > e->grain)
|
||||||
grain = dimm->grain;
|
e->grain = dimm->grain;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* If the error is memory-controller wide, there's no need to
|
* If the error is memory-controller wide, there's no need to
|
||||||
|
@ -1159,8 +1216,13 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type,
|
||||||
* channel/memory controller/... may be affected.
|
* channel/memory controller/... may be affected.
|
||||||
* Also, don't show errors for empty DIMM slots.
|
* Also, don't show errors for empty DIMM slots.
|
||||||
*/
|
*/
|
||||||
if (enable_per_layer_report && dimm->nr_pages) {
|
if (e->enable_per_layer_report && dimm->nr_pages) {
|
||||||
if (p != label) {
|
if (n_labels >= EDAC_MAX_LABELS) {
|
||||||
|
e->enable_per_layer_report = false;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
n_labels++;
|
||||||
|
if (p != e->label) {
|
||||||
strcpy(p, OTHER_LABEL);
|
strcpy(p, OTHER_LABEL);
|
||||||
p += strlen(OTHER_LABEL);
|
p += strlen(OTHER_LABEL);
|
||||||
}
|
}
|
||||||
|
@ -1187,12 +1249,12 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!enable_per_layer_report) {
|
if (!e->enable_per_layer_report) {
|
||||||
strcpy(label, "any memory");
|
strcpy(e->label, "any memory");
|
||||||
} else {
|
} else {
|
||||||
edac_dbg(4, "csrow/channel to increment: (%d,%d)\n", row, chan);
|
edac_dbg(4, "csrow/channel to increment: (%d,%d)\n", row, chan);
|
||||||
if (p == label)
|
if (p == e->label)
|
||||||
strcpy(label, "unknown memory");
|
strcpy(e->label, "unknown memory");
|
||||||
if (type == HW_EVENT_ERR_CORRECTED) {
|
if (type == HW_EVENT_ERR_CORRECTED) {
|
||||||
if (row >= 0) {
|
if (row >= 0) {
|
||||||
mci->csrows[row]->ce_count += error_count;
|
mci->csrows[row]->ce_count += error_count;
|
||||||
|
@ -1205,7 +1267,7 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type,
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Fill the RAM location data */
|
/* Fill the RAM location data */
|
||||||
p = location;
|
p = e->location;
|
||||||
|
|
||||||
for (i = 0; i < mci->n_layers; i++) {
|
for (i = 0; i < mci->n_layers; i++) {
|
||||||
if (pos[i] < 0)
|
if (pos[i] < 0)
|
||||||
|
@ -1215,32 +1277,16 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type,
|
||||||
edac_layer_name[mci->layers[i].type],
|
edac_layer_name[mci->layers[i].type],
|
||||||
pos[i]);
|
pos[i]);
|
||||||
}
|
}
|
||||||
if (p > location)
|
if (p > e->location)
|
||||||
*(p - 1) = '\0';
|
*(p - 1) = '\0';
|
||||||
|
|
||||||
/* Report the error via the trace interface */
|
/* Report the error via the trace interface */
|
||||||
grain_bits = fls_long(grain) + 1;
|
grain_bits = fls_long(e->grain) + 1;
|
||||||
trace_mc_event(type, msg, label, error_count,
|
trace_mc_event(type, e->msg, e->label, e->error_count,
|
||||||
mci->mc_idx, top_layer, mid_layer, low_layer,
|
mci->mc_idx, e->top_layer, e->mid_layer, e->low_layer,
|
||||||
PAGES_TO_MiB(page_frame_number) | offset_in_page,
|
PAGES_TO_MiB(e->page_frame_number) | e->offset_in_page,
|
||||||
grain_bits, syndrome, other_detail);
|
grain_bits, e->syndrome, e->other_detail);
|
||||||
|
|
||||||
/* Memory type dependent details about the error */
|
edac_raw_mc_handle_error(type, mci, e);
|
||||||
if (type == HW_EVENT_ERR_CORRECTED) {
|
|
||||||
snprintf(detail, sizeof(detail),
|
|
||||||
"page:0x%lx offset:0x%lx grain:%ld syndrome:0x%lx",
|
|
||||||
page_frame_number, offset_in_page,
|
|
||||||
grain, syndrome);
|
|
||||||
edac_ce_error(mci, error_count, pos, msg, location, label,
|
|
||||||
detail, other_detail, enable_per_layer_report,
|
|
||||||
page_frame_number, offset_in_page, grain);
|
|
||||||
} else {
|
|
||||||
snprintf(detail, sizeof(detail),
|
|
||||||
"page:0x%lx offset:0x%lx grain:%ld",
|
|
||||||
page_frame_number, offset_in_page, grain);
|
|
||||||
|
|
||||||
edac_ue_error(mci, error_count, pos, msg, location, label,
|
|
||||||
detail, other_detail, enable_per_layer_report);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
EXPORT_SYMBOL_GPL(edac_mc_handle_error);
|
EXPORT_SYMBOL_GPL(edac_mc_handle_error);
|
||||||
|
|
|
@ -7,7 +7,7 @@
|
||||||
*
|
*
|
||||||
* Written Doug Thompson <norsk5@xmission.com> www.softwarebitmaker.com
|
* Written Doug Thompson <norsk5@xmission.com> www.softwarebitmaker.com
|
||||||
*
|
*
|
||||||
* (c) 2012 - Mauro Carvalho Chehab <mchehab@redhat.com>
|
* (c) 2012-2013 - Mauro Carvalho Chehab <mchehab@redhat.com>
|
||||||
* The entire API were re-written, and ported to use struct device
|
* The entire API were re-written, and ported to use struct device
|
||||||
*
|
*
|
||||||
*/
|
*/
|
||||||
|
@ -429,8 +429,12 @@ static int edac_create_csrow_objects(struct mem_ctl_info *mci)
|
||||||
if (!nr_pages_per_csrow(csrow))
|
if (!nr_pages_per_csrow(csrow))
|
||||||
continue;
|
continue;
|
||||||
err = edac_create_csrow_object(mci, mci->csrows[i], i);
|
err = edac_create_csrow_object(mci, mci->csrows[i], i);
|
||||||
if (err < 0)
|
if (err < 0) {
|
||||||
|
edac_dbg(1,
|
||||||
|
"failure: create csrow objects for csrow %d\n",
|
||||||
|
i);
|
||||||
goto error;
|
goto error;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
return 0;
|
return 0;
|
||||||
|
|
||||||
|
@ -677,9 +681,6 @@ static ssize_t mci_sdram_scrub_rate_store(struct device *dev,
|
||||||
unsigned long bandwidth = 0;
|
unsigned long bandwidth = 0;
|
||||||
int new_bw = 0;
|
int new_bw = 0;
|
||||||
|
|
||||||
if (!mci->set_sdram_scrub_rate)
|
|
||||||
return -ENODEV;
|
|
||||||
|
|
||||||
if (strict_strtoul(data, 10, &bandwidth) < 0)
|
if (strict_strtoul(data, 10, &bandwidth) < 0)
|
||||||
return -EINVAL;
|
return -EINVAL;
|
||||||
|
|
||||||
|
@ -703,9 +704,6 @@ static ssize_t mci_sdram_scrub_rate_show(struct device *dev,
|
||||||
struct mem_ctl_info *mci = to_mci(dev);
|
struct mem_ctl_info *mci = to_mci(dev);
|
||||||
int bandwidth = 0;
|
int bandwidth = 0;
|
||||||
|
|
||||||
if (!mci->get_sdram_scrub_rate)
|
|
||||||
return -ENODEV;
|
|
||||||
|
|
||||||
bandwidth = mci->get_sdram_scrub_rate(mci);
|
bandwidth = mci->get_sdram_scrub_rate(mci);
|
||||||
if (bandwidth < 0) {
|
if (bandwidth < 0) {
|
||||||
edac_printk(KERN_DEBUG, EDAC_MC, "Error reading scrub rate\n");
|
edac_printk(KERN_DEBUG, EDAC_MC, "Error reading scrub rate\n");
|
||||||
|
@ -866,8 +864,7 @@ DEVICE_ATTR(ce_count, S_IRUGO, mci_ce_count_show, NULL);
|
||||||
DEVICE_ATTR(max_location, S_IRUGO, mci_max_location_show, NULL);
|
DEVICE_ATTR(max_location, S_IRUGO, mci_max_location_show, NULL);
|
||||||
|
|
||||||
/* memory scrubber attribute file */
|
/* memory scrubber attribute file */
|
||||||
DEVICE_ATTR(sdram_scrub_rate, S_IRUGO | S_IWUSR, mci_sdram_scrub_rate_show,
|
DEVICE_ATTR(sdram_scrub_rate, 0, NULL, NULL);
|
||||||
mci_sdram_scrub_rate_store);
|
|
||||||
|
|
||||||
static struct attribute *mci_attrs[] = {
|
static struct attribute *mci_attrs[] = {
|
||||||
&dev_attr_reset_counters.attr,
|
&dev_attr_reset_counters.attr,
|
||||||
|
@ -878,7 +875,6 @@ static struct attribute *mci_attrs[] = {
|
||||||
&dev_attr_ce_noinfo_count.attr,
|
&dev_attr_ce_noinfo_count.attr,
|
||||||
&dev_attr_ue_count.attr,
|
&dev_attr_ue_count.attr,
|
||||||
&dev_attr_ce_count.attr,
|
&dev_attr_ce_count.attr,
|
||||||
&dev_attr_sdram_scrub_rate.attr,
|
|
||||||
&dev_attr_max_location.attr,
|
&dev_attr_max_location.attr,
|
||||||
NULL
|
NULL
|
||||||
};
|
};
|
||||||
|
@ -1007,11 +1003,28 @@ int edac_create_sysfs_mci_device(struct mem_ctl_info *mci)
|
||||||
edac_dbg(0, "creating device %s\n", dev_name(&mci->dev));
|
edac_dbg(0, "creating device %s\n", dev_name(&mci->dev));
|
||||||
err = device_add(&mci->dev);
|
err = device_add(&mci->dev);
|
||||||
if (err < 0) {
|
if (err < 0) {
|
||||||
|
edac_dbg(1, "failure: create device %s\n", dev_name(&mci->dev));
|
||||||
bus_unregister(&mci->bus);
|
bus_unregister(&mci->bus);
|
||||||
kfree(mci->bus.name);
|
kfree(mci->bus.name);
|
||||||
return err;
|
return err;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (mci->set_sdram_scrub_rate || mci->get_sdram_scrub_rate) {
|
||||||
|
if (mci->get_sdram_scrub_rate) {
|
||||||
|
dev_attr_sdram_scrub_rate.attr.mode |= S_IRUGO;
|
||||||
|
dev_attr_sdram_scrub_rate.show = &mci_sdram_scrub_rate_show;
|
||||||
|
}
|
||||||
|
if (mci->set_sdram_scrub_rate) {
|
||||||
|
dev_attr_sdram_scrub_rate.attr.mode |= S_IWUSR;
|
||||||
|
dev_attr_sdram_scrub_rate.store = &mci_sdram_scrub_rate_store;
|
||||||
|
}
|
||||||
|
err = device_create_file(&mci->dev,
|
||||||
|
&dev_attr_sdram_scrub_rate);
|
||||||
|
if (err) {
|
||||||
|
edac_dbg(1, "failure: create sdram_scrub_rate\n");
|
||||||
|
goto fail2;
|
||||||
|
}
|
||||||
|
}
|
||||||
/*
|
/*
|
||||||
* Create the dimm/rank devices
|
* Create the dimm/rank devices
|
||||||
*/
|
*/
|
||||||
|
@ -1056,6 +1069,7 @@ int edac_create_sysfs_mci_device(struct mem_ctl_info *mci)
|
||||||
continue;
|
continue;
|
||||||
device_unregister(&dimm->dev);
|
device_unregister(&dimm->dev);
|
||||||
}
|
}
|
||||||
|
fail2:
|
||||||
device_unregister(&mci->dev);
|
device_unregister(&mci->dev);
|
||||||
bus_unregister(&mci->bus);
|
bus_unregister(&mci->bus);
|
||||||
kfree(mci->bus.name);
|
kfree(mci->bus.name);
|
||||||
|
|
|
@ -146,7 +146,7 @@ static void __exit edac_exit(void)
|
||||||
/*
|
/*
|
||||||
* Inform the kernel of our entry and exit points
|
* Inform the kernel of our entry and exit points
|
||||||
*/
|
*/
|
||||||
module_init(edac_init);
|
subsys_initcall(edac_init);
|
||||||
module_exit(edac_exit);
|
module_exit(edac_exit);
|
||||||
|
|
||||||
MODULE_LICENSE("GPL");
|
MODULE_LICENSE("GPL");
|
||||||
|
|
|
@ -429,8 +429,8 @@ static void edac_pci_main_kobj_teardown(void)
|
||||||
if (atomic_dec_return(&edac_pci_sysfs_refcount) == 0) {
|
if (atomic_dec_return(&edac_pci_sysfs_refcount) == 0) {
|
||||||
edac_dbg(0, "called kobject_put on main kobj\n");
|
edac_dbg(0, "called kobject_put on main kobj\n");
|
||||||
kobject_put(edac_pci_top_main_kobj);
|
kobject_put(edac_pci_top_main_kobj);
|
||||||
|
edac_put_sysfs_subsys();
|
||||||
}
|
}
|
||||||
edac_put_sysfs_subsys();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
|
537
drivers/edac/ghes_edac.c
Normal file
537
drivers/edac/ghes_edac.c
Normal file
|
@ -0,0 +1,537 @@
|
||||||
|
/*
|
||||||
|
* GHES/EDAC Linux driver
|
||||||
|
*
|
||||||
|
* This file may be distributed under the terms of the GNU General Public
|
||||||
|
* License version 2.
|
||||||
|
*
|
||||||
|
* Copyright (c) 2013 by Mauro Carvalho Chehab <mchehab@redhat.com>
|
||||||
|
*
|
||||||
|
* Red Hat Inc. http://www.redhat.com
|
||||||
|
*/
|
||||||
|
|
||||||
|
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
|
||||||
|
|
||||||
|
#include <acpi/ghes.h>
|
||||||
|
#include <linux/edac.h>
|
||||||
|
#include <linux/dmi.h>
|
||||||
|
#include "edac_core.h"
|
||||||
|
#include <ras/ras_event.h>
|
||||||
|
|
||||||
|
#define GHES_EDAC_REVISION " Ver: 1.0.0"
|
||||||
|
|
||||||
|
struct ghes_edac_pvt {
|
||||||
|
struct list_head list;
|
||||||
|
struct ghes *ghes;
|
||||||
|
struct mem_ctl_info *mci;
|
||||||
|
|
||||||
|
/* Buffers for the error handling routine */
|
||||||
|
char detail_location[240];
|
||||||
|
char other_detail[160];
|
||||||
|
char msg[80];
|
||||||
|
};
|
||||||
|
|
||||||
|
static LIST_HEAD(ghes_reglist);
|
||||||
|
static DEFINE_MUTEX(ghes_edac_lock);
|
||||||
|
static int ghes_edac_mc_num;
|
||||||
|
|
||||||
|
|
||||||
|
/* Memory Device - Type 17 of SMBIOS spec */
|
||||||
|
struct memdev_dmi_entry {
|
||||||
|
u8 type;
|
||||||
|
u8 length;
|
||||||
|
u16 handle;
|
||||||
|
u16 phys_mem_array_handle;
|
||||||
|
u16 mem_err_info_handle;
|
||||||
|
u16 total_width;
|
||||||
|
u16 data_width;
|
||||||
|
u16 size;
|
||||||
|
u8 form_factor;
|
||||||
|
u8 device_set;
|
||||||
|
u8 device_locator;
|
||||||
|
u8 bank_locator;
|
||||||
|
u8 memory_type;
|
||||||
|
u16 type_detail;
|
||||||
|
u16 speed;
|
||||||
|
u8 manufacturer;
|
||||||
|
u8 serial_number;
|
||||||
|
u8 asset_tag;
|
||||||
|
u8 part_number;
|
||||||
|
u8 attributes;
|
||||||
|
u32 extended_size;
|
||||||
|
u16 conf_mem_clk_speed;
|
||||||
|
} __attribute__((__packed__));
|
||||||
|
|
||||||
|
struct ghes_edac_dimm_fill {
|
||||||
|
struct mem_ctl_info *mci;
|
||||||
|
unsigned count;
|
||||||
|
};
|
||||||
|
|
||||||
|
char *memory_type[] = {
|
||||||
|
[MEM_EMPTY] = "EMPTY",
|
||||||
|
[MEM_RESERVED] = "RESERVED",
|
||||||
|
[MEM_UNKNOWN] = "UNKNOWN",
|
||||||
|
[MEM_FPM] = "FPM",
|
||||||
|
[MEM_EDO] = "EDO",
|
||||||
|
[MEM_BEDO] = "BEDO",
|
||||||
|
[MEM_SDR] = "SDR",
|
||||||
|
[MEM_RDR] = "RDR",
|
||||||
|
[MEM_DDR] = "DDR",
|
||||||
|
[MEM_RDDR] = "RDDR",
|
||||||
|
[MEM_RMBS] = "RMBS",
|
||||||
|
[MEM_DDR2] = "DDR2",
|
||||||
|
[MEM_FB_DDR2] = "FB_DDR2",
|
||||||
|
[MEM_RDDR2] = "RDDR2",
|
||||||
|
[MEM_XDR] = "XDR",
|
||||||
|
[MEM_DDR3] = "DDR3",
|
||||||
|
[MEM_RDDR3] = "RDDR3",
|
||||||
|
};
|
||||||
|
|
||||||
|
static void ghes_edac_count_dimms(const struct dmi_header *dh, void *arg)
|
||||||
|
{
|
||||||
|
int *num_dimm = arg;
|
||||||
|
|
||||||
|
if (dh->type == DMI_ENTRY_MEM_DEVICE)
|
||||||
|
(*num_dimm)++;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void ghes_edac_dmidecode(const struct dmi_header *dh, void *arg)
|
||||||
|
{
|
||||||
|
struct ghes_edac_dimm_fill *dimm_fill = arg;
|
||||||
|
struct mem_ctl_info *mci = dimm_fill->mci;
|
||||||
|
|
||||||
|
if (dh->type == DMI_ENTRY_MEM_DEVICE) {
|
||||||
|
struct memdev_dmi_entry *entry = (struct memdev_dmi_entry *)dh;
|
||||||
|
struct dimm_info *dimm = EDAC_DIMM_PTR(mci->layers, mci->dimms,
|
||||||
|
mci->n_layers,
|
||||||
|
dimm_fill->count, 0, 0);
|
||||||
|
|
||||||
|
if (entry->size == 0xffff) {
|
||||||
|
pr_info("Can't get DIMM%i size\n",
|
||||||
|
dimm_fill->count);
|
||||||
|
dimm->nr_pages = MiB_TO_PAGES(32);/* Unknown */
|
||||||
|
} else if (entry->size == 0x7fff) {
|
||||||
|
dimm->nr_pages = MiB_TO_PAGES(entry->extended_size);
|
||||||
|
} else {
|
||||||
|
if (entry->size & 1 << 15)
|
||||||
|
dimm->nr_pages = MiB_TO_PAGES((entry->size &
|
||||||
|
0x7fff) << 10);
|
||||||
|
else
|
||||||
|
dimm->nr_pages = MiB_TO_PAGES(entry->size);
|
||||||
|
}
|
||||||
|
|
||||||
|
switch (entry->memory_type) {
|
||||||
|
case 0x12:
|
||||||
|
if (entry->type_detail & 1 << 13)
|
||||||
|
dimm->mtype = MEM_RDDR;
|
||||||
|
else
|
||||||
|
dimm->mtype = MEM_DDR;
|
||||||
|
break;
|
||||||
|
case 0x13:
|
||||||
|
if (entry->type_detail & 1 << 13)
|
||||||
|
dimm->mtype = MEM_RDDR2;
|
||||||
|
else
|
||||||
|
dimm->mtype = MEM_DDR2;
|
||||||
|
break;
|
||||||
|
case 0x14:
|
||||||
|
dimm->mtype = MEM_FB_DDR2;
|
||||||
|
break;
|
||||||
|
case 0x18:
|
||||||
|
if (entry->type_detail & 1 << 13)
|
||||||
|
dimm->mtype = MEM_RDDR3;
|
||||||
|
else
|
||||||
|
dimm->mtype = MEM_DDR3;
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
if (entry->type_detail & 1 << 6)
|
||||||
|
dimm->mtype = MEM_RMBS;
|
||||||
|
else if ((entry->type_detail & ((1 << 7) | (1 << 13)))
|
||||||
|
== ((1 << 7) | (1 << 13)))
|
||||||
|
dimm->mtype = MEM_RDR;
|
||||||
|
else if (entry->type_detail & 1 << 7)
|
||||||
|
dimm->mtype = MEM_SDR;
|
||||||
|
else if (entry->type_detail & 1 << 9)
|
||||||
|
dimm->mtype = MEM_EDO;
|
||||||
|
else
|
||||||
|
dimm->mtype = MEM_UNKNOWN;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Actually, we can only detect if the memory has bits for
|
||||||
|
* checksum or not
|
||||||
|
*/
|
||||||
|
if (entry->total_width == entry->data_width)
|
||||||
|
dimm->edac_mode = EDAC_NONE;
|
||||||
|
else
|
||||||
|
dimm->edac_mode = EDAC_SECDED;
|
||||||
|
|
||||||
|
dimm->dtype = DEV_UNKNOWN;
|
||||||
|
dimm->grain = 128; /* Likely, worse case */
|
||||||
|
|
||||||
|
/*
|
||||||
|
* FIXME: It shouldn't be hard to also fill the DIMM labels
|
||||||
|
*/
|
||||||
|
|
||||||
|
if (dimm->nr_pages) {
|
||||||
|
edac_dbg(1, "DIMM%i: %s size = %d MB%s\n",
|
||||||
|
dimm_fill->count, memory_type[dimm->mtype],
|
||||||
|
PAGES_TO_MiB(dimm->nr_pages),
|
||||||
|
(dimm->edac_mode != EDAC_NONE) ? "(ECC)" : "");
|
||||||
|
edac_dbg(2, "\ttype %d, detail 0x%02x, width %d(total %d)\n",
|
||||||
|
entry->memory_type, entry->type_detail,
|
||||||
|
entry->total_width, entry->data_width);
|
||||||
|
}
|
||||||
|
|
||||||
|
dimm_fill->count++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void ghes_edac_report_mem_error(struct ghes *ghes, int sev,
|
||||||
|
struct cper_sec_mem_err *mem_err)
|
||||||
|
{
|
||||||
|
enum hw_event_mc_err_type type;
|
||||||
|
struct edac_raw_error_desc *e;
|
||||||
|
struct mem_ctl_info *mci;
|
||||||
|
struct ghes_edac_pvt *pvt = NULL;
|
||||||
|
char *p;
|
||||||
|
u8 grain_bits;
|
||||||
|
|
||||||
|
list_for_each_entry(pvt, &ghes_reglist, list) {
|
||||||
|
if (ghes == pvt->ghes)
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
if (!pvt) {
|
||||||
|
pr_err("Internal error: Can't find EDAC structure\n");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
mci = pvt->mci;
|
||||||
|
e = &mci->error_desc;
|
||||||
|
|
||||||
|
/* Cleans the error report buffer */
|
||||||
|
memset(e, 0, sizeof (*e));
|
||||||
|
e->error_count = 1;
|
||||||
|
strcpy(e->label, "unknown label");
|
||||||
|
e->msg = pvt->msg;
|
||||||
|
e->other_detail = pvt->other_detail;
|
||||||
|
e->top_layer = -1;
|
||||||
|
e->mid_layer = -1;
|
||||||
|
e->low_layer = -1;
|
||||||
|
*pvt->other_detail = '\0';
|
||||||
|
*pvt->msg = '\0';
|
||||||
|
|
||||||
|
switch (sev) {
|
||||||
|
case GHES_SEV_CORRECTED:
|
||||||
|
type = HW_EVENT_ERR_CORRECTED;
|
||||||
|
break;
|
||||||
|
case GHES_SEV_RECOVERABLE:
|
||||||
|
type = HW_EVENT_ERR_UNCORRECTED;
|
||||||
|
break;
|
||||||
|
case GHES_SEV_PANIC:
|
||||||
|
type = HW_EVENT_ERR_FATAL;
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
case GHES_SEV_NO:
|
||||||
|
type = HW_EVENT_ERR_INFO;
|
||||||
|
}
|
||||||
|
|
||||||
|
edac_dbg(1, "error validation_bits: 0x%08llx\n",
|
||||||
|
(long long)mem_err->validation_bits);
|
||||||
|
|
||||||
|
/* Error type, mapped on e->msg */
|
||||||
|
if (mem_err->validation_bits & CPER_MEM_VALID_ERROR_TYPE) {
|
||||||
|
p = pvt->msg;
|
||||||
|
switch (mem_err->error_type) {
|
||||||
|
case 0:
|
||||||
|
p += sprintf(p, "Unknown");
|
||||||
|
break;
|
||||||
|
case 1:
|
||||||
|
p += sprintf(p, "No error");
|
||||||
|
break;
|
||||||
|
case 2:
|
||||||
|
p += sprintf(p, "Single-bit ECC");
|
||||||
|
break;
|
||||||
|
case 3:
|
||||||
|
p += sprintf(p, "Multi-bit ECC");
|
||||||
|
break;
|
||||||
|
case 4:
|
||||||
|
p += sprintf(p, "Single-symbol ChipKill ECC");
|
||||||
|
break;
|
||||||
|
case 5:
|
||||||
|
p += sprintf(p, "Multi-symbol ChipKill ECC");
|
||||||
|
break;
|
||||||
|
case 6:
|
||||||
|
p += sprintf(p, "Master abort");
|
||||||
|
break;
|
||||||
|
case 7:
|
||||||
|
p += sprintf(p, "Target abort");
|
||||||
|
break;
|
||||||
|
case 8:
|
||||||
|
p += sprintf(p, "Parity Error");
|
||||||
|
break;
|
||||||
|
case 9:
|
||||||
|
p += sprintf(p, "Watchdog timeout");
|
||||||
|
break;
|
||||||
|
case 10:
|
||||||
|
p += sprintf(p, "Invalid address");
|
||||||
|
break;
|
||||||
|
case 11:
|
||||||
|
p += sprintf(p, "Mirror Broken");
|
||||||
|
break;
|
||||||
|
case 12:
|
||||||
|
p += sprintf(p, "Memory Sparing");
|
||||||
|
break;
|
||||||
|
case 13:
|
||||||
|
p += sprintf(p, "Scrub corrected error");
|
||||||
|
break;
|
||||||
|
case 14:
|
||||||
|
p += sprintf(p, "Scrub uncorrected error");
|
||||||
|
break;
|
||||||
|
case 15:
|
||||||
|
p += sprintf(p, "Physical Memory Map-out event");
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
p += sprintf(p, "reserved error (%d)",
|
||||||
|
mem_err->error_type);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
strcpy(pvt->msg, "unknown error");
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Error address */
|
||||||
|
if (mem_err->validation_bits & CPER_MEM_VALID_PHYSICAL_ADDRESS) {
|
||||||
|
e->page_frame_number = mem_err->physical_addr >> PAGE_SHIFT;
|
||||||
|
e->offset_in_page = mem_err->physical_addr & ~PAGE_MASK;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Error grain */
|
||||||
|
if (mem_err->validation_bits & CPER_MEM_VALID_PHYSICAL_ADDRESS_MASK) {
|
||||||
|
e->grain = ~(mem_err->physical_addr_mask & ~PAGE_MASK);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Memory error location, mapped on e->location */
|
||||||
|
p = e->location;
|
||||||
|
if (mem_err->validation_bits & CPER_MEM_VALID_NODE)
|
||||||
|
p += sprintf(p, "node:%d ", mem_err->node);
|
||||||
|
if (mem_err->validation_bits & CPER_MEM_VALID_CARD)
|
||||||
|
p += sprintf(p, "card:%d ", mem_err->card);
|
||||||
|
if (mem_err->validation_bits & CPER_MEM_VALID_MODULE)
|
||||||
|
p += sprintf(p, "module:%d ", mem_err->module);
|
||||||
|
if (mem_err->validation_bits & CPER_MEM_VALID_BANK)
|
||||||
|
p += sprintf(p, "bank:%d ", mem_err->bank);
|
||||||
|
if (mem_err->validation_bits & CPER_MEM_VALID_ROW)
|
||||||
|
p += sprintf(p, "row:%d ", mem_err->row);
|
||||||
|
if (mem_err->validation_bits & CPER_MEM_VALID_COLUMN)
|
||||||
|
p += sprintf(p, "col:%d ", mem_err->column);
|
||||||
|
if (mem_err->validation_bits & CPER_MEM_VALID_BIT_POSITION)
|
||||||
|
p += sprintf(p, "bit_pos:%d ", mem_err->bit_pos);
|
||||||
|
if (p > e->location)
|
||||||
|
*(p - 1) = '\0';
|
||||||
|
|
||||||
|
/* All other fields are mapped on e->other_detail */
|
||||||
|
p = pvt->other_detail;
|
||||||
|
if (mem_err->validation_bits & CPER_MEM_VALID_ERROR_STATUS) {
|
||||||
|
u64 status = mem_err->error_status;
|
||||||
|
|
||||||
|
p += sprintf(p, "status(0x%016llx): ", (long long)status);
|
||||||
|
switch ((status >> 8) & 0xff) {
|
||||||
|
case 1:
|
||||||
|
p += sprintf(p, "Error detected internal to the component ");
|
||||||
|
break;
|
||||||
|
case 16:
|
||||||
|
p += sprintf(p, "Error detected in the bus ");
|
||||||
|
break;
|
||||||
|
case 4:
|
||||||
|
p += sprintf(p, "Storage error in DRAM memory ");
|
||||||
|
break;
|
||||||
|
case 5:
|
||||||
|
p += sprintf(p, "Storage error in TLB ");
|
||||||
|
break;
|
||||||
|
case 6:
|
||||||
|
p += sprintf(p, "Storage error in cache ");
|
||||||
|
break;
|
||||||
|
case 7:
|
||||||
|
p += sprintf(p, "Error in one or more functional units ");
|
||||||
|
break;
|
||||||
|
case 8:
|
||||||
|
p += sprintf(p, "component failed self test ");
|
||||||
|
break;
|
||||||
|
case 9:
|
||||||
|
p += sprintf(p, "Overflow or undervalue of internal queue ");
|
||||||
|
break;
|
||||||
|
case 17:
|
||||||
|
p += sprintf(p, "Virtual address not found on IO-TLB or IO-PDIR ");
|
||||||
|
break;
|
||||||
|
case 18:
|
||||||
|
p += sprintf(p, "Improper access error ");
|
||||||
|
break;
|
||||||
|
case 19:
|
||||||
|
p += sprintf(p, "Access to a memory address which is not mapped to any component ");
|
||||||
|
break;
|
||||||
|
case 20:
|
||||||
|
p += sprintf(p, "Loss of Lockstep ");
|
||||||
|
break;
|
||||||
|
case 21:
|
||||||
|
p += sprintf(p, "Response not associated with a request ");
|
||||||
|
break;
|
||||||
|
case 22:
|
||||||
|
p += sprintf(p, "Bus parity error - must also set the A, C, or D Bits ");
|
||||||
|
break;
|
||||||
|
case 23:
|
||||||
|
p += sprintf(p, "Detection of a PATH_ERROR ");
|
||||||
|
break;
|
||||||
|
case 25:
|
||||||
|
p += sprintf(p, "Bus operation timeout ");
|
||||||
|
break;
|
||||||
|
case 26:
|
||||||
|
p += sprintf(p, "A read was issued to data that has been poisoned ");
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
p += sprintf(p, "reserved ");
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (mem_err->validation_bits & CPER_MEM_VALID_REQUESTOR_ID)
|
||||||
|
p += sprintf(p, "requestorID: 0x%016llx ",
|
||||||
|
(long long)mem_err->requestor_id);
|
||||||
|
if (mem_err->validation_bits & CPER_MEM_VALID_RESPONDER_ID)
|
||||||
|
p += sprintf(p, "responderID: 0x%016llx ",
|
||||||
|
(long long)mem_err->responder_id);
|
||||||
|
if (mem_err->validation_bits & CPER_MEM_VALID_TARGET_ID)
|
||||||
|
p += sprintf(p, "targetID: 0x%016llx ",
|
||||||
|
(long long)mem_err->responder_id);
|
||||||
|
if (p > pvt->other_detail)
|
||||||
|
*(p - 1) = '\0';
|
||||||
|
|
||||||
|
/* Generate the trace event */
|
||||||
|
grain_bits = fls_long(e->grain);
|
||||||
|
sprintf(pvt->detail_location, "APEI location: %s %s",
|
||||||
|
e->location, e->other_detail);
|
||||||
|
trace_mc_event(type, e->msg, e->label, e->error_count,
|
||||||
|
mci->mc_idx, e->top_layer, e->mid_layer, e->low_layer,
|
||||||
|
PAGES_TO_MiB(e->page_frame_number) | e->offset_in_page,
|
||||||
|
grain_bits, e->syndrome, pvt->detail_location);
|
||||||
|
|
||||||
|
/* Report the error via EDAC API */
|
||||||
|
edac_raw_mc_handle_error(type, mci, e);
|
||||||
|
}
|
||||||
|
EXPORT_SYMBOL_GPL(ghes_edac_report_mem_error);
|
||||||
|
|
||||||
|
int ghes_edac_register(struct ghes *ghes, struct device *dev)
|
||||||
|
{
|
||||||
|
bool fake = false;
|
||||||
|
int rc, num_dimm = 0;
|
||||||
|
struct mem_ctl_info *mci;
|
||||||
|
struct edac_mc_layer layers[1];
|
||||||
|
struct ghes_edac_pvt *pvt;
|
||||||
|
struct ghes_edac_dimm_fill dimm_fill;
|
||||||
|
|
||||||
|
/* Get the number of DIMMs */
|
||||||
|
dmi_walk(ghes_edac_count_dimms, &num_dimm);
|
||||||
|
|
||||||
|
/* Check if we've got a bogus BIOS */
|
||||||
|
if (num_dimm == 0) {
|
||||||
|
fake = true;
|
||||||
|
num_dimm = 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
layers[0].type = EDAC_MC_LAYER_ALL_MEM;
|
||||||
|
layers[0].size = num_dimm;
|
||||||
|
layers[0].is_virt_csrow = true;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* We need to serialize edac_mc_alloc() and edac_mc_add_mc(),
|
||||||
|
* to avoid duplicated memory controller numbers
|
||||||
|
*/
|
||||||
|
mutex_lock(&ghes_edac_lock);
|
||||||
|
mci = edac_mc_alloc(ghes_edac_mc_num, ARRAY_SIZE(layers), layers,
|
||||||
|
sizeof(*pvt));
|
||||||
|
if (!mci) {
|
||||||
|
pr_info("Can't allocate memory for EDAC data\n");
|
||||||
|
mutex_unlock(&ghes_edac_lock);
|
||||||
|
return -ENOMEM;
|
||||||
|
}
|
||||||
|
|
||||||
|
pvt = mci->pvt_info;
|
||||||
|
memset(pvt, 0, sizeof(*pvt));
|
||||||
|
list_add_tail(&pvt->list, &ghes_reglist);
|
||||||
|
pvt->ghes = ghes;
|
||||||
|
pvt->mci = mci;
|
||||||
|
mci->pdev = dev;
|
||||||
|
|
||||||
|
mci->mtype_cap = MEM_FLAG_EMPTY;
|
||||||
|
mci->edac_ctl_cap = EDAC_FLAG_NONE;
|
||||||
|
mci->edac_cap = EDAC_FLAG_NONE;
|
||||||
|
mci->mod_name = "ghes_edac.c";
|
||||||
|
mci->mod_ver = GHES_EDAC_REVISION;
|
||||||
|
mci->ctl_name = "ghes_edac";
|
||||||
|
mci->dev_name = "ghes";
|
||||||
|
|
||||||
|
if (!ghes_edac_mc_num) {
|
||||||
|
if (!fake) {
|
||||||
|
pr_info("This EDAC driver relies on BIOS to enumerate memory and get error reports.\n");
|
||||||
|
pr_info("Unfortunately, not all BIOSes reflect the memory layout correctly.\n");
|
||||||
|
pr_info("So, the end result of using this driver varies from vendor to vendor.\n");
|
||||||
|
pr_info("If you find incorrect reports, please contact your hardware vendor\n");
|
||||||
|
pr_info("to correct its BIOS.\n");
|
||||||
|
pr_info("This system has %d DIMM sockets.\n",
|
||||||
|
num_dimm);
|
||||||
|
} else {
|
||||||
|
pr_info("This system has a very crappy BIOS: It doesn't even list the DIMMS.\n");
|
||||||
|
pr_info("Its SMBIOS info is wrong. It is doubtful that the error report would\n");
|
||||||
|
pr_info("work on such system. Use this driver with caution\n");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!fake) {
|
||||||
|
/*
|
||||||
|
* Fill DIMM info from DMI for the memory controller #0
|
||||||
|
*
|
||||||
|
* Keep it in blank for the other memory controllers, as
|
||||||
|
* there's no reliable way to properly credit each DIMM to
|
||||||
|
* the memory controller, as different BIOSes fill the
|
||||||
|
* DMI bank location fields on different ways
|
||||||
|
*/
|
||||||
|
if (!ghes_edac_mc_num) {
|
||||||
|
dimm_fill.count = 0;
|
||||||
|
dimm_fill.mci = mci;
|
||||||
|
dmi_walk(ghes_edac_dmidecode, &dimm_fill);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
struct dimm_info *dimm = EDAC_DIMM_PTR(mci->layers, mci->dimms,
|
||||||
|
mci->n_layers, 0, 0, 0);
|
||||||
|
|
||||||
|
dimm->nr_pages = 1;
|
||||||
|
dimm->grain = 128;
|
||||||
|
dimm->mtype = MEM_UNKNOWN;
|
||||||
|
dimm->dtype = DEV_UNKNOWN;
|
||||||
|
dimm->edac_mode = EDAC_SECDED;
|
||||||
|
}
|
||||||
|
|
||||||
|
rc = edac_mc_add_mc(mci);
|
||||||
|
if (rc < 0) {
|
||||||
|
pr_info("Can't register at EDAC core\n");
|
||||||
|
edac_mc_free(mci);
|
||||||
|
mutex_unlock(&ghes_edac_lock);
|
||||||
|
return -ENODEV;
|
||||||
|
}
|
||||||
|
|
||||||
|
ghes_edac_mc_num++;
|
||||||
|
mutex_unlock(&ghes_edac_lock);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
EXPORT_SYMBOL_GPL(ghes_edac_register);
|
||||||
|
|
||||||
|
void ghes_edac_unregister(struct ghes *ghes)
|
||||||
|
{
|
||||||
|
struct mem_ctl_info *mci;
|
||||||
|
struct ghes_edac_pvt *pvt, *tmp;
|
||||||
|
|
||||||
|
list_for_each_entry_safe(pvt, tmp, &ghes_reglist, list) {
|
||||||
|
if (ghes == pvt->ghes) {
|
||||||
|
mci = pvt->mci;
|
||||||
|
edac_mc_del_mc(mci->pdev);
|
||||||
|
edac_mc_free(mci);
|
||||||
|
list_del(&pvt->list);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
EXPORT_SYMBOL_GPL(ghes_edac_unregister);
|
|
@ -106,16 +106,26 @@ static int nr_channels;
|
||||||
|
|
||||||
static int how_many_channels(struct pci_dev *pdev)
|
static int how_many_channels(struct pci_dev *pdev)
|
||||||
{
|
{
|
||||||
|
int n_channels;
|
||||||
|
|
||||||
unsigned char capid0_8b; /* 8th byte of CAPID0 */
|
unsigned char capid0_8b; /* 8th byte of CAPID0 */
|
||||||
|
|
||||||
pci_read_config_byte(pdev, I3200_CAPID0 + 8, &capid0_8b);
|
pci_read_config_byte(pdev, I3200_CAPID0 + 8, &capid0_8b);
|
||||||
|
|
||||||
if (capid0_8b & 0x20) { /* check DCD: Dual Channel Disable */
|
if (capid0_8b & 0x20) { /* check DCD: Dual Channel Disable */
|
||||||
edac_dbg(0, "In single channel mode\n");
|
edac_dbg(0, "In single channel mode\n");
|
||||||
return 1;
|
n_channels = 1;
|
||||||
} else {
|
} else {
|
||||||
edac_dbg(0, "In dual channel mode\n");
|
edac_dbg(0, "In dual channel mode\n");
|
||||||
return 2;
|
n_channels = 2;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (capid0_8b & 0x10) /* check if both channels are filled */
|
||||||
|
edac_dbg(0, "2 DIMMS per channel disabled\n");
|
||||||
|
else
|
||||||
|
edac_dbg(0, "2 DIMMS per channel enabled\n");
|
||||||
|
|
||||||
|
return n_channels;
|
||||||
}
|
}
|
||||||
|
|
||||||
static unsigned long eccerrlog_syndrome(u64 log)
|
static unsigned long eccerrlog_syndrome(u64 log)
|
||||||
|
@ -290,6 +300,8 @@ static void i3200_get_drbs(void __iomem *window,
|
||||||
for (i = 0; i < I3200_RANKS_PER_CHANNEL; i++) {
|
for (i = 0; i < I3200_RANKS_PER_CHANNEL; i++) {
|
||||||
drbs[0][i] = readw(window + I3200_C0DRB + 2*i) & I3200_DRB_MASK;
|
drbs[0][i] = readw(window + I3200_C0DRB + 2*i) & I3200_DRB_MASK;
|
||||||
drbs[1][i] = readw(window + I3200_C1DRB + 2*i) & I3200_DRB_MASK;
|
drbs[1][i] = readw(window + I3200_C1DRB + 2*i) & I3200_DRB_MASK;
|
||||||
|
|
||||||
|
edac_dbg(0, "drb[0][%d] = %d, drb[1][%d] = %d\n", i, drbs[0][i], i, drbs[1][i]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -311,6 +323,9 @@ static unsigned long drb_to_nr_pages(
|
||||||
int n;
|
int n;
|
||||||
|
|
||||||
n = drbs[channel][rank];
|
n = drbs[channel][rank];
|
||||||
|
if (!n)
|
||||||
|
return 0;
|
||||||
|
|
||||||
if (rank > 0)
|
if (rank > 0)
|
||||||
n -= drbs[channel][rank - 1];
|
n -= drbs[channel][rank - 1];
|
||||||
if (stacked && (channel == 1) &&
|
if (stacked && (channel == 1) &&
|
||||||
|
@ -377,19 +392,19 @@ static int i3200_probe1(struct pci_dev *pdev, int dev_idx)
|
||||||
* cumulative; the last one will contain the total memory
|
* cumulative; the last one will contain the total memory
|
||||||
* contained in all ranks.
|
* contained in all ranks.
|
||||||
*/
|
*/
|
||||||
for (i = 0; i < mci->nr_csrows; i++) {
|
for (i = 0; i < I3200_DIMMS; i++) {
|
||||||
unsigned long nr_pages;
|
unsigned long nr_pages;
|
||||||
struct csrow_info *csrow = mci->csrows[i];
|
|
||||||
|
|
||||||
nr_pages = drb_to_nr_pages(drbs, stacked,
|
|
||||||
i / I3200_RANKS_PER_CHANNEL,
|
|
||||||
i % I3200_RANKS_PER_CHANNEL);
|
|
||||||
|
|
||||||
if (nr_pages == 0)
|
|
||||||
continue;
|
|
||||||
|
|
||||||
for (j = 0; j < nr_channels; j++) {
|
for (j = 0; j < nr_channels; j++) {
|
||||||
struct dimm_info *dimm = csrow->channels[j]->dimm;
|
struct dimm_info *dimm = EDAC_DIMM_PTR(mci->layers, mci->dimms,
|
||||||
|
mci->n_layers, i, j, 0);
|
||||||
|
|
||||||
|
nr_pages = drb_to_nr_pages(drbs, stacked, j, i);
|
||||||
|
if (nr_pages == 0)
|
||||||
|
continue;
|
||||||
|
|
||||||
|
edac_dbg(0, "csrow %d, channel %d%s, size = %ld Mb\n", i, j,
|
||||||
|
stacked ? " (stacked)" : "", PAGES_TO_MiB(nr_pages));
|
||||||
|
|
||||||
dimm->nr_pages = nr_pages;
|
dimm->nr_pages = nr_pages;
|
||||||
dimm->grain = nr_pages << PAGE_SHIFT;
|
dimm->grain = nr_pages << PAGE_SHIFT;
|
||||||
|
|
|
@ -27,6 +27,7 @@
|
||||||
#include <linux/edac.h>
|
#include <linux/edac.h>
|
||||||
#include <linux/delay.h>
|
#include <linux/delay.h>
|
||||||
#include <linux/mmzone.h>
|
#include <linux/mmzone.h>
|
||||||
|
#include <linux/debugfs.h>
|
||||||
|
|
||||||
#include "edac_core.h"
|
#include "edac_core.h"
|
||||||
|
|
||||||
|
@ -68,6 +69,14 @@
|
||||||
I5100_FERR_NF_MEM_M1ERR_MASK)
|
I5100_FERR_NF_MEM_M1ERR_MASK)
|
||||||
#define I5100_NERR_NF_MEM 0xa4 /* MC Next Non-Fatal Errors */
|
#define I5100_NERR_NF_MEM 0xa4 /* MC Next Non-Fatal Errors */
|
||||||
#define I5100_EMASK_MEM 0xa8 /* MC Error Mask Register */
|
#define I5100_EMASK_MEM 0xa8 /* MC Error Mask Register */
|
||||||
|
#define I5100_MEM0EINJMSK0 0x200 /* Injection Mask0 Register Channel 0 */
|
||||||
|
#define I5100_MEM1EINJMSK0 0x208 /* Injection Mask0 Register Channel 1 */
|
||||||
|
#define I5100_MEMXEINJMSK0_EINJEN (1 << 27)
|
||||||
|
#define I5100_MEM0EINJMSK1 0x204 /* Injection Mask1 Register Channel 0 */
|
||||||
|
#define I5100_MEM1EINJMSK1 0x206 /* Injection Mask1 Register Channel 1 */
|
||||||
|
|
||||||
|
/* Device 19, Function 0 */
|
||||||
|
#define I5100_DINJ0 0x9a
|
||||||
|
|
||||||
/* device 21 and 22, func 0 */
|
/* device 21 and 22, func 0 */
|
||||||
#define I5100_MTR_0 0x154 /* Memory Technology Registers 0-3 */
|
#define I5100_MTR_0 0x154 /* Memory Technology Registers 0-3 */
|
||||||
|
@ -338,13 +347,26 @@ struct i5100_priv {
|
||||||
unsigned ranksperchan; /* number of ranks per channel */
|
unsigned ranksperchan; /* number of ranks per channel */
|
||||||
|
|
||||||
struct pci_dev *mc; /* device 16 func 1 */
|
struct pci_dev *mc; /* device 16 func 1 */
|
||||||
|
struct pci_dev *einj; /* device 19 func 0 */
|
||||||
struct pci_dev *ch0mm; /* device 21 func 0 */
|
struct pci_dev *ch0mm; /* device 21 func 0 */
|
||||||
struct pci_dev *ch1mm; /* device 22 func 0 */
|
struct pci_dev *ch1mm; /* device 22 func 0 */
|
||||||
|
|
||||||
struct delayed_work i5100_scrubbing;
|
struct delayed_work i5100_scrubbing;
|
||||||
int scrub_enable;
|
int scrub_enable;
|
||||||
|
|
||||||
|
/* Error injection */
|
||||||
|
u8 inject_channel;
|
||||||
|
u8 inject_hlinesel;
|
||||||
|
u8 inject_deviceptr1;
|
||||||
|
u8 inject_deviceptr2;
|
||||||
|
u16 inject_eccmask1;
|
||||||
|
u16 inject_eccmask2;
|
||||||
|
|
||||||
|
struct dentry *debugfs;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
static struct dentry *i5100_debugfs;
|
||||||
|
|
||||||
/* map a rank/chan to a slot number on the mainboard */
|
/* map a rank/chan to a slot number on the mainboard */
|
||||||
static int i5100_rank_to_slot(const struct mem_ctl_info *mci,
|
static int i5100_rank_to_slot(const struct mem_ctl_info *mci,
|
||||||
int chan, int rank)
|
int chan, int rank)
|
||||||
|
@ -863,13 +885,126 @@ static void i5100_init_csrows(struct mem_ctl_info *mci)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/****************************************************************************
|
||||||
|
* Error injection routines
|
||||||
|
****************************************************************************/
|
||||||
|
|
||||||
|
static void i5100_do_inject(struct mem_ctl_info *mci)
|
||||||
|
{
|
||||||
|
struct i5100_priv *priv = mci->pvt_info;
|
||||||
|
u32 mask0;
|
||||||
|
u16 mask1;
|
||||||
|
|
||||||
|
/* MEM[1:0]EINJMSK0
|
||||||
|
* 31 - ADDRMATCHEN
|
||||||
|
* 29:28 - HLINESEL
|
||||||
|
* 00 Reserved
|
||||||
|
* 01 Lower half of cache line
|
||||||
|
* 10 Upper half of cache line
|
||||||
|
* 11 Both upper and lower parts of cache line
|
||||||
|
* 27 - EINJEN
|
||||||
|
* 25:19 - XORMASK1 for deviceptr1
|
||||||
|
* 9:5 - SEC2RAM for deviceptr2
|
||||||
|
* 4:0 - FIR2RAM for deviceptr1
|
||||||
|
*/
|
||||||
|
mask0 = ((priv->inject_hlinesel & 0x3) << 28) |
|
||||||
|
I5100_MEMXEINJMSK0_EINJEN |
|
||||||
|
((priv->inject_eccmask1 & 0xffff) << 10) |
|
||||||
|
((priv->inject_deviceptr2 & 0x1f) << 5) |
|
||||||
|
(priv->inject_deviceptr1 & 0x1f);
|
||||||
|
|
||||||
|
/* MEM[1:0]EINJMSK1
|
||||||
|
* 15:0 - XORMASK2 for deviceptr2
|
||||||
|
*/
|
||||||
|
mask1 = priv->inject_eccmask2;
|
||||||
|
|
||||||
|
if (priv->inject_channel == 0) {
|
||||||
|
pci_write_config_dword(priv->mc, I5100_MEM0EINJMSK0, mask0);
|
||||||
|
pci_write_config_word(priv->mc, I5100_MEM0EINJMSK1, mask1);
|
||||||
|
} else {
|
||||||
|
pci_write_config_dword(priv->mc, I5100_MEM1EINJMSK0, mask0);
|
||||||
|
pci_write_config_word(priv->mc, I5100_MEM1EINJMSK1, mask1);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Error Injection Response Function
|
||||||
|
* Intel 5100 Memory Controller Hub Chipset (318378) datasheet
|
||||||
|
* hints about this register but carry no data about them. All
|
||||||
|
* data regarding device 19 is based on experimentation and the
|
||||||
|
* Intel 7300 Chipset Memory Controller Hub (318082) datasheet
|
||||||
|
* which appears to be accurate for the i5100 in this area.
|
||||||
|
*
|
||||||
|
* The injection code don't work without setting this register.
|
||||||
|
* The register needs to be flipped off then on else the hardware
|
||||||
|
* will only preform the first injection.
|
||||||
|
*
|
||||||
|
* Stop condition bits 7:4
|
||||||
|
* 1010 - Stop after one injection
|
||||||
|
* 1011 - Never stop injecting faults
|
||||||
|
*
|
||||||
|
* Start condition bits 3:0
|
||||||
|
* 1010 - Never start
|
||||||
|
* 1011 - Start immediately
|
||||||
|
*/
|
||||||
|
pci_write_config_byte(priv->einj, I5100_DINJ0, 0xaa);
|
||||||
|
pci_write_config_byte(priv->einj, I5100_DINJ0, 0xab);
|
||||||
|
}
|
||||||
|
|
||||||
|
#define to_mci(k) container_of(k, struct mem_ctl_info, dev)
|
||||||
|
static ssize_t inject_enable_write(struct file *file, const char __user *data,
|
||||||
|
size_t count, loff_t *ppos)
|
||||||
|
{
|
||||||
|
struct device *dev = file->private_data;
|
||||||
|
struct mem_ctl_info *mci = to_mci(dev);
|
||||||
|
|
||||||
|
i5100_do_inject(mci);
|
||||||
|
|
||||||
|
return count;
|
||||||
|
}
|
||||||
|
|
||||||
|
static const struct file_operations i5100_inject_enable_fops = {
|
||||||
|
.open = simple_open,
|
||||||
|
.write = inject_enable_write,
|
||||||
|
.llseek = generic_file_llseek,
|
||||||
|
};
|
||||||
|
|
||||||
|
static int i5100_setup_debugfs(struct mem_ctl_info *mci)
|
||||||
|
{
|
||||||
|
struct i5100_priv *priv = mci->pvt_info;
|
||||||
|
|
||||||
|
if (!i5100_debugfs)
|
||||||
|
return -ENODEV;
|
||||||
|
|
||||||
|
priv->debugfs = debugfs_create_dir(mci->bus.name, i5100_debugfs);
|
||||||
|
|
||||||
|
if (!priv->debugfs)
|
||||||
|
return -ENOMEM;
|
||||||
|
|
||||||
|
debugfs_create_x8("inject_channel", S_IRUGO | S_IWUSR, priv->debugfs,
|
||||||
|
&priv->inject_channel);
|
||||||
|
debugfs_create_x8("inject_hlinesel", S_IRUGO | S_IWUSR, priv->debugfs,
|
||||||
|
&priv->inject_hlinesel);
|
||||||
|
debugfs_create_x8("inject_deviceptr1", S_IRUGO | S_IWUSR, priv->debugfs,
|
||||||
|
&priv->inject_deviceptr1);
|
||||||
|
debugfs_create_x8("inject_deviceptr2", S_IRUGO | S_IWUSR, priv->debugfs,
|
||||||
|
&priv->inject_deviceptr2);
|
||||||
|
debugfs_create_x16("inject_eccmask1", S_IRUGO | S_IWUSR, priv->debugfs,
|
||||||
|
&priv->inject_eccmask1);
|
||||||
|
debugfs_create_x16("inject_eccmask2", S_IRUGO | S_IWUSR, priv->debugfs,
|
||||||
|
&priv->inject_eccmask2);
|
||||||
|
debugfs_create_file("inject_enable", S_IWUSR, priv->debugfs,
|
||||||
|
&mci->dev, &i5100_inject_enable_fops);
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
static int i5100_init_one(struct pci_dev *pdev, const struct pci_device_id *id)
|
static int i5100_init_one(struct pci_dev *pdev, const struct pci_device_id *id)
|
||||||
{
|
{
|
||||||
int rc;
|
int rc;
|
||||||
struct mem_ctl_info *mci;
|
struct mem_ctl_info *mci;
|
||||||
struct edac_mc_layer layers[2];
|
struct edac_mc_layer layers[2];
|
||||||
struct i5100_priv *priv;
|
struct i5100_priv *priv;
|
||||||
struct pci_dev *ch0mm, *ch1mm;
|
struct pci_dev *ch0mm, *ch1mm, *einj;
|
||||||
int ret = 0;
|
int ret = 0;
|
||||||
u32 dw;
|
u32 dw;
|
||||||
int ranksperch;
|
int ranksperch;
|
||||||
|
@ -941,6 +1076,22 @@ static int i5100_init_one(struct pci_dev *pdev, const struct pci_device_id *id)
|
||||||
goto bail_disable_ch1;
|
goto bail_disable_ch1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/* device 19, func 0, Error injection */
|
||||||
|
einj = pci_get_device_func(PCI_VENDOR_ID_INTEL,
|
||||||
|
PCI_DEVICE_ID_INTEL_5100_19, 0);
|
||||||
|
if (!einj) {
|
||||||
|
ret = -ENODEV;
|
||||||
|
goto bail_einj;
|
||||||
|
}
|
||||||
|
|
||||||
|
rc = pci_enable_device(einj);
|
||||||
|
if (rc < 0) {
|
||||||
|
ret = rc;
|
||||||
|
goto bail_disable_einj;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
mci->pdev = &pdev->dev;
|
mci->pdev = &pdev->dev;
|
||||||
|
|
||||||
priv = mci->pvt_info;
|
priv = mci->pvt_info;
|
||||||
|
@ -948,6 +1099,7 @@ static int i5100_init_one(struct pci_dev *pdev, const struct pci_device_id *id)
|
||||||
priv->mc = pdev;
|
priv->mc = pdev;
|
||||||
priv->ch0mm = ch0mm;
|
priv->ch0mm = ch0mm;
|
||||||
priv->ch1mm = ch1mm;
|
priv->ch1mm = ch1mm;
|
||||||
|
priv->einj = einj;
|
||||||
|
|
||||||
INIT_DELAYED_WORK(&(priv->i5100_scrubbing), i5100_refresh_scrubbing);
|
INIT_DELAYED_WORK(&(priv->i5100_scrubbing), i5100_refresh_scrubbing);
|
||||||
|
|
||||||
|
@ -975,6 +1127,13 @@ static int i5100_init_one(struct pci_dev *pdev, const struct pci_device_id *id)
|
||||||
mci->set_sdram_scrub_rate = i5100_set_scrub_rate;
|
mci->set_sdram_scrub_rate = i5100_set_scrub_rate;
|
||||||
mci->get_sdram_scrub_rate = i5100_get_scrub_rate;
|
mci->get_sdram_scrub_rate = i5100_get_scrub_rate;
|
||||||
|
|
||||||
|
priv->inject_channel = 0;
|
||||||
|
priv->inject_hlinesel = 0;
|
||||||
|
priv->inject_deviceptr1 = 0;
|
||||||
|
priv->inject_deviceptr2 = 0;
|
||||||
|
priv->inject_eccmask1 = 0;
|
||||||
|
priv->inject_eccmask2 = 0;
|
||||||
|
|
||||||
i5100_init_csrows(mci);
|
i5100_init_csrows(mci);
|
||||||
|
|
||||||
/* this strange construction seems to be in every driver, dunno why */
|
/* this strange construction seems to be in every driver, dunno why */
|
||||||
|
@ -992,6 +1151,8 @@ static int i5100_init_one(struct pci_dev *pdev, const struct pci_device_id *id)
|
||||||
goto bail_scrub;
|
goto bail_scrub;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
i5100_setup_debugfs(mci);
|
||||||
|
|
||||||
return ret;
|
return ret;
|
||||||
|
|
||||||
bail_scrub:
|
bail_scrub:
|
||||||
|
@ -999,6 +1160,12 @@ static int i5100_init_one(struct pci_dev *pdev, const struct pci_device_id *id)
|
||||||
cancel_delayed_work_sync(&(priv->i5100_scrubbing));
|
cancel_delayed_work_sync(&(priv->i5100_scrubbing));
|
||||||
edac_mc_free(mci);
|
edac_mc_free(mci);
|
||||||
|
|
||||||
|
bail_disable_einj:
|
||||||
|
pci_disable_device(einj);
|
||||||
|
|
||||||
|
bail_einj:
|
||||||
|
pci_dev_put(einj);
|
||||||
|
|
||||||
bail_disable_ch1:
|
bail_disable_ch1:
|
||||||
pci_disable_device(ch1mm);
|
pci_disable_device(ch1mm);
|
||||||
|
|
||||||
|
@ -1030,14 +1197,18 @@ static void i5100_remove_one(struct pci_dev *pdev)
|
||||||
|
|
||||||
priv = mci->pvt_info;
|
priv = mci->pvt_info;
|
||||||
|
|
||||||
|
debugfs_remove_recursive(priv->debugfs);
|
||||||
|
|
||||||
priv->scrub_enable = 0;
|
priv->scrub_enable = 0;
|
||||||
cancel_delayed_work_sync(&(priv->i5100_scrubbing));
|
cancel_delayed_work_sync(&(priv->i5100_scrubbing));
|
||||||
|
|
||||||
pci_disable_device(pdev);
|
pci_disable_device(pdev);
|
||||||
pci_disable_device(priv->ch0mm);
|
pci_disable_device(priv->ch0mm);
|
||||||
pci_disable_device(priv->ch1mm);
|
pci_disable_device(priv->ch1mm);
|
||||||
|
pci_disable_device(priv->einj);
|
||||||
pci_dev_put(priv->ch0mm);
|
pci_dev_put(priv->ch0mm);
|
||||||
pci_dev_put(priv->ch1mm);
|
pci_dev_put(priv->ch1mm);
|
||||||
|
pci_dev_put(priv->einj);
|
||||||
|
|
||||||
edac_mc_free(mci);
|
edac_mc_free(mci);
|
||||||
}
|
}
|
||||||
|
@ -1060,13 +1231,16 @@ static int __init i5100_init(void)
|
||||||
{
|
{
|
||||||
int pci_rc;
|
int pci_rc;
|
||||||
|
|
||||||
pci_rc = pci_register_driver(&i5100_driver);
|
i5100_debugfs = debugfs_create_dir("i5100_edac", NULL);
|
||||||
|
|
||||||
|
pci_rc = pci_register_driver(&i5100_driver);
|
||||||
return (pci_rc < 0) ? pci_rc : 0;
|
return (pci_rc < 0) ? pci_rc : 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void __exit i5100_exit(void)
|
static void __exit i5100_exit(void)
|
||||||
{
|
{
|
||||||
|
debugfs_remove(i5100_debugfs);
|
||||||
|
|
||||||
pci_unregister_driver(&i5100_driver);
|
pci_unregister_driver(&i5100_driver);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -420,21 +420,21 @@ static inline int numdimms(u32 dimms)
|
||||||
|
|
||||||
static inline int numrank(u32 rank)
|
static inline int numrank(u32 rank)
|
||||||
{
|
{
|
||||||
static int ranks[4] = { 1, 2, 4, -EINVAL };
|
static const int ranks[] = { 1, 2, 4, -EINVAL };
|
||||||
|
|
||||||
return ranks[rank & 0x3];
|
return ranks[rank & 0x3];
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline int numbank(u32 bank)
|
static inline int numbank(u32 bank)
|
||||||
{
|
{
|
||||||
static int banks[4] = { 4, 8, 16, -EINVAL };
|
static const int banks[] = { 4, 8, 16, -EINVAL };
|
||||||
|
|
||||||
return banks[bank & 0x3];
|
return banks[bank & 0x3];
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline int numrow(u32 row)
|
static inline int numrow(u32 row)
|
||||||
{
|
{
|
||||||
static int rows[8] = {
|
static const int rows[] = {
|
||||||
1 << 12, 1 << 13, 1 << 14, 1 << 15,
|
1 << 12, 1 << 13, 1 << 14, 1 << 15,
|
||||||
1 << 16, -EINVAL, -EINVAL, -EINVAL,
|
1 << 16, -EINVAL, -EINVAL, -EINVAL,
|
||||||
};
|
};
|
||||||
|
@ -444,7 +444,7 @@ static inline int numrow(u32 row)
|
||||||
|
|
||||||
static inline int numcol(u32 col)
|
static inline int numcol(u32 col)
|
||||||
{
|
{
|
||||||
static int cols[8] = {
|
static const int cols[] = {
|
||||||
1 << 10, 1 << 11, 1 << 12, -EINVAL,
|
1 << 10, 1 << 11, 1 << 12, -EINVAL,
|
||||||
};
|
};
|
||||||
return cols[col & 0x3];
|
return cols[col & 0x3];
|
||||||
|
|
|
@ -639,7 +639,7 @@ static void get_memory_layout(const struct mem_ctl_info *mci)
|
||||||
tmp_mb = (1 + pvt->tohm) >> 20;
|
tmp_mb = (1 + pvt->tohm) >> 20;
|
||||||
|
|
||||||
mb = div_u64_rem(tmp_mb, 1000, &kb);
|
mb = div_u64_rem(tmp_mb, 1000, &kb);
|
||||||
edac_dbg(0, "TOHM: %u.%03u GB (0x%016Lx)", mb, kb, (u64)pvt->tohm);
|
edac_dbg(0, "TOHM: %u.%03u GB (0x%016Lx)\n", mb, kb, (u64)pvt->tohm);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Step 2) Get SAD range and SAD Interleave list
|
* Step 2) Get SAD range and SAD Interleave list
|
||||||
|
|
72
include/acpi/ghes.h
Normal file
72
include/acpi/ghes.h
Normal file
|
@ -0,0 +1,72 @@
|
||||||
|
#include <acpi/apei.h>
|
||||||
|
#include <acpi/hed.h>
|
||||||
|
|
||||||
|
/*
|
||||||
|
* One struct ghes is created for each generic hardware error source.
|
||||||
|
* It provides the context for APEI hardware error timer/IRQ/SCI/NMI
|
||||||
|
* handler.
|
||||||
|
*
|
||||||
|
* estatus: memory buffer for error status block, allocated during
|
||||||
|
* HEST parsing.
|
||||||
|
*/
|
||||||
|
#define GHES_TO_CLEAR 0x0001
|
||||||
|
#define GHES_EXITING 0x0002
|
||||||
|
|
||||||
|
struct ghes {
|
||||||
|
struct acpi_hest_generic *generic;
|
||||||
|
struct acpi_hest_generic_status *estatus;
|
||||||
|
u64 buffer_paddr;
|
||||||
|
unsigned long flags;
|
||||||
|
union {
|
||||||
|
struct list_head list;
|
||||||
|
struct timer_list timer;
|
||||||
|
unsigned int irq;
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
struct ghes_estatus_node {
|
||||||
|
struct llist_node llnode;
|
||||||
|
struct acpi_hest_generic *generic;
|
||||||
|
struct ghes *ghes;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct ghes_estatus_cache {
|
||||||
|
u32 estatus_len;
|
||||||
|
atomic_t count;
|
||||||
|
struct acpi_hest_generic *generic;
|
||||||
|
unsigned long long time_in;
|
||||||
|
struct rcu_head rcu;
|
||||||
|
};
|
||||||
|
|
||||||
|
enum {
|
||||||
|
GHES_SEV_NO = 0x0,
|
||||||
|
GHES_SEV_CORRECTED = 0x1,
|
||||||
|
GHES_SEV_RECOVERABLE = 0x2,
|
||||||
|
GHES_SEV_PANIC = 0x3,
|
||||||
|
};
|
||||||
|
|
||||||
|
/* From drivers/edac/ghes_edac.c */
|
||||||
|
|
||||||
|
#ifdef CONFIG_EDAC_GHES
|
||||||
|
void ghes_edac_report_mem_error(struct ghes *ghes, int sev,
|
||||||
|
struct cper_sec_mem_err *mem_err);
|
||||||
|
|
||||||
|
int ghes_edac_register(struct ghes *ghes, struct device *dev);
|
||||||
|
|
||||||
|
void ghes_edac_unregister(struct ghes *ghes);
|
||||||
|
|
||||||
|
#else
|
||||||
|
static inline void ghes_edac_report_mem_error(struct ghes *ghes, int sev,
|
||||||
|
struct cper_sec_mem_err *mem_err)
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline int ghes_edac_register(struct ghes *ghes, struct device *dev)
|
||||||
|
{
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline void ghes_edac_unregister(struct ghes *ghes)
|
||||||
|
{
|
||||||
|
}
|
||||||
|
#endif
|
|
@ -14,7 +14,6 @@
|
||||||
|
|
||||||
#include <linux/atomic.h>
|
#include <linux/atomic.h>
|
||||||
#include <linux/device.h>
|
#include <linux/device.h>
|
||||||
#include <linux/kobject.h>
|
|
||||||
#include <linux/completion.h>
|
#include <linux/completion.h>
|
||||||
#include <linux/workqueue.h>
|
#include <linux/workqueue.h>
|
||||||
#include <linux/debugfs.h>
|
#include <linux/debugfs.h>
|
||||||
|
@ -48,8 +47,17 @@ static inline void opstate_init(void)
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Max length of a DIMM label*/
|
||||||
#define EDAC_MC_LABEL_LEN 31
|
#define EDAC_MC_LABEL_LEN 31
|
||||||
#define MC_PROC_NAME_MAX_LEN 7
|
|
||||||
|
/* Maximum size of the location string */
|
||||||
|
#define LOCATION_SIZE 80
|
||||||
|
|
||||||
|
/* Defines the maximum number of labels that can be reported */
|
||||||
|
#define EDAC_MAX_LABELS 8
|
||||||
|
|
||||||
|
/* String used to join two or more labels */
|
||||||
|
#define OTHER_LABEL " or "
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* enum dev_type - describe the type of memory DRAM chips used at the stick
|
* enum dev_type - describe the type of memory DRAM chips used at the stick
|
||||||
|
@ -101,8 +109,24 @@ enum hw_event_mc_err_type {
|
||||||
HW_EVENT_ERR_CORRECTED,
|
HW_EVENT_ERR_CORRECTED,
|
||||||
HW_EVENT_ERR_UNCORRECTED,
|
HW_EVENT_ERR_UNCORRECTED,
|
||||||
HW_EVENT_ERR_FATAL,
|
HW_EVENT_ERR_FATAL,
|
||||||
|
HW_EVENT_ERR_INFO,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
static inline char *mc_event_error_type(const unsigned int err_type)
|
||||||
|
{
|
||||||
|
switch (err_type) {
|
||||||
|
case HW_EVENT_ERR_CORRECTED:
|
||||||
|
return "Corrected";
|
||||||
|
case HW_EVENT_ERR_UNCORRECTED:
|
||||||
|
return "Uncorrected";
|
||||||
|
case HW_EVENT_ERR_FATAL:
|
||||||
|
return "Fatal";
|
||||||
|
default:
|
||||||
|
case HW_EVENT_ERR_INFO:
|
||||||
|
return "Info";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* enum mem_type - memory types. For a more detailed reference, please see
|
* enum mem_type - memory types. For a more detailed reference, please see
|
||||||
* http://en.wikipedia.org/wiki/DRAM
|
* http://en.wikipedia.org/wiki/DRAM
|
||||||
|
@ -376,6 +400,9 @@ enum scrub_type {
|
||||||
* @EDAC_MC_LAYER_CHANNEL: memory layer is named "channel"
|
* @EDAC_MC_LAYER_CHANNEL: memory layer is named "channel"
|
||||||
* @EDAC_MC_LAYER_SLOT: memory layer is named "slot"
|
* @EDAC_MC_LAYER_SLOT: memory layer is named "slot"
|
||||||
* @EDAC_MC_LAYER_CHIP_SELECT: memory layer is named "chip select"
|
* @EDAC_MC_LAYER_CHIP_SELECT: memory layer is named "chip select"
|
||||||
|
* @EDAC_MC_LAYER_ALL_MEM: memory layout is unknown. All memory is mapped
|
||||||
|
* as a single memory area. This is used when
|
||||||
|
* retrieving errors from a firmware driven driver.
|
||||||
*
|
*
|
||||||
* This enum is used by the drivers to tell edac_mc_sysfs what name should
|
* This enum is used by the drivers to tell edac_mc_sysfs what name should
|
||||||
* be used when describing a memory stick location.
|
* be used when describing a memory stick location.
|
||||||
|
@ -385,6 +412,7 @@ enum edac_mc_layer_type {
|
||||||
EDAC_MC_LAYER_CHANNEL,
|
EDAC_MC_LAYER_CHANNEL,
|
||||||
EDAC_MC_LAYER_SLOT,
|
EDAC_MC_LAYER_SLOT,
|
||||||
EDAC_MC_LAYER_CHIP_SELECT,
|
EDAC_MC_LAYER_CHIP_SELECT,
|
||||||
|
EDAC_MC_LAYER_ALL_MEM,
|
||||||
};
|
};
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -551,6 +579,46 @@ struct errcount_attribute_data {
|
||||||
int layer0, layer1, layer2;
|
int layer0, layer1, layer2;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* edac_raw_error_desc - Raw error report structure
|
||||||
|
* @grain: minimum granularity for an error report, in bytes
|
||||||
|
* @error_count: number of errors of the same type
|
||||||
|
* @top_layer: top layer of the error (layer[0])
|
||||||
|
* @mid_layer: middle layer of the error (layer[1])
|
||||||
|
* @low_layer: low layer of the error (layer[2])
|
||||||
|
* @page_frame_number: page where the error happened
|
||||||
|
* @offset_in_page: page offset
|
||||||
|
* @syndrome: syndrome of the error (or 0 if unknown or if
|
||||||
|
* the syndrome is not applicable)
|
||||||
|
* @msg: error message
|
||||||
|
* @location: location of the error
|
||||||
|
* @label: label of the affected DIMM(s)
|
||||||
|
* @other_detail: other driver-specific detail about the error
|
||||||
|
* @enable_per_layer_report: if false, the error affects all layers
|
||||||
|
* (typically, a memory controller error)
|
||||||
|
*/
|
||||||
|
struct edac_raw_error_desc {
|
||||||
|
/*
|
||||||
|
* NOTE: everything before grain won't be cleaned by
|
||||||
|
* edac_raw_error_desc_clean()
|
||||||
|
*/
|
||||||
|
char location[LOCATION_SIZE];
|
||||||
|
char label[(EDAC_MC_LABEL_LEN + 1 + sizeof(OTHER_LABEL)) * EDAC_MAX_LABELS];
|
||||||
|
long grain;
|
||||||
|
|
||||||
|
/* the vars below and grain will be cleaned on every new error report */
|
||||||
|
u16 error_count;
|
||||||
|
int top_layer;
|
||||||
|
int mid_layer;
|
||||||
|
int low_layer;
|
||||||
|
unsigned long page_frame_number;
|
||||||
|
unsigned long offset_in_page;
|
||||||
|
unsigned long syndrome;
|
||||||
|
const char *msg;
|
||||||
|
const char *other_detail;
|
||||||
|
bool enable_per_layer_report;
|
||||||
|
};
|
||||||
|
|
||||||
/* MEMORY controller information structure
|
/* MEMORY controller information structure
|
||||||
*/
|
*/
|
||||||
struct mem_ctl_info {
|
struct mem_ctl_info {
|
||||||
|
@ -630,7 +698,6 @@ struct mem_ctl_info {
|
||||||
const char *mod_ver;
|
const char *mod_ver;
|
||||||
const char *ctl_name;
|
const char *ctl_name;
|
||||||
const char *dev_name;
|
const char *dev_name;
|
||||||
char proc_name[MC_PROC_NAME_MAX_LEN + 1];
|
|
||||||
void *pvt_info;
|
void *pvt_info;
|
||||||
unsigned long start_time; /* mci load start time (in jiffies) */
|
unsigned long start_time; /* mci load start time (in jiffies) */
|
||||||
|
|
||||||
|
@ -659,6 +726,12 @@ struct mem_ctl_info {
|
||||||
/* work struct for this MC */
|
/* work struct for this MC */
|
||||||
struct delayed_work work;
|
struct delayed_work work;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Used to report an error - by being at the global struct
|
||||||
|
* makes the memory allocated by the EDAC core
|
||||||
|
*/
|
||||||
|
struct edac_raw_error_desc error_desc;
|
||||||
|
|
||||||
/* the internal state of this controller instance */
|
/* the internal state of this controller instance */
|
||||||
int op_state;
|
int op_state;
|
||||||
|
|
||||||
|
|
|
@ -2802,6 +2802,7 @@
|
||||||
#define PCI_DEVICE_ID_INTEL_JAKETOWN_UBOX 0x3ce0
|
#define PCI_DEVICE_ID_INTEL_JAKETOWN_UBOX 0x3ce0
|
||||||
#define PCI_DEVICE_ID_INTEL_IOAT_SNB 0x402f
|
#define PCI_DEVICE_ID_INTEL_IOAT_SNB 0x402f
|
||||||
#define PCI_DEVICE_ID_INTEL_5100_16 0x65f0
|
#define PCI_DEVICE_ID_INTEL_5100_16 0x65f0
|
||||||
|
#define PCI_DEVICE_ID_INTEL_5100_19 0x65f3
|
||||||
#define PCI_DEVICE_ID_INTEL_5100_21 0x65f5
|
#define PCI_DEVICE_ID_INTEL_5100_21 0x65f5
|
||||||
#define PCI_DEVICE_ID_INTEL_5100_22 0x65f6
|
#define PCI_DEVICE_ID_INTEL_5100_22 0x65f6
|
||||||
#define PCI_DEVICE_ID_INTEL_5400_ERR 0x4030
|
#define PCI_DEVICE_ID_INTEL_5400_ERR 0x4030
|
||||||
|
|
|
@ -78,9 +78,7 @@ TRACE_EVENT(mc_event,
|
||||||
|
|
||||||
TP_printk("%d %s error%s:%s%s on %s (mc:%d location:%d:%d:%d address:0x%08lx grain:%d syndrome:0x%08lx%s%s)",
|
TP_printk("%d %s error%s:%s%s on %s (mc:%d location:%d:%d:%d address:0x%08lx grain:%d syndrome:0x%08lx%s%s)",
|
||||||
__entry->error_count,
|
__entry->error_count,
|
||||||
(__entry->error_type == HW_EVENT_ERR_CORRECTED) ? "Corrected" :
|
mc_event_error_type(__entry->error_type),
|
||||||
((__entry->error_type == HW_EVENT_ERR_FATAL) ?
|
|
||||||
"Fatal" : "Uncorrected"),
|
|
||||||
__entry->error_count > 1 ? "s" : "",
|
__entry->error_count > 1 ? "s" : "",
|
||||||
((char *)__get_str(msg))[0] ? " " : "",
|
((char *)__get_str(msg))[0] ? " " : "",
|
||||||
__get_str(msg),
|
__get_str(msg),
|
||||||
|
|
Loading…
Reference in a new issue