Skip to content

Commit

Permalink
Merge branch 'linux_next' of git://git.kernel.org/pub/scm/linux/kerne…
Browse files Browse the repository at this point in the history
…l/git/mchehab/linux-edac

Pull EDAC fixes and ghes-edac from Mauro Carvalho Chehab:
 "For:

   - Some fixes at edac drivers (i7core_edac, sb_edac, i3200_edac);
   - error injection support for i5100, when EDAC debug is enabled;
   - fix edac when it is loaded builtin (early init for the subsystem);
   - a "Firmware First" EDAC driver, allowing ghes to report errors via
     EDAC (ghes-edac).

  With regards to ghes-edac, this fixes a longstanding BZ at Red Hat
  that happens with Nehalem and Sandy Bridge CPUs: when both GHES and
  i7core_edac or sb_edac are running, the error reports are
  unpredictable, as both BIOS and OS race to access the registers.  With
  ghes-edac, the EDAC core will refuse to register any other concurrent
  memory error driver.

  This patchset moves the ghes struct definitions to a separate header
  file (include/acpi/ghes.h) and adds 3 hooks at apei/ghes.c to
  register/unregister and to report errors via ghes-edac.  Those changes
  were acked by ghes driver maintainer (Huang)."

* 'linux_next' of git://git.kernel.org/pub/scm/linux/kernel/git/mchehab/linux-edac: (30 commits)
  i5100_edac: convert to use simple_open()
  ghes_edac: fix to use list_for_each_entry_safe() when delete list items
  ghes_edac: Fix RAS tracing
  ghes_edac: Make it compliant with UEFI spec 2.3.1
  ghes_edac: Improve driver's printk messages
  ghes_edac: Don't credit the same memory dimm twice
  ghes_edac: do a better job of filling EDAC DIMM info
  ghes_edac: add support for reporting errors via EDAC
  ghes_edac: Register at EDAC core the BIOS report
  ghes: add the needed hooks for EDAC error report
  ghes: move structures/enum to a header file
  edac: add support for error type "Info"
  edac: add support for raw error reports
  edac: reduce stack pressure by using a pre-allocated buffer
  edac: lock module owner to avoid error report conflicts
  edac: remove proc_name from mci structure
  edac: add a new memory layer type
  edac: initialize the core earlier
  edac: better report error conditions in debug mode
  i5100_edac: Remove two checkpatch warnings
  ...
  • Loading branch information
torvalds committed Mar 1, 2013
2 parents 19cc90f + b076989 commit ad6c2c2
Show file tree
Hide file tree
Showing 18 changed files with 1,077 additions and 140 deletions.
7 changes: 7 additions & 0 deletions MAINTAINERS
Original file line number Diff line number Diff line change
Expand Up @@ -2904,6 +2904,13 @@ W: bluesmoke.sourceforge.net
S: Maintained
F: drivers/edac/e7xxx_edac.c

EDAC-GHES
M: Mauro Carvalho Chehab <mchehab@redhat.com>
L: linux-edac@vger.kernel.org
W: bluesmoke.sourceforge.net
S: Maintained
F: drivers/edac/ghes-edac.c

EDAC-I82443BXGX
M: Tim Small <tim@buttersideup.com>
L: linux-edac@vger.kernel.org
Expand Down
71 changes: 21 additions & 50 deletions drivers/acpi/apei/ghes.c
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,8 @@
#include <linux/genalloc.h>
#include <linux/pci.h>
#include <linux/aer.h>
#include <acpi/apei.h>
#include <acpi/hed.h>

#include <acpi/ghes.h>
#include <asm/mce.h>
#include <asm/tlbflush.h>
#include <asm/nmi.h>
Expand Down Expand Up @@ -84,42 +84,6 @@
((struct acpi_hest_generic_status *) \
((struct ghes_estatus_node *)(estatus_node) + 1))

/*
* One struct ghes is created for each generic hardware error source.
* It provides the context for APEI hardware error timer/IRQ/SCI/NMI
* handler.
*
* estatus: memory buffer for error status block, allocated during
* HEST parsing.
*/
#define GHES_TO_CLEAR 0x0001
#define GHES_EXITING 0x0002

struct ghes {
struct acpi_hest_generic *generic;
struct acpi_hest_generic_status *estatus;
u64 buffer_paddr;
unsigned long flags;
union {
struct list_head list;
struct timer_list timer;
unsigned int irq;
};
};

struct ghes_estatus_node {
struct llist_node llnode;
struct acpi_hest_generic *generic;
};

struct ghes_estatus_cache {
u32 estatus_len;
atomic_t count;
struct acpi_hest_generic *generic;
unsigned long long time_in;
struct rcu_head rcu;
};

bool ghes_disable;
module_param_named(disable, ghes_disable, bool, 0);

Expand Down Expand Up @@ -333,13 +297,6 @@ static void ghes_fini(struct ghes *ghes)
apei_unmap_generic_address(&ghes->generic->error_status_address);
}

enum {
GHES_SEV_NO = 0x0,
GHES_SEV_CORRECTED = 0x1,
GHES_SEV_RECOVERABLE = 0x2,
GHES_SEV_PANIC = 0x3,
};

static inline int ghes_severity(int severity)
{
switch (severity) {
Expand Down Expand Up @@ -452,7 +409,8 @@ static void ghes_clear_estatus(struct ghes *ghes)
ghes->flags &= ~GHES_TO_CLEAR;
}

static void ghes_do_proc(const struct acpi_hest_generic_status *estatus)
static void ghes_do_proc(struct ghes *ghes,
const struct acpi_hest_generic_status *estatus)
{
int sev, sec_sev;
struct acpi_hest_generic_data *gdata;
Expand All @@ -464,6 +422,8 @@ static void ghes_do_proc(const struct acpi_hest_generic_status *estatus)
CPER_SEC_PLATFORM_MEM)) {
struct cper_sec_mem_err *mem_err;
mem_err = (struct cper_sec_mem_err *)(gdata+1);
ghes_edac_report_mem_error(ghes, sev, mem_err);

#ifdef CONFIG_X86_MCE
apei_mce_report_mem_error(sev == GHES_SEV_CORRECTED,
mem_err);
Expand Down Expand Up @@ -682,7 +642,7 @@ static int ghes_proc(struct ghes *ghes)
if (ghes_print_estatus(NULL, ghes->generic, ghes->estatus))
ghes_estatus_cache_add(ghes->generic, ghes->estatus);
}
ghes_do_proc(ghes->estatus);
ghes_do_proc(ghes, ghes->estatus);
out:
ghes_clear_estatus(ghes);
return 0;
Expand Down Expand Up @@ -775,7 +735,7 @@ static void ghes_proc_in_irq(struct irq_work *irq_work)
estatus = GHES_ESTATUS_FROM_NODE(estatus_node);
len = apei_estatus_len(estatus);
node_len = GHES_ESTATUS_NODE_LEN(len);
ghes_do_proc(estatus);
ghes_do_proc(estatus_node->ghes, estatus);
if (!ghes_estatus_cached(estatus)) {
generic = estatus_node->generic;
if (ghes_print_estatus(NULL, generic, estatus))
Expand Down Expand Up @@ -864,6 +824,7 @@ static int ghes_notify_nmi(unsigned int cmd, struct pt_regs *regs)
estatus_node = (void *)gen_pool_alloc(ghes_estatus_pool,
node_len);
if (estatus_node) {
estatus_node->ghes = ghes;
estatus_node->generic = ghes->generic;
estatus = GHES_ESTATUS_FROM_NODE(estatus_node);
memcpy(estatus, ghes->estatus, len);
Expand Down Expand Up @@ -942,6 +903,11 @@ static int ghes_probe(struct platform_device *ghes_dev)
ghes = NULL;
goto err;
}

rc = ghes_edac_register(ghes, &ghes_dev->dev);
if (rc < 0)
goto err;

switch (generic->notify.type) {
case ACPI_HEST_NOTIFY_POLLED:
ghes->timer.function = ghes_poll_func;
Expand All @@ -954,13 +920,13 @@ static int ghes_probe(struct platform_device *ghes_dev)
if (acpi_gsi_to_irq(generic->notify.vector, &ghes->irq)) {
pr_err(GHES_PFX "Failed to map GSI to IRQ for generic hardware error source: %d\n",
generic->header.source_id);
goto err;
goto err_edac_unreg;
}
if (request_irq(ghes->irq, ghes_irq_func,
0, "GHES IRQ", ghes)) {
pr_err(GHES_PFX "Failed to register IRQ for generic hardware error source: %d\n",
generic->header.source_id);
goto err;
goto err_edac_unreg;
}
break;
case ACPI_HEST_NOTIFY_SCI:
Expand All @@ -986,6 +952,8 @@ static int ghes_probe(struct platform_device *ghes_dev)
platform_set_drvdata(ghes_dev, ghes);

return 0;
err_edac_unreg:
ghes_edac_unregister(ghes);
err:
if (ghes) {
ghes_fini(ghes);
Expand Down Expand Up @@ -1038,6 +1006,9 @@ static int ghes_remove(struct platform_device *ghes_dev)
}

ghes_fini(ghes);

ghes_edac_unregister(ghes);

kfree(ghes);

platform_set_drvdata(ghes_dev, NULL);
Expand Down
23 changes: 23 additions & 0 deletions drivers/edac/Kconfig
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,29 @@ config EDAC_MM_EDAC
occurred so that a particular failing memory module can be
replaced. If unsure, select 'Y'.

config EDAC_GHES
bool "Output ACPI APEI/GHES BIOS detected errors via EDAC"
depends on ACPI_APEI_GHES && (EDAC_MM_EDAC=y)
default y
help
Not all machines support hardware-driven error report. Some of those
provide a BIOS-driven error report mechanism via ACPI, using the
APEI/GHES driver. By enabling this option, the error reports provided
by GHES are sent to userspace via the EDAC API.

When this option is enabled, it will disable the hardware-driven
mechanisms, if a GHES BIOS is detected, entering into the
"Firmware First" mode.

It should be noticed that keeping both GHES and a hardware-driven
error mechanism won't work well, as BIOS will race with OS, while
reading the error registers. So, if you want to not use "Firmware
first" GHES error mechanism, you should disable GHES either at
compilation time or by passing "ghes.disable=1" Kernel parameter
at boot time.

In doubt, say 'Y'.

config EDAC_AMD64
tristate "AMD64 (Opteron, Athlon64) K8, F10h"
depends on EDAC_MM_EDAC && AMD_NB && X86_64 && EDAC_DECODE_MCE
Expand Down
1 change: 1 addition & 0 deletions drivers/edac/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ ifdef CONFIG_PCI
edac_core-y += edac_pci.o edac_pci_sysfs.o
endif

obj-$(CONFIG_EDAC_GHES) += ghes_edac.o
obj-$(CONFIG_EDAC_MCE_INJ) += mce_amd_inj.o

edac_mce_amd-y := mce_amd.o
Expand Down
5 changes: 5 additions & 0 deletions drivers/edac/edac_core.h
Original file line number Diff line number Diff line change
Expand Up @@ -453,6 +453,11 @@ extern struct mem_ctl_info *find_mci_by_dev(struct device *dev);
extern struct mem_ctl_info *edac_mc_del_mc(struct device *dev);
extern int edac_mc_find_csrow_by_page(struct mem_ctl_info *mci,
unsigned long page);

void edac_raw_mc_handle_error(const enum hw_event_mc_err_type type,
struct mem_ctl_info *mci,
struct edac_raw_error_desc *e);

void edac_mc_handle_error(const enum hw_event_mc_err_type type,
struct mem_ctl_info *mci,
const u16 error_count,
Expand Down
Loading

0 comments on commit ad6c2c2

Please sign in to comment.