From patchwork Fri Oct  2 12:22:29 2020
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Shiju Jose <shiju.jose@huawei.com>
X-Patchwork-Id: 11813345
Return-Path: <SRS0=tWsE=DJ=vger.kernel.org=linux-edac-owner@kernel.org>
Received: from mail.kernel.org (pdx-korg-mail-1.web.codeaurora.org
 [172.30.200.123])
	by pdx-korg-patchwork-2.web.codeaurora.org (Postfix) with ESMTP id 24D596CB
	for <patchwork-linux-edac@patchwork.kernel.org>;
 Fri,  2 Oct 2020 12:27:36 +0000 (UTC)
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
	by mail.kernel.org (Postfix) with ESMTP id 1649021707
	for <patchwork-linux-edac@patchwork.kernel.org>;
 Fri,  2 Oct 2020 12:27:36 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S1726029AbgJBM1f (ORCPT
        <rfc822;patchwork-linux-edac@patchwork.kernel.org>);
        Fri, 2 Oct 2020 08:27:35 -0400
Received: from lhrrgout.huawei.com ([185.176.76.210]:2943 "EHLO huawei.com"
        rhost-flags-OK-OK-OK-FAIL) by vger.kernel.org with ESMTP
        id S1726176AbgJBM1f (ORCPT <rfc822;linux-edac@vger.kernel.org>);
        Fri, 2 Oct 2020 08:27:35 -0400
Received: from lhreml715-chm.china.huawei.com (unknown [172.18.7.106])
        by Forcepoint Email with ESMTP id 9466F8D17E98730A081B;
        Fri,  2 Oct 2020 13:27:33 +0100 (IST)
Received: from DESKTOP-6T4S3DQ.china.huawei.com (10.47.84.119) by
 lhreml715-chm.china.huawei.com (10.201.108.66) with Microsoft SMTP Server
 (version=TLS1_2, cipher=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256) id
 15.1.1913.5; Fri, 2 Oct 2020 13:27:33 +0100
From: Shiju Jose <shiju.jose@huawei.com>
To: <linux-edac@vger.kernel.org>, <linux-acpi@vger.kernel.org>,
        <linux-kernel@vger.kernel.org>, <bp@alien8.de>,
        <tony.luck@intel.com>, <rjw@rjwysocki.net>, <james.morse@arm.com>,
        <lenb@kernel.org>
CC: <linuxarm@huawei.com>, <shiju.jose@huawei.com>
Subject: [RFC PATCH 1/7] RAS/CEC: Replace the macro PFN with ELEM_NO
Date: Fri, 2 Oct 2020 13:22:29 +0100
Message-ID: <20201002122235.1280-2-shiju.jose@huawei.com>
X-Mailer: git-send-email 2.26.0.windows.1
In-Reply-To: <20201002122235.1280-1-shiju.jose@huawei.com>
References: <20201002122235.1280-1-shiju.jose@huawei.com>
MIME-Version: 1.0
X-Originating-IP: [10.47.84.119]
X-ClientProxiedBy: lhreml720-chm.china.huawei.com (10.201.108.71) To
 lhreml715-chm.china.huawei.com (10.201.108.66)
X-CFilter-Loop: Reflected
Precedence: bulk
List-ID: <linux-edac.vger.kernel.org>
X-Mailing-List: linux-edac@vger.kernel.org

Replace the macro PFN with ELEM_NO for common use.

Signed-off-by: Shiju Jose <shiju.jose@huawei.com>
---
 drivers/ras/cec.c | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/drivers/ras/cec.c b/drivers/ras/cec.c
index 569d9ad2c594..22d11c66c266 100644
--- a/drivers/ras/cec.c
+++ b/drivers/ras/cec.c
@@ -86,7 +86,7 @@
  * u64: [ 63 ... 12 | DECAY_BITS | COUNT_BITS ]
  */
 
-#define PFN(e)			((e) >> PAGE_SHIFT)
+#define ELEM_NO(e, shift)	((e) >> (shift))
 #define DECAY(e)		(((e) >> COUNT_BITS) & DECAY_MASK)
 #define COUNT(e)		((unsigned int)(e) & COUNT_MASK)
 #define FULL_COUNT(e)		((e) & (PAGE_SIZE - 1))
@@ -113,6 +113,10 @@ static struct ce_array {
 					 * Times we did spring cleaning.
 					 */
 
+	u8 id_shift;			/*
+					 * shift for element id.
+					 */
+
 	union {
 		struct {
 			__u32	disabled : 1,	/* cmdline disabled */
@@ -191,7 +195,7 @@ static int __find_elem(struct ce_array *ca, u64 pfn, unsigned int *to)
 	while (min <= max) {
 		int i = (min + max) >> 1;
 
-		this_pfn = PFN(ca->array[i]);
+		this_pfn = ELEM_NO(ca->array[i], ca->id_shift);
 
 		if (this_pfn < pfn)
 			min = i + 1;
@@ -258,7 +262,7 @@ static u64 del_lru_elem_unlocked(struct ce_array *ca)
 
 	del_elem(ca, min_idx);
 
-	return PFN(ca->array[min_idx]);
+	return ELEM_NO(ca->array[min_idx], ca->id_shift);
 }
 
 /*
@@ -287,7 +291,7 @@ static bool sanity_check(struct ce_array *ca)
 	int i;
 
 	for (i = 0; i < ca->n; i++) {
-		u64 this = PFN(ca->array[i]);
+		u64 this = ELEM_NO(ca->array[i], ca->id_shift);
 
 		if (WARN(prev > this, "prev: 0x%016llx <-> this: 0x%016llx\n", prev, this))
 			ret = true;
@@ -300,7 +304,7 @@ static bool sanity_check(struct ce_array *ca)
 
 	pr_info("Sanity check dump:\n{ n: %d\n", ca->n);
 	for (i = 0; i < ca->n; i++) {
-		u64 this = PFN(ca->array[i]);
+		u64 this = ELEM_NO(ca->array[i], ca->id_shift);
 
 		pr_info(" %03d: [%016llx|%03llx]\n", i, this, FULL_COUNT(ca->array[i]));
 	}
@@ -444,7 +448,7 @@ static int array_dump(struct seq_file *m, void *v)
 
 	seq_printf(m, "{ n: %d\n", ca->n);
 	for (i = 0; i < ca->n; i++) {
-		u64 this = PFN(ca->array[i]);
+		u64 this = ELEM_NO(ca->array[i], ca->id_shift);
 
 		seq_printf(m, " %3d: [%016llx|%s|%03llx]\n",
 			   i, this, bins[DECAY(ca->array[i])], COUNT(ca->array[i]));
@@ -569,6 +573,7 @@ static void __init cec_init(void)
 		return;
 	}
 
+	ce_arr.id_shift = PAGE_SHIFT;
 	INIT_DELAYED_WORK(&cec_work, cec_work_fn);
 	schedule_delayed_work(&cec_work, CEC_DECAY_DEFAULT_INTERVAL);
 

From patchwork Fri Oct  2 12:22:30 2020
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Shiju Jose <shiju.jose@huawei.com>
X-Patchwork-Id: 11813349
Return-Path: <SRS0=tWsE=DJ=vger.kernel.org=linux-edac-owner@kernel.org>
Received: from mail.kernel.org (pdx-korg-mail-1.web.codeaurora.org
 [172.30.200.123])
	by pdx-korg-patchwork-2.web.codeaurora.org (Postfix) with ESMTP id B62D56CB
	for <patchwork-linux-edac@patchwork.kernel.org>;
 Fri,  2 Oct 2020 12:28:09 +0000 (UTC)
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
	by mail.kernel.org (Postfix) with ESMTP id A754420665
	for <patchwork-linux-edac@patchwork.kernel.org>;
 Fri,  2 Oct 2020 12:28:09 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S2387717AbgJBM2J (ORCPT
        <rfc822;patchwork-linux-edac@patchwork.kernel.org>);
        Fri, 2 Oct 2020 08:28:09 -0400
Received: from lhrrgout.huawei.com ([185.176.76.210]:2944 "EHLO huawei.com"
        rhost-flags-OK-OK-OK-FAIL) by vger.kernel.org with ESMTP
        id S1726010AbgJBM2I (ORCPT <rfc822;linux-edac@vger.kernel.org>);
        Fri, 2 Oct 2020 08:28:08 -0400
Received: from lhreml715-chm.china.huawei.com (unknown [172.18.7.106])
        by Forcepoint Email with ESMTP id 2E84F37E9A5E1E12C15D;
        Fri,  2 Oct 2020 13:28:07 +0100 (IST)
Received: from DESKTOP-6T4S3DQ.china.huawei.com (10.47.84.119) by
 lhreml715-chm.china.huawei.com (10.201.108.66) with Microsoft SMTP Server
 (version=TLS1_2, cipher=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256) id
 15.1.1913.5; Fri, 2 Oct 2020 13:28:06 +0100
From: Shiju Jose <shiju.jose@huawei.com>
To: <linux-edac@vger.kernel.org>, <linux-acpi@vger.kernel.org>,
        <linux-kernel@vger.kernel.org>, <bp@alien8.de>,
        <tony.luck@intel.com>, <rjw@rjwysocki.net>, <james.morse@arm.com>,
        <lenb@kernel.org>
CC: <linuxarm@huawei.com>, <shiju.jose@huawei.com>
Subject: [RFC PATCH 2/7] RAS/CEC: Replace pfns_poisoned with elems_poisoned
Date: Fri, 2 Oct 2020 13:22:30 +0100
Message-ID: <20201002122235.1280-3-shiju.jose@huawei.com>
X-Mailer: git-send-email 2.26.0.windows.1
In-Reply-To: <20201002122235.1280-1-shiju.jose@huawei.com>
References: <20201002122235.1280-1-shiju.jose@huawei.com>
MIME-Version: 1.0
X-Originating-IP: [10.47.84.119]
X-ClientProxiedBy: lhreml720-chm.china.huawei.com (10.201.108.71) To
 lhreml715-chm.china.huawei.com (10.201.108.66)
X-CFilter-Loop: Reflected
Precedence: bulk
List-ID: <linux-edac.vger.kernel.org>
X-Mailing-List: linux-edac@vger.kernel.org

Replace the variable pfns_poisoned with elems_poisoned
for the common use.

Signed-off-by: Shiju Jose <shiju.jose@huawei.com>
---
 drivers/ras/cec.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/ras/cec.c b/drivers/ras/cec.c
index 22d11c66c266..f20da1103f27 100644
--- a/drivers/ras/cec.c
+++ b/drivers/ras/cec.c
@@ -100,8 +100,8 @@ static struct ce_array {
 					 * since the last spring cleaning.
 					 */
 
-	u64 pfns_poisoned;		/*
-					 * number of PFNs which got poisoned.
+	u64 elems_poisoned;		/*
+					 * number of elements which got poisoned.
 					 */
 
 	u64 ces_entered;		/*
@@ -362,7 +362,7 @@ static int cec_add_elem(u64 pfn)
 			/* We have reached max count for this page, soft-offline it. */
 			pr_err("Soft-offlining pfn: 0x%llx\n", pfn);
 			memory_failure_queue(pfn, MF_SOFT_OFFLINE);
-			ca->pfns_poisoned++;
+			ca->elems_poisoned++;
 		}
 
 		del_elem(ca, to);
@@ -457,7 +457,7 @@ static int array_dump(struct seq_file *m, void *v)
 	seq_printf(m, "}\n");
 
 	seq_printf(m, "Stats:\nCEs: %llu\nofflined pages: %llu\n",
-		   ca->ces_entered, ca->pfns_poisoned);
+		   ca->ces_entered, ca->elems_poisoned);
 
 	seq_printf(m, "Flags: 0x%x\n", ca->flags);
 

From patchwork Fri Oct  2 12:22:31 2020
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Shiju Jose <shiju.jose@huawei.com>
X-Patchwork-Id: 11813357
Return-Path: <SRS0=tWsE=DJ=vger.kernel.org=linux-edac-owner@kernel.org>
Received: from mail.kernel.org (pdx-korg-mail-1.web.codeaurora.org
 [172.30.200.123])
	by pdx-korg-patchwork-2.web.codeaurora.org (Postfix) with ESMTP id 1180F139A
	for <patchwork-linux-edac@patchwork.kernel.org>;
 Fri,  2 Oct 2020 12:29:01 +0000 (UTC)
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
	by mail.kernel.org (Postfix) with ESMTP id ED4982074B
	for <patchwork-linux-edac@patchwork.kernel.org>;
 Fri,  2 Oct 2020 12:29:00 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S1726176AbgJBM3A (ORCPT
        <rfc822;patchwork-linux-edac@patchwork.kernel.org>);
        Fri, 2 Oct 2020 08:29:00 -0400
Received: from lhrrgout.huawei.com ([185.176.76.210]:2945 "EHLO huawei.com"
        rhost-flags-OK-OK-OK-FAIL) by vger.kernel.org with ESMTP
        id S1726090AbgJBM3A (ORCPT <rfc822;linux-edac@vger.kernel.org>);
        Fri, 2 Oct 2020 08:29:00 -0400
Received: from lhreml715-chm.china.huawei.com (unknown [172.18.7.106])
        by Forcepoint Email with ESMTP id 49972A71E05F8C9D331E;
        Fri,  2 Oct 2020 13:28:59 +0100 (IST)
Received: from DESKTOP-6T4S3DQ.china.huawei.com (10.47.84.119) by
 lhreml715-chm.china.huawei.com (10.201.108.66) with Microsoft SMTP Server
 (version=TLS1_2, cipher=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256) id
 15.1.1913.5; Fri, 2 Oct 2020 13:28:58 +0100
From: Shiju Jose <shiju.jose@huawei.com>
To: <linux-edac@vger.kernel.org>, <linux-acpi@vger.kernel.org>,
        <linux-kernel@vger.kernel.org>, <bp@alien8.de>,
        <tony.luck@intel.com>, <rjw@rjwysocki.net>, <james.morse@arm.com>,
        <lenb@kernel.org>
CC: <linuxarm@huawei.com>, <shiju.jose@huawei.com>
Subject: [RFC PATCH 3/7] RAS/CEC: Move X86 MCE specific code under
 CONFIG_X86_MCE
Date: Fri, 2 Oct 2020 13:22:31 +0100
Message-ID: <20201002122235.1280-4-shiju.jose@huawei.com>
X-Mailer: git-send-email 2.26.0.windows.1
In-Reply-To: <20201002122235.1280-1-shiju.jose@huawei.com>
References: <20201002122235.1280-1-shiju.jose@huawei.com>
MIME-Version: 1.0
X-Originating-IP: [10.47.84.119]
X-ClientProxiedBy: lhreml720-chm.china.huawei.com (10.201.108.71) To
 lhreml715-chm.china.huawei.com (10.201.108.66)
X-CFilter-Loop: Reflected
Precedence: bulk
List-ID: <linux-edac.vger.kernel.org>
X-Mailing-List: linux-edac@vger.kernel.org

CEC may need to support other architectures such as ARM64.
Move X86 MCE specific code under CONFIG_X86_MCE to support
building for other architectures.

Signed-off-by: Shiju Jose <shiju.jose@huawei.com>
---
 drivers/ras/cec.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/drivers/ras/cec.c b/drivers/ras/cec.c
index f20da1103f27..803e641d8e5c 100644
--- a/drivers/ras/cec.c
+++ b/drivers/ras/cec.c
@@ -8,7 +8,9 @@
 #include <linux/kernel.h>
 #include <linux/workqueue.h>
 
+#if defined(CONFIG_X86_MCE)
 #include <asm/mce.h>
+#endif
 
 #include "debugfs.h"
 
@@ -511,6 +513,7 @@ static int __init create_debugfs_nodes(void)
 	if (!IS_ENABLED(CONFIG_RAS_CEC_DEBUG))
 		return 0;
 
+#if defined(CONFIG_X86_MCE)
 	pfn = debugfs_create_file("pfn", S_IRUSR | S_IWUSR, d, &dfs_pfn, &pfn_ops);
 	if (!pfn) {
 		pr_warn("Error creating pfn debugfs node!\n");
@@ -522,6 +525,7 @@ static int __init create_debugfs_nodes(void)
 		pr_warn("Error creating array debugfs node!\n");
 		goto err;
 	}
+#endif
 
 	return 0;
 
@@ -531,6 +535,7 @@ static int __init create_debugfs_nodes(void)
 	return 1;
 }
 
+#if defined(CONFIG_X86_MCE)
 static int cec_notifier(struct notifier_block *nb, unsigned long val,
 			void *data)
 {
@@ -556,28 +561,33 @@ static struct notifier_block cec_nb = {
 	.notifier_call	= cec_notifier,
 	.priority	= MCE_PRIO_CEC,
 };
+#endif
 
 static void __init cec_init(void)
 {
 	if (ce_arr.disabled)
 		return;
 
+#if defined(CONFIG_X86_MCE)
 	ce_arr.array = (void *)get_zeroed_page(GFP_KERNEL);
 	if (!ce_arr.array) {
 		pr_err("Error allocating CE array page!\n");
 		return;
 	}
+#endif
 
 	if (create_debugfs_nodes()) {
 		free_page((unsigned long)ce_arr.array);
 		return;
 	}
 
+#if defined(CONFIG_X86_MCE)
 	ce_arr.id_shift = PAGE_SHIFT;
 	INIT_DELAYED_WORK(&cec_work, cec_work_fn);
 	schedule_delayed_work(&cec_work, CEC_DECAY_DEFAULT_INTERVAL);
 
 	mce_register_decode_chain(&cec_nb);
+#endif
 
 	pr_info("Correctable Errors collector initialized.\n");
 }

From patchwork Fri Oct  2 12:22:32 2020
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Shiju Jose <shiju.jose@huawei.com>
X-Patchwork-Id: 11813365
Return-Path: <SRS0=tWsE=DJ=vger.kernel.org=linux-edac-owner@kernel.org>
Received: from mail.kernel.org (pdx-korg-mail-1.web.codeaurora.org
 [172.30.200.123])
	by pdx-korg-patchwork-2.web.codeaurora.org (Postfix) with ESMTP id C03FC139A
	for <patchwork-linux-edac@patchwork.kernel.org>;
 Fri,  2 Oct 2020 12:29:57 +0000 (UTC)
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
	by mail.kernel.org (Postfix) with ESMTP id B221D21707
	for <patchwork-linux-edac@patchwork.kernel.org>;
 Fri,  2 Oct 2020 12:29:57 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S2387717AbgJBM3y (ORCPT
        <rfc822;patchwork-linux-edac@patchwork.kernel.org>);
        Fri, 2 Oct 2020 08:29:54 -0400
Received: from lhrrgout.huawei.com ([185.176.76.210]:2946 "EHLO huawei.com"
        rhost-flags-OK-OK-OK-FAIL) by vger.kernel.org with ESMTP
        id S1726090AbgJBM3x (ORCPT <rfc822;linux-edac@vger.kernel.org>);
        Fri, 2 Oct 2020 08:29:53 -0400
Received: from lhreml715-chm.china.huawei.com (unknown [172.18.7.107])
        by Forcepoint Email with ESMTP id BE7E84547CA7565F6D7E;
        Fri,  2 Oct 2020 13:29:51 +0100 (IST)
Received: from DESKTOP-6T4S3DQ.china.huawei.com (10.47.84.119) by
 lhreml715-chm.china.huawei.com (10.201.108.66) with Microsoft SMTP Server
 (version=TLS1_2, cipher=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256) id
 15.1.1913.5; Fri, 2 Oct 2020 13:29:51 +0100
From: Shiju Jose <shiju.jose@huawei.com>
To: <linux-edac@vger.kernel.org>, <linux-acpi@vger.kernel.org>,
        <linux-kernel@vger.kernel.org>, <bp@alien8.de>,
        <tony.luck@intel.com>, <rjw@rjwysocki.net>, <james.morse@arm.com>,
        <lenb@kernel.org>
CC: <linuxarm@huawei.com>, <shiju.jose@huawei.com>
Subject: [RFC PATCH 4/7] RAS/CEC: Modify cec_mod_work() for common use
Date: Fri, 2 Oct 2020 13:22:32 +0100
Message-ID: <20201002122235.1280-5-shiju.jose@huawei.com>
X-Mailer: git-send-email 2.26.0.windows.1
In-Reply-To: <20201002122235.1280-1-shiju.jose@huawei.com>
References: <20201002122235.1280-1-shiju.jose@huawei.com>
MIME-Version: 1.0
X-Originating-IP: [10.47.84.119]
X-ClientProxiedBy: lhreml720-chm.china.huawei.com (10.201.108.71) To
 lhreml715-chm.china.huawei.com (10.201.108.66)
X-CFilter-Loop: Reflected
Precedence: bulk
List-ID: <linux-edac.vger.kernel.org>
X-Mailing-List: linux-edac@vger.kernel.org

Modify the function cec_mod_work() for the common use
with the other error sources.

Signed-off-by: Shiju Jose <shiju.jose@huawei.com>
---
 drivers/ras/cec.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/ras/cec.c b/drivers/ras/cec.c
index 803e641d8e5c..f869e7a270b8 100644
--- a/drivers/ras/cec.c
+++ b/drivers/ras/cec.c
@@ -167,12 +167,12 @@ static void do_spring_cleaning(struct ce_array *ca)
 /*
  * @interval in seconds
  */
-static void cec_mod_work(unsigned long interval)
+static void cec_mod_work(struct delayed_work *dwork, unsigned long interval)
 {
 	unsigned long iv;
 
 	iv = interval * HZ;
-	mod_delayed_work(system_wq, &cec_work, round_jiffies(iv));
+	mod_delayed_work(system_wq, dwork, round_jiffies(iv));
 }
 
 static void cec_work_fn(struct work_struct *work)
@@ -181,7 +181,7 @@ static void cec_work_fn(struct work_struct *work)
 	do_spring_cleaning(&ce_arr);
 	mutex_unlock(&ce_mutex);
 
-	cec_mod_work(decay_interval);
+	cec_mod_work(&cec_work, decay_interval);
 }
 
 /*
@@ -420,7 +420,7 @@ static int decay_interval_set(void *data, u64 val)
 	*(u64 *)data   = val;
 	decay_interval = val;
 
-	cec_mod_work(decay_interval);
+	cec_mod_work(&cec_work, decay_interval);
 
 	return 0;
 }

From patchwork Fri Oct  2 12:22:33 2020
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Shiju Jose <shiju.jose@huawei.com>
X-Patchwork-Id: 11813367
Return-Path: <SRS0=tWsE=DJ=vger.kernel.org=linux-edac-owner@kernel.org>
Received: from mail.kernel.org (pdx-korg-mail-1.web.codeaurora.org
 [172.30.200.123])
	by pdx-korg-patchwork-2.web.codeaurora.org (Postfix) with ESMTP id CC4DC139A
	for <patchwork-linux-edac@patchwork.kernel.org>;
 Fri,  2 Oct 2020 12:30:28 +0000 (UTC)
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
	by mail.kernel.org (Postfix) with ESMTP id B7AF721707
	for <patchwork-linux-edac@patchwork.kernel.org>;
 Fri,  2 Oct 2020 12:30:28 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S2387823AbgJBMa2 (ORCPT
        <rfc822;patchwork-linux-edac@patchwork.kernel.org>);
        Fri, 2 Oct 2020 08:30:28 -0400
Received: from lhrrgout.huawei.com ([185.176.76.210]:2947 "EHLO huawei.com"
        rhost-flags-OK-OK-OK-FAIL) by vger.kernel.org with ESMTP
        id S1726090AbgJBMa2 (ORCPT <rfc822;linux-edac@vger.kernel.org>);
        Fri, 2 Oct 2020 08:30:28 -0400
Received: from lhreml715-chm.china.huawei.com (unknown [172.18.7.106])
        by Forcepoint Email with ESMTP id B99506BA6F7C194C561B;
        Fri,  2 Oct 2020 13:30:26 +0100 (IST)
Received: from DESKTOP-6T4S3DQ.china.huawei.com (10.47.84.119) by
 lhreml715-chm.china.huawei.com (10.201.108.66) with Microsoft SMTP Server
 (version=TLS1_2, cipher=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256) id
 15.1.1913.5; Fri, 2 Oct 2020 13:30:26 +0100
From: Shiju Jose <shiju.jose@huawei.com>
To: <linux-edac@vger.kernel.org>, <linux-acpi@vger.kernel.org>,
        <linux-kernel@vger.kernel.org>, <bp@alien8.de>,
        <tony.luck@intel.com>, <rjw@rjwysocki.net>, <james.morse@arm.com>,
        <lenb@kernel.org>
CC: <linuxarm@huawei.com>, <shiju.jose@huawei.com>
Subject: [RFC PATCH 5/7] RAS/CEC: Add support for errors count check on short
 time period
Date: Fri, 2 Oct 2020 13:22:33 +0100
Message-ID: <20201002122235.1280-6-shiju.jose@huawei.com>
X-Mailer: git-send-email 2.26.0.windows.1
In-Reply-To: <20201002122235.1280-1-shiju.jose@huawei.com>
References: <20201002122235.1280-1-shiju.jose@huawei.com>
MIME-Version: 1.0
X-Originating-IP: [10.47.84.119]
X-ClientProxiedBy: lhreml720-chm.china.huawei.com (10.201.108.71) To
 lhreml715-chm.china.huawei.com (10.201.108.66)
X-CFilter-Loop: Reflected
Precedence: bulk
List-ID: <linux-edac.vger.kernel.org>
X-Mailing-List: linux-edac@vger.kernel.org

Some types of elements, for example CPU core, should be isolated
when the corrected errors reported too often. This is used for the
early fault prediction and would help to prevent serious faults
by taking corrective actions.
Modify CEC to support for the errors count check on short
time period. Implementation details is added in the file.

Signed-off-by: Shiju Jose <shiju.jose@huawei.com>
---
 drivers/ras/cec.c | 125 ++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 109 insertions(+), 16 deletions(-)

diff --git a/drivers/ras/cec.c b/drivers/ras/cec.c
index f869e7a270b8..ca52917d514c 100644
--- a/drivers/ras/cec.c
+++ b/drivers/ras/cec.c
@@ -119,6 +119,23 @@ static struct ce_array {
 					 * shift for element id.
 					 */
 
+	struct delayed_work work;	/*
+					 * delayed work.
+					 */
+
+	bool short_period;		/* Indicates threshold check for the error count
+					 * over short time period.
+					 */
+
+	u8 time_slot;			/*
+					 * time slot's number within the decay interval.
+					 */
+
+	union {
+		struct mutex	mutex;
+		spinlock_t	spin_lock;
+	};
+
 	union {
 		struct {
 			__u32	disabled : 1,	/* cmdline disabled */
@@ -128,7 +145,6 @@ static struct ce_array {
 	};
 } ce_arr;
 
-static DEFINE_MUTEX(ce_mutex);
 static u64 dfs_pfn;
 
 /* Amount of errors after which we offline */
@@ -138,9 +154,35 @@ static u64 action_threshold = COUNT_MASK;
 #define CEC_DECAY_DEFAULT_INTERVAL	24 * 60 * 60	/* 24 hrs */
 #define CEC_DECAY_MIN_INTERVAL		 1 * 60 * 60	/* 1h */
 #define CEC_DECAY_MAX_INTERVAL	   30 *	24 * 60 * 60	/* one month */
-static struct delayed_work cec_work;
 static u64 decay_interval = CEC_DECAY_DEFAULT_INTERVAL;
 
+/* Definitions for elements (for example CPU) for which
+ * error count on shrot time period is checked with threshold.
+ *
+ * An element such as a CPU core may need to isolate when large number of
+ * correctable errors are reported on that element too often. When the
+ * CEs count is exceeded the threshold value in a short time period.
+ *
+ * The decay interval is divided into a number of time slots. The CE collector
+ * calculates the average error count at the end of each decay interval. Then
+ * the average count would be subtracted from the total count in each following
+ * time slots. The work function for the decay interval would be set  for the
+ * reduced time period = decay interval/ number of time slots. When the new
+ * CE count for a cpu is added, the element would be offlined when the sum of
+ * the most recent CEs counts exceeded the CE threshold value.
+ */
+
+/*
+ * u64: [ 63 ELEM ID 23 | ELEM_STATUS_BIT 22 | 21 AVG_COUNT_BITS 12 | 11 DECAY_BITS 10 | 9 COUNT_BITS 0]
+ */
+
+/* Number of time slots in the decay interval */
+#define RAS_CEC_NUM_TIME_SLOTS	10
+
+#define AVG_COUNT_SHIFT	(DECAY_BITS + COUNT_BITS)
+#define ELEM_STATUS_BIT	BIT(22)	/* Indicates an element offlined by CEC */
+#define ELEM_ID_SHIFT	(1 + AVG_COUNT_SHIFT + COUNT_BITS)
+
 /*
  * Decrement decay value. We're using DECAY_BITS bits to denote decay of an
  * element in the array. On insertion and any access, it gets reset to max.
@@ -177,11 +219,62 @@ static void cec_mod_work(struct delayed_work *dwork, unsigned long interval)
 
 static void cec_work_fn(struct work_struct *work)
 {
-	mutex_lock(&ce_mutex);
-	do_spring_cleaning(&ce_arr);
-	mutex_unlock(&ce_mutex);
+	struct ce_array *ca;
+	unsigned long flags;
+	u64 avg_count;
+	int i, time_slots = 1;
+	struct delayed_work *d_work = container_of(work, struct delayed_work, work);
+
+	if (!d_work)
+		return;
+
+	ca = container_of(d_work, struct ce_array, work);
+	if (!ca->array || ca->disabled)
+		return;
 
-	cec_mod_work(&cec_work, decay_interval);
+	if (!ca->short_period) {
+		mutex_lock(&ca->mutex);
+		do_spring_cleaning(ca);
+		mutex_unlock(&ca->mutex);
+	} else {
+		time_slots = RAS_CEC_NUM_TIME_SLOTS;
+		spin_lock_irqsave(&ca->spin_lock, flags);
+		ca->time_slot = (ca->time_slot + 1) % RAS_CEC_NUM_TIME_SLOTS;
+
+		for (i = 0; i < ca->n; i++) {
+			if (ca->array[i] & ELEM_STATUS_BIT)
+				continue;
+
+			/* clear old errors count approximately by subtracting the avg count
+			 * from the total errors count.
+			 */
+			avg_count = (ca->array[i] >> AVG_COUNT_SHIFT) & COUNT_MASK;
+			ca->array[i] -= avg_count;
+		}
+
+		if (ca->time_slot) {
+			spin_unlock_irqrestore(&ca->spin_lock, flags);
+			goto exit;
+		}
+
+		for (i = 0; i < ca->n; i++) {
+			if (ca->array[i] & ELEM_STATUS_BIT)
+				continue;
+
+			/* calculate average error count for the completed time period */
+			avg_count = COUNT(ca->array[i]) / RAS_CEC_NUM_TIME_SLOTS;
+			ca->array[i] -= (COUNT(ca->array[i]) % RAS_CEC_NUM_TIME_SLOTS);
+			/* store average error count */
+			ca->array[i] &= ~(COUNT_MASK << AVG_COUNT_SHIFT);
+			ca->array[i] |= (avg_count << AVG_COUNT_SHIFT);
+		}
+
+		do_spring_cleaning(ca);
+		spin_unlock_irqrestore(&ca->spin_lock, flags);
+	}
+
+exit:
+	cec_mod_work(&ca->work, decay_interval/time_slots);
 }
 
 /*
@@ -279,9 +372,9 @@ static u64 __maybe_unused del_lru_elem(void)
 	if (!ca->n)
 		return 0;
 
-	mutex_lock(&ce_mutex);
+	mutex_lock(&ca->mutex);
 	pfn = del_lru_elem_unlocked(ca);
-	mutex_unlock(&ce_mutex);
+	mutex_unlock(&ca->mutex);
 
 	return pfn;
 }
@@ -328,7 +421,7 @@ static int cec_add_elem(u64 pfn)
 	if (!ce_arr.array || ce_arr.disabled)
 		return -ENODEV;
 
-	mutex_lock(&ce_mutex);
+	mutex_lock(&ca->mutex);
 
 	ca->ces_entered++;
 
@@ -386,7 +479,7 @@ static int cec_add_elem(u64 pfn)
 	WARN_ON_ONCE(sanity_check(ca));
 
 unlock:
-	mutex_unlock(&ce_mutex);
+	mutex_unlock(&ca->mutex);
 
 	return ret;
 }
@@ -420,7 +513,7 @@ static int decay_interval_set(void *data, u64 val)
 	*(u64 *)data   = val;
 	decay_interval = val;
 
-	cec_mod_work(&cec_work, decay_interval);
+	cec_mod_work(&ce_arr.work, decay_interval);
 
 	return 0;
 }
@@ -446,7 +539,7 @@ static int array_dump(struct seq_file *m, void *v)
 	struct ce_array *ca = &ce_arr;
 	int i;
 
-	mutex_lock(&ce_mutex);
+	mutex_lock(&ca->mutex);
 
 	seq_printf(m, "{ n: %d\n", ca->n);
 	for (i = 0; i < ca->n; i++) {
@@ -468,7 +561,7 @@ static int array_dump(struct seq_file *m, void *v)
 
 	seq_printf(m, "Action threshold: %lld\n", action_threshold);
 
-	mutex_unlock(&ce_mutex);
+	mutex_unlock(&ca->mutex);
 
 	return 0;
 }
@@ -583,9 +676,9 @@ static void __init cec_init(void)
 
 #if defined(CONFIG_X86_MCE)
 	ce_arr.id_shift = PAGE_SHIFT;
-	INIT_DELAYED_WORK(&cec_work, cec_work_fn);
-	schedule_delayed_work(&cec_work, CEC_DECAY_DEFAULT_INTERVAL);
-
+	mutex_init(&ce_arr.mutex);
+	INIT_DELAYED_WORK(&ce_arr.work, cec_work_fn);
+	schedule_delayed_work(&ce_arr.work, CEC_DECAY_DEFAULT_INTERVAL);
 	mce_register_decode_chain(&cec_nb);
 #endif
 

From patchwork Fri Oct  2 12:22:34 2020
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Shiju Jose <shiju.jose@huawei.com>
X-Patchwork-Id: 11813373
Return-Path: <SRS0=tWsE=DJ=vger.kernel.org=linux-edac-owner@kernel.org>
Received: from mail.kernel.org (pdx-korg-mail-1.web.codeaurora.org
 [172.30.200.123])
	by pdx-korg-patchwork-2.web.codeaurora.org (Postfix) with ESMTP id 6F2E9139A
	for <patchwork-linux-edac@patchwork.kernel.org>;
 Fri,  2 Oct 2020 12:31:26 +0000 (UTC)
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
	by mail.kernel.org (Postfix) with ESMTP id 52DE620874
	for <patchwork-linux-edac@patchwork.kernel.org>;
 Fri,  2 Oct 2020 12:31:26 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S2387844AbgJBMbW (ORCPT
        <rfc822;patchwork-linux-edac@patchwork.kernel.org>);
        Fri, 2 Oct 2020 08:31:22 -0400
Received: from lhrrgout.huawei.com ([185.176.76.210]:2948 "EHLO huawei.com"
        rhost-flags-OK-OK-OK-FAIL) by vger.kernel.org with ESMTP
        id S1726090AbgJBMbV (ORCPT <rfc822;linux-edac@vger.kernel.org>);
        Fri, 2 Oct 2020 08:31:21 -0400
Received: from lhreml715-chm.china.huawei.com (unknown [172.18.7.106])
        by Forcepoint Email with ESMTP id 4821A485866C963EC62F;
        Fri,  2 Oct 2020 13:31:20 +0100 (IST)
Received: from DESKTOP-6T4S3DQ.china.huawei.com (10.47.84.119) by
 lhreml715-chm.china.huawei.com (10.201.108.66) with Microsoft SMTP Server
 (version=TLS1_2, cipher=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256) id
 15.1.1913.5; Fri, 2 Oct 2020 13:31:19 +0100
From: Shiju Jose <shiju.jose@huawei.com>
To: <linux-edac@vger.kernel.org>, <linux-acpi@vger.kernel.org>,
        <linux-kernel@vger.kernel.org>, <bp@alien8.de>,
        <tony.luck@intel.com>, <rjw@rjwysocki.net>, <james.morse@arm.com>,
        <lenb@kernel.org>
CC: <linuxarm@huawei.com>, <shiju.jose@huawei.com>
Subject: [RFC PATCH 6/7] RAS/CEC: Add CPU Correctable Error Collector to
 isolate an erroneous CPU core
Date: Fri, 2 Oct 2020 13:22:34 +0100
Message-ID: <20201002122235.1280-7-shiju.jose@huawei.com>
X-Mailer: git-send-email 2.26.0.windows.1
In-Reply-To: <20201002122235.1280-1-shiju.jose@huawei.com>
References: <20201002122235.1280-1-shiju.jose@huawei.com>
MIME-Version: 1.0
X-Originating-IP: [10.47.84.119]
X-ClientProxiedBy: lhreml720-chm.china.huawei.com (10.201.108.71) To
 lhreml715-chm.china.huawei.com (10.201.108.66)
X-CFilter-Loop: Reflected
Precedence: bulk
List-ID: <linux-edac.vger.kernel.org>
X-Mailing-List: linux-edac@vger.kernel.org

When the CPU correctable errors, for example L1/L2 cache errors,
reported on an ARM64 CPU core too often, it should be isolated.
Add the CPU correctable error collector to store the CPU correctable
error count.

When the correctable error count for a CPU exceed the threshold
value in a short time period, it will try to isolate the CPU core.

If disabling entire CPU core is not acceptable, Please suggest
method to disable L1 and L2 cache on ARM64 core?

Signed-off-by: Shiju Jose <shiju.jose@huawei.com>
---
 arch/arm64/ras/Kconfig |  17 +++
 drivers/ras/Kconfig    |   1 +
 drivers/ras/cec.c      | 231 +++++++++++++++++++++++++++++++++++++++--
 include/linux/ras.h    |   9 ++
 4 files changed, 247 insertions(+), 11 deletions(-)
 create mode 100644 arch/arm64/ras/Kconfig

diff --git a/arch/arm64/ras/Kconfig b/arch/arm64/ras/Kconfig
new file mode 100644
index 000000000000..bfa14157cd2e
--- /dev/null
+++ b/arch/arm64/ras/Kconfig
@@ -0,0 +1,17 @@
+# SPDX-License-Identifier: GPL-2.0
+config RAS_CEC
+        bool "Correctable Errors Collector"
+        depends on ARM64 && HOTPLUG_CPU && DEBUG_FS
+        help
+          This is a small cache which collects correctable CPU errors and
+          counts their repeated occurrence. Once the counter for a CPU
+          overflows in a short time period, we try to offline that CPU
+          as we take it to mean that it has reached a relatively high error
+          count and would probably be best if we don't use it anymore.
+
+          Presently CPU error correction enabld for ARM64 platform only.
+
+config RAS_CEC_DEBUG
+        bool "CEC debugging machinery"
+        default n
+        depends on RAS_CEC
diff --git a/drivers/ras/Kconfig b/drivers/ras/Kconfig
index c2a236f2e846..d2f877e5f7ad 100644
--- a/drivers/ras/Kconfig
+++ b/drivers/ras/Kconfig
@@ -32,5 +32,6 @@ menuconfig RAS
 if RAS
 
 source "arch/x86/ras/Kconfig"
+source "arch/arm64/ras/Kconfig"
 
 endif
diff --git a/drivers/ras/cec.c b/drivers/ras/cec.c
index ca52917d514c..408bf2ac2461 100644
--- a/drivers/ras/cec.c
+++ b/drivers/ras/cec.c
@@ -7,6 +7,8 @@
 #include <linux/ras.h>
 #include <linux/kernel.h>
 #include <linux/workqueue.h>
+#include <linux/cpu.h>
+#include <linux/slab.h>
 
 #if defined(CONFIG_X86_MCE)
 #include <asm/mce.h>
@@ -143,7 +145,7 @@ static struct ce_array {
 		};
 		__u32 flags;
 	};
-} ce_arr;
+} ce_arr, cpu_ce_arr;
 
 static u64 dfs_pfn;
 
@@ -156,6 +158,8 @@ static u64 action_threshold = COUNT_MASK;
 #define CEC_DECAY_MAX_INTERVAL	   30 *	24 * 60 * 60	/* one month */
 static u64 decay_interval = CEC_DECAY_DEFAULT_INTERVAL;
 
+static const char * const bins[] = { "00", "01", "10", "11" };
+
 /* Definitions for elements (for example CPU) for which
  * error count on shrot time period is checked with threshold.
  *
@@ -484,6 +488,172 @@ static int cec_add_elem(u64 pfn)
 	return ret;
 }
 
+struct cec_elem_offline {
+	struct work_struct work;
+	struct ce_array *ca;
+	int array_index;
+	int elem_id;
+};
+
+/*
+ * Work function to offline a cpu because the offlining to be done
+ * in the process context.
+ */
+static void cec_cpu_offline_work_fn(struct work_struct *work)
+{
+	int rc, cpu;
+	struct cec_elem_offline *elem;
+	struct ce_array *ca;
+
+	elem = container_of(work, struct cec_elem_offline, work);
+
+	cpu = elem->elem_id;
+	if (!cpu_online(cpu))
+		return;
+
+	rc = remove_cpu(cpu);
+	if (rc) {
+		pr_warn("Failed to offline CPU%d, error %d\n", cpu, rc);
+	} else {
+		ca = elem->ca;
+		ca->array[elem->array_index] |= ELEM_STATUS_BIT;
+	}
+
+	kfree(elem);
+}
+
+int cec_cpu_add_elem(int cpu, u64 ce_count)
+{
+	struct ce_array *ca = &cpu_ce_arr;
+	unsigned int to = 0;
+	int count, ret = 0;
+	unsigned long flags;
+	struct cec_elem_offline *elem;
+
+	/*
+	 * We can be called very early on the identify_cpu() path where we are
+	 * not initialized yet. We ignore the error for simplicity.
+	 */
+	if (!ca->array || ca->disabled || !cpu_online(cpu))
+		return -ENODEV;
+
+	spin_lock_irqsave(&ca->spin_lock, flags);
+
+	ca->ces_entered++;
+
+	ret = find_elem(ca, cpu, &to);
+	if (ret < 0) {
+		/*
+		 * Shift range [to-end] to make room for one more element.
+		 */
+		memmove((void *)&ca->array[to + 1],
+			(void *)&ca->array[to],
+			(ca->n - to) * sizeof(u64));
+
+		ca->array[to] = cpu << ca->id_shift;
+		ca->n++;
+	}
+
+	/* Error received for a previously CEC offlined CPU, which later online elsewhere.
+	 * reset array.
+	 */
+	if (ca->array[to] & ELEM_STATUS_BIT) {
+		ca->array[to] &= ~(ELEM_STATUS_BIT);
+		ca->array[to] &= ~(COUNT_MASK);
+	}
+
+	/* Add/refresh element generation and increment count */
+	ca->array[to] |= DECAY_MASK << COUNT_BITS;
+	ca->array[to] += ce_count;
+
+	/* Check action threshold and offline, if reached. */
+	count = COUNT(ca->array[to]);
+	if (count >= action_threshold) {
+		if (!cpu_online(cpu)) {
+			pr_warn("CEC: Invalid cpu: %d\n", cpu);
+		} else {
+			/* We have reached max count for this cpu, offline it. */
+			ca->elems_poisoned++;
+			/* schedule work function to offline the cpu */
+			elem = kmalloc(sizeof(*elem), GFP_NOWAIT);
+			if (elem) {
+				pr_info("CEC: offlining cpu: %d\n", cpu);
+				elem->ca = ca;
+				elem->array_index = to;
+				elem->elem_id = cpu;
+				INIT_WORK(&elem->work, cec_cpu_offline_work_fn);
+				schedule_work(&elem->work);
+			} else
+				pr_warn("CEC: offlining cpu: out of memory %d\n", cpu);
+		}
+
+		/*
+		 * Return a >0 value to callers, to denote that we've reached
+		 * the offlining threshold.
+		 */
+		ret = 1;
+
+		goto unlock;
+	}
+
+	ca->decay_count++;
+
+	/* Do we need to call spring cleaning for the modules(eg CPU) with
+	 * small number of elements?
+	 */
+	if (ca->decay_count >= (num_present_cpus() >> DECAY_BITS))
+		do_spring_cleaning(ca);
+
+	WARN_ON_ONCE(sanity_check(ca));
+
+unlock:
+	spin_unlock_irqrestore(&ca->spin_lock, flags);
+
+	return ret;
+}
+
+static int cec_cpu_stats_show(struct seq_file *seq, void *v)
+{
+	struct ce_array *ca = &cpu_ce_arr;
+	unsigned long flags;
+	int i;
+
+	spin_lock_irqsave(&cpu_ce_arr.spin_lock, flags);
+	seq_puts(seq, "CEC CPU Stats:\n");
+
+	seq_printf(seq, "{ n: %d\n", ca->n);
+	for (i = 0; i < ca->n; i++) {
+		int cpu = ELEM_NO(ca->array[i], ca->id_shift);
+
+	seq_printf(seq, "cpu=%d: %03llx\n",
+		   cpu, ca->array[i]);
+
+	seq_printf(seq, " %3d: [%d|%s|%03lld|%s]\n",
+		   i, cpu, bins[DECAY(ca->array[i])],
+		   COUNT(ca->array[i]),
+		   cpu_online(cpu) ? "online" :
+		   (ca->array[i] & ELEM_STATUS_BIT) ?
+		   "offlined-by-cec" : "offline");
+	}
+
+	seq_printf(seq, "}\n");
+
+	seq_printf(seq, "Stats:\nCEs: %llu\nofflined CPUs: %llu\n",
+		   ca->ces_entered, ca->elems_poisoned);
+
+	seq_printf(seq, "Flags: 0x%x\n", ca->flags);
+
+	seq_printf(seq, "Decay interval: %lld seconds\n", decay_interval);
+	seq_printf(seq, "Decays: %lld\n", ca->decays_done);
+
+	seq_printf(seq, "Action threshold: %lld\n", action_threshold);
+
+	spin_unlock_irqrestore(&cpu_ce_arr.spin_lock, flags);
+
+	return 0;
+}
+DEFINE_SHOW_ATTRIBUTE(cec_cpu_stats);
+
 static int u64_get(void *data, u64 *val)
 {
 	*val = *(u64 *)data;
@@ -514,6 +684,7 @@ static int decay_interval_set(void *data, u64 val)
 	decay_interval = val;
 
 	cec_mod_work(&ce_arr.work, decay_interval);
+	cec_mod_work(&cpu_ce_arr.work, decay_interval/RAS_CEC_NUM_TIME_SLOTS);
 
 	return 0;
 }
@@ -532,8 +703,6 @@ static int action_threshold_set(void *data, u64 val)
 }
 DEFINE_DEBUGFS_ATTRIBUTE(action_threshold_ops, u64_get, action_threshold_set, "%lld\n");
 
-static const char * const bins[] = { "00", "01", "10", "11" };
-
 static int array_dump(struct seq_file *m, void *v)
 {
 	struct ce_array *ca = &ce_arr;
@@ -620,6 +789,14 @@ static int __init create_debugfs_nodes(void)
 	}
 #endif
 
+#if defined(CONFIG_ARM64)
+	array = debugfs_create_file("cpu_stats", 0400, d, NULL, &cec_cpu_stats_fops);
+	if (!array) {
+		pr_warn("Error creating cpu_stats debugfs node!\n");
+		goto err;
+	}
+#endif
+
 	return 0;
 
 err:
@@ -658,21 +835,26 @@ static struct notifier_block cec_nb = {
 
 static void __init cec_init(void)
 {
-	if (ce_arr.disabled)
+	if (ce_arr.disabled && cpu_ce_arr.disabled)
 		return;
 
 #if defined(CONFIG_X86_MCE)
 	ce_arr.array = (void *)get_zeroed_page(GFP_KERNEL);
 	if (!ce_arr.array) {
 		pr_err("Error allocating CE array page!\n");
-		return;
+		goto error;
 	}
 #endif
 
-	if (create_debugfs_nodes()) {
-		free_page((unsigned long)ce_arr.array);
-		return;
-	}
+#if defined(CONFIG_ARM64)
+	cpu_ce_arr.array = kcalloc(num_present_cpus(), sizeof(*(cpu_ce_arr.array)),
+				   GFP_KERNEL);
+	if (!cpu_ce_arr.array)
+		goto error;
+#endif
+
+	if (create_debugfs_nodes())
+		goto error;
 
 #if defined(CONFIG_X86_MCE)
 	ce_arr.id_shift = PAGE_SHIFT;
@@ -682,22 +864,49 @@ static void __init cec_init(void)
 	mce_register_decode_chain(&cec_nb);
 #endif
 
+#if defined(CONFIG_ARM64)
+	cpu_ce_arr.short_period = true;
+	cpu_ce_arr.id_shift = ELEM_ID_SHIFT;
+	spin_lock_init(&cpu_ce_arr.spin_lock);
+	INIT_DELAYED_WORK(&cpu_ce_arr.work, cec_work_fn);
+	schedule_delayed_work(&cpu_ce_arr.work, CEC_DECAY_DEFAULT_INTERVAL/RAS_CEC_NUM_TIME_SLOTS);
+#endif
+
 	pr_info("Correctable Errors collector initialized.\n");
+	return;
+error:
+#if defined(CONFIG_ARM64)
+	kfree(cpu_ce_arr.array);
+#endif
+	if (ce_arr.array)
+		free_page((unsigned long)ce_arr.array);
+
 }
 late_initcall(cec_init);
 
 int __init parse_cec_param(char *str)
 {
+	bool match = false;
+
 	if (!str)
 		return 0;
 
 	if (*str == '=')
 		str++;
 
-	if (!strcmp(str, "cec_disable"))
+	if (!strcmp(str, "cec_disable")) {
 		ce_arr.disabled = 1;
+		match = true;
+	}
+
+	if (!strcmp(str, "cec_cpu_disable")) {
+		cpu_ce_arr.disabled = 1;
+		match = true;
+	}
+
+	if (match)
+		return 1;
 	else
 		return 0;
 
-	return 1;
 }
diff --git a/include/linux/ras.h b/include/linux/ras.h
index 1f4048bf2674..43d91298f1e3 100644
--- a/include/linux/ras.h
+++ b/include/linux/ras.h
@@ -18,6 +18,15 @@ static inline int ras_add_daemon_trace(void) { return 0; }
 
 #ifdef CONFIG_RAS_CEC
 int __init parse_cec_param(char *str);
+/**
+ * cec_cpu_add_elem - add the count of CPU correctable errors to the
+ * CEC(correctable errors collector).
+ * @cpu: CPU index.
+ * @ce_count: CPU correctable errors count.
+ */
+int cec_cpu_add_elem(int cpu, u64 ce_count);
+#else
+static inline int cec_cpu_add_elem(int cpu, u64 ce_count) { return -ENODEV; }
 #endif
 
 #ifdef CONFIG_RAS

From patchwork Fri Oct  2 12:22:35 2020
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Shiju Jose <shiju.jose@huawei.com>
X-Patchwork-Id: 11813377
Return-Path: <SRS0=tWsE=DJ=vger.kernel.org=linux-edac-owner@kernel.org>
Received: from mail.kernel.org (pdx-korg-mail-1.web.codeaurora.org
 [172.30.200.123])
	by pdx-korg-patchwork-2.web.codeaurora.org (Postfix) with ESMTP id 2CCBC139A
	for <patchwork-linux-edac@patchwork.kernel.org>;
 Fri,  2 Oct 2020 12:31:56 +0000 (UTC)
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
	by mail.kernel.org (Postfix) with ESMTP id 138392177B
	for <patchwork-linux-edac@patchwork.kernel.org>;
 Fri,  2 Oct 2020 12:31:56 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S2387531AbgJBMbz (ORCPT
        <rfc822;patchwork-linux-edac@patchwork.kernel.org>);
        Fri, 2 Oct 2020 08:31:55 -0400
Received: from lhrrgout.huawei.com ([185.176.76.210]:2949 "EHLO huawei.com"
        rhost-flags-OK-OK-OK-FAIL) by vger.kernel.org with ESMTP
        id S1726090AbgJBMbz (ORCPT <rfc822;linux-edac@vger.kernel.org>);
        Fri, 2 Oct 2020 08:31:55 -0400
Received: from lhreml715-chm.china.huawei.com (unknown [172.18.7.106])
        by Forcepoint Email with ESMTP id 6DD1FBE0408B33BD3372;
        Fri,  2 Oct 2020 13:31:54 +0100 (IST)
Received: from DESKTOP-6T4S3DQ.china.huawei.com (10.47.84.119) by
 lhreml715-chm.china.huawei.com (10.201.108.66) with Microsoft SMTP Server
 (version=TLS1_2, cipher=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256) id
 15.1.1913.5; Fri, 2 Oct 2020 13:31:53 +0100
From: Shiju Jose <shiju.jose@huawei.com>
To: <linux-edac@vger.kernel.org>, <linux-acpi@vger.kernel.org>,
        <linux-kernel@vger.kernel.org>, <bp@alien8.de>,
        <tony.luck@intel.com>, <rjw@rjwysocki.net>, <james.morse@arm.com>,
        <lenb@kernel.org>
CC: <linuxarm@huawei.com>, <shiju.jose@huawei.com>
Subject: [RFC PATCH 7/7] ACPI / APEI: Add reporting ARM64 CPU correctable
 errors to the CEC
Date: Fri, 2 Oct 2020 13:22:35 +0100
Message-ID: <20201002122235.1280-8-shiju.jose@huawei.com>
X-Mailer: git-send-email 2.26.0.windows.1
In-Reply-To: <20201002122235.1280-1-shiju.jose@huawei.com>
References: <20201002122235.1280-1-shiju.jose@huawei.com>
MIME-Version: 1.0
X-Originating-IP: [10.47.84.119]
X-ClientProxiedBy: lhreml720-chm.china.huawei.com (10.201.108.71) To
 lhreml715-chm.china.huawei.com (10.201.108.66)
X-CFilter-Loop: Reflected
Precedence: bulk
List-ID: <linux-edac.vger.kernel.org>
X-Mailing-List: linux-edac@vger.kernel.org

Add reporting ARM64 CPU correctable errors to the RAS correctable
errors collector(CEC).

ARM processor error types are cache/TLB/bus errors.
Any of the above error types should not be consider for the
error collection and CPU core isolation?

Signed-off-by: Shiju Jose <shiju.jose@huawei.com>
---
 drivers/acpi/apei/ghes.c | 36 +++++++++++++++++++++++++++++++++---
 1 file changed, 33 insertions(+), 3 deletions(-)

diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index 81bf71b10d44..3cecb457d352 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -511,6 +511,38 @@ static void ghes_handle_aer(struct acpi_hest_generic_data *gdata)
 #endif
 }
 
+static void ghes_handle_arm_hw_error(struct acpi_hest_generic_data *gdata)
+{
+	struct cper_sec_proc_arm *err = acpi_hest_get_payload(gdata);
+	struct cper_arm_err_info *err_info;
+	int sec_sev;
+	int cpu, i, ret;
+
+	log_arm_hw_error(err);
+
+	sec_sev = ghes_severity(gdata->error_severity);
+	if (sec_sev != GHES_SEV_CORRECTED)
+		return;
+
+#if defined(CONFIG_ARM64)
+	cpu = get_logical_index(err->mpidr);
+	if (cpu == -EINVAL)
+		return;
+
+	/* ARM processor error types are cache/tlb/bus errors.
+	 * Any of the above error types should not be consider for the
+	 * error collection and CPU core isolation?
+	 */
+	err_info = (struct cper_arm_err_info *)(err + 1);
+	for (i = 0; i < err->err_info_num; i++) {
+		ret = cec_cpu_add_elem(cpu, err_info->multiple_error + 1);
+		if (ret)
+			break;
+		err_info += 1;
+	}
+#endif
+}
+
 static bool ghes_do_proc(struct ghes *ghes,
 			 const struct acpi_hest_generic_status *estatus)
 {
@@ -543,9 +575,7 @@ static bool ghes_do_proc(struct ghes *ghes,
 			ghes_handle_aer(gdata);
 		}
 		else if (guid_equal(sec_type, &CPER_SEC_PROC_ARM)) {
-			struct cper_sec_proc_arm *err = acpi_hest_get_payload(gdata);
-
-			log_arm_hw_error(err);
+			ghes_handle_arm_hw_error(gdata);
 		} else {
 			void *err = acpi_hest_get_payload(gdata);