diff mbox series

[RFC,v2,5/5] cxl: Add mce notifier to emit aliased address for extended linear cache

Message ID 20241112221335.432583-6-dave.jiang@intel.com (mailing list archive)
State RFC, archived
Headers show
Series acpi/hmat / cxl: Add exclusive caching enumeration and RAS support | expand

Commit Message

Dave Jiang Nov. 12, 2024, 10:12 p.m. UTC
Below is a setup with extended linear cache configuration with an example
layout of memory region shown below presented as a single memory region
consists of 256G memory where there's 128G of DRAM and 128G of CXL memory.
The kernel sees a region of total 256G of system memory.

              128G DRAM                          128G CXL memory
|-----------------------------------|-------------------------------------|

Data resides in either DRAM or far memory (FM) with no replication. Hot
data is swapped into DRAM by the hardware behind the scenes. When error is
detected in one location, it is possible that error also resides in the
aliased location. Therefore when a memory location that is flagged by MCE
is part of the special region, the aliased memory location needs to be
offlined as well.

Add an mce notify callback to identify if the MCE address location is part
of an extended linear cache region and handle accordingly.

Added symbol export to set_mce_nospec() in x86 code in order to call
set_mce_nospec() from the CXL MCE notify callback.

Link: https://lore.kernel.org/linux-cxl/668333b17e4b2_5639294fd@dwillia2-xfh.jf.intel.com.notmuch/
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
---
v2:
- Move mce code to core/mce.c and add arch wrappers for CONFIG_X86_64. (Jonathan)
---
 arch/x86/include/asm/mce.h   |  1 +
 arch/x86/mm/pat/set_memory.c |  1 +
 drivers/cxl/core/Makefile    |  1 +
 drivers/cxl/core/mbox.c      |  3 +++
 drivers/cxl/core/mce.c       | 52 ++++++++++++++++++++++++++++++++++++
 drivers/cxl/core/mce.h       | 14 ++++++++++
 drivers/cxl/core/region.c    | 25 +++++++++++++++++
 drivers/cxl/cxl.h            |  6 +++++
 drivers/cxl/cxlmem.h         |  2 ++
 9 files changed, 105 insertions(+)
 create mode 100644 drivers/cxl/core/mce.c
 create mode 100644 drivers/cxl/core/mce.h

Comments

Borislav Petkov Nov. 13, 2024, 8:11 a.m. UTC | #1
On Tue, Nov 12, 2024 at 03:12:37PM -0700, Dave Jiang wrote:
> diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
> index 3b9970117a0f..a8ad140d5692 100644
> --- a/arch/x86/include/asm/mce.h
> +++ b/arch/x86/include/asm/mce.h
> @@ -182,6 +182,7 @@ enum mce_notifier_prios {
>  	MCE_PRIO_NFIT,
>  	MCE_PRIO_EXTLOG,
>  	MCE_PRIO_UC,
> +	MCE_PRIO_CXL,
>  	MCE_PRIO_EARLY,
>  	MCE_PRIO_CEC,
>  	MCE_PRIO_HIGHEST = MCE_PRIO_CEC

Why this priority exactly?
Dave Jiang Nov. 13, 2024, 3:27 p.m. UTC | #2
On 11/13/24 1:11 AM, Borislav Petkov wrote:
> On Tue, Nov 12, 2024 at 03:12:37PM -0700, Dave Jiang wrote:
>> diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
>> index 3b9970117a0f..a8ad140d5692 100644
>> --- a/arch/x86/include/asm/mce.h
>> +++ b/arch/x86/include/asm/mce.h
>> @@ -182,6 +182,7 @@ enum mce_notifier_prios {
>>  	MCE_PRIO_NFIT,
>>  	MCE_PRIO_EXTLOG,
>>  	MCE_PRIO_UC,
>> +	MCE_PRIO_CXL,
>>  	MCE_PRIO_EARLY,
>>  	MCE_PRIO_CEC,
>>  	MCE_PRIO_HIGHEST = MCE_PRIO_CEC
> 
> Why this priority exactly?

Hi Boris,
I'm actually looking for recommendation on what the proper one is. The handler is expected to offline the aliased address of the reported MCE if there is one.

>
Borislav Petkov Nov. 14, 2024, 9:32 a.m. UTC | #3
On Wed, Nov 13, 2024 at 08:27:35AM -0700, Dave Jiang wrote:
> I'm actually looking for recommendation on what the proper one is. The
> handler is expected to offline the aliased address of the reported MCE if
> there is one.

Well, MCE_PRIO_EARLY will emit a trace record so that if you have error events
consumers like rasdaemon, it'll get that error record for reporting etc.

MCE_PRIO_UC calls memory_failure() on the error and thus offlines the page.
Functionality which you're partly replicating in your notifier.

And since you wanna do the same thing, why are you even adding a new priority
instead of using MCE_PRIO_UC? amdgpu_bad_page_notifier() uses that same prio
because it does a similar thing.
Dave Jiang Nov. 14, 2024, 3:52 p.m. UTC | #4
On 11/14/24 2:32 AM, Borislav Petkov wrote:
> On Wed, Nov 13, 2024 at 08:27:35AM -0700, Dave Jiang wrote:
>> I'm actually looking for recommendation on what the proper one is. The
>> handler is expected to offline the aliased address of the reported MCE if
>> there is one.
> 
> Well, MCE_PRIO_EARLY will emit a trace record so that if you have error events
> consumers like rasdaemon, it'll get that error record for reporting etc.
> 
> MCE_PRIO_UC calls memory_failure() on the error and thus offlines the page.
> Functionality which you're partly replicating in your notifier.
> 
> And since you wanna do the same thing, why are you even adding a new priority
> instead of using MCE_PRIO_UC? amdgpu_bad_page_notifier() uses that same prio
> because it does a similar thing.

Ok thanks for the explanation. I will use MCE_PRIO_UC. 
>
diff mbox series

Patch

diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 3b9970117a0f..a8ad140d5692 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -182,6 +182,7 @@  enum mce_notifier_prios {
 	MCE_PRIO_NFIT,
 	MCE_PRIO_EXTLOG,
 	MCE_PRIO_UC,
+	MCE_PRIO_CXL,
 	MCE_PRIO_EARLY,
 	MCE_PRIO_CEC,
 	MCE_PRIO_HIGHEST = MCE_PRIO_CEC
diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c
index 44f7b2ea6a07..1f85c29e118e 100644
--- a/arch/x86/mm/pat/set_memory.c
+++ b/arch/x86/mm/pat/set_memory.c
@@ -2083,6 +2083,7 @@  int set_mce_nospec(unsigned long pfn)
 		pr_warn("Could not invalidate pfn=0x%lx from 1:1 map\n", pfn);
 	return rc;
 }
+EXPORT_SYMBOL_GPL(set_mce_nospec);
 
 /* Restore full speculative operation to the pfn. */
 int clear_mce_nospec(unsigned long pfn)
diff --git a/drivers/cxl/core/Makefile b/drivers/cxl/core/Makefile
index 1a0c9c6ca818..d619fb11febd 100644
--- a/drivers/cxl/core/Makefile
+++ b/drivers/cxl/core/Makefile
@@ -17,3 +17,4 @@  cxl_core-y += cdat.o
 cxl_core-y += acpi.o
 cxl_core-$(CONFIG_TRACING) += trace.o
 cxl_core-$(CONFIG_CXL_REGION) += region.o
+cxl_core-$(CONFIG_X86_MCE) += mce.o
diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c
index 3fa9c658a253..2304786a1333 100644
--- a/drivers/cxl/core/mbox.c
+++ b/drivers/cxl/core/mbox.c
@@ -11,6 +11,7 @@ 
 
 #include "core.h"
 #include "trace.h"
+#include "mce.h"
 
 static bool cxl_raw_allow_all;
 
@@ -1489,6 +1490,8 @@  struct cxl_memdev_state *cxl_memdev_state_create(struct device *dev)
 	mds->ram_perf.qos_class = CXL_QOS_CLASS_INVALID;
 	mds->pmem_perf.qos_class = CXL_QOS_CLASS_INVALID;
 
+	cxl_register_mce_notifier(&mds->mce_notifier);
+
 	return mds;
 }
 EXPORT_SYMBOL_NS_GPL(cxl_memdev_state_create, CXL);
diff --git a/drivers/cxl/core/mce.c b/drivers/cxl/core/mce.c
new file mode 100644
index 000000000000..801e4e4ef91a
--- /dev/null
+++ b/drivers/cxl/core/mce.c
@@ -0,0 +1,52 @@ 
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright(c) 2024 Intel Corporation. All rights reserved. */
+#include <linux/notifier.h>
+#include <linux/set_memory.h>
+#include <asm/mce.h>
+#include <cxlmem.h>
+#include "mce.h"
+
+static int cxl_handle_mce(struct notifier_block *nb, unsigned long val,
+			  void *data)
+{
+	struct cxl_memdev_state *mds = container_of(nb, struct cxl_memdev_state,
+						    mce_notifier);
+	struct cxl_memdev *cxlmd = mds->cxlds.cxlmd;
+	struct cxl_port *endpoint = cxlmd->endpoint;
+	struct mce *mce = (struct mce *)data;
+	u64 spa, spa_alias;
+	unsigned long pfn;
+
+	if (!mce || !mce_usable_address(mce))
+		return NOTIFY_DONE;
+
+	spa = mce->addr & MCI_ADDR_PHYSADDR;
+
+	pfn = spa >> PAGE_SHIFT;
+	if (!pfn_valid(pfn))
+		return NOTIFY_DONE;
+
+	spa_alias = cxl_port_get_spa_cache_alias(endpoint, spa);
+	if (!spa_alias)
+		return NOTIFY_DONE;
+
+	pfn = spa_alias >> PAGE_SHIFT;
+
+	/*
+	 * Take down the aliased memory page. The original memory page flagged
+	 * by the MCE will be taken cared of by the standard MCE handler.
+	 */
+	dev_emerg(mds->cxlds.dev, "Offlining aliased SPA address: %#llx\n",
+		  spa_alias);
+	if (!memory_failure(pfn, 0))
+		set_mce_nospec(pfn);
+
+	return NOTIFY_OK;
+}
+
+void cxl_register_mce_notifier(struct notifier_block *mce_notifier)
+{
+	mce_notifier->notifier_call = cxl_handle_mce;
+	mce_notifier->priority = MCE_PRIO_CXL;
+	mce_register_decode_chain(mce_notifier);
+}
diff --git a/drivers/cxl/core/mce.h b/drivers/cxl/core/mce.h
new file mode 100644
index 000000000000..1d747618bbc0
--- /dev/null
+++ b/drivers/cxl/core/mce.h
@@ -0,0 +1,14 @@ 
+/* SPDX-License-Identifier: GPL-2.0-only */
+/* Copyright(c) 2024 Intel Corporation. All rights reserved. */
+#ifndef _CXL_CORE_MCE_H_
+#define _CXL_CORE_MCE_H_
+
+#include <linux/notifier.h>
+
+#ifdef CONFIG_X86_MCE
+void cxl_register_mce_notifier(struct notifier_block *mce_notifer);
+#else
+static inline void cxl_register_mce_notifier(struct notifier_block *mce_notifier) {}
+#endif
+
+#endif
diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c
index a7479b4aad8d..d141ab11f784 100644
--- a/drivers/cxl/core/region.c
+++ b/drivers/cxl/core/region.c
@@ -3440,6 +3440,31 @@  int cxl_add_to_region(struct cxl_port *root, struct cxl_endpoint_decoder *cxled)
 }
 EXPORT_SYMBOL_NS_GPL(cxl_add_to_region, CXL);
 
+u64 cxl_port_get_spa_cache_alias(struct cxl_port *endpoint, u64 spa)
+{
+	struct cxl_region_ref *iter;
+	unsigned long index;
+
+	guard(rwsem_write)(&cxl_region_rwsem);
+
+	xa_for_each(&endpoint->regions, index, iter) {
+		struct cxl_region_params *p = &iter->region->params;
+
+		if (p->res->start <= spa && spa <= p->res->end) {
+			if (!p->cache_size)
+				return 0;
+
+			if (spa > p->res->start + p->cache_size)
+				return spa - p->cache_size;
+
+			return spa + p->cache_size;
+		}
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_NS_GPL(cxl_port_get_spa_cache_alias, CXL);
+
 static int is_system_ram(struct resource *res, void *arg)
 {
 	struct cxl_region *cxlr = arg;
diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h
index 26466807fa7a..6b612a87469d 100644
--- a/drivers/cxl/cxl.h
+++ b/drivers/cxl/cxl.h
@@ -866,6 +866,7 @@  struct cxl_pmem_region *to_cxl_pmem_region(struct device *dev);
 int cxl_add_to_region(struct cxl_port *root,
 		      struct cxl_endpoint_decoder *cxled);
 struct cxl_dax_region *to_cxl_dax_region(struct device *dev);
+u64 cxl_port_get_spa_cache_alias(struct cxl_port *endpoint, u64 spa);
 #else
 static inline bool is_cxl_pmem_region(struct device *dev)
 {
@@ -884,6 +885,11 @@  static inline struct cxl_dax_region *to_cxl_dax_region(struct device *dev)
 {
 	return NULL;
 }
+static inline u64 cxl_port_get_spa_cache_alias(struct cxl_port *endpoint,
+					       u64 spa)
+{
+	return 0;
+}
 #endif
 
 void cxl_endpoint_parse_cdat(struct cxl_port *port);
diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h
index 2a25d1957ddb..55752cbf408c 100644
--- a/drivers/cxl/cxlmem.h
+++ b/drivers/cxl/cxlmem.h
@@ -477,6 +477,7 @@  static inline struct cxl_dev_state *mbox_to_cxlds(struct cxl_mailbox *cxl_mbox)
  * @poison: poison driver state info
  * @security: security driver state info
  * @fw: firmware upload / activation state
+ * @mce_notifier: MCE notifier
  *
  * See CXL 3.0 8.2.9.8.2 Capacity Configuration and Label Storage for
  * details on capacity parameters.
@@ -503,6 +504,7 @@  struct cxl_memdev_state {
 	struct cxl_poison_state poison;
 	struct cxl_security_state security;
 	struct cxl_fw_state fw;
+	struct notifier_block mce_notifier;
 };
 
 static inline struct cxl_memdev_state *