Message ID | 382a9c35ef43e89db85670637d88371f9197b7a2.1655250669.git.alison.schofield@intel.com |
---|---|
State | New, archived |
Headers | show |
Series | CXL Poison List Retrieval & Tracing | expand |
On Tue, Jun 14, 2022 at 05:10:27PM -0700, Alison Schofield wrote: > From: Alison Schofield <alison.schofield@intel.com> > > CXL devices that support persistent memory maintain a list of locations > that are poisoned or result in poison if the addresses are accessed by > the host. > > Per the spec (CXL 2.0 8.2.8.5.4.1), the device returns this Poison > list as a set of Media Error Records that include the source of the > error, the starting device physical address and length. The length is > the number of adjacent DPAs in the record and is in units of 64 bytes. > > Retrieve the list and log each Media Error Record as a trace event of > type cxl_poison_list. > > Signed-off-by: Alison Schofield <alison.schofield@intel.com> > --- > drivers/cxl/cxlmem.h | 43 +++++++++++++++++++++++ > drivers/cxl/core/mbox.c | 75 +++++++++++++++++++++++++++++++++++++++++ > 2 files changed, 118 insertions(+) > > diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h > index 60d10ee1e7fc..29cf0459b44a 100644 > --- a/drivers/cxl/cxlmem.h > +++ b/drivers/cxl/cxlmem.h > @@ -174,6 +174,7 @@ struct cxl_endpoint_dvsec_info { > * (CXL 2.0 8.2.8.4.3 Mailbox Capabilities Register) > * @lsa_size: Size of Label Storage Area > * (CXL 2.0 8.2.9.5.1.1 Identify Memory Device) > + * @poison_max_mer: maximum Media Error Records tracked in Poison List Does not match the member name below. Ira > * @mbox_mutex: Mutex to synchronize mailbox access. > * @firmware_version: Firmware version for the memory device. > * @enabled_cmds: Hardware commands found enabled in CEL. > @@ -204,6 +205,7 @@ struct cxl_dev_state { > > size_t payload_size; > size_t lsa_size; > + u32 poison_max; > struct mutex mbox_mutex; /* Protects device mailbox and firmware */ > char firmware_version[0x10]; > DECLARE_BITMAP(enabled_cmds, CXL_MEM_COMMAND_ID_MAX); > @@ -317,6 +319,46 @@ struct cxl_mbox_set_partition_info { > > #define CXL_SET_PARTITION_IMMEDIATE_FLAG BIT(0) > > +struct cxl_mbox_poison_payload_in { > + __le64 offset; > + __le64 length; > +} __packed; > + > +struct cxl_mbox_poison_payload_out { > + u8 flags; > + u8 rsvd1; > + __le64 overflow_timestamp; > + __le16 count; > + u8 rsvd2[0x14]; > + struct cxl_poison_record { > + __le64 address; > + __le32 length; > + __le32 rsvd; > + } __packed record[]; > +} __packed; > + > +/* CXL 8.2.9.5.4.1 Get Poison List: payload out flags: */ > +#define CXL_POISON_FLAG_MORE BIT(0) > +#define CXL_POISON_FLAG_OVERFLOW BIT(1) > +#define CXL_POISON_FLAG_SCANNING BIT(2) > + > +/* CXL 8.2.9.5.4.1 Get Poison List: Error is encoded in record.address[2:0] */ > +#define CXL_POISON_SOURCE_MASK GENMASK(2, 0) > +#define CXL_POISON_SOURCE_UNKNOWN 0 > +#define CXL_POISON_SOURCE_EXTERNAL 1 > +#define CXL_POISON_SOURCE_INTERNAL 2 > +#define CXL_POISON_SOURCE_INJECTED 3 > +#define CXL_POISON_SOURCE_VENDOR 7 > + > +/* Software define */ > +#define CXL_POISON_SOURCE_INVALID 99 > +#define CXL_POISON_SOURCE_VALID(x) \ > + (((x) == CXL_POISON_SOURCE_UNKNOWN) || \ > + ((x) == CXL_POISON_SOURCE_EXTERNAL) || \ > + ((x) == CXL_POISON_SOURCE_INTERNAL) || \ > + ((x) == CXL_POISON_SOURCE_INJECTED) || \ > + ((x) == CXL_POISON_SOURCE_VENDOR)) > + > /** > * struct cxl_mem_command - Driver representation of a memory device command > * @info: Command information as it exists for the UAPI > @@ -351,6 +393,7 @@ int cxl_mem_create_range_info(struct cxl_dev_state *cxlds); > struct cxl_dev_state *cxl_dev_state_create(struct device *dev); > void set_exclusive_cxl_commands(struct cxl_dev_state *cxlds, unsigned long *cmds); > void clear_exclusive_cxl_commands(struct cxl_dev_state *cxlds, unsigned long *cmds); > +int cxl_mem_get_poison_list(struct device *dev); > #ifdef CONFIG_CXL_SUSPEND > void cxl_mem_active_inc(void); > void cxl_mem_active_dec(void); > diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c > index 54f434733b56..c10c7020ebc2 100644 > --- a/drivers/cxl/core/mbox.c > +++ b/drivers/cxl/core/mbox.c > @@ -9,6 +9,9 @@ > > #include "core.h" > > +#define CREATE_TRACE_POINTS > +#include <trace/events/cxl.h> > + > static bool cxl_raw_allow_all; > > /** > @@ -755,6 +758,7 @@ int cxl_dev_state_identify(struct cxl_dev_state *cxlds) > { > /* See CXL 2.0 Table 175 Identify Memory Device Output Payload */ > struct cxl_mbox_identify id; > + __le32 val = 0; > int rc; > > rc = cxl_mbox_send_cmd(cxlds, CXL_MBOX_OP_IDENTIFY, NULL, 0, &id, > @@ -783,6 +787,9 @@ int cxl_dev_state_identify(struct cxl_dev_state *cxlds) > cxlds->lsa_size = le32_to_cpu(id.lsa_size); > memcpy(cxlds->firmware_version, id.fw_revision, sizeof(id.fw_revision)); > > + memcpy(&val, id.poison_list_max_mer, 3); > + cxlds->poison_max = le32_to_cpu(val); > + > return 0; > } > EXPORT_SYMBOL_NS_GPL(cxl_dev_state_identify, CXL); > @@ -826,6 +833,74 @@ int cxl_mem_create_range_info(struct cxl_dev_state *cxlds) > } > EXPORT_SYMBOL_NS_GPL(cxl_mem_create_range_info, CXL); > > +int cxl_mem_get_poison_list(struct device *dev) > +{ > + struct cxl_memdev *cxlmd = to_cxl_memdev(dev); > + struct cxl_dev_state *cxlds = cxlmd->cxlds; > + struct cxl_mbox_poison_payload_out *po; > + struct cxl_mbox_poison_payload_in pi; > + int nr_records = 0; > + int rc, i; > + > + if (range_len(&cxlds->pmem_range)) { > + pi.offset = cpu_to_le64(cxlds->pmem_range.start); > + pi.length = cpu_to_le64(range_len(&cxlds->pmem_range)); > + } else { > + return -ENXIO; > + } > + > + po = kvmalloc(cxlds->payload_size, GFP_KERNEL); > + if (!po) > + return -ENOMEM; > + > + do { > + rc = cxl_mbox_send_cmd(cxlds, CXL_MBOX_OP_GET_POISON, &pi, > + sizeof(pi), po, cxlds->payload_size); > + if (rc) > + goto out; > + > + if (po->flags & CXL_POISON_FLAG_OVERFLOW) { > + time64_t o_time = le64_to_cpu(po->overflow_timestamp); > + > + dev_err(dev, "Poison list overflow at %ptTs UTC\n", > + &o_time); > + rc = -ENXIO; > + goto out; I guess the idea is that this return will trigger something else will clear the list, rebuild the list, and perform a scan media request? I'm just wondering if this loop should continue to clear the list and then let something else do the scan media request? Other than that question and the above typo. Looks good! Ira > + } > + > + if (po->flags & CXL_POISON_FLAG_SCANNING) { > + dev_err(dev, "Scan Media in Progress\n"); > + rc = -EBUSY; > + goto out; > + } > + > + for (i = 0; i < le16_to_cpu(po->count); i++) { > + u64 addr = le64_to_cpu(po->record[i].address); > + u32 len = le32_to_cpu(po->record[i].length); > + int source = FIELD_GET(CXL_POISON_SOURCE_MASK, addr); > + > + if (!CXL_POISON_SOURCE_VALID(source)) { > + dev_dbg(dev, "Invalid poison source %d", > + source); > + source = CXL_POISON_SOURCE_INVALID; > + } > + > + trace_cxl_poison_list(dev, source, addr, len); > + } > + > + /* Protect against an uncleared _FLAG_MORE */ > + nr_records = nr_records + le16_to_cpu(po->count); > + if (nr_records >= cxlds->poison_max) > + goto out; > + > + } while (po->flags & CXL_POISON_FLAG_MORE); > + > +out: > + kvfree(po); > + return rc; > +} > +EXPORT_SYMBOL_NS_GPL(cxl_mem_get_poison_list, CXL); > + > struct cxl_dev_state *cxl_dev_state_create(struct device *dev) > { > struct cxl_dev_state *cxlds; > -- > 2.31.1 >
On Tue, Jun 14, 2022 at 08:22:41PM -0700, Ira Weiny wrote: Thanks for the review Ira... > On Tue, Jun 14, 2022 at 05:10:27PM -0700, Alison Schofield wrote: > > From: Alison Schofield <alison.schofield@intel.com> > > snip > > > > diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h > > index 60d10ee1e7fc..29cf0459b44a 100644 > > --- a/drivers/cxl/cxlmem.h > > +++ b/drivers/cxl/cxlmem.h > > @@ -174,6 +174,7 @@ struct cxl_endpoint_dvsec_info { > > * (CXL 2.0 8.2.8.4.3 Mailbox Capabilities Register) > > * @lsa_size: Size of Label Storage Area > > * (CXL 2.0 8.2.9.5.1.1 Identify Memory Device) > > + * @poison_max_mer: maximum Media Error Records tracked in Poison List > > Does not match the member name below. Got it! I intended to drop the _mer. > > Ira > > > * @mbox_mutex: Mutex to synchronize mailbox access. > > * @firmware_version: Firmware version for the memory device. > > * @enabled_cmds: Hardware commands found enabled in CEL. > > @@ -204,6 +205,7 @@ struct cxl_dev_state { > > > > size_t payload_size; > > size_t lsa_size; > > + u32 poison_max; > > struct mutex mbox_mutex; /* Protects device mailbox and firmware */ > > char firmware_version[0x10]; > > DECLARE_BITMAP(enabled_cmds, CXL_MEM_COMMAND_ID_MAX); snip > > +int cxl_mem_get_poison_list(struct device *dev) > > +{ > > + struct cxl_memdev *cxlmd = to_cxl_memdev(dev); > > + struct cxl_dev_state *cxlds = cxlmd->cxlds; > > + struct cxl_mbox_poison_payload_out *po; > > + struct cxl_mbox_poison_payload_in pi; > > + int nr_records = 0; > > + int rc, i; > > + > > + if (range_len(&cxlds->pmem_range)) { > > + pi.offset = cpu_to_le64(cxlds->pmem_range.start); > > + pi.length = cpu_to_le64(range_len(&cxlds->pmem_range)); > > + } else { > > + return -ENXIO; > > + } > > + > > + po = kvmalloc(cxlds->payload_size, GFP_KERNEL); > > + if (!po) > > + return -ENOMEM; > > + > > + do { > > + rc = cxl_mbox_send_cmd(cxlds, CXL_MBOX_OP_GET_POISON, &pi, > > + sizeof(pi), po, cxlds->payload_size); > > + if (rc) > > + goto out; > > + > > + if (po->flags & CXL_POISON_FLAG_OVERFLOW) { > > + time64_t o_time = le64_to_cpu(po->overflow_timestamp); > > + > > + dev_err(dev, "Poison list overflow at %ptTs UTC\n", > > + &o_time); > > + rc = -ENXIO; > > + goto out; > > I guess the idea is that this return will trigger something else will clear the list, > rebuild the list, and perform a scan media request? > Per CXL Spec 8.2.9.5.4.1: The poison list may be incomplete when the list has overflowed. User can perform a Scan Media to try to clear and rebuild the list, with no guarantee that the overflow will not recur. So yes to what you are saying. This return value should indicate to user space that a Scan Media should be issued. Issuing the Scan Media to the device does lead the device to rebuild it's list, as you say. Also, when we get the Scan Media results, the device is able to report partial results and tell the host to collect the error records, and then restart the scan, get results again, and on and on until the scan is complete. Perhaps a clarification - there is not a logical pairing of Scan Media followed by Get Poison List. Scan Media followed by Get Scan Media Results is the logical pairing. Get Poison List is getting a snapshot of the poison list at a point in time. The device adds DPAs to the list when the device detects poison, some devices run their own backround scans and add to the poison list, and then there are the user initiated actions (Scan Media and Poison Inject) that can affect the list. > I'm just wondering if this loop should continue to clear the list and then let > something else do the scan media request? It's not like the _MORE status where the device is telling the host to come back and gather more. I think the action of failing, and letting user initiated a Scan Media is correct course here. So, this response got kind of long winded. As you can see, especially if one looks in the spec as I know you are, there are additional commands that need to be implemented to complete the ARS feature set. And, of course, we'll offer user space tooling (NDCTL and libcxl). > > Other than that question and the above typo. Looks good! > > Ira > > > + } > > + > > + if (po->flags & CXL_POISON_FLAG_SCANNING) { > > + dev_err(dev, "Scan Media in Progress\n"); > > + rc = -EBUSY; > > + goto out; > > + } > > + > > + for (i = 0; i < le16_to_cpu(po->count); i++) { > > + u64 addr = le64_to_cpu(po->record[i].address); > > + u32 len = le32_to_cpu(po->record[i].length); > > + int source = FIELD_GET(CXL_POISON_SOURCE_MASK, addr); > > + > > + if (!CXL_POISON_SOURCE_VALID(source)) { > > + dev_dbg(dev, "Invalid poison source %d", > > + source); > > + source = CXL_POISON_SOURCE_INVALID; > > + } > > + > > + trace_cxl_poison_list(dev, source, addr, len); > > + } > > + > > + /* Protect against an uncleared _FLAG_MORE */ > > + nr_records = nr_records + le16_to_cpu(po->count); > > + if (nr_records >= cxlds->poison_max) > > + goto out; > > + > > + } while (po->flags & CXL_POISON_FLAG_MORE); > > + > > +out: > > + kvfree(po); > > + return rc; > > +} > > +EXPORT_SYMBOL_NS_GPL(cxl_mem_get_poison_list, CXL); > > + > > struct cxl_dev_state *cxl_dev_state_create(struct device *dev) > > { > > struct cxl_dev_state *cxlds; > > -- > > 2.31.1 > >
On Tue, Jun 14, 2022 at 10:07:52PM -0700, Alison Schofield wrote: > On Tue, Jun 14, 2022 at 08:22:41PM -0700, Ira Weiny wrote: > [snip] > > > + > > > + do { > > > + rc = cxl_mbox_send_cmd(cxlds, CXL_MBOX_OP_GET_POISON, &pi, > > > + sizeof(pi), po, cxlds->payload_size); > > > + if (rc) > > > + goto out; > > > + > > > + if (po->flags & CXL_POISON_FLAG_OVERFLOW) { > > > + time64_t o_time = le64_to_cpu(po->overflow_timestamp); > > > + > > > + dev_err(dev, "Poison list overflow at %ptTs UTC\n", > > > + &o_time); > > > + rc = -ENXIO; > > > + goto out; > > > > I guess the idea is that this return will trigger something else will clear the list, > > rebuild the list, and perform a scan media request? > > > Per CXL Spec 8.2.9.5.4.1: The poison list may be incomplete when the list > has overflowed. User can perform a Scan Media to try to clear and rebuild > the list, with no guarantee that the overflow will not recur. > > So yes to what you are saying. This return value should indicate to > user space that a Scan Media should be issued. Issuing the Scan Media > to the device does lead the device to rebuild it's list, as you say. > Also, when we get the Scan Media results, the device is able to report > partial results and tell the host to collect the error records, and > then restart the scan, get results again, and on and on until the scan > is complete. > > Perhaps a clarification - there is not a logical pairing of Scan Media > followed by Get Poison List. Scan Media followed by Get Scan Media > Results is the logical pairing. Get Poison List is getting a snapshot > of the poison list at a point in time. The device adds DPAs to the list > when the device detects poison, some devices run their own backround > scans and add to the poison list, and then there are the user initiated > actions (Scan Media and Poison Inject) that can affect the list. > > > I'm just wondering if this loop should continue to clear the list and then let > > something else do the scan media request? > > It's not like the _MORE status where the device is telling the host to > come back and gather more. I think the action of failing, and letting > user initiated a Scan Media is correct course here. Fair enough. But I guess I'm still confused by the spec. The way I read it yesterday (and I could be wrong) was that the OS was supposed to read the entries to clear the list? Is that not true? I the device will clear the list internally when Scan Media is run? At this point I'm just trying to understand not necessarily objecting to the patch. Ira > > So, this response got kind of long winded. As you can see, especially > if one looks in the spec as I know you are, there are additional > commands that need to be implemented to complete the ARS feature set. > And, of course, we'll offer user space tooling (NDCTL and libcxl). >
On Wed, Jun 15, 2022 at 08:01:50AM -0700, Ira Weiny wrote: > On Tue, Jun 14, 2022 at 10:07:52PM -0700, Alison Schofield wrote: > > On Tue, Jun 14, 2022 at 08:22:41PM -0700, Ira Weiny wrote: > > > > [snip] > > > > > + > > > > + do { > > > > + rc = cxl_mbox_send_cmd(cxlds, CXL_MBOX_OP_GET_POISON, &pi, > > > > + sizeof(pi), po, cxlds->payload_size); > > > > + if (rc) > > > > + goto out; > > > > + > > > > + if (po->flags & CXL_POISON_FLAG_OVERFLOW) { > > > > + time64_t o_time = le64_to_cpu(po->overflow_timestamp); > > > > + > > > > + dev_err(dev, "Poison list overflow at %ptTs UTC\n", > > > > + &o_time); > > > > + rc = -ENXIO; > > > > + goto out; > > > > > > I guess the idea is that this return will trigger something else will clear the list, > > > rebuild the list, and perform a scan media request? > > > > > Per CXL Spec 8.2.9.5.4.1: The poison list may be incomplete when the list > > has overflowed. User can perform a Scan Media to try to clear and rebuild > > the list, with no guarantee that the overflow will not recur. > > > > So yes to what you are saying. This return value should indicate to > > user space that a Scan Media should be issued. Issuing the Scan Media > > to the device does lead the device to rebuild it's list, as you say. > > Also, when we get the Scan Media results, the device is able to report > > partial results and tell the host to collect the error records, and > > then restart the scan, get results again, and on and on until the scan > > is complete. > > > > Perhaps a clarification - there is not a logical pairing of Scan Media > > followed by Get Poison List. Scan Media followed by Get Scan Media > > Results is the logical pairing. Get Poison List is getting a snapshot > > of the poison list at a point in time. The device adds DPAs to the list > > when the device detects poison, some devices run their own backround > > scans and add to the poison list, and then there are the user initiated > > actions (Scan Media and Poison Inject) that can affect the list. > > > > > I'm just wondering if this loop should continue to clear the list and then let > > > something else do the scan media request? > > > > It's not like the _MORE status where the device is telling the host to > > come back and gather more. I think the action of failing, and letting > > user initiated a Scan Media is correct course here. > > Fair enough. But I guess I'm still confused by the spec. The way I read it > yesterday (and I could be wrong) was that the OS was supposed to read the > entries to clear the list? Is that not true? I think - not true. Get_Poison_List has no effect on the contents of the list itself. Even with its MORE flag, it is not clearing any poison, it's just telling the host that it had more records than could fit in one device payload so they will have to delivered to the host in multiple requests. I'd expect issuing multiple Get Poison List requests would get same results. (unless of course the media was going bad quickly Maybe you are conflating w other cmds: Scan Media & Clear Poison > > I the device will clear the list internally when Scan Media is run? Spec says device 'rebuids' the list. I might guess that's a clear and start anew, but not the hosts business. As a host, we wait for Scan Media to complete before issuing Get Scan Media Results or Get Poison List. > > At this point I'm just trying to understand not necessarily objecting to the > patch. NP. The questions are helpful! > > Ira > > > > > So, this response got kind of long winded. As you can see, especially > > if one looks in the spec as I know you are, there are additional > > commands that need to be implemented to complete the ARS feature set. > > And, of course, we'll offer user space tooling (NDCTL and libcxl). > >
On Tue, 14 Jun 2022, alison.schofield@intel.com wrote: >From: Alison Schofield <alison.schofield@intel.com> > >CXL devices that support persistent memory maintain a list of locations >that are poisoned or result in poison if the addresses are accessed by >the host. > >Per the spec (CXL 2.0 8.2.8.5.4.1), the device returns this Poison >list as a set of Media Error Records that include the source of the >error, the starting device physical address and length. The length is >the number of adjacent DPAs in the record and is in units of 64 bytes. > >Retrieve the list and log each Media Error Record as a trace event of >type cxl_poison_list. > >Signed-off-by: Alison Schofield <alison.schofield@intel.com> >--- > drivers/cxl/cxlmem.h | 43 +++++++++++++++++++++++ > drivers/cxl/core/mbox.c | 75 +++++++++++++++++++++++++++++++++++++++++ > 2 files changed, 118 insertions(+) > >diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h >index 60d10ee1e7fc..29cf0459b44a 100644 >--- a/drivers/cxl/cxlmem.h >+++ b/drivers/cxl/cxlmem.h >@@ -174,6 +174,7 @@ struct cxl_endpoint_dvsec_info { > * (CXL 2.0 8.2.8.4.3 Mailbox Capabilities Register) > * @lsa_size: Size of Label Storage Area > * (CXL 2.0 8.2.9.5.1.1 Identify Memory Device) >+ * @poison_max_mer: maximum Media Error Records tracked in Poison List > * @mbox_mutex: Mutex to synchronize mailbox access. > * @firmware_version: Firmware version for the memory device. > * @enabled_cmds: Hardware commands found enabled in CEL. >@@ -204,6 +205,7 @@ struct cxl_dev_state { > > size_t payload_size; > size_t lsa_size; >+ u32 poison_max; > struct mutex mbox_mutex; /* Protects device mailbox and firmware */ > char firmware_version[0x10]; > DECLARE_BITMAP(enabled_cmds, CXL_MEM_COMMAND_ID_MAX); >@@ -317,6 +319,46 @@ struct cxl_mbox_set_partition_info { > > #define CXL_SET_PARTITION_IMMEDIATE_FLAG BIT(0) > >+struct cxl_mbox_poison_payload_in { >+ __le64 offset; >+ __le64 length; >+} __packed; >+ >+struct cxl_mbox_poison_payload_out { >+ u8 flags; >+ u8 rsvd1; >+ __le64 overflow_timestamp; >+ __le16 count; >+ u8 rsvd2[0x14]; >+ struct cxl_poison_record { >+ __le64 address; >+ __le32 length; >+ __le32 rsvd; >+ } __packed record[]; >+} __packed; >+ >+/* CXL 8.2.9.5.4.1 Get Poison List: payload out flags: */ >+#define CXL_POISON_FLAG_MORE BIT(0) >+#define CXL_POISON_FLAG_OVERFLOW BIT(1) >+#define CXL_POISON_FLAG_SCANNING BIT(2) >+ >+/* CXL 8.2.9.5.4.1 Get Poison List: Error is encoded in record.address[2:0] */ >+#define CXL_POISON_SOURCE_MASK GENMASK(2, 0) >+#define CXL_POISON_SOURCE_UNKNOWN 0 >+#define CXL_POISON_SOURCE_EXTERNAL 1 >+#define CXL_POISON_SOURCE_INTERNAL 2 >+#define CXL_POISON_SOURCE_INJECTED 3 >+#define CXL_POISON_SOURCE_VENDOR 7 >+ >+/* Software define */ >+#define CXL_POISON_SOURCE_INVALID 99 >+#define CXL_POISON_SOURCE_VALID(x) \ >+ (((x) == CXL_POISON_SOURCE_UNKNOWN) || \ >+ ((x) == CXL_POISON_SOURCE_EXTERNAL) || \ >+ ((x) == CXL_POISON_SOURCE_INTERNAL) || \ >+ ((x) == CXL_POISON_SOURCE_INJECTED) || \ >+ ((x) == CXL_POISON_SOURCE_VENDOR)) >+ > /** > * struct cxl_mem_command - Driver representation of a memory device command > * @info: Command information as it exists for the UAPI >@@ -351,6 +393,7 @@ int cxl_mem_create_range_info(struct cxl_dev_state *cxlds); > struct cxl_dev_state *cxl_dev_state_create(struct device *dev); > void set_exclusive_cxl_commands(struct cxl_dev_state *cxlds, unsigned long *cmds); > void clear_exclusive_cxl_commands(struct cxl_dev_state *cxlds, unsigned long *cmds); >+int cxl_mem_get_poison_list(struct device *dev); > #ifdef CONFIG_CXL_SUSPEND > void cxl_mem_active_inc(void); > void cxl_mem_active_dec(void); >diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c >index 54f434733b56..c10c7020ebc2 100644 >--- a/drivers/cxl/core/mbox.c >+++ b/drivers/cxl/core/mbox.c >@@ -9,6 +9,9 @@ > > #include "core.h" > >+#define CREATE_TRACE_POINTS >+#include <trace/events/cxl.h> >+ > static bool cxl_raw_allow_all; > > /** >@@ -755,6 +758,7 @@ int cxl_dev_state_identify(struct cxl_dev_state *cxlds) > { > /* See CXL 2.0 Table 175 Identify Memory Device Output Payload */ > struct cxl_mbox_identify id; >+ __le32 val = 0; > int rc; > > rc = cxl_mbox_send_cmd(cxlds, CXL_MBOX_OP_IDENTIFY, NULL, 0, &id, >@@ -783,6 +787,9 @@ int cxl_dev_state_identify(struct cxl_dev_state *cxlds) > cxlds->lsa_size = le32_to_cpu(id.lsa_size); > memcpy(cxlds->firmware_version, id.fw_revision, sizeof(id.fw_revision)); > >+ memcpy(&val, id.poison_list_max_mer, 3); >+ cxlds->poison_max = le32_to_cpu(val); >+ > return 0; > } > EXPORT_SYMBOL_NS_GPL(cxl_dev_state_identify, CXL); >@@ -826,6 +833,74 @@ int cxl_mem_create_range_info(struct cxl_dev_state *cxlds) > } > EXPORT_SYMBOL_NS_GPL(cxl_mem_create_range_info, CXL); > >+int cxl_mem_get_poison_list(struct device *dev) >+{ >+ struct cxl_memdev *cxlmd = to_cxl_memdev(dev); >+ struct cxl_dev_state *cxlds = cxlmd->cxlds; >+ struct cxl_mbox_poison_payload_out *po; >+ struct cxl_mbox_poison_payload_in pi; >+ int nr_records = 0; >+ int rc, i; >+ >+ if (range_len(&cxlds->pmem_range)) { >+ pi.offset = cpu_to_le64(cxlds->pmem_range.start); >+ pi.length = cpu_to_le64(range_len(&cxlds->pmem_range)); Do you ever see this changing to not always use the full pmem DPA range but allow arbitrary ones? I also assume this is the reason why you don't check the range vs cxlds->ram_range to prevent any overlaps, no? Thanks, Davidlohr >+ } else { >+ return -ENXIO; >+ } >+ >+ po = kvmalloc(cxlds->payload_size, GFP_KERNEL); >+ if (!po) >+ return -ENOMEM; >+ >+ do { >+ rc = cxl_mbox_send_cmd(cxlds, CXL_MBOX_OP_GET_POISON, &pi, >+ sizeof(pi), po, cxlds->payload_size); >+ if (rc) >+ goto out; >+ >+ if (po->flags & CXL_POISON_FLAG_OVERFLOW) { >+ time64_t o_time = le64_to_cpu(po->overflow_timestamp); >+ >+ dev_err(dev, "Poison list overflow at %ptTs UTC\n", >+ &o_time); >+ rc = -ENXIO; >+ goto out; >+ } >+ >+ if (po->flags & CXL_POISON_FLAG_SCANNING) { >+ dev_err(dev, "Scan Media in Progress\n"); >+ rc = -EBUSY; >+ goto out; >+ } >+ >+ for (i = 0; i < le16_to_cpu(po->count); i++) { >+ u64 addr = le64_to_cpu(po->record[i].address); >+ u32 len = le32_to_cpu(po->record[i].length); >+ int source = FIELD_GET(CXL_POISON_SOURCE_MASK, addr); >+ >+ if (!CXL_POISON_SOURCE_VALID(source)) { >+ dev_dbg(dev, "Invalid poison source %d", >+ source); >+ source = CXL_POISON_SOURCE_INVALID; >+ } >+ >+ trace_cxl_poison_list(dev, source, addr, len); >+ } >+ >+ /* Protect against an uncleared _FLAG_MORE */ >+ nr_records = nr_records + le16_to_cpu(po->count); >+ if (nr_records >= cxlds->poison_max) >+ goto out; >+ >+ } while (po->flags & CXL_POISON_FLAG_MORE); >+ >+out: >+ kvfree(po); >+ return rc; >+} >+EXPORT_SYMBOL_NS_GPL(cxl_mem_get_poison_list, CXL); >+ > struct cxl_dev_state *cxl_dev_state_create(struct device *dev) > { > struct cxl_dev_state *cxlds; > >-- >2.31.1 >
On Thu, Jun 16, 2022 at 12:43:34PM -0700, Davidlohr Bueso wrote: > On Tue, 14 Jun 2022, alison.schofield@intel.com wrote: > > >From: Alison Schofield <alison.schofield@intel.com> > > > >CXL devices that support persistent memory maintain a list of locations > >that are poisoned or result in poison if the addresses are accessed by > >the host. > > > >Per the spec (CXL 2.0 8.2.8.5.4.1), the device returns this Poison > >list as a set of Media Error Records that include the source of the > >error, the starting device physical address and length. The length is > >the number of adjacent DPAs in the record and is in units of 64 bytes. > > > >Retrieve the list and log each Media Error Record as a trace event of > >type cxl_poison_list. > > > >Signed-off-by: Alison Schofield <alison.schofield@intel.com> > >--- > > drivers/cxl/cxlmem.h | 43 +++++++++++++++++++++++ > > drivers/cxl/core/mbox.c | 75 +++++++++++++++++++++++++++++++++++++++++ > > 2 files changed, 118 insertions(+) > > snip > >+int cxl_mem_get_poison_list(struct device *dev) > >+{ > >+ struct cxl_memdev *cxlmd = to_cxl_memdev(dev); > >+ struct cxl_dev_state *cxlds = cxlmd->cxlds; > >+ struct cxl_mbox_poison_payload_out *po; > >+ struct cxl_mbox_poison_payload_in pi; > >+ int nr_records = 0; > >+ int rc, i; > >+ > >+ if (range_len(&cxlds->pmem_range)) { > >+ pi.offset = cpu_to_le64(cxlds->pmem_range.start); > >+ pi.length = cpu_to_le64(range_len(&cxlds->pmem_range)); First off - you stopped at a bug here - that pi.length needs to be in units of 64 bytes. > > Do you ever see this changing to not always use the full pmem DPA range > but allow arbitrary ones? I also assume this is the reason why you don't > check the range vs cxlds->ram_range to prevent any overlaps, no? > > Thanks, > Davidlohr David - Great question! I'm headed in this direction - cxl list --media-errors -m mem1 lists media errors for requested memdev cxl list --media-errors -r region# lists region errors with HPA addresses (So here cxl tool will collect the poison for all the regions memdevs and do the DPA to HPA translation) To answer your question, I wasn't thinking of limiting the range within the memdev, but certainly could. And if we were taking in ranges, those ranges would need to be checked. $cxl list --media-errors -m mem1 --range-start= --range-end|len= Now, if I left the sysfs inteface as is, the driver will read the entire poison list for the memdev and then cxl tool will filter it for the range requested. Or, maybe we should implement in libcxl (not sysfs), with memdev and range options and only collect from the device the range requested. Either one looks the same to the cxl tool user, but limiting the range we send to the device would certainly cut down on unwanted records being logged, retrieved, and examined. I'd like to hear more from you and other community members. Alison > > snip
On Thu, 16 Jun 2022, Alison Schofield wrote: >I'm headed in this direction - I like these interfaces, btw. >cxl list --media-errors -m mem1 > lists media errors for requested memdev But in this patchset you're only listing for persistent configurations. So if there is a volatile partion, or the whole device is volatile, this would not consider that. So unless I'm missing something, we need to consider ram_range as well. >cxl list --media-errors -r region# > lists region errors with HPA addresses > (So here cxl tool will collect the poison for all the regions > memdevs and do the DPA to HPA translation) I was indeed thinking along these lines. But similar to the above, the region driver also has plans to enumarate volatile regions configured by BIOS. > >To answer your question, I wasn't thinking of limiting >the range within the memdev, but certainly could. And if we were >taking in ranges, those ranges would need to be checked. My question was originally considering poisoning only within pmem DPA ranges, but now I'm wondering if all this also applies equally to volatile parts as well... Reading the spec I interpret both, but reading the T3 Memory Device Software Guide '2.13.19' it only mentions persistent capacity. > >$cxl list --media-errors -m mem1 --range-start= --range-end|len= I figure this kind of like the above with regions being very arbitrary and dynamic. >Now, if I left the sysfs interface as is, the driver will read the >entire poison list for the memdev and then cxl tool will filter it >for the range requested. > >Or, maybe we should implement in libcxl (not sysfs), with memdev and >range options and only collect from the device the range requested. I wonder if the latter may be the better option considering that always scanning the entire memdev would cause unnecessary media scan wait times, specially for large capacities. Thanks, Davidlohr
David - you make lots of good points, one quick comments at end... On Thu, Jun 16, 2022 at 02:47:40PM -0700, Davidlohr Bueso wrote: > On Thu, 16 Jun 2022, Alison Schofield wrote: > >I'm headed in this direction - > > I like these interfaces, btw. > > >cxl list --media-errors -m mem1 > > lists media errors for requested memdev > > But in this patchset you're only listing for persistent configurations. > So if there is a volatile partion, or the whole device is volatile, > this would not consider that. > > So unless I'm missing something, we need to consider ram_range as well. > > >cxl list --media-errors -r region# > > lists region errors with HPA addresses > > (So here cxl tool will collect the poison for all the regions > > memdevs and do the DPA to HPA translation) > > I was indeed thinking along these lines. But similar to the above, > the region driver also has plans to enumarate volatile regions > configured by BIOS. > > > > >To answer your question, I wasn't thinking of limiting > >the range within the memdev, but certainly could. And if we were > >taking in ranges, those ranges would need to be checked. > > My question was originally considering poisoning only within pmem DPA > ranges, but now I'm wondering if all this also applies equally to volatile > parts as well... Reading the spec I interpret both, but reading the > T3 Memory Device Software Guide '2.13.19' it only mentions persistent > capacity. > > > > >$cxl list --media-errors -m mem1 --range-start= --range-end|len= > > I figure this kind of like the above with regions being very arbitrary > and dynamic. > > >Now, if I left the sysfs interface as is, the driver will read the > >entire poison list for the memdev and then cxl tool will filter it > >for the range requested. > > > >Or, maybe we should implement in libcxl (not sysfs), with memdev and > >range options and only collect from the device the range requested. > > I wonder if the latter may be the better option considering that always > scanning the entire memdev would cause unnecessary media scan wait times, > specially for large capacities. This is not a Media Scan. This is only reading the existing Poison List. > > Thanks, > Davidlohr
On Thu, 16 Jun 2022, Alison Schofield wrote: >> I wonder if the latter may be the better option considering that always >> scanning the entire memdev would cause unnecessary media scan wait times, >> specially for large capacities. > >This is not a Media Scan. This is only reading the existing Poison List. Right, but I was thinking we'd want a input similar interface passing the desidered range. Thanks, Davidlohr
On Thu, 16 Jun 2022, Alison Schofield wrote: >cxl list --media-errors -m mem1 > lists media errors for requested memdev > >cxl list --media-errors -r region# A quick question on the tooling front: the above goes nicely with cxl-list, but what about the rest of the poisoning cmds? Do you have anything in mind? Do we want something specific for media and poison management instead? Ie: cxl media --list-errors <params> cxl media --inject-errors <params> cxl media --clear-errors <params> Thanks, Davidlohr
On Thu, Jun 16, 2022 at 03:45:25PM -0700, Davidlohr Bueso wrote: > On Thu, 16 Jun 2022, Alison Schofield wrote: > > >cxl list --media-errors -m mem1 > > lists media errors for requested memdev > > > >cxl list --media-errors -r region# > > A quick question on the tooling front: the above goes nicely with > cxl-list, but what about the rest of the poisoning cmds? Do you have > anything in mind? Do we want something specific for media and poison > management instead? Ie: > > cxl media --list-errors <params> Not clear how this one differs. Seems like we can get any piece of the list w cxl list. > cxl media --inject-errors <params> > cxl media --clear-errors <params> For inject/clear I'd probably start w what ndctl does today. ndctl inject−error <namespace> [<options>] where option -d --uninject performs the clear. > > Thanks, > Davidlohr
On Thu, 2022-06-16 at 16:15 -0700, Alison Schofield wrote: > On Thu, Jun 16, 2022 at 03:45:25PM -0700, Davidlohr Bueso wrote: > > On Thu, 16 Jun 2022, Alison Schofield wrote: > > > > > cxl list --media-errors -m mem1 > > >         lists media errors for requested memdev > > > > > > cxl list --media-errors -r region# > > > > A quick question on the tooling front: the above goes nicely with > > cxl-list, but what about the rest of the poisoning cmds? Do you > > have > > anything in mind? Do we want something specific for media and > > poison > > management instead? Ie: > > > > cxl media --list-errors <params> > Not clear how this one differs. Seems like we can get any piece of > the list w cxl list. > > > cxl media --inject-errors <params> > > cxl media --clear-errors <params> > For inject/clear I'd probably start w what ndctl does today. > ndctl inject−error <namespace> [<options>] > where option -d --uninject performs the clear. Yeah agreed with Alison that cxl inject-error <options> sounds good. Generally speaking, we've tried to avoid the 'sub-command of a subcommand' situation - such as 'cxl <media> <sub-sub-command> <options>', instead prefering 'cxl <action-object> <options>'. > > > > > Thanks, > > Davidlohr
On Thu, 16 Jun 2022, Verma, Vishal L wrote:
>Yeah agreed with Alison that cxl inject-error <options> sounds good.
I agree as well, that's a much nicer interface.
Thanks,
Davidlohr
On Tue, 14 Jun 2022 17:10:27 -0700 alison.schofield@intel.com wrote: > From: Alison Schofield <alison.schofield@intel.com> > > CXL devices that support persistent memory maintain a list of locations > that are poisoned or result in poison if the addresses are accessed by > the host. > > Per the spec (CXL 2.0 8.2.8.5.4.1), the device returns this Poison > list as a set of Media Error Records that include the source of the > error, the starting device physical address and length. The length is > the number of adjacent DPAs in the record and is in units of 64 bytes. > > Retrieve the list and log each Media Error Record as a trace event of > type cxl_poison_list. > > Signed-off-by: Alison Schofield <alison.schofield@intel.com> > --- > +int cxl_mem_get_poison_list(struct device *dev) > +{ > + struct cxl_memdev *cxlmd = to_cxl_memdev(dev); > + struct cxl_dev_state *cxlds = cxlmd->cxlds; > + struct cxl_mbox_poison_payload_out *po; > + struct cxl_mbox_poison_payload_in pi; > + int nr_records = 0; > + int rc, i; > + > + if (range_len(&cxlds->pmem_range)) { > + pi.offset = cpu_to_le64(cxlds->pmem_range.start); > + pi.length = cpu_to_le64(range_len(&cxlds->pmem_range)); > + } else { > + return -ENXIO; > + } > + > + po = kvmalloc(cxlds->payload_size, GFP_KERNEL); > + if (!po) > + return -ENOMEM; > + > + do { > + rc = cxl_mbox_send_cmd(cxlds, CXL_MBOX_OP_GET_POISON, &pi, > + sizeof(pi), po, cxlds->payload_size); > + if (rc) > + goto out; > + > + if (po->flags & CXL_POISON_FLAG_OVERFLOW) { > + time64_t o_time = le64_to_cpu(po->overflow_timestamp); > + > + dev_err(dev, "Poison list overflow at %ptTs UTC\n", > + &o_time); > + rc = -ENXIO; > + goto out; > + } > + > + if (po->flags & CXL_POISON_FLAG_SCANNING) { > + dev_err(dev, "Scan Media in Progress\n"); > + rc = -EBUSY; > + goto out; > + } > + > + for (i = 0; i < le16_to_cpu(po->count); i++) { > + u64 addr = le64_to_cpu(po->record[i].address); > + u32 len = le32_to_cpu(po->record[i].length); > + int source = FIELD_GET(CXL_POISON_SOURCE_MASK, addr); > + > + if (!CXL_POISON_SOURCE_VALID(source)) { > + dev_dbg(dev, "Invalid poison source %d", > + source); > + source = CXL_POISON_SOURCE_INVALID; > + } > + > + trace_cxl_poison_list(dev, source, addr, len); > + } > + > + /* Protect against an uncleared _FLAG_MORE */ > + nr_records = nr_records + le16_to_cpu(po->count); > + if (nr_records >= cxlds->poison_max) If this happens and _FLAG_MORE is set (it will occur anyway currently if we happen to have poison_max records - I hit this in QEMU because until now default of poison_max == 0) then we should spit out an error message as I think that means the hardware is broken. > + goto out; > + > + } while (po->flags & CXL_POISON_FLAG_MORE); > + > +out: > + kvfree(po); > + return rc; > +} > +EXPORT_SYMBOL_NS_GPL(cxl_mem_get_poison_list, CXL); > + > struct cxl_dev_state *cxl_dev_state_create(struct device *dev) > { > struct cxl_dev_state *cxlds;
On Tue, 14 Jun 2022 17:10:27 -0700 alison.schofield@intel.com wrote: > From: Alison Schofield <alison.schofield@intel.com> > > CXL devices that support persistent memory maintain a list of locations > that are poisoned or result in poison if the addresses are accessed by > the host. > > Per the spec (CXL 2.0 8.2.8.5.4.1), the device returns this Poison > list as a set of Media Error Records that include the source of the > error, the starting device physical address and length. The length is > the number of adjacent DPAs in the record and is in units of 64 bytes. > > Retrieve the list and log each Media Error Record as a trace event of > type cxl_poison_list. > > Signed-off-by: Alison Schofield <alison.schofield@intel.com> A few more things inline. Otherwise, can confirm it works with some hack QEMU code. I'll tidy that up and post soon. > +int cxl_mem_get_poison_list(struct device *dev) > +{ > + struct cxl_memdev *cxlmd = to_cxl_memdev(dev); > + struct cxl_dev_state *cxlds = cxlmd->cxlds; > + struct cxl_mbox_poison_payload_out *po; > + struct cxl_mbox_poison_payload_in pi; > + int nr_records = 0; > + int rc, i; > + > + if (range_len(&cxlds->pmem_range)) { > + pi.offset = cpu_to_le64(cxlds->pmem_range.start); > + pi.length = cpu_to_le64(range_len(&cxlds->pmem_range)); > + } else { > + return -ENXIO; > + } > + > + po = kvmalloc(cxlds->payload_size, GFP_KERNEL); > + if (!po) > + return -ENOMEM; > + > + do { > + rc = cxl_mbox_send_cmd(cxlds, CXL_MBOX_OP_GET_POISON, &pi, > + sizeof(pi), po, cxlds->payload_size); > + if (rc) > + goto out; > + > + if (po->flags & CXL_POISON_FLAG_OVERFLOW) { > + time64_t o_time = le64_to_cpu(po->overflow_timestamp); > + > + dev_err(dev, "Poison list overflow at %ptTs UTC\n", > + &o_time); > + rc = -ENXIO; > + goto out; > + } > + > + if (po->flags & CXL_POISON_FLAG_SCANNING) { > + dev_err(dev, "Scan Media in Progress\n"); > + rc = -EBUSY; > + goto out; > + } > + > + for (i = 0; i < le16_to_cpu(po->count); i++) { > + u64 addr = le64_to_cpu(po->record[i].address); > + u32 len = le32_to_cpu(po->record[i].length); > + int source = FIELD_GET(CXL_POISON_SOURCE_MASK, addr); > + > + if (!CXL_POISON_SOURCE_VALID(source)) { > + dev_dbg(dev, "Invalid poison source %d", > + source); > + source = CXL_POISON_SOURCE_INVALID; > + } > + > + trace_cxl_poison_list(dev, source, addr, len); Need to mask off the lower 6 bits of addr as they contain the source + a few reserved bits. I was confused how you were geting better than 64 byte precision in your example. > + } > + > + /* Protect against an uncleared _FLAG_MORE */ > + nr_records = nr_records + le16_to_cpu(po->count); > + if (nr_records >= cxlds->poison_max) > + goto out; > + > + } while (po->flags & CXL_POISON_FLAG_MORE); So.. A conundrum here. What happens if: 1. We get an error mid way through a set of multiple reads (something intermittent - maybe a software issue) 2. We will drop out of here fine and report the error. 3. We run this function again. It will (I think) currently pick up where we left off, but we have no way of knowing that as there isn't a 'total records' count or any other form of index in the output payload. So, software solutions I think should work (though may warrant a note to be added to the spec). 1. Read whole thing twice. First time is just to ensure we get to the end and flush out any prior half done reads. 2. Issue a read for a different region (perhaps length 0) first and assume everything starts from scratch when we go back to this region. Jonathan > + > +out: > + kvfree(po); > + return rc; > +} > +EXPORT_SYMBOL_NS_GPL(cxl_mem_get_poison_list, CXL); > + > struct cxl_dev_state *cxl_dev_state_create(struct device *dev) > { > struct cxl_dev_state *cxlds;
On Fri, Jun 17, 2022 at 07:05:08AM -0700, Jonathan Cameron wrote: > On Tue, 14 Jun 2022 17:10:27 -0700 > alison.schofield@intel.com wrote: > > > From: Alison Schofield <alison.schofield@intel.com> > > > > CXL devices that support persistent memory maintain a list of locations > > that are poisoned or result in poison if the addresses are accessed by > > the host. > > > > Per the spec (CXL 2.0 8.2.8.5.4.1), the device returns this Poison > > list as a set of Media Error Records that include the source of the > > error, the starting device physical address and length. The length is > > the number of adjacent DPAs in the record and is in units of 64 bytes. > > > > Retrieve the list and log each Media Error Record as a trace event of > > type cxl_poison_list. > > > > Signed-off-by: Alison Schofield <alison.schofield@intel.com> > > A few more things inline. > > Otherwise, can confirm it works with some hack QEMU code. > I'll tidy that up and post soon. > > > +int cxl_mem_get_poison_list(struct device *dev) > > +{ snip > > + > > + trace_cxl_poison_list(dev, source, addr, len); > > Need to mask off the lower 6 bits of addr as they contain the source > + a few reserved bits. > > I was confused how you were geting better than 64 byte precision in your > example. > Ah...got it. Thanks! > > + } > > + > > + /* Protect against an uncleared _FLAG_MORE */ > > + nr_records = nr_records + le16_to_cpu(po->count); > > + if (nr_records >= cxlds->poison_max) > > + goto out; > > + > > + } while (po->flags & CXL_POISON_FLAG_MORE); > So.. A conundrum here. What happens if: > > 1. We get an error mid way through a set of multiple reads > (something intermittent - maybe a software issue) > 2. We will drop out of here fine and report the error. > 3. We run this function again. > > It will (I think) currently pick up where we left off, but we have > no way of knowing that as there isn't a 'total records' count or > any other form of index in the output payload. Yes. That is sad. I'm assume it's by design and CXL devices never intended to keep any totals. > > So, software solutions I think should work (though may warrant a note > to be added to the spec). > > 1. Read whole thing twice. First time is just to ensure we get > to the end and flush out any prior half done reads. > 2. Issue a read for a different region (perhaps length 0) first > and assume everything starts from scratch when we go back to > this region. Can you tell me more about 2 ? Also, Since posting this I have added protection to this path to ensure only one reader of the poison list for this device. Like this: if (!completion_done(&cxlds->read_poison_complete); return -EBUSY; wait_for_completion_interruptible(&cxlds->read_poison_complete); ...GET ALL THE POISON... complete(&cxlds->read_poison_complete); And will add the error message on that unexpected _FLAG_MORE too. Alison > > Jonathan > > > + > > +out: > > + kvfree(po); > > + return rc; > > +} > > +EXPORT_SYMBOL_NS_GPL(cxl_mem_get_poison_list, CXL); > > + > > struct cxl_dev_state *cxl_dev_state_create(struct device *dev) > > { > > struct cxl_dev_state *cxlds; >
On Fri, 17 Jun 2022, Alison Schofield wrote: >On Fri, Jun 17, 2022 at 07:05:08AM -0700, Jonathan Cameron wrote: >> On Tue, 14 Jun 2022 17:10:27 -0700 >> alison.schofield@intel.com wrote: >> >> > From: Alison Schofield <alison.schofield@intel.com> >> > >> > CXL devices that support persistent memory maintain a list of locations >> > that are poisoned or result in poison if the addresses are accessed by >> > the host. >> > >> > Per the spec (CXL 2.0 8.2.8.5.4.1), the device returns this Poison >> > list as a set of Media Error Records that include the source of the >> > error, the starting device physical address and length. The length is >> > the number of adjacent DPAs in the record and is in units of 64 bytes. >> > >> > Retrieve the list and log each Media Error Record as a trace event of >> > type cxl_poison_list. >> > >> > Signed-off-by: Alison Schofield <alison.schofield@intel.com> >> >> A few more things inline. >> >> Otherwise, can confirm it works with some hack QEMU code. >> I'll tidy that up and post soon. >> >> > +int cxl_mem_get_poison_list(struct device *dev) >> > +{ >snip >> > + >> > + trace_cxl_poison_list(dev, source, addr, len); >> >> Need to mask off the lower 6 bits of addr as they contain the source >> + a few reserved bits. >> >> I was confused how you were geting better than 64 byte precision in your >> example. >> >Ah...got it. Thanks! > >> > + } >> > + >> > + /* Protect against an uncleared _FLAG_MORE */ >> > + nr_records = nr_records + le16_to_cpu(po->count); >> > + if (nr_records >= cxlds->poison_max) >> > + goto out; >> > + >> > + } while (po->flags & CXL_POISON_FLAG_MORE); >> So.. A conundrum here. What happens if: >> >> 1. We get an error mid way through a set of multiple reads >> (something intermittent - maybe a software issue) >> 2. We will drop out of here fine and report the error. >> 3. We run this function again. >> >> It will (I think) currently pick up where we left off, but we have >> no way of knowing that as there isn't a 'total records' count or >> any other form of index in the output payload. > >Yes. That is sad. I'm assume it's by design and CXL devices never >intended to keep any totals. > >> >> So, software solutions I think should work (though may warrant a note >> to be added to the spec). >> >> 1. Read whole thing twice. First time is just to ensure we get >> to the end and flush out any prior half done reads. >> 2. Issue a read for a different region (perhaps length 0) first >> and assume everything starts from scratch when we go back to >> this region. > >Can you tell me more about 2 ? > >Also, Since posting this I have added protection to this path to ensure >only one reader of the poison list for this device. Like this: I don't think we should prevent multiple list reads. I would expect the scenario Jonathan describes to be the uncommon case. Thanks, Davidlohr > >if (!completion_done(&cxlds->read_poison_complete); > return -EBUSY; >wait_for_completion_interruptible(&cxlds->read_poison_complete); > ...GET ALL THE POISON... >complete(&cxlds->read_poison_complete); > >And will add the error message on that unexpected _FLAG_MORE too. > >Alison >> >> Jonathan >> > > > >> > + >> > +out: >> > + kvfree(po); >> > + return rc; >> > +} >> > +EXPORT_SYMBOL_NS_GPL(cxl_mem_get_poison_list, CXL); >> > + >> > struct cxl_dev_state *cxl_dev_state_create(struct device *dev) >> > { >> > struct cxl_dev_state *cxlds; >>
alison.schofield@ wrote: > From: Alison Schofield <alison.schofield@intel.com> > > CXL devices that support persistent memory maintain a list of locations > that are poisoned or result in poison if the addresses are accessed by > the host. > > Per the spec (CXL 2.0 8.2.8.5.4.1), the device returns this Poison > list as a set of Media Error Records that include the source of the > error, the starting device physical address and length. The length is > the number of adjacent DPAs in the record and is in units of 64 bytes. > > Retrieve the list and log each Media Error Record as a trace event of > type cxl_poison_list. > > Signed-off-by: Alison Schofield <alison.schofield@intel.com> > --- > drivers/cxl/cxlmem.h | 43 +++++++++++++++++++++++ > drivers/cxl/core/mbox.c | 75 +++++++++++++++++++++++++++++++++++++++++ > 2 files changed, 118 insertions(+) > > diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h > index 60d10ee1e7fc..29cf0459b44a 100644 > --- a/drivers/cxl/cxlmem.h > +++ b/drivers/cxl/cxlmem.h > @@ -174,6 +174,7 @@ struct cxl_endpoint_dvsec_info { > * (CXL 2.0 8.2.8.4.3 Mailbox Capabilities Register) > * @lsa_size: Size of Label Storage Area > * (CXL 2.0 8.2.9.5.1.1 Identify Memory Device) > + * @poison_max_mer: maximum Media Error Records tracked in Poison List > * @mbox_mutex: Mutex to synchronize mailbox access. > * @firmware_version: Firmware version for the memory device. > * @enabled_cmds: Hardware commands found enabled in CEL. > @@ -204,6 +205,7 @@ struct cxl_dev_state { > > size_t payload_size; > size_t lsa_size; > + u32 poison_max; > struct mutex mbox_mutex; /* Protects device mailbox and firmware */ > char firmware_version[0x10]; > DECLARE_BITMAP(enabled_cmds, CXL_MEM_COMMAND_ID_MAX); > @@ -317,6 +319,46 @@ struct cxl_mbox_set_partition_info { > > #define CXL_SET_PARTITION_IMMEDIATE_FLAG BIT(0) > > +struct cxl_mbox_poison_payload_in { > + __le64 offset; > + __le64 length; > +} __packed; > + > +struct cxl_mbox_poison_payload_out { > + u8 flags; > + u8 rsvd1; > + __le64 overflow_timestamp; > + __le16 count; > + u8 rsvd2[0x14]; > + struct cxl_poison_record { > + __le64 address; > + __le32 length; > + __le32 rsvd; > + } __packed record[]; > +} __packed; > + > +/* CXL 8.2.9.5.4.1 Get Poison List: payload out flags: */ > +#define CXL_POISON_FLAG_MORE BIT(0) > +#define CXL_POISON_FLAG_OVERFLOW BIT(1) > +#define CXL_POISON_FLAG_SCANNING BIT(2) > + > +/* CXL 8.2.9.5.4.1 Get Poison List: Error is encoded in record.address[2:0] */ > +#define CXL_POISON_SOURCE_MASK GENMASK(2, 0) > +#define CXL_POISON_SOURCE_UNKNOWN 0 > +#define CXL_POISON_SOURCE_EXTERNAL 1 > +#define CXL_POISON_SOURCE_INTERNAL 2 > +#define CXL_POISON_SOURCE_INJECTED 3 > +#define CXL_POISON_SOURCE_VENDOR 7 > + > +/* Software define */ > +#define CXL_POISON_SOURCE_INVALID 99 > +#define CXL_POISON_SOURCE_VALID(x) \ > + (((x) == CXL_POISON_SOURCE_UNKNOWN) || \ > + ((x) == CXL_POISON_SOURCE_EXTERNAL) || \ > + ((x) == CXL_POISON_SOURCE_INTERNAL) || \ > + ((x) == CXL_POISON_SOURCE_INJECTED) || \ > + ((x) == CXL_POISON_SOURCE_VENDOR)) > + > /** > * struct cxl_mem_command - Driver representation of a memory device command > * @info: Command information as it exists for the UAPI > @@ -351,6 +393,7 @@ int cxl_mem_create_range_info(struct cxl_dev_state *cxlds); > struct cxl_dev_state *cxl_dev_state_create(struct device *dev); > void set_exclusive_cxl_commands(struct cxl_dev_state *cxlds, unsigned long *cmds); > void clear_exclusive_cxl_commands(struct cxl_dev_state *cxlds, unsigned long *cmds); > +int cxl_mem_get_poison_list(struct device *dev); > #ifdef CONFIG_CXL_SUSPEND > void cxl_mem_active_inc(void); > void cxl_mem_active_dec(void); > diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c > index 54f434733b56..c10c7020ebc2 100644 > --- a/drivers/cxl/core/mbox.c > +++ b/drivers/cxl/core/mbox.c > @@ -9,6 +9,9 @@ > > #include "core.h" > > +#define CREATE_TRACE_POINTS > +#include <trace/events/cxl.h> > + > static bool cxl_raw_allow_all; > > /** > @@ -755,6 +758,7 @@ int cxl_dev_state_identify(struct cxl_dev_state *cxlds) > { > /* See CXL 2.0 Table 175 Identify Memory Device Output Payload */ > struct cxl_mbox_identify id; > + __le32 val = 0; > int rc; > > rc = cxl_mbox_send_cmd(cxlds, CXL_MBOX_OP_IDENTIFY, NULL, 0, &id, > @@ -783,6 +787,9 @@ int cxl_dev_state_identify(struct cxl_dev_state *cxlds) > cxlds->lsa_size = le32_to_cpu(id.lsa_size); > memcpy(cxlds->firmware_version, id.fw_revision, sizeof(id.fw_revision)); > > + memcpy(&val, id.poison_list_max_mer, 3); > + cxlds->poison_max = le32_to_cpu(val); > + > return 0; > } > EXPORT_SYMBOL_NS_GPL(cxl_dev_state_identify, CXL); > @@ -826,6 +833,74 @@ int cxl_mem_create_range_info(struct cxl_dev_state *cxlds) > } > EXPORT_SYMBOL_NS_GPL(cxl_mem_create_range_info, CXL); > > +int cxl_mem_get_poison_list(struct device *dev) > +{ > + struct cxl_memdev *cxlmd = to_cxl_memdev(dev); > + struct cxl_dev_state *cxlds = cxlmd->cxlds; > + struct cxl_mbox_poison_payload_out *po; > + struct cxl_mbox_poison_payload_in pi; > + int nr_records = 0; > + int rc, i; > + > + if (range_len(&cxlds->pmem_range)) { > + pi.offset = cpu_to_le64(cxlds->pmem_range.start); > + pi.length = cpu_to_le64(range_len(&cxlds->pmem_range)); > + } else { > + return -ENXIO; > + } > + > + po = kvmalloc(cxlds->payload_size, GFP_KERNEL); > + if (!po) > + return -ENOMEM; > + > + do { > + rc = cxl_mbox_send_cmd(cxlds, CXL_MBOX_OP_GET_POISON, &pi, > + sizeof(pi), po, cxlds->payload_size); > + if (rc) > + goto out; In this case, no need for a 'goto' when 'break' will do. > + > + if (po->flags & CXL_POISON_FLAG_OVERFLOW) { > + time64_t o_time = le64_to_cpu(po->overflow_timestamp); > + > + dev_err(dev, "Poison list overflow at %ptTs UTC\n", > + &o_time); This should be just another event type, not an error message. > + rc = -ENXIO; No need to error out, the media error records are still valid. > + goto out; > + } > + > + if (po->flags & CXL_POISON_FLAG_SCANNING) { > + dev_err(dev, "Scan Media in Progress\n"); This isn't an error condition and it should be something the kernel is mediating in the first instance. For example if userspace did: echo 1 > trace_poison & echo 1 > trace_poison_cached I would expect the second echo to just block awaiting the completion of the scan media request. I.e. just prevent the possibility of these commands colliding outside of CONFIG_CXL_MEM_RAW_COMMANDS=y shenanigans, in which case userspace gets to keep the pieces. > + rc = -EBUSY; > + goto out; > + } > + > + for (i = 0; i < le16_to_cpu(po->count); i++) { > + u64 addr = le64_to_cpu(po->record[i].address); > + u32 len = le32_to_cpu(po->record[i].length); > + int source = FIELD_GET(CXL_POISON_SOURCE_MASK, addr); > + > + if (!CXL_POISON_SOURCE_VALID(source)) { > + dev_dbg(dev, "Invalid poison source %d", > + source); Per above, just emit the raw field value and leave parsing values to userspace. > + source = CXL_POISON_SOURCE_INVALID; > + } > + > + trace_cxl_poison_list(dev, source, addr, len); > + } > + > + /* Protect against an uncleared _FLAG_MORE */ > + nr_records = nr_records + le16_to_cpu(po->count); > + if (nr_records >= cxlds->poison_max) > + goto out; This also feels like something that should have a Linux max as well, something like: "cxlds->poison_max = min(identify->poison_max, CXL_LIST_POISON_MAX)" ...where CXL_LIST_POISON_MAX is a "reasonable" maximum for a cache of hardware media poison errors like 1024. > + > + } while (po->flags & CXL_POISON_FLAG_MORE); > + > +out: > + kvfree(po); > + return rc; > +} > +EXPORT_SYMBOL_NS_GPL(cxl_mem_get_poison_list, CXL); > + > struct cxl_dev_state *cxl_dev_state_create(struct device *dev) > { > struct cxl_dev_state *cxlds; > -- > 2.31.1 >
Alison Schofield wrote: > On Thu, Jun 16, 2022 at 12:43:34PM -0700, Davidlohr Bueso wrote: > > On Tue, 14 Jun 2022, alison.schofield@intel.com wrote: > > > > >From: Alison Schofield <alison.schofield@intel.com> > > > > > >CXL devices that support persistent memory maintain a list of locations > > >that are poisoned or result in poison if the addresses are accessed by > > >the host. > > > > > >Per the spec (CXL 2.0 8.2.8.5.4.1), the device returns this Poison > > >list as a set of Media Error Records that include the source of the > > >error, the starting device physical address and length. The length is > > >the number of adjacent DPAs in the record and is in units of 64 bytes. > > > > > >Retrieve the list and log each Media Error Record as a trace event of > > >type cxl_poison_list. > > > > > >Signed-off-by: Alison Schofield <alison.schofield@intel.com> > > >--- > > > drivers/cxl/cxlmem.h | 43 +++++++++++++++++++++++ > > > drivers/cxl/core/mbox.c | 75 +++++++++++++++++++++++++++++++++++++++++ > > > 2 files changed, 118 insertions(+) > > > > snip > > > >+int cxl_mem_get_poison_list(struct device *dev) > > >+{ > > >+ struct cxl_memdev *cxlmd = to_cxl_memdev(dev); > > >+ struct cxl_dev_state *cxlds = cxlmd->cxlds; > > >+ struct cxl_mbox_poison_payload_out *po; > > >+ struct cxl_mbox_poison_payload_in pi; > > >+ int nr_records = 0; > > >+ int rc, i; > > >+ > > >+ if (range_len(&cxlds->pmem_range)) { > > >+ pi.offset = cpu_to_le64(cxlds->pmem_range.start); > > >+ pi.length = cpu_to_le64(range_len(&cxlds->pmem_range)); > > First off - you stopped at a bug here - that pi.length needs to be > in units of 64 bytes. > > > > Do you ever see this changing to not always use the full pmem DPA range > > but allow arbitrary ones? I also assume this is the reason why you don't > > check the range vs cxlds->ram_range to prevent any overlaps, no? > > > > Thanks, > > Davidlohr > > David - Great question! > > I'm headed in this direction - > > cxl list --media-errors -m mem1 > lists media errors for requested memdev > > cxl list --media-errors -r region# > lists region errors with HPA addresses > (So here cxl tool will collect the poison for all the regions > memdevs and do the DPA to HPA translation) > > To answer your question, I wasn't thinking of limiting > the range within the memdev, but certainly could. And if we were > taking in ranges, those ranges would need to be checked. > > $cxl list --media-errors -m mem1 --range-start= --range-end|len= > > Now, if I left the sysfs inteface as is, the driver will read the > entire poison list for the memdev and then cxl tool will filter it > for the range requested. > > Or, maybe we should implement in libcxl (not sysfs), with memdev and > range options and only collect from the device the range requested. > > Either one looks the same to the cxl tool user, but limiting the > range we send to the device would certainly cut down on unwanted > records being logged, retrieved, and examined. > > I'd like to hear more from you and other community members. There is some history here that is relevant to this design. CXL Get Poison List builds on lessons learned from the ACPI "Address Range Scrub" mechanism that was deployed for ACPI described persistent memory platform (See ACPI 6.4 9.20.7.2 "Address Range Scrubbing (ARS) Overview"). In that case there was no expectation that the device maintained a cached and coherent (with incoming poison writes) copy of the media error list. CXL Get Poison List in comparison is meant to obviate the need to perform Scan Media in most scenarios, and it is lightweight enough that userspace need not have a mechanism to request errors by range, in my opinion. One of the design warts of drivers/acpi/nfit/ that I want to get away from in the case of drivers/cxl/ is snooping the equivalent of ARS command results to populate a kernel list of poison addresses and instead put the onus on userspace to collect DPA events and optionally inform the kernel of the HPA impacts. For example, DAX filesystems will soon have the ability to do something useful with poison notifications [1], but that support is limited to synchronously consumed poison flagged by memory_failure(). When the cxl tool translates the poison list to HPA it needs an ABI to turn around and notify the filesystem about which blocks got clobbered. [1]: https://lore.kernel.org/all/20220616193157.2c2e963f3e7e38dfac554a28@linux-foundation.org/
Jonathan Cameron wrote: > On Tue, 14 Jun 2022 17:10:27 -0700 > alison.schofield@intel.com wrote: > > > From: Alison Schofield <alison.schofield@intel.com> > > > > CXL devices that support persistent memory maintain a list of locations > > that are poisoned or result in poison if the addresses are accessed by > > the host. > > > > Per the spec (CXL 2.0 8.2.8.5.4.1), the device returns this Poison > > list as a set of Media Error Records that include the source of the > > error, the starting device physical address and length. The length is > > the number of adjacent DPAs in the record and is in units of 64 bytes. > > > > Retrieve the list and log each Media Error Record as a trace event of > > type cxl_poison_list. > > > > Signed-off-by: Alison Schofield <alison.schofield@intel.com> > > A few more things inline. > > Otherwise, can confirm it works with some hack QEMU code. > I'll tidy that up and post soon. > > > +int cxl_mem_get_poison_list(struct device *dev) > > +{ > > + struct cxl_memdev *cxlmd = to_cxl_memdev(dev); > > + struct cxl_dev_state *cxlds = cxlmd->cxlds; > > + struct cxl_mbox_poison_payload_out *po; > > + struct cxl_mbox_poison_payload_in pi; > > + int nr_records = 0; > > + int rc, i; > > + > > + if (range_len(&cxlds->pmem_range)) { > > + pi.offset = cpu_to_le64(cxlds->pmem_range.start); > > + pi.length = cpu_to_le64(range_len(&cxlds->pmem_range)); > > + } else { > > + return -ENXIO; > > + } > > + > > + po = kvmalloc(cxlds->payload_size, GFP_KERNEL); > > + if (!po) > > + return -ENOMEM; > > + > > + do { > > + rc = cxl_mbox_send_cmd(cxlds, CXL_MBOX_OP_GET_POISON, &pi, > > + sizeof(pi), po, cxlds->payload_size); > > + if (rc) > > + goto out; > > + > > + if (po->flags & CXL_POISON_FLAG_OVERFLOW) { > > + time64_t o_time = le64_to_cpu(po->overflow_timestamp); > > + > > + dev_err(dev, "Poison list overflow at %ptTs UTC\n", > > + &o_time); > > + rc = -ENXIO; > > + goto out; > > + } > > + > > + if (po->flags & CXL_POISON_FLAG_SCANNING) { > > + dev_err(dev, "Scan Media in Progress\n"); > > + rc = -EBUSY; > > + goto out; > > + } > > + > > + for (i = 0; i < le16_to_cpu(po->count); i++) { > > + u64 addr = le64_to_cpu(po->record[i].address); > > > + u32 len = le32_to_cpu(po->record[i].length); > > > > + int source = FIELD_GET(CXL_POISON_SOURCE_MASK, addr); > > + > > + if (!CXL_POISON_SOURCE_VALID(source)) { > > + dev_dbg(dev, "Invalid poison source %d", > > + source); > > + source = CXL_POISON_SOURCE_INVALID; > > + } > > + > > + trace_cxl_poison_list(dev, source, addr, len); > > Need to mask off the lower 6 bits of addr as they contain the source > + a few reserved bits. > > I was confused how you were geting better than 64 byte precision in your > example. > > > + } > > + > > + /* Protect against an uncleared _FLAG_MORE */ > > + nr_records = nr_records + le16_to_cpu(po->count); > > + if (nr_records >= cxlds->poison_max) > > + goto out; > > + > > + } while (po->flags & CXL_POISON_FLAG_MORE); > So.. A conundrum here. What happens if: > > 1. We get an error mid way through a set of multiple reads > (something intermittent - maybe a software issue) > 2. We will drop out of here fine and report the error. > 3. We run this function again. > > It will (I think) currently pick up where we left off, but we have > no way of knowing that as there isn't a 'total records' count or > any other form of index in the output payload. > > So, software solutions I think should work (though may warrant a note > to be added to the spec). > > 1. Read whole thing twice. First time is just to ensure we get > to the end and flush out any prior half done reads. > 2. Issue a read for a different region (perhaps length 0) first > and assume everything starts from scratch when we go back to > this region. It would be nice if this was codified as *the* way to reset the retrieval, but I worry that neither length==0 or length==1 can be used for this purpose since the "more" bit is attached to the last passed in *range*, not request. I.e. spec seems to allow for overlapping retrievals, although I doubt any implementation gets that fancy. I think it is sufficient to just include the "more" flag in the trace event and if userspace suspects that it is getting "more" results from a previous run it can reissue the scan. This is another reason that the trace event should include the pid of the process that triggered the results so it can delineate re-requests. Otherwise, the poison list cache is opportunistic so I am not sure that missing records in this corner case is fatal.
Alison Schofield wrote: > On Fri, Jun 17, 2022 at 07:05:08AM -0700, Jonathan Cameron wrote: > > On Tue, 14 Jun 2022 17:10:27 -0700 > > alison.schofield@intel.com wrote: > > > > > From: Alison Schofield <alison.schofield@intel.com> > > > > > > CXL devices that support persistent memory maintain a list of locations > > > that are poisoned or result in poison if the addresses are accessed by > > > the host. > > > > > > Per the spec (CXL 2.0 8.2.8.5.4.1), the device returns this Poison > > > list as a set of Media Error Records that include the source of the > > > error, the starting device physical address and length. The length is > > > the number of adjacent DPAs in the record and is in units of 64 bytes. > > > > > > Retrieve the list and log each Media Error Record as a trace event of > > > type cxl_poison_list. > > > > > > Signed-off-by: Alison Schofield <alison.schofield@intel.com> > > > > A few more things inline. > > > > Otherwise, can confirm it works with some hack QEMU code. > > I'll tidy that up and post soon. > > > > > +int cxl_mem_get_poison_list(struct device *dev) > > > +{ > snip > > > + > > > + trace_cxl_poison_list(dev, source, addr, len); > > > > Need to mask off the lower 6 bits of addr as they contain the source > > + a few reserved bits. > > > > I was confused how you were geting better than 64 byte precision in your > > example. > > > Ah...got it. Thanks! > > > > + } > > > + > > > + /* Protect against an uncleared _FLAG_MORE */ > > > + nr_records = nr_records + le16_to_cpu(po->count); > > > + if (nr_records >= cxlds->poison_max) > > > + goto out; > > > + > > > + } while (po->flags & CXL_POISON_FLAG_MORE); > > So.. A conundrum here. What happens if: > > > > 1. We get an error mid way through a set of multiple reads > > (something intermittent - maybe a software issue) > > 2. We will drop out of here fine and report the error. > > 3. We run this function again. > > > > It will (I think) currently pick up where we left off, but we have > > no way of knowing that as there isn't a 'total records' count or > > any other form of index in the output payload. > > Yes. That is sad. I'm assume it's by design and CXL devices never > intended to keep any totals. > > > > > So, software solutions I think should work (though may warrant a note > > to be added to the spec). > > > > 1. Read whole thing twice. First time is just to ensure we get > > to the end and flush out any prior half done reads. > > 2. Issue a read for a different region (perhaps length 0) first > > and assume everything starts from scratch when we go back to > > this region. > > Can you tell me more about 2 ? > > Also, Since posting this I have added protection to this path to ensure > only one reader of the poison list for this device. Like this: > > if (!completion_done(&cxlds->read_poison_complete); > return -EBUSY; > wait_for_completion_interruptible(&cxlds->read_poison_complete); > ...GET ALL THE POISON... > complete(&cxlds->read_poison_complete); Since this runs in the context of the requester a completion feels out of place. What this probably wants is a mutex() protecting the state machine of the Media Error Record retrieval and the "more" flag. > > And will add the error message on that unexpected _FLAG_MORE too. > > Alison > > > > Jonathan > > > > > > > > + > > > +out: > > > + kvfree(po); > > > + return rc; > > > +} > > > +EXPORT_SYMBOL_NS_GPL(cxl_mem_get_poison_list, CXL); > > > + > > > struct cxl_dev_state *cxl_dev_state_create(struct device *dev) > > > { > > > struct cxl_dev_state *cxlds; > >
On Fri, 17 Jun 2022 12:02:32 -0700 Dan Williams <dan.j.williams@intel.com> wrote: > Alison Schofield wrote: > > On Thu, Jun 16, 2022 at 12:43:34PM -0700, Davidlohr Bueso wrote: > > > On Tue, 14 Jun 2022, alison.schofield@intel.com wrote: > > > > > > >From: Alison Schofield <alison.schofield@intel.com> > > > > > > > >CXL devices that support persistent memory maintain a list of locations > > > >that are poisoned or result in poison if the addresses are accessed by > > > >the host. > > > > > > > >Per the spec (CXL 2.0 8.2.8.5.4.1), the device returns this Poison > > > >list as a set of Media Error Records that include the source of the > > > >error, the starting device physical address and length. The length is > > > >the number of adjacent DPAs in the record and is in units of 64 bytes. > > > > > > > >Retrieve the list and log each Media Error Record as a trace event of > > > >type cxl_poison_list. > > > > > > > >Signed-off-by: Alison Schofield <alison.schofield@intel.com> > > > >--- > > > > drivers/cxl/cxlmem.h | 43 +++++++++++++++++++++++ > > > > drivers/cxl/core/mbox.c | 75 +++++++++++++++++++++++++++++++++++++++++ > > > > 2 files changed, 118 insertions(+) > > > > > > snip > > > > > >+int cxl_mem_get_poison_list(struct device *dev) > > > >+{ > > > >+ struct cxl_memdev *cxlmd = to_cxl_memdev(dev); > > > >+ struct cxl_dev_state *cxlds = cxlmd->cxlds; > > > >+ struct cxl_mbox_poison_payload_out *po; > > > >+ struct cxl_mbox_poison_payload_in pi; > > > >+ int nr_records = 0; > > > >+ int rc, i; > > > >+ > > > >+ if (range_len(&cxlds->pmem_range)) { > > > >+ pi.offset = cpu_to_le64(cxlds->pmem_range.start); > > > >+ pi.length = cpu_to_le64(range_len(&cxlds->pmem_range)); > > > > First off - you stopped at a bug here - that pi.length needs to be > > in units of 64 bytes. > > > > > > Do you ever see this changing to not always use the full pmem DPA range > > > but allow arbitrary ones? I also assume this is the reason why you don't > > > check the range vs cxlds->ram_range to prevent any overlaps, no? > > > > > > Thanks, > > > Davidlohr > > > > David - Great question! > > > > I'm headed in this direction - > > > > cxl list --media-errors -m mem1 > > lists media errors for requested memdev > > > > cxl list --media-errors -r region# > > lists region errors with HPA addresses > > (So here cxl tool will collect the poison for all the regions > > memdevs and do the DPA to HPA translation) > > > > To answer your question, I wasn't thinking of limiting > > the range within the memdev, but certainly could. And if we were > > taking in ranges, those ranges would need to be checked. > > > > $cxl list --media-errors -m mem1 --range-start= --range-end|len= > > > > Now, if I left the sysfs inteface as is, the driver will read the > > entire poison list for the memdev and then cxl tool will filter it > > for the range requested. > > > > Or, maybe we should implement in libcxl (not sysfs), with memdev and > > range options and only collect from the device the range requested. > > > > Either one looks the same to the cxl tool user, but limiting the > > range we send to the device would certainly cut down on unwanted > > records being logged, retrieved, and examined. > > > > I'd like to hear more from you and other community members. > > There is some history here that is relevant to this design. CXL Get > Poison List builds on lessons learned from the ACPI "Address Range > Scrub" mechanism that was deployed for ACPI described persistent memory > platform (See ACPI 6.4 9.20.7.2 "Address Range Scrubbing (ARS) > Overview"). In that case there was no expectation that the device > maintained a cached and coherent (with incoming poison writes) copy of > the media error list. CXL Get Poison List in comparison is meant to > obviate the need to perform Scan Media in most scenarios, and it is > lightweight enough that userspace need not have a mechanism to request > errors by range, in my opinion. > > One of the design warts of drivers/acpi/nfit/ that I want to get away > from in the case of drivers/cxl/ is snooping the equivalent of ARS > command results to populate a kernel list of poison addresses and > instead put the onus on userspace to collect DPA events and optionally > inform the kernel of the HPA impacts. Can you give more info on why such an in kernel flow doesn't work well for this usecase (particularly for volatile memory)? I don't much like the flow differing from how we do DRAM patrol scrub based handling today. I'm not sure I like the requirement for userspace to be in the loop if we are dealing with volatile failures even if the detection is async. > For example, DAX filesystems will > soon have the ability to do something useful with poison notifications > [1], but that support is limited to synchronously consumed poison > flagged by memory_failure(). When the cxl tool translates the poison > list to HPA it needs an ABI to turn around and notify the filesystem > about which blocks got clobbered. I'm not clear why it makes sense to do the DPA to HPA conversion in userspace. It should be a fairly trivial bit of code to do the reverse look up in kernel and then we just bolt into existing infrastructure. Jonathan > > [1]: https://lore.kernel.org/all/20220616193157.2c2e963f3e7e38dfac554a28@linux-foundation.org/
On Fri, 17 Jun 2022 09:29:35 -0700 Alison Schofield <alison.schofield@intel.com> wrote: > On Fri, Jun 17, 2022 at 07:05:08AM -0700, Jonathan Cameron wrote: > > On Tue, 14 Jun 2022 17:10:27 -0700 > > alison.schofield@intel.com wrote: > > > > > From: Alison Schofield <alison.schofield@intel.com> > > > > > > CXL devices that support persistent memory maintain a list of locations > > > that are poisoned or result in poison if the addresses are accessed by > > > the host. > > > > > > Per the spec (CXL 2.0 8.2.8.5.4.1), the device returns this Poison > > > list as a set of Media Error Records that include the source of the > > > error, the starting device physical address and length. The length is > > > the number of adjacent DPAs in the record and is in units of 64 bytes. > > > > > > Retrieve the list and log each Media Error Record as a trace event of > > > type cxl_poison_list. > > > > > > Signed-off-by: Alison Schofield <alison.schofield@intel.com> > > > > A few more things inline. > > > > Otherwise, can confirm it works with some hack QEMU code. > > I'll tidy that up and post soon. > > > > > +int cxl_mem_get_poison_list(struct device *dev) > > > +{ > snip > > > + > > > + trace_cxl_poison_list(dev, source, addr, len); > > > > Need to mask off the lower 6 bits of addr as they contain the source > > + a few reserved bits. > > > > I was confused how you were geting better than 64 byte precision in your > > example. > > > Ah...got it. Thanks! > > > > + } > > > + > > > + /* Protect against an uncleared _FLAG_MORE */ > > > + nr_records = nr_records + le16_to_cpu(po->count); > > > + if (nr_records >= cxlds->poison_max) > > > + goto out; > > > + > > > + } while (po->flags & CXL_POISON_FLAG_MORE); > > So.. A conundrum here. What happens if: > > > > 1. We get an error mid way through a set of multiple reads > > (something intermittent - maybe a software issue) > > 2. We will drop out of here fine and report the error. > > 3. We run this function again. > > > > It will (I think) currently pick up where we left off, but we have > > no way of knowing that as there isn't a 'total records' count or > > any other form of index in the output payload. > > Yes. That is sad. I'm assume it's by design and CXL devices never > intended to keep any totals. > > > > > So, software solutions I think should work (though may warrant a note > > to be added to the spec). > > > > 1. Read whole thing twice. First time is just to ensure we get > > to the end and flush out any prior half done reads. > > 2. Issue a read for a different region (perhaps length 0) first > > and assume everything starts from scratch when we go back to > > this region. > > Can you tell me more about 2 ? 2 relies on interpreting what the spec says for an unusual corner case. The concept of 'more available' is something I would assume you'd only get if you issued a repeat of the same request. I don't think the spec actually covers this case - perhaps that's something we need to raise via the appropriate channels. Jonathan > > Also, Since posting this I have added protection to this path to ensure > only one reader of the poison list for this device. Like this: > > if (!completion_done(&cxlds->read_poison_complete); > return -EBUSY; > wait_for_completion_interruptible(&cxlds->read_poison_complete); > ...GET ALL THE POISON... > complete(&cxlds->read_poison_complete); > > And will add the error message on that unexpected _FLAG_MORE too. > > Alison > > > > Jonathan > > > > > > > > + > > > +out: > > > + kvfree(po); > > > + return rc; > > > +} > > > +EXPORT_SYMBOL_NS_GPL(cxl_mem_get_poison_list, CXL); > > > + > > > struct cxl_dev_state *cxl_dev_state_create(struct device *dev) > > > { > > > struct cxl_dev_state *cxlds; > >
On Fri, 17 Jun 2022 12:27:38 -0700 Dan Williams <dan.j.williams@intel.com> wrote: > Jonathan Cameron wrote: > > On Tue, 14 Jun 2022 17:10:27 -0700 > > alison.schofield@intel.com wrote: > > > > > From: Alison Schofield <alison.schofield@intel.com> > > > > > > CXL devices that support persistent memory maintain a list of locations > > > that are poisoned or result in poison if the addresses are accessed by > > > the host. > > > > > > Per the spec (CXL 2.0 8.2.8.5.4.1), the device returns this Poison > > > list as a set of Media Error Records that include the source of the > > > error, the starting device physical address and length. The length is > > > the number of adjacent DPAs in the record and is in units of 64 bytes. > > > > > > Retrieve the list and log each Media Error Record as a trace event of > > > type cxl_poison_list. > > > > > > Signed-off-by: Alison Schofield <alison.schofield@intel.com> > > > > A few more things inline. > > > > Otherwise, can confirm it works with some hack QEMU code. > > I'll tidy that up and post soon. > > > > > +int cxl_mem_get_poison_list(struct device *dev) > > > +{ > > > + struct cxl_memdev *cxlmd = to_cxl_memdev(dev); > > > + struct cxl_dev_state *cxlds = cxlmd->cxlds; > > > + struct cxl_mbox_poison_payload_out *po; > > > + struct cxl_mbox_poison_payload_in pi; > > > + int nr_records = 0; > > > + int rc, i; > > > + > > > + if (range_len(&cxlds->pmem_range)) { > > > + pi.offset = cpu_to_le64(cxlds->pmem_range.start); > > > + pi.length = cpu_to_le64(range_len(&cxlds->pmem_range)); > > > + } else { > > > + return -ENXIO; > > > + } > > > + > > > + po = kvmalloc(cxlds->payload_size, GFP_KERNEL); > > > + if (!po) > > > + return -ENOMEM; > > > + > > > + do { > > > + rc = cxl_mbox_send_cmd(cxlds, CXL_MBOX_OP_GET_POISON, &pi, > > > + sizeof(pi), po, cxlds->payload_size); > > > + if (rc) > > > + goto out; > > > + > > > + if (po->flags & CXL_POISON_FLAG_OVERFLOW) { > > > + time64_t o_time = le64_to_cpu(po->overflow_timestamp); > > > + > > > + dev_err(dev, "Poison list overflow at %ptTs UTC\n", > > > + &o_time); > > > + rc = -ENXIO; > > > + goto out; > > > + } > > > + > > > + if (po->flags & CXL_POISON_FLAG_SCANNING) { > > > + dev_err(dev, "Scan Media in Progress\n"); > > > + rc = -EBUSY; > > > + goto out; > > > + } > > > + > > > + for (i = 0; i < le16_to_cpu(po->count); i++) { > > > + u64 addr = le64_to_cpu(po->record[i].address); > > > > > + u32 len = le32_to_cpu(po->record[i].length); > > > > > > > + int source = FIELD_GET(CXL_POISON_SOURCE_MASK, addr); > > > + > > > + if (!CXL_POISON_SOURCE_VALID(source)) { > > > + dev_dbg(dev, "Invalid poison source %d", > > > + source); > > > + source = CXL_POISON_SOURCE_INVALID; > > > + } > > > + > > > + trace_cxl_poison_list(dev, source, addr, len); > > > > Need to mask off the lower 6 bits of addr as they contain the source > > + a few reserved bits. > > > > I was confused how you were geting better than 64 byte precision in your > > example. > > > > > + } > > > + > > > + /* Protect against an uncleared _FLAG_MORE */ > > > + nr_records = nr_records + le16_to_cpu(po->count); > > > + if (nr_records >= cxlds->poison_max) > > > + goto out; > > > + > > > + } while (po->flags & CXL_POISON_FLAG_MORE); > > So.. A conundrum here. What happens if: > > > > 1. We get an error mid way through a set of multiple reads > > (something intermittent - maybe a software issue) > > 2. We will drop out of here fine and report the error. > > 3. We run this function again. > > > > It will (I think) currently pick up where we left off, but we have > > no way of knowing that as there isn't a 'total records' count or > > any other form of index in the output payload. > > > > So, software solutions I think should work (though may warrant a note > > to be added to the spec). > > > > 1. Read whole thing twice. First time is just to ensure we get > > to the end and flush out any prior half done reads. > > 2. Issue a read for a different region (perhaps length 0) first > > and assume everything starts from scratch when we go back to > > this region. > > It would be nice if this was codified as *the* way to reset the > retrieval, but I worry that neither length==0 or length==1 can be used > for this purpose since the "more" bit is attached to the last passed in > *range*, not request. I.e. spec seems to allow for overlapping > retrievals, although I doubt any implementation gets that fancy. > > I think it is sufficient to just include the "more" flag in the trace > event and if userspace suspects that it is getting "more" results from a > previous run it can reissue the scan. Meaning is a bit ugly if attached to an individual trace event, though I guess we could do something nicer like only have one that doesn't have MORE set, thus indicating that one trace event is the last one from a query. i.e. fill in MORE for all the other events in the last GET_POISON_LIST reply. > This is another reason that the > trace event should include the pid of the process that triggered the > results so it can delineate re-requests. Otherwise, the poison list > cache is opportunistic so I am not sure that missing records in this > corner case is fatal. Ok, for now let's document the limitation with an appropriate comment. In parallel I've started a thread in appropriate venue to discuss if we can clarify the spec and potentially do better in future. So that discussion should shift over there. Thanks, Jonathan
diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h index 60d10ee1e7fc..29cf0459b44a 100644 --- a/drivers/cxl/cxlmem.h +++ b/drivers/cxl/cxlmem.h @@ -174,6 +174,7 @@ struct cxl_endpoint_dvsec_info { * (CXL 2.0 8.2.8.4.3 Mailbox Capabilities Register) * @lsa_size: Size of Label Storage Area * (CXL 2.0 8.2.9.5.1.1 Identify Memory Device) + * @poison_max_mer: maximum Media Error Records tracked in Poison List * @mbox_mutex: Mutex to synchronize mailbox access. * @firmware_version: Firmware version for the memory device. * @enabled_cmds: Hardware commands found enabled in CEL. @@ -204,6 +205,7 @@ struct cxl_dev_state { size_t payload_size; size_t lsa_size; + u32 poison_max; struct mutex mbox_mutex; /* Protects device mailbox and firmware */ char firmware_version[0x10]; DECLARE_BITMAP(enabled_cmds, CXL_MEM_COMMAND_ID_MAX); @@ -317,6 +319,46 @@ struct cxl_mbox_set_partition_info { #define CXL_SET_PARTITION_IMMEDIATE_FLAG BIT(0) +struct cxl_mbox_poison_payload_in { + __le64 offset; + __le64 length; +} __packed; + +struct cxl_mbox_poison_payload_out { + u8 flags; + u8 rsvd1; + __le64 overflow_timestamp; + __le16 count; + u8 rsvd2[0x14]; + struct cxl_poison_record { + __le64 address; + __le32 length; + __le32 rsvd; + } __packed record[]; +} __packed; + +/* CXL 8.2.9.5.4.1 Get Poison List: payload out flags: */ +#define CXL_POISON_FLAG_MORE BIT(0) +#define CXL_POISON_FLAG_OVERFLOW BIT(1) +#define CXL_POISON_FLAG_SCANNING BIT(2) + +/* CXL 8.2.9.5.4.1 Get Poison List: Error is encoded in record.address[2:0] */ +#define CXL_POISON_SOURCE_MASK GENMASK(2, 0) +#define CXL_POISON_SOURCE_UNKNOWN 0 +#define CXL_POISON_SOURCE_EXTERNAL 1 +#define CXL_POISON_SOURCE_INTERNAL 2 +#define CXL_POISON_SOURCE_INJECTED 3 +#define CXL_POISON_SOURCE_VENDOR 7 + +/* Software define */ +#define CXL_POISON_SOURCE_INVALID 99 +#define CXL_POISON_SOURCE_VALID(x) \ + (((x) == CXL_POISON_SOURCE_UNKNOWN) || \ + ((x) == CXL_POISON_SOURCE_EXTERNAL) || \ + ((x) == CXL_POISON_SOURCE_INTERNAL) || \ + ((x) == CXL_POISON_SOURCE_INJECTED) || \ + ((x) == CXL_POISON_SOURCE_VENDOR)) + /** * struct cxl_mem_command - Driver representation of a memory device command * @info: Command information as it exists for the UAPI @@ -351,6 +393,7 @@ int cxl_mem_create_range_info(struct cxl_dev_state *cxlds); struct cxl_dev_state *cxl_dev_state_create(struct device *dev); void set_exclusive_cxl_commands(struct cxl_dev_state *cxlds, unsigned long *cmds); void clear_exclusive_cxl_commands(struct cxl_dev_state *cxlds, unsigned long *cmds); +int cxl_mem_get_poison_list(struct device *dev); #ifdef CONFIG_CXL_SUSPEND void cxl_mem_active_inc(void); void cxl_mem_active_dec(void); diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c index 54f434733b56..c10c7020ebc2 100644 --- a/drivers/cxl/core/mbox.c +++ b/drivers/cxl/core/mbox.c @@ -9,6 +9,9 @@ #include "core.h" +#define CREATE_TRACE_POINTS +#include <trace/events/cxl.h> + static bool cxl_raw_allow_all; /** @@ -755,6 +758,7 @@ int cxl_dev_state_identify(struct cxl_dev_state *cxlds) { /* See CXL 2.0 Table 175 Identify Memory Device Output Payload */ struct cxl_mbox_identify id; + __le32 val = 0; int rc; rc = cxl_mbox_send_cmd(cxlds, CXL_MBOX_OP_IDENTIFY, NULL, 0, &id, @@ -783,6 +787,9 @@ int cxl_dev_state_identify(struct cxl_dev_state *cxlds) cxlds->lsa_size = le32_to_cpu(id.lsa_size); memcpy(cxlds->firmware_version, id.fw_revision, sizeof(id.fw_revision)); + memcpy(&val, id.poison_list_max_mer, 3); + cxlds->poison_max = le32_to_cpu(val); + return 0; } EXPORT_SYMBOL_NS_GPL(cxl_dev_state_identify, CXL); @@ -826,6 +833,74 @@ int cxl_mem_create_range_info(struct cxl_dev_state *cxlds) } EXPORT_SYMBOL_NS_GPL(cxl_mem_create_range_info, CXL); +int cxl_mem_get_poison_list(struct device *dev) +{ + struct cxl_memdev *cxlmd = to_cxl_memdev(dev); + struct cxl_dev_state *cxlds = cxlmd->cxlds; + struct cxl_mbox_poison_payload_out *po; + struct cxl_mbox_poison_payload_in pi; + int nr_records = 0; + int rc, i; + + if (range_len(&cxlds->pmem_range)) { + pi.offset = cpu_to_le64(cxlds->pmem_range.start); + pi.length = cpu_to_le64(range_len(&cxlds->pmem_range)); + } else { + return -ENXIO; + } + + po = kvmalloc(cxlds->payload_size, GFP_KERNEL); + if (!po) + return -ENOMEM; + + do { + rc = cxl_mbox_send_cmd(cxlds, CXL_MBOX_OP_GET_POISON, &pi, + sizeof(pi), po, cxlds->payload_size); + if (rc) + goto out; + + if (po->flags & CXL_POISON_FLAG_OVERFLOW) { + time64_t o_time = le64_to_cpu(po->overflow_timestamp); + + dev_err(dev, "Poison list overflow at %ptTs UTC\n", + &o_time); + rc = -ENXIO; + goto out; + } + + if (po->flags & CXL_POISON_FLAG_SCANNING) { + dev_err(dev, "Scan Media in Progress\n"); + rc = -EBUSY; + goto out; + } + + for (i = 0; i < le16_to_cpu(po->count); i++) { + u64 addr = le64_to_cpu(po->record[i].address); + u32 len = le32_to_cpu(po->record[i].length); + int source = FIELD_GET(CXL_POISON_SOURCE_MASK, addr); + + if (!CXL_POISON_SOURCE_VALID(source)) { + dev_dbg(dev, "Invalid poison source %d", + source); + source = CXL_POISON_SOURCE_INVALID; + } + + trace_cxl_poison_list(dev, source, addr, len); + } + + /* Protect against an uncleared _FLAG_MORE */ + nr_records = nr_records + le16_to_cpu(po->count); + if (nr_records >= cxlds->poison_max) + goto out; + + } while (po->flags & CXL_POISON_FLAG_MORE); + +out: + kvfree(po); + return rc; +} +EXPORT_SYMBOL_NS_GPL(cxl_mem_get_poison_list, CXL); + struct cxl_dev_state *cxl_dev_state_create(struct device *dev) { struct cxl_dev_state *cxlds;