Message ID | 20220124202204.1488346-1-vaibhav@linux.ibm.com (mailing list archive) |
---|---|
State | Accepted |
Commit | bbbca72352bb9484bc057c91a408332b35ee8f4c |
Headers | show |
Series | [v4] powerpc/papr_scm: Implement initial support for injecting smart errors | expand |
Vaibhav Jain <vaibhav@linux.ibm.com> writes: > Presently PAPR doesn't support injecting smart errors on an > NVDIMM. This makes testing the NVDIMM health reporting functionality > difficult as simulating NVDIMM health related events need a hacked up > qemu version. > > To solve this problem this patch proposes simulating certain set of > NVDIMM health related events in papr_scm. Specifically 'fatal' health > state and 'dirty' shutdown state. These error can be injected via the > user-space 'ndctl-inject-smart(1)' command. With the proposed patch and > corresponding ndctl patches following command flow is expected: > > $ sudo ndctl list -DH -d nmem0 > ... > "health_state":"ok", > "shutdown_state":"clean", > ... > # inject unsafe shutdown and fatal health error > $ sudo ndctl inject-smart nmem0 -Uf > ... > "health_state":"fatal", > "shutdown_state":"dirty", > ... > # uninject all errors > $ sudo ndctl inject-smart nmem0 -N > ... > "health_state":"ok", > "shutdown_state":"clean", > ... > > The patch adds a new member 'health_bitmap_inject_mask' inside struct > papr_scm_priv which is then bitwise ANDed to the health bitmap fetched from the > hypervisor. The value for 'health_bitmap_inject_mask' is accessible from sysfs > at nmemX/papr/health_bitmap_inject. > > A new PDSM named 'SMART_INJECT' is proposed that accepts newly > introduced 'struct nd_papr_pdsm_smart_inject' as payload thats > exchanged between libndctl and papr_scm to indicate the requested > smart-error states. > > When the processing the PDSM 'SMART_INJECT', papr_pdsm_smart_inject() > constructs a pair or 'inject_mask' and 'clear_mask' bitmaps from the payload > and bit-blt it to the 'health_bitmap_inject_mask'. This ensures the after being > fetched from the hypervisor, the health_bitmap reflects requested smart-error > states. > Reviewed-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com> > Signed-off-by: Vaibhav Jain <vaibhav@linux.ibm.com> > Signed-off-by: Shivaprasad G Bhat <sbhat@linux.ibm.com> > --- > Changelog: > > Since v3: > * Renamed the sysfs entry from 'health_bitmap_override' to > 'health_bitmap_inject'. > * Simplified the variable names and removed the 'health_bitmap_{mask,override}' > members. Instead replaced them with a single 'health_bitmap_inject_mask' > member. [Aneesh] > * Updated the sysfs documentations and commit description. > * Used READ/WRITE_ONCE macros at places where 'health_bitmap_inject_mask' may be > accessed concurrently. > > Since v2: > * Rebased the patch to ppc-next > * Added documentation for newly introduced sysfs attribute 'health_bitmap_override' > > Since v1: > * Updated the patch description. > * Removed dependency of a header movement patch. > * Removed '__packed' attribute for 'struct nd_papr_pdsm_smart_inject' [Aneesh] > --- > Documentation/ABI/testing/sysfs-bus-papr-pmem | 12 +++ > arch/powerpc/include/uapi/asm/papr_pdsm.h | 18 ++++ > arch/powerpc/platforms/pseries/papr_scm.c | 90 ++++++++++++++++++- > 3 files changed, 117 insertions(+), 3 deletions(-) > > diff --git a/Documentation/ABI/testing/sysfs-bus-papr-pmem b/Documentation/ABI/testing/sysfs-bus-papr-pmem > index 95254cec92bf..4ac0673901e7 100644 > --- a/Documentation/ABI/testing/sysfs-bus-papr-pmem > +++ b/Documentation/ABI/testing/sysfs-bus-papr-pmem > @@ -61,3 +61,15 @@ Description: > * "CchRHCnt" : Cache Read Hit Count > * "CchWHCnt" : Cache Write Hit Count > * "FastWCnt" : Fast Write Count > + > +What: /sys/bus/nd/devices/nmemX/papr/health_bitmap_inject > +Date: Jan, 2022 > +KernelVersion: v5.17 > +Contact: linuxppc-dev <linuxppc-dev@lists.ozlabs.org>, nvdimm@lists.linux.dev, > +Description: > + (RO) Reports the health bitmap inject bitmap that is applied to > + bitmap received from PowerVM via the H_SCM_HEALTH. This is used > + to forcibly set specific bits returned from Hcall. These is then > + used to simulate various health or shutdown states for an nvdimm > + and are set by user-space tools like ndctl by issuing a PAPR DSM. > + > diff --git a/arch/powerpc/include/uapi/asm/papr_pdsm.h b/arch/powerpc/include/uapi/asm/papr_pdsm.h > index 82488b1e7276..17439925045c 100644 > --- a/arch/powerpc/include/uapi/asm/papr_pdsm.h > +++ b/arch/powerpc/include/uapi/asm/papr_pdsm.h > @@ -116,6 +116,22 @@ struct nd_papr_pdsm_health { > }; > }; > > +/* Flags for injecting specific smart errors */ > +#define PDSM_SMART_INJECT_HEALTH_FATAL (1 << 0) > +#define PDSM_SMART_INJECT_BAD_SHUTDOWN (1 << 1) > + > +struct nd_papr_pdsm_smart_inject { > + union { > + struct { > + /* One or more of PDSM_SMART_INJECT_ */ > + __u32 flags; > + __u8 fatal_enable; > + __u8 unsafe_shutdown_enable; > + }; > + __u8 buf[ND_PDSM_PAYLOAD_MAX_SIZE]; > + }; > +}; > + > /* > * Methods to be embedded in ND_CMD_CALL request. These are sent to the kernel > * via 'nd_cmd_pkg.nd_command' member of the ioctl struct > @@ -123,12 +139,14 @@ struct nd_papr_pdsm_health { > enum papr_pdsm { > PAPR_PDSM_MIN = 0x0, > PAPR_PDSM_HEALTH, > + PAPR_PDSM_SMART_INJECT, > PAPR_PDSM_MAX, > }; > > /* Maximal union that can hold all possible payload types */ > union nd_pdsm_payload { > struct nd_papr_pdsm_health health; > + struct nd_papr_pdsm_smart_inject smart_inject; > __u8 buf[ND_PDSM_PAYLOAD_MAX_SIZE]; > } __packed; > > diff --git a/arch/powerpc/platforms/pseries/papr_scm.c b/arch/powerpc/platforms/pseries/papr_scm.c > index f48e87ac89c9..20aafd387840 100644 > --- a/arch/powerpc/platforms/pseries/papr_scm.c > +++ b/arch/powerpc/platforms/pseries/papr_scm.c > @@ -120,6 +120,10 @@ struct papr_scm_priv { > > /* length of the stat buffer as expected by phyp */ > size_t stat_buffer_len; > + > + /* The bits which needs to be overridden */ > + u64 health_bitmap_inject_mask; > + > }; > > static int papr_scm_pmem_flush(struct nd_region *nd_region, > @@ -347,19 +351,29 @@ static ssize_t drc_pmem_query_stats(struct papr_scm_priv *p, > static int __drc_pmem_query_health(struct papr_scm_priv *p) > { > unsigned long ret[PLPAR_HCALL_BUFSIZE]; > + u64 bitmap = 0; > long rc; > > /* issue the hcall */ > rc = plpar_hcall(H_SCM_HEALTH, ret, p->drc_index); > - if (rc != H_SUCCESS) { > + if (rc == H_SUCCESS) > + bitmap = ret[0] & ret[1]; > + else if (rc == H_FUNCTION) > + dev_info_once(&p->pdev->dev, > + "Hcall H_SCM_HEALTH not implemented, assuming empty health bitmap"); > + else { > + > dev_err(&p->pdev->dev, > "Failed to query health information, Err:%ld\n", rc); > return -ENXIO; > } > > p->lasthealth_jiffies = jiffies; > - p->health_bitmap = ret[0] & ret[1]; > - > + /* Allow injecting specific health bits via inject mask. */ > + if (p->health_bitmap_inject_mask) > + bitmap = (bitmap & ~p->health_bitmap_inject_mask) | > + p->health_bitmap_inject_mask; > + WRITE_ONCE(p->health_bitmap, bitmap); > dev_dbg(&p->pdev->dev, > "Queried dimm health info. Bitmap:0x%016lx Mask:0x%016lx\n", > ret[0], ret[1]); > @@ -669,6 +683,56 @@ static int papr_pdsm_health(struct papr_scm_priv *p, > return rc; > } > > +/* Inject a smart error Add the dirty-shutdown-counter value to the pdsm */ > +static int papr_pdsm_smart_inject(struct papr_scm_priv *p, > + union nd_pdsm_payload *payload) > +{ > + int rc; > + u32 supported_flags = 0; > + u64 inject_mask = 0, clear_mask = 0; > + u64 mask; > + > + /* Check for individual smart error flags and update inject/clear masks */ > + if (payload->smart_inject.flags & PDSM_SMART_INJECT_HEALTH_FATAL) { > + supported_flags |= PDSM_SMART_INJECT_HEALTH_FATAL; > + if (payload->smart_inject.fatal_enable) > + inject_mask |= PAPR_PMEM_HEALTH_FATAL; > + else > + clear_mask |= PAPR_PMEM_HEALTH_FATAL; > + } > + > + if (payload->smart_inject.flags & PDSM_SMART_INJECT_BAD_SHUTDOWN) { > + supported_flags |= PDSM_SMART_INJECT_BAD_SHUTDOWN; > + if (payload->smart_inject.unsafe_shutdown_enable) > + inject_mask |= PAPR_PMEM_SHUTDOWN_DIRTY; > + else > + clear_mask |= PAPR_PMEM_SHUTDOWN_DIRTY; > + } > + > + dev_dbg(&p->pdev->dev, "[Smart-inject] inject_mask=%#llx clear_mask=%#llx\n", > + inject_mask, clear_mask); > + > + /* Prevent concurrent access to dimm health bitmap related members */ > + rc = mutex_lock_interruptible(&p->health_mutex); > + if (rc) > + return rc; > + > + /* Use inject/clear masks to set health_bitmap_inject_mask */ > + mask = READ_ONCE(p->health_bitmap_inject_mask); > + mask = (mask & ~clear_mask) | inject_mask; > + WRITE_ONCE(p->health_bitmap_inject_mask, mask); > + > + /* Invalidate cached health bitmap */ > + p->lasthealth_jiffies = 0; > + > + mutex_unlock(&p->health_mutex); > + > + /* Return the supported flags back to userspace */ > + payload->smart_inject.flags = supported_flags; > + > + return sizeof(struct nd_papr_pdsm_health); > +} > + > /* > * 'struct pdsm_cmd_desc' > * Identifies supported PDSMs' expected length of in/out payloads > @@ -702,6 +766,12 @@ static const struct pdsm_cmd_desc __pdsm_cmd_descriptors[] = { > .size_out = sizeof(struct nd_papr_pdsm_health), > .service = papr_pdsm_health, > }, > + > + [PAPR_PDSM_SMART_INJECT] = { > + .size_in = sizeof(struct nd_papr_pdsm_smart_inject), > + .size_out = sizeof(struct nd_papr_pdsm_smart_inject), > + .service = papr_pdsm_smart_inject, > + }, > /* Empty */ > [PAPR_PDSM_MAX] = { > .size_in = 0, > @@ -838,6 +908,19 @@ static int papr_scm_ndctl(struct nvdimm_bus_descriptor *nd_desc, > return 0; > } > > +static ssize_t health_bitmap_inject_show(struct device *dev, > + struct device_attribute *attr, > + char *buf) > +{ > + struct nvdimm *dimm = to_nvdimm(dev); > + struct papr_scm_priv *p = nvdimm_provider_data(dimm); > + > + return sprintf(buf, "%#llx\n", > + READ_ONCE(p->health_bitmap_inject_mask)); > +} > + > +static DEVICE_ATTR_ADMIN_RO(health_bitmap_inject); > + > static ssize_t perf_stats_show(struct device *dev, > struct device_attribute *attr, char *buf) > { > @@ -952,6 +1035,7 @@ static struct attribute *papr_nd_attributes[] = { > &dev_attr_flags.attr, > &dev_attr_perf_stats.attr, > &dev_attr_dirty_shutdown.attr, > + &dev_attr_health_bitmap_inject.attr, > NULL, > }; > > -- > 2.34.1
On Tue, 25 Jan 2022 01:52:04 +0530, Vaibhav Jain wrote: > Presently PAPR doesn't support injecting smart errors on an > NVDIMM. This makes testing the NVDIMM health reporting functionality > difficult as simulating NVDIMM health related events need a hacked up > qemu version. > > To solve this problem this patch proposes simulating certain set of > NVDIMM health related events in papr_scm. Specifically 'fatal' health > state and 'dirty' shutdown state. These error can be injected via the > user-space 'ndctl-inject-smart(1)' command. With the proposed patch and > corresponding ndctl patches following command flow is expected: > > [...] Applied to powerpc/next. [1/1] powerpc/papr_scm: Implement initial support for injecting smart errors https://git.kernel.org/powerpc/c/bbbca72352bb9484bc057c91a408332b35ee8f4c cheers
diff --git a/Documentation/ABI/testing/sysfs-bus-papr-pmem b/Documentation/ABI/testing/sysfs-bus-papr-pmem index 95254cec92bf..4ac0673901e7 100644 --- a/Documentation/ABI/testing/sysfs-bus-papr-pmem +++ b/Documentation/ABI/testing/sysfs-bus-papr-pmem @@ -61,3 +61,15 @@ Description: * "CchRHCnt" : Cache Read Hit Count * "CchWHCnt" : Cache Write Hit Count * "FastWCnt" : Fast Write Count + +What: /sys/bus/nd/devices/nmemX/papr/health_bitmap_inject +Date: Jan, 2022 +KernelVersion: v5.17 +Contact: linuxppc-dev <linuxppc-dev@lists.ozlabs.org>, nvdimm@lists.linux.dev, +Description: + (RO) Reports the health bitmap inject bitmap that is applied to + bitmap received from PowerVM via the H_SCM_HEALTH. This is used + to forcibly set specific bits returned from Hcall. These is then + used to simulate various health or shutdown states for an nvdimm + and are set by user-space tools like ndctl by issuing a PAPR DSM. + diff --git a/arch/powerpc/include/uapi/asm/papr_pdsm.h b/arch/powerpc/include/uapi/asm/papr_pdsm.h index 82488b1e7276..17439925045c 100644 --- a/arch/powerpc/include/uapi/asm/papr_pdsm.h +++ b/arch/powerpc/include/uapi/asm/papr_pdsm.h @@ -116,6 +116,22 @@ struct nd_papr_pdsm_health { }; }; +/* Flags for injecting specific smart errors */ +#define PDSM_SMART_INJECT_HEALTH_FATAL (1 << 0) +#define PDSM_SMART_INJECT_BAD_SHUTDOWN (1 << 1) + +struct nd_papr_pdsm_smart_inject { + union { + struct { + /* One or more of PDSM_SMART_INJECT_ */ + __u32 flags; + __u8 fatal_enable; + __u8 unsafe_shutdown_enable; + }; + __u8 buf[ND_PDSM_PAYLOAD_MAX_SIZE]; + }; +}; + /* * Methods to be embedded in ND_CMD_CALL request. These are sent to the kernel * via 'nd_cmd_pkg.nd_command' member of the ioctl struct @@ -123,12 +139,14 @@ struct nd_papr_pdsm_health { enum papr_pdsm { PAPR_PDSM_MIN = 0x0, PAPR_PDSM_HEALTH, + PAPR_PDSM_SMART_INJECT, PAPR_PDSM_MAX, }; /* Maximal union that can hold all possible payload types */ union nd_pdsm_payload { struct nd_papr_pdsm_health health; + struct nd_papr_pdsm_smart_inject smart_inject; __u8 buf[ND_PDSM_PAYLOAD_MAX_SIZE]; } __packed; diff --git a/arch/powerpc/platforms/pseries/papr_scm.c b/arch/powerpc/platforms/pseries/papr_scm.c index f48e87ac89c9..20aafd387840 100644 --- a/arch/powerpc/platforms/pseries/papr_scm.c +++ b/arch/powerpc/platforms/pseries/papr_scm.c @@ -120,6 +120,10 @@ struct papr_scm_priv { /* length of the stat buffer as expected by phyp */ size_t stat_buffer_len; + + /* The bits which needs to be overridden */ + u64 health_bitmap_inject_mask; + }; static int papr_scm_pmem_flush(struct nd_region *nd_region, @@ -347,19 +351,29 @@ static ssize_t drc_pmem_query_stats(struct papr_scm_priv *p, static int __drc_pmem_query_health(struct papr_scm_priv *p) { unsigned long ret[PLPAR_HCALL_BUFSIZE]; + u64 bitmap = 0; long rc; /* issue the hcall */ rc = plpar_hcall(H_SCM_HEALTH, ret, p->drc_index); - if (rc != H_SUCCESS) { + if (rc == H_SUCCESS) + bitmap = ret[0] & ret[1]; + else if (rc == H_FUNCTION) + dev_info_once(&p->pdev->dev, + "Hcall H_SCM_HEALTH not implemented, assuming empty health bitmap"); + else { + dev_err(&p->pdev->dev, "Failed to query health information, Err:%ld\n", rc); return -ENXIO; } p->lasthealth_jiffies = jiffies; - p->health_bitmap = ret[0] & ret[1]; - + /* Allow injecting specific health bits via inject mask. */ + if (p->health_bitmap_inject_mask) + bitmap = (bitmap & ~p->health_bitmap_inject_mask) | + p->health_bitmap_inject_mask; + WRITE_ONCE(p->health_bitmap, bitmap); dev_dbg(&p->pdev->dev, "Queried dimm health info. Bitmap:0x%016lx Mask:0x%016lx\n", ret[0], ret[1]); @@ -669,6 +683,56 @@ static int papr_pdsm_health(struct papr_scm_priv *p, return rc; } +/* Inject a smart error Add the dirty-shutdown-counter value to the pdsm */ +static int papr_pdsm_smart_inject(struct papr_scm_priv *p, + union nd_pdsm_payload *payload) +{ + int rc; + u32 supported_flags = 0; + u64 inject_mask = 0, clear_mask = 0; + u64 mask; + + /* Check for individual smart error flags and update inject/clear masks */ + if (payload->smart_inject.flags & PDSM_SMART_INJECT_HEALTH_FATAL) { + supported_flags |= PDSM_SMART_INJECT_HEALTH_FATAL; + if (payload->smart_inject.fatal_enable) + inject_mask |= PAPR_PMEM_HEALTH_FATAL; + else + clear_mask |= PAPR_PMEM_HEALTH_FATAL; + } + + if (payload->smart_inject.flags & PDSM_SMART_INJECT_BAD_SHUTDOWN) { + supported_flags |= PDSM_SMART_INJECT_BAD_SHUTDOWN; + if (payload->smart_inject.unsafe_shutdown_enable) + inject_mask |= PAPR_PMEM_SHUTDOWN_DIRTY; + else + clear_mask |= PAPR_PMEM_SHUTDOWN_DIRTY; + } + + dev_dbg(&p->pdev->dev, "[Smart-inject] inject_mask=%#llx clear_mask=%#llx\n", + inject_mask, clear_mask); + + /* Prevent concurrent access to dimm health bitmap related members */ + rc = mutex_lock_interruptible(&p->health_mutex); + if (rc) + return rc; + + /* Use inject/clear masks to set health_bitmap_inject_mask */ + mask = READ_ONCE(p->health_bitmap_inject_mask); + mask = (mask & ~clear_mask) | inject_mask; + WRITE_ONCE(p->health_bitmap_inject_mask, mask); + + /* Invalidate cached health bitmap */ + p->lasthealth_jiffies = 0; + + mutex_unlock(&p->health_mutex); + + /* Return the supported flags back to userspace */ + payload->smart_inject.flags = supported_flags; + + return sizeof(struct nd_papr_pdsm_health); +} + /* * 'struct pdsm_cmd_desc' * Identifies supported PDSMs' expected length of in/out payloads @@ -702,6 +766,12 @@ static const struct pdsm_cmd_desc __pdsm_cmd_descriptors[] = { .size_out = sizeof(struct nd_papr_pdsm_health), .service = papr_pdsm_health, }, + + [PAPR_PDSM_SMART_INJECT] = { + .size_in = sizeof(struct nd_papr_pdsm_smart_inject), + .size_out = sizeof(struct nd_papr_pdsm_smart_inject), + .service = papr_pdsm_smart_inject, + }, /* Empty */ [PAPR_PDSM_MAX] = { .size_in = 0, @@ -838,6 +908,19 @@ static int papr_scm_ndctl(struct nvdimm_bus_descriptor *nd_desc, return 0; } +static ssize_t health_bitmap_inject_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct nvdimm *dimm = to_nvdimm(dev); + struct papr_scm_priv *p = nvdimm_provider_data(dimm); + + return sprintf(buf, "%#llx\n", + READ_ONCE(p->health_bitmap_inject_mask)); +} + +static DEVICE_ATTR_ADMIN_RO(health_bitmap_inject); + static ssize_t perf_stats_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -952,6 +1035,7 @@ static struct attribute *papr_nd_attributes[] = { &dev_attr_flags.attr, &dev_attr_perf_stats.attr, &dev_attr_dirty_shutdown.attr, + &dev_attr_health_bitmap_inject.attr, NULL, };