diff mbox series

hw/block/nvme: add smart_critical_warning property

Message ID 20210111075003.151764-1-pizhenwei@bytedance.com (mailing list archive)
State New, archived
Headers show
Series hw/block/nvme: add smart_critical_warning property | expand

Commit Message

zhenwei pi Jan. 11, 2021, 7:50 a.m. UTC
There is a very low probability that hitting physical NVMe disk
hardware critical warning case, it's hard to write & test a monitor
agent service.

For debugging purposes, add a new 'smart_critical_warning' property
to emulate this situation.

Test with this patch:
1, append 'smart_critical_warning=16' for nvme parameters.
2, run smartctl in guest
 #smartctl -H -l error /dev/nvme0n1

  === START OF SMART DATA SECTION ===
  SMART overall-health self-assessment test result: FAILED!
  - volatile memory backup device has failed

Signed-off-by: zhenwei pi <pizhenwei@bytedance.com>
---
 hw/block/nvme.c | 4 ++++
 hw/block/nvme.h | 1 +
 2 files changed, 5 insertions(+)

Comments

Philippe Mathieu-Daudé Jan. 11, 2021, 9:14 a.m. UTC | #1
On 1/11/21 8:50 AM, zhenwei pi wrote:
> There is a very low probability that hitting physical NVMe disk
> hardware critical warning case, it's hard to write & test a monitor
> agent service.
> 
> For debugging purposes, add a new 'smart_critical_warning' property
> to emulate this situation.
> 
> Test with this patch:
> 1, append 'smart_critical_warning=16' for nvme parameters.
> 2, run smartctl in guest
>  #smartctl -H -l error /dev/nvme0n1
> 
>   === START OF SMART DATA SECTION ===
>   SMART overall-health self-assessment test result: FAILED!
>   - volatile memory backup device has failed
> 
> Signed-off-by: zhenwei pi <pizhenwei@bytedance.com>
> ---
>  hw/block/nvme.c | 4 ++++
>  hw/block/nvme.h | 1 +
>  2 files changed, 5 insertions(+)
> 
> diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> index 27d2c72716..2f0bcac91c 100644
> --- a/hw/block/nvme.c
> +++ b/hw/block/nvme.c
> @@ -1215,6 +1215,8 @@ static uint16_t nvme_smart_info(NvmeCtrl *n, uint8_t rae, uint32_t buf_len,
>  
>      trans_len = MIN(sizeof(smart) - off, buf_len);
>  
> +    smart.critical_warning = n->params.smart_critical_warning;
> +
>      smart.data_units_read[0] = cpu_to_le64(DIV_ROUND_UP(stats.units_read,
>                                                          1000));
>      smart.data_units_written[0] = cpu_to_le64(DIV_ROUND_UP(stats.units_written,
> @@ -2824,6 +2826,8 @@ static Property nvme_props[] = {
>      DEFINE_PROP_UINT32("aer_max_queued", NvmeCtrl, params.aer_max_queued, 64),
>      DEFINE_PROP_UINT8("mdts", NvmeCtrl, params.mdts, 7),
>      DEFINE_PROP_BOOL("use-intel-id", NvmeCtrl, params.use_intel_id, false),
> +    DEFINE_PROP_UINT8("smart_critical_warning", NvmeCtrl,
> +                      params.smart_critical_warning, 0),
>      DEFINE_PROP_END_OF_LIST(),
>  };
>  
> diff --git a/hw/block/nvme.h b/hw/block/nvme.h
> index e080a2318a..76684f5ac0 100644
> --- a/hw/block/nvme.h
> +++ b/hw/block/nvme.h
> @@ -16,6 +16,7 @@ typedef struct NvmeParams {
>      uint32_t aer_max_queued;
>      uint8_t  mdts;
>      bool     use_intel_id;
> +    uint8_t  smart_critical_warning;
>  } NvmeParams;
>  
>  typedef struct NvmeAsyncEvent {
> 

This is an easy way to achieve your goal.

However a better way is to add a QMP command to
change NvmeCtrl->temperature.

See for example tmp105_initfn() in hw/misc/tmp105.c
and qmp_tmp105_set_temperature() in tests/qtest/tmp105-test.c.

Regards,

Phil.
Klaus Jensen Jan. 11, 2021, 9:21 a.m. UTC | #2
On Jan 11 10:14, Philippe Mathieu-Daudé wrote:
> On 1/11/21 8:50 AM, zhenwei pi wrote:
> > There is a very low probability that hitting physical NVMe disk
> > hardware critical warning case, it's hard to write & test a monitor
> > agent service.
> > 
> > For debugging purposes, add a new 'smart_critical_warning' property
> > to emulate this situation.
> > 
> > Test with this patch:
> > 1, append 'smart_critical_warning=16' for nvme parameters.
> > 2, run smartctl in guest
> >  #smartctl -H -l error /dev/nvme0n1
> > 
> >   === START OF SMART DATA SECTION ===
> >   SMART overall-health self-assessment test result: FAILED!
> >   - volatile memory backup device has failed
> > 
> > Signed-off-by: zhenwei pi <pizhenwei@bytedance.com>
> > ---
> >  hw/block/nvme.c | 4 ++++
> >  hw/block/nvme.h | 1 +
> >  2 files changed, 5 insertions(+)
> > 
> > diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> > index 27d2c72716..2f0bcac91c 100644
> > --- a/hw/block/nvme.c
> > +++ b/hw/block/nvme.c
> > @@ -1215,6 +1215,8 @@ static uint16_t nvme_smart_info(NvmeCtrl *n, uint8_t rae, uint32_t buf_len,
> >  
> >      trans_len = MIN(sizeof(smart) - off, buf_len);
> >  
> > +    smart.critical_warning = n->params.smart_critical_warning;
> > +
> >      smart.data_units_read[0] = cpu_to_le64(DIV_ROUND_UP(stats.units_read,
> >                                                          1000));
> >      smart.data_units_written[0] = cpu_to_le64(DIV_ROUND_UP(stats.units_written,
> > @@ -2824,6 +2826,8 @@ static Property nvme_props[] = {
> >      DEFINE_PROP_UINT32("aer_max_queued", NvmeCtrl, params.aer_max_queued, 64),
> >      DEFINE_PROP_UINT8("mdts", NvmeCtrl, params.mdts, 7),
> >      DEFINE_PROP_BOOL("use-intel-id", NvmeCtrl, params.use_intel_id, false),
> > +    DEFINE_PROP_UINT8("smart_critical_warning", NvmeCtrl,
> > +                      params.smart_critical_warning, 0),
> >      DEFINE_PROP_END_OF_LIST(),
> >  };
> >  
> > diff --git a/hw/block/nvme.h b/hw/block/nvme.h
> > index e080a2318a..76684f5ac0 100644
> > --- a/hw/block/nvme.h
> > +++ b/hw/block/nvme.h
> > @@ -16,6 +16,7 @@ typedef struct NvmeParams {
> >      uint32_t aer_max_queued;
> >      uint8_t  mdts;
> >      bool     use_intel_id;
> > +    uint8_t  smart_critical_warning;
> >  } NvmeParams;
> >  
> >  typedef struct NvmeAsyncEvent {
> > 
> 
> This is an easy way to achieve your goal.
> 
> However a better way is to add a QMP command to
> change NvmeCtrl->temperature.
> 
> See for example tmp105_initfn() in hw/misc/tmp105.c
> and qmp_tmp105_set_temperature() in tests/qtest/tmp105-test.c.
> 

Nice.

+1 for this approach.
zhenwei pi Jan. 11, 2021, 9:49 a.m. UTC | #3
On 1/11/21 5:21 PM, Klaus Jensen wrote:
> On Jan 11 10:14, Philippe Mathieu-Daudé wrote:
>> On 1/11/21 8:50 AM, zhenwei pi wrote:
>>> There is a very low probability that hitting physical NVMe disk
>>> hardware critical warning case, it's hard to write & test a monitor
>>> agent service.
>>>
>>> For debugging purposes, add a new 'smart_critical_warning' property
>>> to emulate this situation.
>>>
>>> Test with this patch:
>>> 1, append 'smart_critical_warning=16' for nvme parameters.
>>> 2, run smartctl in guest
>>>   #smartctl -H -l error /dev/nvme0n1
>>>
>>>    === START OF SMART DATA SECTION ===
>>>    SMART overall-health self-assessment test result: FAILED!
>>>    - volatile memory backup device has failed
>>>
>>> Signed-off-by: zhenwei pi <pizhenwei@bytedance.com>
>>> ---
>>>   hw/block/nvme.c | 4 ++++
>>>   hw/block/nvme.h | 1 +
>>>   2 files changed, 5 insertions(+)
>>>
>>> diff --git a/hw/block/nvme.c b/hw/block/nvme.c
>>> index 27d2c72716..2f0bcac91c 100644
>>> --- a/hw/block/nvme.c
>>> +++ b/hw/block/nvme.c
>>> @@ -1215,6 +1215,8 @@ static uint16_t nvme_smart_info(NvmeCtrl *n, uint8_t rae, uint32_t buf_len,
>>>   
>>>       trans_len = MIN(sizeof(smart) - off, buf_len);
>>>   
>>> +    smart.critical_warning = n->params.smart_critical_warning;
>>> +
>>>       smart.data_units_read[0] = cpu_to_le64(DIV_ROUND_UP(stats.units_read,
>>>                                                           1000));
>>>       smart.data_units_written[0] = cpu_to_le64(DIV_ROUND_UP(stats.units_written,
>>> @@ -2824,6 +2826,8 @@ static Property nvme_props[] = {
>>>       DEFINE_PROP_UINT32("aer_max_queued", NvmeCtrl, params.aer_max_queued, 64),
>>>       DEFINE_PROP_UINT8("mdts", NvmeCtrl, params.mdts, 7),
>>>       DEFINE_PROP_BOOL("use-intel-id", NvmeCtrl, params.use_intel_id, false),
>>> +    DEFINE_PROP_UINT8("smart_critical_warning", NvmeCtrl,
>>> +                      params.smart_critical_warning, 0),
>>>       DEFINE_PROP_END_OF_LIST(),
>>>   };
>>>   
>>> diff --git a/hw/block/nvme.h b/hw/block/nvme.h
>>> index e080a2318a..76684f5ac0 100644
>>> --- a/hw/block/nvme.h
>>> +++ b/hw/block/nvme.h
>>> @@ -16,6 +16,7 @@ typedef struct NvmeParams {
>>>       uint32_t aer_max_queued;
>>>       uint8_t  mdts;
>>>       bool     use_intel_id;
>>> +    uint8_t  smart_critical_warning;
>>>   } NvmeParams;
>>>   
>>>   typedef struct NvmeAsyncEvent {
>>>
>>
>> This is an easy way to achieve your goal.
>>
>> However a better way is to add a QMP command to
>> change NvmeCtrl->temperature.
>>
>> See for example tmp105_initfn() in hw/misc/tmp105.c
>> and qmp_tmp105_set_temperature() in tests/qtest/tmp105-test.c.
>>
> 
> Nice.
> 
> +1 for this approach.
> 

Using QMP command to change NvmeCtrl->temperature only triggers 
NVME_SMART_TEMPERATURE warning, it's OK to test the work flow of uplayer 
software, but it's not enough to test all the cases of each warning.

 From NVMe version 1.3 to 1.4, a new bit definition has been added(bit 
5, Persistent Memory Region has become read-only or unreliable). Before 
we really hit this warning on a physical disk, we can use QEMU to test 
this feature(maybe another new feature in the future).

I don't disagree "add a QMP command" solution, but I think QEMU should 
be able to emulate all of the warnings(not only temperature).
Klaus Jensen Jan. 11, 2021, 11:15 a.m. UTC | #4
On Jan 11 17:49, zhenwei pi wrote:
> On 1/11/21 5:21 PM, Klaus Jensen wrote:
> > On Jan 11 10:14, Philippe Mathieu-Daudé wrote:
> > > On 1/11/21 8:50 AM, zhenwei pi wrote:
> > > > There is a very low probability that hitting physical NVMe disk
> > > > hardware critical warning case, it's hard to write & test a monitor
> > > > agent service.
> > > > 
> > > > For debugging purposes, add a new 'smart_critical_warning' property
> > > > to emulate this situation.
> > > > 
> > > > Test with this patch:
> > > > 1, append 'smart_critical_warning=16' for nvme parameters.
> > > > 2, run smartctl in guest
> > > >   #smartctl -H -l error /dev/nvme0n1
> > > > 
> > > >    === START OF SMART DATA SECTION ===
> > > >    SMART overall-health self-assessment test result: FAILED!
> > > >    - volatile memory backup device has failed
> > > > 
> > > > Signed-off-by: zhenwei pi <pizhenwei@bytedance.com>
> > > > ---
> > > >   hw/block/nvme.c | 4 ++++
> > > >   hw/block/nvme.h | 1 +
> > > >   2 files changed, 5 insertions(+)
> > > > 
> > > > diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> > > > index 27d2c72716..2f0bcac91c 100644
> > > > --- a/hw/block/nvme.c
> > > > +++ b/hw/block/nvme.c
> > > > @@ -1215,6 +1215,8 @@ static uint16_t nvme_smart_info(NvmeCtrl *n, uint8_t rae, uint32_t buf_len,
> > > >       trans_len = MIN(sizeof(smart) - off, buf_len);
> > > > +    smart.critical_warning = n->params.smart_critical_warning;
> > > > +
> > > >       smart.data_units_read[0] = cpu_to_le64(DIV_ROUND_UP(stats.units_read,
> > > >                                                           1000));
> > > >       smart.data_units_written[0] = cpu_to_le64(DIV_ROUND_UP(stats.units_written,
> > > > @@ -2824,6 +2826,8 @@ static Property nvme_props[] = {
> > > >       DEFINE_PROP_UINT32("aer_max_queued", NvmeCtrl, params.aer_max_queued, 64),
> > > >       DEFINE_PROP_UINT8("mdts", NvmeCtrl, params.mdts, 7),
> > > >       DEFINE_PROP_BOOL("use-intel-id", NvmeCtrl, params.use_intel_id, false),
> > > > +    DEFINE_PROP_UINT8("smart_critical_warning", NvmeCtrl,
> > > > +                      params.smart_critical_warning, 0),
> > > >       DEFINE_PROP_END_OF_LIST(),
> > > >   };
> > > > diff --git a/hw/block/nvme.h b/hw/block/nvme.h
> > > > index e080a2318a..76684f5ac0 100644
> > > > --- a/hw/block/nvme.h
> > > > +++ b/hw/block/nvme.h
> > > > @@ -16,6 +16,7 @@ typedef struct NvmeParams {
> > > >       uint32_t aer_max_queued;
> > > >       uint8_t  mdts;
> > > >       bool     use_intel_id;
> > > > +    uint8_t  smart_critical_warning;
> > > >   } NvmeParams;
> > > >   typedef struct NvmeAsyncEvent {
> > > > 
> > > 
> > > This is an easy way to achieve your goal.
> > > 
> > > However a better way is to add a QMP command to
> > > change NvmeCtrl->temperature.
> > > 
> > > See for example tmp105_initfn() in hw/misc/tmp105.c
> > > and qmp_tmp105_set_temperature() in tests/qtest/tmp105-test.c.
> > > 
> > 
> > Nice.
> > 
> > +1 for this approach.
> > 
> 
> Using QMP command to change NvmeCtrl->temperature only triggers
> NVME_SMART_TEMPERATURE warning, it's OK to test the work flow of uplayer
> software, but it's not enough to test all the cases of each warning.
> 
> From NVMe version 1.3 to 1.4, a new bit definition has been added(bit 5,
> Persistent Memory Region has become read-only or unreliable). Before we
> really hit this warning on a physical disk, we can use QEMU to test this
> feature(maybe another new feature in the future).
> 
> I don't disagree "add a QMP command" solution, but I think QEMU should be
> able to emulate all of the warnings(not only temperature).
> 

I think Philippe just made an example. It can be a QMP command that sets
the critical warning field.
diff mbox series

Patch

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 27d2c72716..2f0bcac91c 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -1215,6 +1215,8 @@  static uint16_t nvme_smart_info(NvmeCtrl *n, uint8_t rae, uint32_t buf_len,
 
     trans_len = MIN(sizeof(smart) - off, buf_len);
 
+    smart.critical_warning = n->params.smart_critical_warning;
+
     smart.data_units_read[0] = cpu_to_le64(DIV_ROUND_UP(stats.units_read,
                                                         1000));
     smart.data_units_written[0] = cpu_to_le64(DIV_ROUND_UP(stats.units_written,
@@ -2824,6 +2826,8 @@  static Property nvme_props[] = {
     DEFINE_PROP_UINT32("aer_max_queued", NvmeCtrl, params.aer_max_queued, 64),
     DEFINE_PROP_UINT8("mdts", NvmeCtrl, params.mdts, 7),
     DEFINE_PROP_BOOL("use-intel-id", NvmeCtrl, params.use_intel_id, false),
+    DEFINE_PROP_UINT8("smart_critical_warning", NvmeCtrl,
+                      params.smart_critical_warning, 0),
     DEFINE_PROP_END_OF_LIST(),
 };
 
diff --git a/hw/block/nvme.h b/hw/block/nvme.h
index e080a2318a..76684f5ac0 100644
--- a/hw/block/nvme.h
+++ b/hw/block/nvme.h
@@ -16,6 +16,7 @@  typedef struct NvmeParams {
     uint32_t aer_max_queued;
     uint8_t  mdts;
     bool     use_intel_id;
+    uint8_t  smart_critical_warning;
 } NvmeParams;
 
 typedef struct NvmeAsyncEvent {