diff mbox series

[v3,10/11] PCI/AER: Add optional logging callback for correctable error

Message ID 166879134199.674819.15564186577122699358.stgit@djiang5-desk3.ch.intel.com (mailing list archive)
State Superseded
Headers show
Series cxl/pci: Add fundamental error handling | expand

Commit Message

Dave Jiang Nov. 18, 2022, 5:09 p.m. UTC
Some new devices such as CXL devices may want to record additional error
information on a corrected error. Add a callback to allow the PCI device
driver to do additional logging and/or error handling.

Cc: Bjorn Helgaas <bhelgaas@google.com>
Suggested-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
---
 drivers/pci/pcie/aer.c |    8 +++++++-
 include/linux/pci.h    |    3 +++
 2 files changed, 10 insertions(+), 1 deletion(-)

Comments

Sathyanarayanan Kuppuswamy Nov. 19, 2022, 1:08 a.m. UTC | #1
Hi,

On 11/18/22 9:09 AM, Dave Jiang wrote:
> Some new devices such as CXL devices may want to record additional error
> information on a corrected error. Add a callback to allow the PCI device
> driver to do additional logging and/or error handling.

Change looks good. But I am not sure whether this needs to be documented
in Documentation/PCI/pci-error-recovery.rst with usage example. I will let
Bjorn make a call on it.

Reviewed-by: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>

> 
> Cc: Bjorn Helgaas <bhelgaas@google.com>
> Suggested-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
> Signed-off-by: Dave Jiang <dave.jiang@intel.com>
> ---
>  drivers/pci/pcie/aer.c |    8 +++++++-
>  include/linux/pci.h    |    3 +++
>  2 files changed, 10 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
> index e2d8a74f83c3..af1b5eecbb11 100644
> --- a/drivers/pci/pcie/aer.c
> +++ b/drivers/pci/pcie/aer.c
> @@ -961,8 +961,14 @@ static void handle_error_source(struct pci_dev *dev, struct aer_err_info *info)
>  		if (aer)
>  			pci_write_config_dword(dev, aer + PCI_ERR_COR_STATUS,
>  					info->status);
> -		if (pcie_aer_is_native(dev))
> +		if (pcie_aer_is_native(dev)) {
> +			struct pci_driver *pdrv = dev->driver;
> +
> +			if (pdrv && pdrv->err_handler &&
> +			    pdrv->err_handler->cor_error_log)
> +				pdrv->err_handler->cor_error_log(dev);
>  			pcie_clear_device_status(dev);
> +		}
>  	} else if (info->severity == AER_NONFATAL)
>  		pcie_do_recovery(dev, pci_channel_io_normal, aer_root_reset);
>  	else if (info->severity == AER_FATAL)
> diff --git a/include/linux/pci.h b/include/linux/pci.h
> index 575849a100a3..54939b3426a9 100644
> --- a/include/linux/pci.h
> +++ b/include/linux/pci.h
> @@ -844,6 +844,9 @@ struct pci_error_handlers {
>  
>  	/* Device driver may resume normal operations */
>  	void (*resume)(struct pci_dev *dev);
> +
> +	/* Allow device driver to record more details of a correctable error */
> +	void (*cor_error_log)(struct pci_dev *dev);
>  };
>  
>  
> 
>
Jonathan Cameron Nov. 21, 2022, 12:05 p.m. UTC | #2
On Fri, 18 Nov 2022 10:09:02 -0700
Dave Jiang <dave.jiang@intel.com> wrote:

> Some new devices such as CXL devices may want to record additional error
> information on a corrected error. Add a callback to allow the PCI device
> driver to do additional logging and/or error handling.

Probably want to be a little careful about talking about error handling for
corrected errors.  It does make sense if you are doing stats based offlining
of flaky parts of devices (we do this on some of our crypto and similar
accelerators), but that isn't really 'error handling'.

Agreed with other review that it might warrant some documentation but as
said their, Bjorn's call to make!

Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>

> 
> Cc: Bjorn Helgaas <bhelgaas@google.com>
> Suggested-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
> Signed-off-by: Dave Jiang <dave.jiang@intel.com>
> ---
>  drivers/pci/pcie/aer.c |    8 +++++++-
>  include/linux/pci.h    |    3 +++
>  2 files changed, 10 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
> index e2d8a74f83c3..af1b5eecbb11 100644
> --- a/drivers/pci/pcie/aer.c
> +++ b/drivers/pci/pcie/aer.c
> @@ -961,8 +961,14 @@ static void handle_error_source(struct pci_dev *dev, struct aer_err_info *info)
>  		if (aer)
>  			pci_write_config_dword(dev, aer + PCI_ERR_COR_STATUS,
>  					info->status);
> -		if (pcie_aer_is_native(dev))
> +		if (pcie_aer_is_native(dev)) {
> +			struct pci_driver *pdrv = dev->driver;
> +
> +			if (pdrv && pdrv->err_handler &&
> +			    pdrv->err_handler->cor_error_log)
> +				pdrv->err_handler->cor_error_log(dev);
>  			pcie_clear_device_status(dev);
> +		}
>  	} else if (info->severity == AER_NONFATAL)
>  		pcie_do_recovery(dev, pci_channel_io_normal, aer_root_reset);
>  	else if (info->severity == AER_FATAL)
> diff --git a/include/linux/pci.h b/include/linux/pci.h
> index 575849a100a3..54939b3426a9 100644
> --- a/include/linux/pci.h
> +++ b/include/linux/pci.h
> @@ -844,6 +844,9 @@ struct pci_error_handlers {
>  
>  	/* Device driver may resume normal operations */
>  	void (*resume)(struct pci_dev *dev);
> +
> +	/* Allow device driver to record more details of a correctable error */
> +	void (*cor_error_log)(struct pci_dev *dev);
>  };
>  
>  
> 
>
Jonathan Cameron Nov. 21, 2022, 12:17 p.m. UTC | #3
On Mon, 21 Nov 2022 12:05:27 +0000
Jonathan Cameron <Jonathan.Cameron@Huawei.com> wrote:

> On Fri, 18 Nov 2022 10:09:02 -0700
> Dave Jiang <dave.jiang@intel.com> wrote:
> 
> > Some new devices such as CXL devices may want to record additional error
> > information on a corrected error. Add a callback to allow the PCI device
> > driver to do additional logging and/or error handling.  
> 
> Probably want to be a little careful about talking about error handling for
> corrected errors.  It does make sense if you are doing stats based offlining
> of flaky parts of devices (we do this on some of our crypto and similar
> accelerators), but that isn't really 'error handling'.

Ah I'd also forgotten the mess of 'correctable' in PCIE (6.2.2.1 in PCIe r6.0 base)
Reality is that complex text means that the hardware can correct it without
intervention (though it may be other hardware, and it may not correct it, in which
case the source of the AER error should then issue an uncorrectable error message.)

Basic point stands but language is tricky around this.

Jonathan


> 
> Agreed with other review that it might warrant some documentation but as
> said their, Bjorn's call to make!
> 
> Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
> 
> > 
> > Cc: Bjorn Helgaas <bhelgaas@google.com>
> > Suggested-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
> > Signed-off-by: Dave Jiang <dave.jiang@intel.com>
> > ---
> >  drivers/pci/pcie/aer.c |    8 +++++++-
> >  include/linux/pci.h    |    3 +++
> >  2 files changed, 10 insertions(+), 1 deletion(-)
> > 
> > diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
> > index e2d8a74f83c3..af1b5eecbb11 100644
> > --- a/drivers/pci/pcie/aer.c
> > +++ b/drivers/pci/pcie/aer.c
> > @@ -961,8 +961,14 @@ static void handle_error_source(struct pci_dev *dev, struct aer_err_info *info)
> >  		if (aer)
> >  			pci_write_config_dword(dev, aer + PCI_ERR_COR_STATUS,
> >  					info->status);
> > -		if (pcie_aer_is_native(dev))
> > +		if (pcie_aer_is_native(dev)) {
> > +			struct pci_driver *pdrv = dev->driver;
> > +
> > +			if (pdrv && pdrv->err_handler &&
> > +			    pdrv->err_handler->cor_error_log)
> > +				pdrv->err_handler->cor_error_log(dev);
> >  			pcie_clear_device_status(dev);
> > +		}
> >  	} else if (info->severity == AER_NONFATAL)
> >  		pcie_do_recovery(dev, pci_channel_io_normal, aer_root_reset);
> >  	else if (info->severity == AER_FATAL)
> > diff --git a/include/linux/pci.h b/include/linux/pci.h
> > index 575849a100a3..54939b3426a9 100644
> > --- a/include/linux/pci.h
> > +++ b/include/linux/pci.h
> > @@ -844,6 +844,9 @@ struct pci_error_handlers {
> >  
> >  	/* Device driver may resume normal operations */
> >  	void (*resume)(struct pci_dev *dev);
> > +
> > +	/* Allow device driver to record more details of a correctable error */
> > +	void (*cor_error_log)(struct pci_dev *dev);
> >  };
> >  
> >  
> > 
> >   
>
Dave Jiang Nov. 28, 2022, 6:19 p.m. UTC | #4
On 11/18/2022 6:08 PM, Sathyanarayanan Kuppuswamy wrote:
> Hi,
> 
> On 11/18/22 9:09 AM, Dave Jiang wrote:
>> Some new devices such as CXL devices may want to record additional error
>> information on a corrected error. Add a callback to allow the PCI device
>> driver to do additional logging and/or error handling.
> 
> Change looks good. But I am not sure whether this needs to be documented
> in Documentation/PCI/pci-error-recovery.rst with usage example. I will let
> Bjorn make a call on it.

Ok I'll add documentation. Thanks for the review!

> 
> Reviewed-by: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
> 
>>
>> Cc: Bjorn Helgaas <bhelgaas@google.com>
>> Suggested-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
>> Signed-off-by: Dave Jiang <dave.jiang@intel.com>
>> ---
>>   drivers/pci/pcie/aer.c |    8 +++++++-
>>   include/linux/pci.h    |    3 +++
>>   2 files changed, 10 insertions(+), 1 deletion(-)
>>
>> diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
>> index e2d8a74f83c3..af1b5eecbb11 100644
>> --- a/drivers/pci/pcie/aer.c
>> +++ b/drivers/pci/pcie/aer.c
>> @@ -961,8 +961,14 @@ static void handle_error_source(struct pci_dev *dev, struct aer_err_info *info)
>>   		if (aer)
>>   			pci_write_config_dword(dev, aer + PCI_ERR_COR_STATUS,
>>   					info->status);
>> -		if (pcie_aer_is_native(dev))
>> +		if (pcie_aer_is_native(dev)) {
>> +			struct pci_driver *pdrv = dev->driver;
>> +
>> +			if (pdrv && pdrv->err_handler &&
>> +			    pdrv->err_handler->cor_error_log)
>> +				pdrv->err_handler->cor_error_log(dev);
>>   			pcie_clear_device_status(dev);
>> +		}
>>   	} else if (info->severity == AER_NONFATAL)
>>   		pcie_do_recovery(dev, pci_channel_io_normal, aer_root_reset);
>>   	else if (info->severity == AER_FATAL)
>> diff --git a/include/linux/pci.h b/include/linux/pci.h
>> index 575849a100a3..54939b3426a9 100644
>> --- a/include/linux/pci.h
>> +++ b/include/linux/pci.h
>> @@ -844,6 +844,9 @@ struct pci_error_handlers {
>>   
>>   	/* Device driver may resume normal operations */
>>   	void (*resume)(struct pci_dev *dev);
>> +
>> +	/* Allow device driver to record more details of a correctable error */
>> +	void (*cor_error_log)(struct pci_dev *dev);
>>   };
>>   
>>   
>>
>>
>
Dave Jiang Nov. 28, 2022, 9:01 p.m. UTC | #5
On 11/21/2022 5:05 AM, Jonathan Cameron wrote:
> On Fri, 18 Nov 2022 10:09:02 -0700
> Dave Jiang <dave.jiang@intel.com> wrote:
> 
>> Some new devices such as CXL devices may want to record additional error
>> information on a corrected error. Add a callback to allow the PCI device
>> driver to do additional logging and/or error handling.
> 
> Probably want to be a little careful about talking about error handling for
> corrected errors.  It does make sense if you are doing stats based offlining
> of flaky parts of devices (we do this on some of our crypto and similar
> accelerators), but that isn't really 'error handling'.

I'll remove the text of error handling and make it to be for optional 
additional logging only.

> 
> Agreed with other review that it might warrant some documentation but as
> said their, Bjorn's call to make!
> 
> Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
> 
>>
>> Cc: Bjorn Helgaas <bhelgaas@google.com>
>> Suggested-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
>> Signed-off-by: Dave Jiang <dave.jiang@intel.com>
>> ---
>>   drivers/pci/pcie/aer.c |    8 +++++++-
>>   include/linux/pci.h    |    3 +++
>>   2 files changed, 10 insertions(+), 1 deletion(-)
>>
>> diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
>> index e2d8a74f83c3..af1b5eecbb11 100644
>> --- a/drivers/pci/pcie/aer.c
>> +++ b/drivers/pci/pcie/aer.c
>> @@ -961,8 +961,14 @@ static void handle_error_source(struct pci_dev *dev, struct aer_err_info *info)
>>   		if (aer)
>>   			pci_write_config_dword(dev, aer + PCI_ERR_COR_STATUS,
>>   					info->status);
>> -		if (pcie_aer_is_native(dev))
>> +		if (pcie_aer_is_native(dev)) {
>> +			struct pci_driver *pdrv = dev->driver;
>> +
>> +			if (pdrv && pdrv->err_handler &&
>> +			    pdrv->err_handler->cor_error_log)
>> +				pdrv->err_handler->cor_error_log(dev);
>>   			pcie_clear_device_status(dev);
>> +		}
>>   	} else if (info->severity == AER_NONFATAL)
>>   		pcie_do_recovery(dev, pci_channel_io_normal, aer_root_reset);
>>   	else if (info->severity == AER_FATAL)
>> diff --git a/include/linux/pci.h b/include/linux/pci.h
>> index 575849a100a3..54939b3426a9 100644
>> --- a/include/linux/pci.h
>> +++ b/include/linux/pci.h
>> @@ -844,6 +844,9 @@ struct pci_error_handlers {
>>   
>>   	/* Device driver may resume normal operations */
>>   	void (*resume)(struct pci_dev *dev);
>> +
>> +	/* Allow device driver to record more details of a correctable error */
>> +	void (*cor_error_log)(struct pci_dev *dev);
>>   };
>>   
>>   
>>
>>
>
diff mbox series

Patch

diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
index e2d8a74f83c3..af1b5eecbb11 100644
--- a/drivers/pci/pcie/aer.c
+++ b/drivers/pci/pcie/aer.c
@@ -961,8 +961,14 @@  static void handle_error_source(struct pci_dev *dev, struct aer_err_info *info)
 		if (aer)
 			pci_write_config_dword(dev, aer + PCI_ERR_COR_STATUS,
 					info->status);
-		if (pcie_aer_is_native(dev))
+		if (pcie_aer_is_native(dev)) {
+			struct pci_driver *pdrv = dev->driver;
+
+			if (pdrv && pdrv->err_handler &&
+			    pdrv->err_handler->cor_error_log)
+				pdrv->err_handler->cor_error_log(dev);
 			pcie_clear_device_status(dev);
+		}
 	} else if (info->severity == AER_NONFATAL)
 		pcie_do_recovery(dev, pci_channel_io_normal, aer_root_reset);
 	else if (info->severity == AER_FATAL)
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 575849a100a3..54939b3426a9 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -844,6 +844,9 @@  struct pci_error_handlers {
 
 	/* Device driver may resume normal operations */
 	void (*resume)(struct pci_dev *dev);
+
+	/* Allow device driver to record more details of a correctable error */
+	void (*cor_error_log)(struct pci_dev *dev);
 };