diff mbox

[v1,1/2] PCI/AER: Decode Error Source Requester ID

Message ID 152770285586.80701.6545710900591672975.stgit@bhelgaas-glaptop.roam.corp.google.com (mailing list archive)
State New, archived
Delegated to: Bjorn Helgaas
Headers show

Commit Message

Bjorn Helgaas May 30, 2018, 5:54 p.m. UTC
From: Bjorn Helgaas <bhelgaas@google.com>

Decode the Requester ID from the AER Error Source Register into domain/
bus/device/function format to match other logging.  In cases where the ID
matches the device used for pci_err(), drop the extra ID completely so we
don't print it twice.

Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 drivers/pci/pcie/aer/aerdrv_errprint.c |   18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

Comments

Lukas Wunner May 30, 2018, 6:32 p.m. UTC | #1
On Wed, May 30, 2018 at 12:54:15PM -0500, Bjorn Helgaas wrote:
>  void aer_print_port_info(struct pci_dev *dev, struct aer_err_info *info)
>  {
> -	pci_info(dev, "AER: %s%s error received: id=%04x\n",
> +	u8 bus = info->id >> 8;
> +	u8 devfn = info->id & 0xff;
> +
> +	pci_info(dev, "AER: %s%s error received: %04x:%02x:%02x.%d\n",
>  		info->multi_error_valid ? "Multiple " : "",
> -		aer_error_severity_string[info->severity], info->id);
> +		aer_error_severity_string[info->severity],
> +		pci_domain_nr(dev->bus), bus, devfn >> 3, devfn & 0x7);

I think PCI_SLOT(devfn), PCI_FUNC(devfn) is a bit more readable.
Rajat Jain May 30, 2018, 6:41 p.m. UTC | #2
On Wed, May 30, 2018 at 10:54 AM Bjorn Helgaas <helgaas@kernel.org> wrote:

> From: Bjorn Helgaas <bhelgaas@google.com>

> Decode the Requester ID from the AER Error Source Register into domain/
> bus/device/function format to match other logging.  In cases where the ID
> matches the device used for pci_err(), drop the extra ID completely so we
> don't print it twice.

> Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
> ---
>   drivers/pci/pcie/aer/aerdrv_errprint.c |   18 +++++++++++-------
>   1 file changed, 11 insertions(+), 7 deletions(-)

> diff --git a/drivers/pci/pcie/aer/aerdrv_errprint.c
b/drivers/pci/pcie/aer/aerdrv_errprint.c
> index 21ca5e1b0ded..d7fde8368d81 100644
> --- a/drivers/pci/pcie/aer/aerdrv_errprint.c
> +++ b/drivers/pci/pcie/aer/aerdrv_errprint.c
> @@ -163,17 +163,17 @@ void aer_print_error(struct pci_dev *dev, struct
aer_err_info *info)
>          int id = ((dev->bus->number << 8) | dev->devfn);

>          if (!info->status) {
> -               pci_err(dev, "PCIe Bus Error: severity=%s,
type=Unaccessible, id=%04x(Unregistered Agent ID)\n",
> -                       aer_error_severity_string[info->severity], id);
> +               pci_err(dev, "PCIe Bus Error: severity=%s,
type=Inaccessible, (Unregistered Agent ID)\n",
> +                       aer_error_severity_string[info->severity]);

Does this code path indicate that a requester id was decoded to a device
that is not registered with the kernel? If so, shouldn't we log the bad
requester ID for better debugging, specifically since there is not going to
be any subsequent print about this ID (since we return from this function
in this case)?

>                  goto out;
>          }

>          layer = AER_GET_LAYER_ERROR(info->severity, info->status);
>          agent = AER_GET_AGENT(info->severity, info->status);

> -       pci_err(dev, "PCIe Bus Error: severity=%s, type=%s,
id=%04x(%s)\n",
> +       pci_err(dev, "PCIe Bus Error: severity=%s, type=%s, (%s)\n",
>                  aer_error_severity_string[info->severity],
> -               aer_error_layer[layer], id, aer_agent_string[agent]);
> +               aer_error_layer[layer], aer_agent_string[agent]);

>          pci_err(dev, "  device [%04x:%04x] error status/mask=%08x/%08x\n",
>                  dev->vendor, dev->device,
> @@ -186,7 +186,7 @@ void aer_print_error(struct pci_dev *dev, struct
aer_err_info *info)

>   out:
>          if (info->id && info->error_dev_num > 1 && info->id == id)
> -               pci_err(dev, "  Error of this Agent(%04x) is reported
first\n", id);
> +               pci_err(dev, "  Error of this Agent is reported first\n");

>          trace_aer_event(dev_name(&dev->dev), (info->status & ~info->mask),
>                          info->severity, info->tlp_header_valid,
&info->tlp);
> @@ -194,9 +194,13 @@ void aer_print_error(struct pci_dev *dev, struct
aer_err_info *info)

>   void aer_print_port_info(struct pci_dev *dev, struct aer_err_info *info)
>   {
> -       pci_info(dev, "AER: %s%s error received: id=%04x\n",
> +       u8 bus = info->id >> 8;
> +       u8 devfn = info->id & 0xff;
> +
> +       pci_info(dev, "AER: %s%s error received: %04x:%02x:%02x.%d\n",
>                  info->multi_error_valid ? "Multiple " : "",
> -               aer_error_severity_string[info->severity], info->id);
> +               aer_error_severity_string[info->severity],
> +               pci_domain_nr(dev->bus), bus, devfn >> 3, devfn & 0x7);
>   }

>   #ifdef CONFIG_ACPI_APEI_PCIEAER
Bjorn Helgaas May 31, 2018, 4:42 a.m. UTC | #3
On Wed, May 30, 2018 at 11:41:23AM -0700, Rajat Jain wrote:
> On Wed, May 30, 2018 at 10:54 AM Bjorn Helgaas <helgaas@kernel.org> wrote:
> 
> > From: Bjorn Helgaas <bhelgaas@google.com>
> 
> > Decode the Requester ID from the AER Error Source Register into domain/
> > bus/device/function format to match other logging.  In cases where the ID
> > matches the device used for pci_err(), drop the extra ID completely so we
> > don't print it twice.
> 
> > Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
> > ---
> >   drivers/pci/pcie/aer/aerdrv_errprint.c |   18 +++++++++++-------
> >   1 file changed, 11 insertions(+), 7 deletions(-)
> 
> > diff --git a/drivers/pci/pcie/aer/aerdrv_errprint.c
> b/drivers/pci/pcie/aer/aerdrv_errprint.c
> > index 21ca5e1b0ded..d7fde8368d81 100644
> > --- a/drivers/pci/pcie/aer/aerdrv_errprint.c
> > +++ b/drivers/pci/pcie/aer/aerdrv_errprint.c
> > @@ -163,17 +163,17 @@ void aer_print_error(struct pci_dev *dev, struct
> aer_err_info *info)
> >          int id = ((dev->bus->number << 8) | dev->devfn);

> >          if (!info->status) {
> > -               pci_err(dev, "PCIe Bus Error: severity=%s,
> type=Unaccessible, id=%04x(Unregistered Agent ID)\n",
> > -                       aer_error_severity_string[info->severity], id);
> > +               pci_err(dev, "PCIe Bus Error: severity=%s,
> type=Inaccessible, (Unregistered Agent ID)\n",
> > +                       aer_error_severity_string[info->severity]);
> 
> Does this code path indicate that a requester id was decoded to a device
> that is not registered with the kernel? If so, shouldn't we log the bad
> requester ID for better debugging, specifically since there is not going to
> be any subsequent print about this ID (since we return from this function
> in this case)?

Previously we printed "id", which was constructed above from "dev":

  id = ((dev->bus->number << 8) | dev->devfn);

so even if we print "id=%04x", it contains exactly the same
information as the bus/device/function printed using "dev".

So no, I don't think "Unregistered Agent ID" means a device not registered
with the kernel.  At any rate, we do have a pci_dev for it.

I *think* "info->status == 0" means PCI_ERR_COR_STATUS (or
PCI_ERR_UNCOR_STATUS) was zero, i.e., we didn't find any error status
bits set for this device.  I don't think "Unregistered Agent ID" is a
very good description of this situation.

Bjorn
Bjorn Helgaas May 31, 2018, 4:54 a.m. UTC | #4
On Wed, May 30, 2018 at 08:32:41PM +0200, Lukas Wunner wrote:
> On Wed, May 30, 2018 at 12:54:15PM -0500, Bjorn Helgaas wrote:
> >  void aer_print_port_info(struct pci_dev *dev, struct aer_err_info *info)
> >  {
> > -	pci_info(dev, "AER: %s%s error received: id=%04x\n",
> > +	u8 bus = info->id >> 8;
> > +	u8 devfn = info->id & 0xff;
> > +
> > +	pci_info(dev, "AER: %s%s error received: %04x:%02x:%02x.%d\n",
> >  		info->multi_error_valid ? "Multiple " : "",
> > -		aer_error_severity_string[info->severity], info->id);
> > +		aer_error_severity_string[info->severity],
> > +		pci_domain_nr(dev->bus), bus, devfn >> 3, devfn & 0x7);
> 
> I think PCI_SLOT(devfn), PCI_FUNC(devfn) is a bit more readable.

I used those originally, but of course those definitions predate PCIe
so they aren't clearly related to a Requester ID.

I searched the PCIe spec for the specifics of the Requester ID
composition.  It was surprisingly hard to find a clear statement.  The
best I found was PCIe r4.0, sec 6.13, which says

    Routing IDs, Requester IDs, and Completer IDs are 16-bit
    identifiers traditionally composed of three fields: an 8-bit Bus
    Number, a 5-bit Device Number, and a 3-bit Function Number.

Even that isn't specific about where the fields are, 

But it's probably not worth obsessing over this and PCI_SLOT() and
PCI_FUNC() are definitely more readable, so I changed them.
Rajat Jain May 31, 2018, 5:35 p.m. UTC | #5
On Wed, May 30, 2018 at 9:42 PM Bjorn Helgaas <helgaas@kernel.org> wrote:
>
> On Wed, May 30, 2018 at 11:41:23AM -0700, Rajat Jain wrote:
> > On Wed, May 30, 2018 at 10:54 AM Bjorn Helgaas <helgaas@kernel.org> wrote:
> >
> > > From: Bjorn Helgaas <bhelgaas@google.com>
> >
> > > Decode the Requester ID from the AER Error Source Register into domain/
> > > bus/device/function format to match other logging.  In cases where the ID
> > > matches the device used for pci_err(), drop the extra ID completely so we
> > > don't print it twice.
> >
> > > Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
> > > ---
> > >   drivers/pci/pcie/aer/aerdrv_errprint.c |   18 +++++++++++-------
> > >   1 file changed, 11 insertions(+), 7 deletions(-)
> >
> > > diff --git a/drivers/pci/pcie/aer/aerdrv_errprint.c
> > b/drivers/pci/pcie/aer/aerdrv_errprint.c
> > > index 21ca5e1b0ded..d7fde8368d81 100644
> > > --- a/drivers/pci/pcie/aer/aerdrv_errprint.c
> > > +++ b/drivers/pci/pcie/aer/aerdrv_errprint.c
> > > @@ -163,17 +163,17 @@ void aer_print_error(struct pci_dev *dev, struct
> > aer_err_info *info)
> > >          int id = ((dev->bus->number << 8) | dev->devfn);
>
> > >          if (!info->status) {
> > > -               pci_err(dev, "PCIe Bus Error: severity=%s,
> > type=Unaccessible, id=%04x(Unregistered Agent ID)\n",
> > > -                       aer_error_severity_string[info->severity], id);
> > > +               pci_err(dev, "PCIe Bus Error: severity=%s,
> > type=Inaccessible, (Unregistered Agent ID)\n",
> > > +                       aer_error_severity_string[info->severity]);
> >
> > Does this code path indicate that a requester id was decoded to a device
> > that is not registered with the kernel? If so, shouldn't we log the bad
> > requester ID for better debugging, specifically since there is not going to
> > be any subsequent print about this ID (since we return from this function
> > in this case)?
>
> Previously we printed "id", which was constructed above from "dev":
>
>   id = ((dev->bus->number << 8) | dev->devfn);
>
> so even if we print "id=%04x", it contains exactly the same
> information as the bus/device/function printed using "dev".

Sorry, my bad, I missed it, despite it being right there in my face :-).

>
> So no, I don't think "Unregistered Agent ID" means a device not registered
> with the kernel.  At any rate, we do have a pci_dev for it.
>
> I *think* "info->status == 0" means PCI_ERR_COR_STATUS (or
> PCI_ERR_UNCOR_STATUS) was zero, i.e., we didn't find any error status
> bits set for this device.  I don't think "Unregistered Agent ID" is a
> very good description of this situation.

Agree, may be something along the lines of "Unknown Error Status"
might be better.

Thanks,

Rajat

>
> Bjorn
diff mbox

Patch

diff --git a/drivers/pci/pcie/aer/aerdrv_errprint.c b/drivers/pci/pcie/aer/aerdrv_errprint.c
index 21ca5e1b0ded..d7fde8368d81 100644
--- a/drivers/pci/pcie/aer/aerdrv_errprint.c
+++ b/drivers/pci/pcie/aer/aerdrv_errprint.c
@@ -163,17 +163,17 @@  void aer_print_error(struct pci_dev *dev, struct aer_err_info *info)
 	int id = ((dev->bus->number << 8) | dev->devfn);
 
 	if (!info->status) {
-		pci_err(dev, "PCIe Bus Error: severity=%s, type=Unaccessible, id=%04x(Unregistered Agent ID)\n",
-			aer_error_severity_string[info->severity], id);
+		pci_err(dev, "PCIe Bus Error: severity=%s, type=Inaccessible, (Unregistered Agent ID)\n",
+			aer_error_severity_string[info->severity]);
 		goto out;
 	}
 
 	layer = AER_GET_LAYER_ERROR(info->severity, info->status);
 	agent = AER_GET_AGENT(info->severity, info->status);
 
-	pci_err(dev, "PCIe Bus Error: severity=%s, type=%s, id=%04x(%s)\n",
+	pci_err(dev, "PCIe Bus Error: severity=%s, type=%s, (%s)\n",
 		aer_error_severity_string[info->severity],
-		aer_error_layer[layer], id, aer_agent_string[agent]);
+		aer_error_layer[layer], aer_agent_string[agent]);
 
 	pci_err(dev, "  device [%04x:%04x] error status/mask=%08x/%08x\n",
 		dev->vendor, dev->device,
@@ -186,7 +186,7 @@  void aer_print_error(struct pci_dev *dev, struct aer_err_info *info)
 
 out:
 	if (info->id && info->error_dev_num > 1 && info->id == id)
-		pci_err(dev, "  Error of this Agent(%04x) is reported first\n", id);
+		pci_err(dev, "  Error of this Agent is reported first\n");
 
 	trace_aer_event(dev_name(&dev->dev), (info->status & ~info->mask),
 			info->severity, info->tlp_header_valid, &info->tlp);
@@ -194,9 +194,13 @@  void aer_print_error(struct pci_dev *dev, struct aer_err_info *info)
 
 void aer_print_port_info(struct pci_dev *dev, struct aer_err_info *info)
 {
-	pci_info(dev, "AER: %s%s error received: id=%04x\n",
+	u8 bus = info->id >> 8;
+	u8 devfn = info->id & 0xff;
+
+	pci_info(dev, "AER: %s%s error received: %04x:%02x:%02x.%d\n",
 		info->multi_error_valid ? "Multiple " : "",
-		aer_error_severity_string[info->severity], info->id);
+		aer_error_severity_string[info->severity],
+		pci_domain_nr(dev->bus), bus, devfn >> 3, devfn & 0x7);
 }
 
 #ifdef CONFIG_ACPI_APEI_PCIEAER