diff mbox series

[RFC,v2,5/5] drm/xe/RAS: send multicast event on occurrence of an error

Message ID 20231020155835.1295524-6-aravind.iddamsetty@linux.intel.com (mailing list archive)
State New, archived
Headers show
Series Proposal to use netlink for RAS and Telemetry across drm subsystem | expand

Commit Message

Aravind Iddamsetty Oct. 20, 2023, 3:58 p.m. UTC
Whenever a correctable or an uncorrectable error happens an event is sent
to the corresponding listeners of these groups.

v2: Rebase

Signed-off-by: Aravind Iddamsetty <aravind.iddamsetty@linux.intel.com>
---
 drivers/gpu/drm/xe/xe_hw_error.c | 33 ++++++++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)

Comments

Michael J. Ruhl Oct. 20, 2023, 8:40 p.m. UTC | #1
>-----Original Message-----
>From: Aravind Iddamsetty <aravind.iddamsetty@linux.intel.com>
>Sent: Friday, October 20, 2023 11:59 AM
>To: intel-xe@lists.freedesktop.org; dri-devel@lists.freedesktop.org;
>alexander.deucher@amd.com; airlied@gmail.com; daniel@ffwll.ch;
>joonas.lahtinen@linux.intel.com; ogabbay@kernel.org; Tayar, Tomer (Habana)
><ttayar@habana.ai>; Hawking.Zhang@amd.com;
>Harish.Kasiviswanathan@amd.com; Felix.Kuehling@amd.com;
>Luben.Tuikov@amd.com; Ruhl, Michael J <michael.j.ruhl@intel.com>
>Subject: [RFC v2 5/5] drm/xe/RAS: send multicast event on occurrence of an
>error
>
>Whenever a correctable or an uncorrectable error happens an event is sent
>to the corresponding listeners of these groups.
>
>v2: Rebase

Hi Aravind,

This looks reasonable to me.

Reviewed-by: Michael J. Ruhl <michael.j.ruhl@intel.com>

M

>Signed-off-by: Aravind Iddamsetty <aravind.iddamsetty@linux.intel.com>
>---
> drivers/gpu/drm/xe/xe_hw_error.c | 33
>++++++++++++++++++++++++++++++++
> 1 file changed, 33 insertions(+)
>
>diff --git a/drivers/gpu/drm/xe/xe_hw_error.c
>b/drivers/gpu/drm/xe/xe_hw_error.c
>index bab6d4cf0b69..b0befb5e01cb 100644
>--- a/drivers/gpu/drm/xe/xe_hw_error.c
>+++ b/drivers/gpu/drm/xe/xe_hw_error.c
>@@ -786,6 +786,37 @@ xe_soc_hw_error_handler(struct xe_tile *tile, const
>enum hardware_error hw_err)
> 				(HARDWARE_ERROR_MAX << 1) + 1);
> }
>
>+static void
>+generate_netlink_event(struct xe_device *xe, const enum hardware_error
>hw_err)
>+{
>+	struct sk_buff *msg;
>+	void *hdr;
>+
>+	if (!xe->drm.drm_genl_family.module)
>+		return;
>+
>+	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC);
>+	if (!msg) {
>+		drm_dbg_driver(&xe->drm, "couldn't allocate memory for error
>multicast event\n");
>+		return;
>+	}
>+
>+	hdr = genlmsg_put(msg, 0, 0, &xe->drm.drm_genl_family, 0,
>DRM_RAS_CMD_ERROR_EVENT);
>+	if (!hdr) {
>+		drm_dbg_driver(&xe->drm, "mutlicast msg buffer is small\n");
>+		nlmsg_free(msg);
>+		return;
>+	}
>+
>+	genlmsg_end(msg, hdr);
>+
>+	genlmsg_multicast(&xe->drm.drm_genl_family, msg, 0,
>+			  hw_err ?
>+			  DRM_GENL_MCAST_UNCORR_ERR
>+			  : DRM_GENL_MCAST_CORR_ERR,
>+			  GFP_ATOMIC);
>+}
>+
> static void
> xe_hw_error_source_handler(struct xe_tile *tile, const enum hardware_error
>hw_err)
> {
>@@ -849,6 +880,8 @@ xe_hw_error_source_handler(struct xe_tile *tile, const
>enum hardware_error hw_er
> 	}
>
> 	xe_mmio_write32(gt, DEV_ERR_STAT_REG(hw_err), errsrc);
>+
>+	generate_netlink_event(tile_to_xe(tile), hw_err);
> unlock:
> 	spin_unlock_irqrestore(&tile_to_xe(tile)->irq.lock, flags);
> }
>--
>2.25.1
Tomer Tayar Nov. 10, 2023, 12:27 p.m. UTC | #2
On 20/10/2023 18:58, Aravind Iddamsetty wrote:
> Whenever a correctable or an uncorrectable error happens an event is sent
> to the corresponding listeners of these groups.
>
> v2: Rebase
>
> Signed-off-by: Aravind Iddamsetty<aravind.iddamsetty@linux.intel.com>
> ---
>   drivers/gpu/drm/xe/xe_hw_error.c | 33 ++++++++++++++++++++++++++++++++
>   1 file changed, 33 insertions(+)
>
> diff --git a/drivers/gpu/drm/xe/xe_hw_error.c b/drivers/gpu/drm/xe/xe_hw_error.c
> index bab6d4cf0b69..b0befb5e01cb 100644
> --- a/drivers/gpu/drm/xe/xe_hw_error.c
> +++ b/drivers/gpu/drm/xe/xe_hw_error.c
> @@ -786,6 +786,37 @@ xe_soc_hw_error_handler(struct xe_tile *tile, const enum hardware_error hw_err)
>   				(HARDWARE_ERROR_MAX << 1) + 1);
>   }
>   
> +static void
> +generate_netlink_event(struct xe_device *xe, const enum hardware_error hw_err)
> +{
> +	struct sk_buff *msg;
> +	void *hdr;
> +
> +	if (!xe->drm.drm_genl_family.module)
> +		return;
> +
> +	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC);
> +	if (!msg) {
> +		drm_dbg_driver(&xe->drm, "couldn't allocate memory for error multicast event\n");
> +		return;
> +	}
> +
> +	hdr = genlmsg_put(msg, 0, 0, &xe->drm.drm_genl_family, 0, DRM_RAS_CMD_ERROR_EVENT);
> +	if (!hdr) {
> +		drm_dbg_driver(&xe->drm, "mutlicast msg buffer is small\n");
> +		nlmsg_free(msg);
> +		return;
> +	}
> +
> +	genlmsg_end(msg, hdr);
> +
> +	genlmsg_multicast(&xe->drm.drm_genl_family, msg, 0,
> +			  hw_err ?
> +			  DRM_GENL_MCAST_UNCORR_ERR
> +			  : DRM_GENL_MCAST_CORR_ERR,
> +			  GFP_ATOMIC);

I agree that hiding/wrapping any netlink/genetlink API/macro with a DRM 
helper would be sometimes redundant,
and that in some cases the specific DRM driver would have to "dirt its 
hands" and deal with netlink (e.g. fill_error_details() in patch #3).
However maybe here a DRM helper would have been useful, so we won't see 
a copy of this sequence in other DRM drivers?

Thanks,
Tomer

> +}
> +
>   static void
>   xe_hw_error_source_handler(struct xe_tile *tile, const enum hardware_error hw_err)
>   {
> @@ -849,6 +880,8 @@ xe_hw_error_source_handler(struct xe_tile *tile, const enum hardware_error hw_er
>   	}
>   
>   	xe_mmio_write32(gt, DEV_ERR_STAT_REG(hw_err), errsrc);
> +
> +	generate_netlink_event(tile_to_xe(tile), hw_err);
>   unlock:
>   	spin_unlock_irqrestore(&tile_to_xe(tile)->irq.lock, flags);
>   }
Tomer Tayar Nov. 12, 2023, 3:28 p.m. UTC | #3
On 10/11/2023 14:27, Tomer Tayar wrote:
> On 20/10/2023 18:58, Aravind Iddamsetty wrote:
>> Whenever a correctable or an uncorrectable error happens an event is sent
>> to the corresponding listeners of these groups.
>>
>> v2: Rebase
>>
>> Signed-off-by: Aravind Iddamsetty<aravind.iddamsetty@linux.intel.com>
>> ---
>>    drivers/gpu/drm/xe/xe_hw_error.c | 33 ++++++++++++++++++++++++++++++++
>>    1 file changed, 33 insertions(+)
>>
>> diff --git a/drivers/gpu/drm/xe/xe_hw_error.c b/drivers/gpu/drm/xe/xe_hw_error.c
>> index bab6d4cf0b69..b0befb5e01cb 100644
>> --- a/drivers/gpu/drm/xe/xe_hw_error.c
>> +++ b/drivers/gpu/drm/xe/xe_hw_error.c
>> @@ -786,6 +786,37 @@ xe_soc_hw_error_handler(struct xe_tile *tile, const enum hardware_error hw_err)
>>    				(HARDWARE_ERROR_MAX << 1) + 1);
>>    }
>>    
>> +static void
>> +generate_netlink_event(struct xe_device *xe, const enum hardware_error hw_err)
>> +{
>> +	struct sk_buff *msg;
>> +	void *hdr;
>> +
>> +	if (!xe->drm.drm_genl_family.module)
>> +		return;
>> +
>> +	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC);
>> +	if (!msg) {
>> +		drm_dbg_driver(&xe->drm, "couldn't allocate memory for error multicast event\n");
>> +		return;
>> +	}
>> +
>> +	hdr = genlmsg_put(msg, 0, 0, &xe->drm.drm_genl_family, 0, DRM_RAS_CMD_ERROR_EVENT);
>> +	if (!hdr) {
>> +		drm_dbg_driver(&xe->drm, "mutlicast msg buffer is small\n");
>> +		nlmsg_free(msg);
>> +		return;
>> +	}
>> +
>> +	genlmsg_end(msg, hdr);
>> +
>> +	genlmsg_multicast(&xe->drm.drm_genl_family, msg, 0,
>> +			  hw_err ?
>> +			  DRM_GENL_MCAST_UNCORR_ERR
>> +			  : DRM_GENL_MCAST_CORR_ERR,
>> +			  GFP_ATOMIC);
> I agree that hiding/wrapping any netlink/genetlink API/macro with a DRM
> helper would be sometimes redundant,
> and that in some cases the specific DRM driver would have to "dirt its
> hands" and deal with netlink (e.g. fill_error_details() in patch #3).
> However maybe here a DRM helper would have been useful, so we won't see
> a copy of this sequence in other DRM drivers?
>
> Thanks,
> Tomer

After rethinking, it is possible that different DRM drivers will need 
some flexibility when it comes to calling genlmsg_put(), as they might 
want to have more of this call in order to attach some data related to 
the error indication.
In that case, adding a DRM function that wraps it may me redundant.
What do you think?

>> +}
>> +
>>    static void
>>    xe_hw_error_source_handler(struct xe_tile *tile, const enum hardware_error hw_err)
>>    {
>> @@ -849,6 +880,8 @@ xe_hw_error_source_handler(struct xe_tile *tile, const enum hardware_error hw_er
>>    	}
>>    
>>    	xe_mmio_write32(gt, DEV_ERR_STAT_REG(hw_err), errsrc);
>> +
>> +	generate_netlink_event(tile_to_xe(tile), hw_err);
>>    unlock:
>>    	spin_unlock_irqrestore(&tile_to_xe(tile)->irq.lock, flags);
>>    }
>
Aravind Iddamsetty Nov. 22, 2023, 2:34 p.m. UTC | #4
On 11/12/23 20:58, Tomer Tayar wrote:
> On 10/11/2023 14:27, Tomer Tayar wrote:
>> On 20/10/2023 18:58, Aravind Iddamsetty wrote:
>>> Whenever a correctable or an uncorrectable error happens an event is sent
>>> to the corresponding listeners of these groups.
>>>
>>> v2: Rebase
>>>
>>> Signed-off-by: Aravind Iddamsetty<aravind.iddamsetty@linux.intel.com>
>>> ---
>>>    drivers/gpu/drm/xe/xe_hw_error.c | 33 ++++++++++++++++++++++++++++++++
>>>    1 file changed, 33 insertions(+)
>>>
>>> diff --git a/drivers/gpu/drm/xe/xe_hw_error.c b/drivers/gpu/drm/xe/xe_hw_error.c
>>> index bab6d4cf0b69..b0befb5e01cb 100644
>>> --- a/drivers/gpu/drm/xe/xe_hw_error.c
>>> +++ b/drivers/gpu/drm/xe/xe_hw_error.c
>>> @@ -786,6 +786,37 @@ xe_soc_hw_error_handler(struct xe_tile *tile, const enum hardware_error hw_err)
>>>    				(HARDWARE_ERROR_MAX << 1) + 1);
>>>    }
>>>    
>>> +static void
>>> +generate_netlink_event(struct xe_device *xe, const enum hardware_error hw_err)
>>> +{
>>> +	struct sk_buff *msg;
>>> +	void *hdr;
>>> +
>>> +	if (!xe->drm.drm_genl_family.module)
>>> +		return;
>>> +
>>> +	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC);
>>> +	if (!msg) {
>>> +		drm_dbg_driver(&xe->drm, "couldn't allocate memory for error multicast event\n");
>>> +		return;
>>> +	}
>>> +
>>> +	hdr = genlmsg_put(msg, 0, 0, &xe->drm.drm_genl_family, 0, DRM_RAS_CMD_ERROR_EVENT);
>>> +	if (!hdr) {
>>> +		drm_dbg_driver(&xe->drm, "mutlicast msg buffer is small\n");
>>> +		nlmsg_free(msg);
>>> +		return;
>>> +	}
>>> +
>>> +	genlmsg_end(msg, hdr);
>>> +
>>> +	genlmsg_multicast(&xe->drm.drm_genl_family, msg, 0,
>>> +			  hw_err ?
>>> +			  DRM_GENL_MCAST_UNCORR_ERR
>>> +			  : DRM_GENL_MCAST_CORR_ERR,
>>> +			  GFP_ATOMIC);
>> I agree that hiding/wrapping any netlink/genetlink API/macro with a DRM
>> helper would be sometimes redundant,
>> and that in some cases the specific DRM driver would have to "dirt its
>> hands" and deal with netlink (e.g. fill_error_details() in patch #3).
>> However maybe here a DRM helper would have been useful, so we won't see
>> a copy of this sequence in other DRM drivers?
>>
>> Thanks,
>> Tomer
> After rethinking, it is possible that different DRM drivers will need 
> some flexibility when it comes to calling genlmsg_put(), as they might 
> want to have more of this call in order to attach some data related to 
> the error indication.
> In that case, adding a DRM function that wraps it may me redundant.
> What do you think?
I think we can expose this base level call to every drm driver and if it wants
to add any custom msg would define it own helper that should be ok i believe.


Thanks,
Aravind.
>
>>> +}
>>> +
>>>    static void
>>>    xe_hw_error_source_handler(struct xe_tile *tile, const enum hardware_error hw_err)
>>>    {
>>> @@ -849,6 +880,8 @@ xe_hw_error_source_handler(struct xe_tile *tile, const enum hardware_error hw_er
>>>    	}
>>>    
>>>    	xe_mmio_write32(gt, DEV_ERR_STAT_REG(hw_err), errsrc);
>>> +
>>> +	generate_netlink_event(tile_to_xe(tile), hw_err);
>>>    unlock:
>>>    	spin_unlock_irqrestore(&tile_to_xe(tile)->irq.lock, flags);
>>>    }
diff mbox series

Patch

diff --git a/drivers/gpu/drm/xe/xe_hw_error.c b/drivers/gpu/drm/xe/xe_hw_error.c
index bab6d4cf0b69..b0befb5e01cb 100644
--- a/drivers/gpu/drm/xe/xe_hw_error.c
+++ b/drivers/gpu/drm/xe/xe_hw_error.c
@@ -786,6 +786,37 @@  xe_soc_hw_error_handler(struct xe_tile *tile, const enum hardware_error hw_err)
 				(HARDWARE_ERROR_MAX << 1) + 1);
 }
 
+static void
+generate_netlink_event(struct xe_device *xe, const enum hardware_error hw_err)
+{
+	struct sk_buff *msg;
+	void *hdr;
+
+	if (!xe->drm.drm_genl_family.module)
+		return;
+
+	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC);
+	if (!msg) {
+		drm_dbg_driver(&xe->drm, "couldn't allocate memory for error multicast event\n");
+		return;
+	}
+
+	hdr = genlmsg_put(msg, 0, 0, &xe->drm.drm_genl_family, 0, DRM_RAS_CMD_ERROR_EVENT);
+	if (!hdr) {
+		drm_dbg_driver(&xe->drm, "mutlicast msg buffer is small\n");
+		nlmsg_free(msg);
+		return;
+	}
+
+	genlmsg_end(msg, hdr);
+
+	genlmsg_multicast(&xe->drm.drm_genl_family, msg, 0,
+			  hw_err ?
+			  DRM_GENL_MCAST_UNCORR_ERR
+			  : DRM_GENL_MCAST_CORR_ERR,
+			  GFP_ATOMIC);
+}
+
 static void
 xe_hw_error_source_handler(struct xe_tile *tile, const enum hardware_error hw_err)
 {
@@ -849,6 +880,8 @@  xe_hw_error_source_handler(struct xe_tile *tile, const enum hardware_error hw_er
 	}
 
 	xe_mmio_write32(gt, DEV_ERR_STAT_REG(hw_err), errsrc);
+
+	generate_netlink_event(tile_to_xe(tile), hw_err);
 unlock:
 	spin_unlock_irqrestore(&tile_to_xe(tile)->irq.lock, flags);
 }