diff mbox series

[v2] drm/amdgpu: add ring timeout information in devcoredump

Message ID 20240305115843.3119708-1-sunil.khatri@amd.com (mailing list archive)
State New, archived
Headers show
Series [v2] drm/amdgpu: add ring timeout information in devcoredump | expand

Commit Message

Sunil Khatri March 5, 2024, 11:58 a.m. UTC
Add ring timeout related information in the amdgpu
devcoredump file for debugging purposes.

During the gpu recovery process the registered call
is triggered and add the debug information in data
file created by devcoredump framework under the
directory /sys/class/devcoredump/devcdx/

Signed-off-by: Sunil Khatri <sunil.khatri@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c | 15 +++++++++++++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h |  2 ++
 2 files changed, 17 insertions(+)

Comments

Christian König March 5, 2024, 1:10 p.m. UTC | #1
Am 05.03.24 um 12:58 schrieb Sunil Khatri:
> Add ring timeout related information in the amdgpu
> devcoredump file for debugging purposes.
>
> During the gpu recovery process the registered call
> is triggered and add the debug information in data
> file created by devcoredump framework under the
> directory /sys/class/devcoredump/devcdx/
>
> Signed-off-by: Sunil Khatri <sunil.khatri@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c | 15 +++++++++++++++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h |  2 ++
>   2 files changed, 17 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
> index a59364e9b6ed..aa7fed59a0d5 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
> @@ -196,6 +196,13 @@ amdgpu_devcoredump_read(char *buffer, loff_t offset, size_t count,
>   			   coredump->reset_task_info.process_name,
>   			   coredump->reset_task_info.pid);
>   
> +	if (coredump->ring_timeout) {
> +		drm_printf(&p, "\nRing timed out details\n");
> +		drm_printf(&p, "IP Type: %d Ring Name: %s \n",
> +				coredump->ring->funcs->type,
> +				coredump->ring->name);
> +	}
> +
>   	if (coredump->reset_vram_lost)
>   		drm_printf(&p, "VRAM is lost due to GPU reset!\n");
>   	if (coredump->adev->reset_info.num_regs) {
> @@ -220,6 +227,8 @@ void amdgpu_coredump(struct amdgpu_device *adev, bool vram_lost,
>   {
>   	struct amdgpu_coredump_info *coredump;
>   	struct drm_device *dev = adev_to_drm(adev);
> +	struct amdgpu_job *job = reset_context->job;
> +	struct drm_sched_job *s_job;
>   
>   	coredump = kzalloc(sizeof(*coredump), GFP_NOWAIT);
>   
> @@ -228,6 +237,12 @@ void amdgpu_coredump(struct amdgpu_device *adev, bool vram_lost,
>   		return;
>   	}
>   
> +	if (job) {
> +		s_job = &job->base;
> +		coredump->ring = to_amdgpu_ring(s_job->sched);
> +		coredump->ring_timeout = TRUE;
> +	}
> +
>   	coredump->reset_vram_lost = vram_lost;
>   
>   	if (reset_context->job && reset_context->job->vm) {
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
> index 19899f6b9b2b..6d67001a1057 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
> @@ -97,6 +97,8 @@ struct amdgpu_coredump_info {
>   	struct amdgpu_task_info         reset_task_info;
>   	struct timespec64               reset_time;
>   	bool                            reset_vram_lost;
> +	struct amdgpu_ring          *ring;
> +	bool                            ring_timeout;

I think you can drop ring_timeout, just having ring as optional 
information should be enough.

Apart from that looks pretty good I think.

Regards,
Christian.

>   };
>   #endif
>
Khatri, Sunil March 5, 2024, 1:17 p.m. UTC | #2
On 3/5/2024 6:40 PM, Christian König wrote:
> Am 05.03.24 um 12:58 schrieb Sunil Khatri:
>> Add ring timeout related information in the amdgpu
>> devcoredump file for debugging purposes.
>>
>> During the gpu recovery process the registered call
>> is triggered and add the debug information in data
>> file created by devcoredump framework under the
>> directory /sys/class/devcoredump/devcdx/
>>
>> Signed-off-by: Sunil Khatri <sunil.khatri@amd.com>
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c | 15 +++++++++++++++
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h |  2 ++
>>   2 files changed, 17 insertions(+)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
>> index a59364e9b6ed..aa7fed59a0d5 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
>> @@ -196,6 +196,13 @@ amdgpu_devcoredump_read(char *buffer, loff_t 
>> offset, size_t count,
>>                  coredump->reset_task_info.process_name,
>>                  coredump->reset_task_info.pid);
>>   +    if (coredump->ring_timeout) {
>> +        drm_printf(&p, "\nRing timed out details\n");
>> +        drm_printf(&p, "IP Type: %d Ring Name: %s \n",
>> +                coredump->ring->funcs->type,
>> +                coredump->ring->name);
>> +    }
>> +
>>       if (coredump->reset_vram_lost)
>>           drm_printf(&p, "VRAM is lost due to GPU reset!\n");
>>       if (coredump->adev->reset_info.num_regs) {
>> @@ -220,6 +227,8 @@ void amdgpu_coredump(struct amdgpu_device *adev, 
>> bool vram_lost,
>>   {
>>       struct amdgpu_coredump_info *coredump;
>>       struct drm_device *dev = adev_to_drm(adev);
>> +    struct amdgpu_job *job = reset_context->job;
>> +    struct drm_sched_job *s_job;
>>         coredump = kzalloc(sizeof(*coredump), GFP_NOWAIT);
>>   @@ -228,6 +237,12 @@ void amdgpu_coredump(struct amdgpu_device 
>> *adev, bool vram_lost,
>>           return;
>>       }
>>   +    if (job) {
>> +        s_job = &job->base;
>> +        coredump->ring = to_amdgpu_ring(s_job->sched);
>> +        coredump->ring_timeout = TRUE;
>> +    }
>> +
>>       coredump->reset_vram_lost = vram_lost;
>>         if (reset_context->job && reset_context->job->vm) {
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
>> index 19899f6b9b2b..6d67001a1057 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
>> @@ -97,6 +97,8 @@ struct amdgpu_coredump_info {
>>       struct amdgpu_task_info         reset_task_info;
>>       struct timespec64               reset_time;
>>       bool                            reset_vram_lost;
>> +    struct amdgpu_ring          *ring;
>> +    bool                            ring_timeout;
>
> I think you can drop ring_timeout, just having ring as optional 
> information should be enough.
>
> Apart from that looks pretty good I think.
>
- GPU reset could happen due to two possibilities atleast: 1. via sysfs 
cat /sys/kernel/debug/dri/0/amdgpu_gpu_recover there is no timeout or 
page fault here. In this case we need information if ringtimeout 
happened or not else it will try to print empty information in 
devcoredump. Same goes for pagefault also in that case also we need to 
see if recovery ran due to pagefault and then only add that information.

So to cover all use cases i added this parameter.


Thanks
Sunil

> Regards,
> Christian.
>
>>   };
>>   #endif
>
Christian König March 5, 2024, 1:20 p.m. UTC | #3
Am 05.03.24 um 14:17 schrieb Khatri, Sunil:
>
> On 3/5/2024 6:40 PM, Christian König wrote:
>> Am 05.03.24 um 12:58 schrieb Sunil Khatri:
>>> Add ring timeout related information in the amdgpu
>>> devcoredump file for debugging purposes.
>>>
>>> During the gpu recovery process the registered call
>>> is triggered and add the debug information in data
>>> file created by devcoredump framework under the
>>> directory /sys/class/devcoredump/devcdx/
>>>
>>> Signed-off-by: Sunil Khatri <sunil.khatri@amd.com>
>>> ---
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c | 15 +++++++++++++++
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h |  2 ++
>>>   2 files changed, 17 insertions(+)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c 
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
>>> index a59364e9b6ed..aa7fed59a0d5 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
>>> @@ -196,6 +196,13 @@ amdgpu_devcoredump_read(char *buffer, loff_t 
>>> offset, size_t count,
>>>                  coredump->reset_task_info.process_name,
>>>                  coredump->reset_task_info.pid);
>>>   +    if (coredump->ring_timeout) {
>>> +        drm_printf(&p, "\nRing timed out details\n");
>>> +        drm_printf(&p, "IP Type: %d Ring Name: %s \n",
>>> +                coredump->ring->funcs->type,
>>> +                coredump->ring->name);
>>> +    }
>>> +
>>>       if (coredump->reset_vram_lost)
>>>           drm_printf(&p, "VRAM is lost due to GPU reset!\n");
>>>       if (coredump->adev->reset_info.num_regs) {
>>> @@ -220,6 +227,8 @@ void amdgpu_coredump(struct amdgpu_device *adev, 
>>> bool vram_lost,
>>>   {
>>>       struct amdgpu_coredump_info *coredump;
>>>       struct drm_device *dev = adev_to_drm(adev);
>>> +    struct amdgpu_job *job = reset_context->job;
>>> +    struct drm_sched_job *s_job;
>>>         coredump = kzalloc(sizeof(*coredump), GFP_NOWAIT);
>>>   @@ -228,6 +237,12 @@ void amdgpu_coredump(struct amdgpu_device 
>>> *adev, bool vram_lost,
>>>           return;
>>>       }
>>>   +    if (job) {
>>> +        s_job = &job->base;
>>> +        coredump->ring = to_amdgpu_ring(s_job->sched);
>>> +        coredump->ring_timeout = TRUE;
>>> +    }
>>> +
>>>       coredump->reset_vram_lost = vram_lost;
>>>         if (reset_context->job && reset_context->job->vm) {
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h 
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
>>> index 19899f6b9b2b..6d67001a1057 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
>>> @@ -97,6 +97,8 @@ struct amdgpu_coredump_info {
>>>       struct amdgpu_task_info         reset_task_info;
>>>       struct timespec64               reset_time;
>>>       bool                            reset_vram_lost;
>>> +    struct amdgpu_ring          *ring;
>>> +    bool                            ring_timeout;
>>
>> I think you can drop ring_timeout, just having ring as optional 
>> information should be enough.
>>
>> Apart from that looks pretty good I think.
>>
> - GPU reset could happen due to two possibilities atleast: 1. via 
> sysfs cat /sys/kernel/debug/dri/0/amdgpu_gpu_recover there is no 
> timeout or page fault here. In this case we need information if 
> ringtimeout happened or not else it will try to print empty 
> information in devcoredump. Same goes for pagefault also in that case 
> also we need to see if recovery ran due to pagefault and then only add 
> that information.
>
> So to cover all use cases i added this parameter.

Yeah, but you don't need that parameter.

If a GPU reset is triggered by a ring timeout we note the ring causing 
this, otherwise the ring is simply NULL.

There is no need for a separate variable to note that a timeout happened.

Regards,
Christian.

>
>
> Thanks
> Sunil
>
>> Regards,
>> Christian.
>>
>>>   };
>>>   #endif
>>
diff mbox series

Patch

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
index a59364e9b6ed..aa7fed59a0d5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
@@ -196,6 +196,13 @@  amdgpu_devcoredump_read(char *buffer, loff_t offset, size_t count,
 			   coredump->reset_task_info.process_name,
 			   coredump->reset_task_info.pid);
 
+	if (coredump->ring_timeout) {
+		drm_printf(&p, "\nRing timed out details\n");
+		drm_printf(&p, "IP Type: %d Ring Name: %s \n",
+				coredump->ring->funcs->type,
+				coredump->ring->name);
+	}
+
 	if (coredump->reset_vram_lost)
 		drm_printf(&p, "VRAM is lost due to GPU reset!\n");
 	if (coredump->adev->reset_info.num_regs) {
@@ -220,6 +227,8 @@  void amdgpu_coredump(struct amdgpu_device *adev, bool vram_lost,
 {
 	struct amdgpu_coredump_info *coredump;
 	struct drm_device *dev = adev_to_drm(adev);
+	struct amdgpu_job *job = reset_context->job;
+	struct drm_sched_job *s_job;
 
 	coredump = kzalloc(sizeof(*coredump), GFP_NOWAIT);
 
@@ -228,6 +237,12 @@  void amdgpu_coredump(struct amdgpu_device *adev, bool vram_lost,
 		return;
 	}
 
+	if (job) {
+		s_job = &job->base;
+		coredump->ring = to_amdgpu_ring(s_job->sched);
+		coredump->ring_timeout = TRUE;
+	}
+
 	coredump->reset_vram_lost = vram_lost;
 
 	if (reset_context->job && reset_context->job->vm) {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
index 19899f6b9b2b..6d67001a1057 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
@@ -97,6 +97,8 @@  struct amdgpu_coredump_info {
 	struct amdgpu_task_info         reset_task_info;
 	struct timespec64               reset_time;
 	bool                            reset_vram_lost;
+	struct amdgpu_ring          *ring;
+	bool                            ring_timeout;
 };
 #endif