diff mbox series

[v4,2/4] drm/amdgpu: Rework coredump to use memory dynamically

Message ID 20230815195100.294458-3-andrealmeid@igalia.com (mailing list archive)
State New, archived
Headers show
Series drm/amdgpu: Rework coredump memory allocation | expand

Commit Message

André Almeida Aug. 15, 2023, 7:50 p.m. UTC
Instead of storing coredump information inside amdgpu_device struct,
move if to a proper separated struct and allocate it dynamically. This
will make it easier to further expand the logged information.

Signed-off-by: André Almeida <andrealmeid@igalia.com>
---
v4: change kmalloc to kzalloc
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h        | 14 +++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 63 ++++++++++++++--------
 2 files changed, 49 insertions(+), 28 deletions(-)

Comments

Christian König Aug. 16, 2023, 9:48 a.m. UTC | #1
Am 15.08.23 um 21:50 schrieb André Almeida:
> Instead of storing coredump information inside amdgpu_device struct,
> move if to a proper separated struct and allocate it dynamically. This
> will make it easier to further expand the logged information.
>
> Signed-off-by: André Almeida <andrealmeid@igalia.com>

Shashank can you take a look at this one?

Thanks,
Christian.

> ---
> v4: change kmalloc to kzalloc
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu.h        | 14 +++--
>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 63 ++++++++++++++--------
>   2 files changed, 49 insertions(+), 28 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index 9c6a332261ab..0d560b713948 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -1088,11 +1088,6 @@ struct amdgpu_device {
>   	uint32_t                        *reset_dump_reg_list;
>   	uint32_t			*reset_dump_reg_value;
>   	int                             num_regs;
> -#ifdef CONFIG_DEV_COREDUMP
> -	struct amdgpu_task_info         reset_task_info;
> -	bool                            reset_vram_lost;
> -	struct timespec64               reset_time;
> -#endif
>   
>   	bool                            scpm_enabled;
>   	uint32_t                        scpm_status;
> @@ -1105,6 +1100,15 @@ struct amdgpu_device {
>   	uint32_t			aid_mask;
>   };
>   
> +#ifdef CONFIG_DEV_COREDUMP
> +struct amdgpu_coredump_info {
> +	struct amdgpu_device		*adev;
> +	struct amdgpu_task_info         reset_task_info;
> +	struct timespec64               reset_time;
> +	bool                            reset_vram_lost;
> +};
> +#endif
> +
>   static inline struct amdgpu_device *drm_to_adev(struct drm_device *ddev)
>   {
>   	return container_of(ddev, struct amdgpu_device, ddev);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index bf4781551f88..b5b879bcc5c9 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -4799,12 +4799,17 @@ static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev)
>   	return 0;
>   }
>   
> -#ifdef CONFIG_DEV_COREDUMP
> +#ifndef CONFIG_DEV_COREDUMP
> +static void amdgpu_coredump(struct amdgpu_device *adev, bool vram_lost,
> +			    struct amdgpu_reset_context *reset_context)
> +{
> +}
> +#else
>   static ssize_t amdgpu_devcoredump_read(char *buffer, loff_t offset,
>   		size_t count, void *data, size_t datalen)
>   {
>   	struct drm_printer p;
> -	struct amdgpu_device *adev = data;
> +	struct amdgpu_coredump_info *coredump = data;
>   	struct drm_print_iterator iter;
>   	int i;
>   
> @@ -4818,21 +4823,21 @@ static ssize_t amdgpu_devcoredump_read(char *buffer, loff_t offset,
>   	drm_printf(&p, "**** AMDGPU Device Coredump ****\n");
>   	drm_printf(&p, "kernel: " UTS_RELEASE "\n");
>   	drm_printf(&p, "module: " KBUILD_MODNAME "\n");
> -	drm_printf(&p, "time: %lld.%09ld\n", adev->reset_time.tv_sec, adev->reset_time.tv_nsec);
> -	if (adev->reset_task_info.pid)
> +	drm_printf(&p, "time: %lld.%09ld\n", coredump->reset_time.tv_sec, coredump->reset_time.tv_nsec);
> +	if (coredump->reset_task_info.pid)
>   		drm_printf(&p, "process_name: %s PID: %d\n",
> -			   adev->reset_task_info.process_name,
> -			   adev->reset_task_info.pid);
> +			   coredump->reset_task_info.process_name,
> +			   coredump->reset_task_info.pid);
>   
> -	if (adev->reset_vram_lost)
> +	if (coredump->reset_vram_lost)
>   		drm_printf(&p, "VRAM is lost due to GPU reset!\n");
> -	if (adev->num_regs) {
> +	if (coredump->adev->num_regs) {
>   		drm_printf(&p, "AMDGPU register dumps:\nOffset:     Value:\n");
>   
> -		for (i = 0; i < adev->num_regs; i++)
> +		for (i = 0; i < coredump->adev->num_regs; i++)
>   			drm_printf(&p, "0x%08x: 0x%08x\n",
> -				   adev->reset_dump_reg_list[i],
> -				   adev->reset_dump_reg_value[i]);
> +				   coredump->adev->reset_dump_reg_list[i],
> +				   coredump->adev->reset_dump_reg_value[i]);
>   	}
>   
>   	return count - iter.remain;
> @@ -4840,14 +4845,32 @@ static ssize_t amdgpu_devcoredump_read(char *buffer, loff_t offset,
>   
>   static void amdgpu_devcoredump_free(void *data)
>   {
> +	kfree(data);
>   }
>   
> -static void amdgpu_reset_capture_coredumpm(struct amdgpu_device *adev)
> +static void amdgpu_coredump(struct amdgpu_device *adev, bool vram_lost,
> +			    struct amdgpu_reset_context *reset_context)
>   {
> +	struct amdgpu_coredump_info *coredump;
>   	struct drm_device *dev = adev_to_drm(adev);
>   
> -	ktime_get_ts64(&adev->reset_time);
> -	dev_coredumpm(dev->dev, THIS_MODULE, adev, 0, GFP_NOWAIT,
> +	coredump = kzalloc(sizeof(*coredump), GFP_NOWAIT);
> +
> +	if (!coredump) {
> +		DRM_ERROR("%s: failed to allocate memory for coredump\n", __func__);
> +		return;
> +	}
> +
> +	coredump->reset_vram_lost = vram_lost;
> +
> +	if (reset_context->job && reset_context->job->vm)
> +		coredump->reset_task_info = reset_context->job->vm->task_info;
> +
> +	coredump->adev = adev;
> +
> +	ktime_get_ts64(&coredump->reset_time);
> +
> +	dev_coredumpm(dev->dev, THIS_MODULE, coredump, 0, GFP_NOWAIT,
>   		      amdgpu_devcoredump_read, amdgpu_devcoredump_free);
>   }
>   #endif
> @@ -4955,15 +4978,9 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle,
>   					goto out;
>   
>   				vram_lost = amdgpu_device_check_vram_lost(tmp_adev);
> -#ifdef CONFIG_DEV_COREDUMP
> -				tmp_adev->reset_vram_lost = vram_lost;
> -				memset(&tmp_adev->reset_task_info, 0,
> -						sizeof(tmp_adev->reset_task_info));
> -				if (reset_context->job && reset_context->job->vm)
> -					tmp_adev->reset_task_info =
> -						reset_context->job->vm->task_info;
> -				amdgpu_reset_capture_coredumpm(tmp_adev);
> -#endif
> +
> +				amdgpu_coredump(tmp_adev, vram_lost, reset_context);
> +
>   				if (vram_lost) {
>   					DRM_INFO("VRAM is lost due to GPU reset!\n");
>   					amdgpu_inc_vram_lost(tmp_adev);
Sharma, Shashank Aug. 16, 2023, 10:11 a.m. UTC | #2
On 16/08/2023 11:48, Christian König wrote:
> Am 15.08.23 um 21:50 schrieb André Almeida:
>> Instead of storing coredump information inside amdgpu_device struct,
>> move if to a proper separated struct and allocate it dynamically. This
>> will make it easier to further expand the logged information.
>>
>> Signed-off-by: André Almeida <andrealmeid@igalia.com>
>
> Shashank can you take a look at this one?


Noted, will check this out.

- Shashank

>
> Thanks,
> Christian.
>
>> ---
>> v4: change kmalloc to kzalloc
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu.h        | 14 +++--
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 63 ++++++++++++++--------
>>   2 files changed, 49 insertions(+), 28 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> index 9c6a332261ab..0d560b713948 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> @@ -1088,11 +1088,6 @@ struct amdgpu_device {
>>       uint32_t                        *reset_dump_reg_list;
>>       uint32_t            *reset_dump_reg_value;
>>       int                             num_regs;
>> -#ifdef CONFIG_DEV_COREDUMP
>> -    struct amdgpu_task_info         reset_task_info;
>> -    bool                            reset_vram_lost;
>> -    struct timespec64               reset_time;
>> -#endif
>>         bool                            scpm_enabled;
>>       uint32_t                        scpm_status;
>> @@ -1105,6 +1100,15 @@ struct amdgpu_device {
>>       uint32_t            aid_mask;
>>   };
>>   +#ifdef CONFIG_DEV_COREDUMP
>> +struct amdgpu_coredump_info {
>> +    struct amdgpu_device        *adev;
>> +    struct amdgpu_task_info         reset_task_info;
>> +    struct timespec64               reset_time;
>> +    bool                            reset_vram_lost;
>> +};
>> +#endif
>> +
>>   static inline struct amdgpu_device *drm_to_adev(struct drm_device 
>> *ddev)
>>   {
>>       return container_of(ddev, struct amdgpu_device, ddev);
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> index bf4781551f88..b5b879bcc5c9 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> @@ -4799,12 +4799,17 @@ static int amdgpu_reset_reg_dumps(struct 
>> amdgpu_device *adev)
>>       return 0;
>>   }
>>   -#ifdef CONFIG_DEV_COREDUMP
>> +#ifndef CONFIG_DEV_COREDUMP
>> +static void amdgpu_coredump(struct amdgpu_device *adev, bool vram_lost,
>> +                struct amdgpu_reset_context *reset_context)
>> +{
>> +}
>> +#else
>>   static ssize_t amdgpu_devcoredump_read(char *buffer, loff_t offset,
>>           size_t count, void *data, size_t datalen)
>>   {
>>       struct drm_printer p;
>> -    struct amdgpu_device *adev = data;
>> +    struct amdgpu_coredump_info *coredump = data;
>>       struct drm_print_iterator iter;
>>       int i;
>>   @@ -4818,21 +4823,21 @@ static ssize_t amdgpu_devcoredump_read(char 
>> *buffer, loff_t offset,
>>       drm_printf(&p, "**** AMDGPU Device Coredump ****\n");
>>       drm_printf(&p, "kernel: " UTS_RELEASE "\n");
>>       drm_printf(&p, "module: " KBUILD_MODNAME "\n");
>> -    drm_printf(&p, "time: %lld.%09ld\n", adev->reset_time.tv_sec, 
>> adev->reset_time.tv_nsec);
>> -    if (adev->reset_task_info.pid)
>> +    drm_printf(&p, "time: %lld.%09ld\n", 
>> coredump->reset_time.tv_sec, coredump->reset_time.tv_nsec);
>> +    if (coredump->reset_task_info.pid)
>>           drm_printf(&p, "process_name: %s PID: %d\n",
>> -               adev->reset_task_info.process_name,
>> -               adev->reset_task_info.pid);
>> +               coredump->reset_task_info.process_name,
>> +               coredump->reset_task_info.pid);
>>   -    if (adev->reset_vram_lost)
>> +    if (coredump->reset_vram_lost)
>>           drm_printf(&p, "VRAM is lost due to GPU reset!\n");
>> -    if (adev->num_regs) {
>> +    if (coredump->adev->num_regs) {
>>           drm_printf(&p, "AMDGPU register dumps:\nOffset:     
>> Value:\n");
>>   -        for (i = 0; i < adev->num_regs; i++)
>> +        for (i = 0; i < coredump->adev->num_regs; i++)
>>               drm_printf(&p, "0x%08x: 0x%08x\n",
>> -                   adev->reset_dump_reg_list[i],
>> -                   adev->reset_dump_reg_value[i]);
>> + coredump->adev->reset_dump_reg_list[i],
>> + coredump->adev->reset_dump_reg_value[i]);
>>       }
>>         return count - iter.remain;
>> @@ -4840,14 +4845,32 @@ static ssize_t amdgpu_devcoredump_read(char 
>> *buffer, loff_t offset,
>>     static void amdgpu_devcoredump_free(void *data)
>>   {
>> +    kfree(data);
>>   }
>>   -static void amdgpu_reset_capture_coredumpm(struct amdgpu_device 
>> *adev)
>> +static void amdgpu_coredump(struct amdgpu_device *adev, bool vram_lost,
>> +                struct amdgpu_reset_context *reset_context)
>>   {
>> +    struct amdgpu_coredump_info *coredump;
>>       struct drm_device *dev = adev_to_drm(adev);
>>   -    ktime_get_ts64(&adev->reset_time);
>> -    dev_coredumpm(dev->dev, THIS_MODULE, adev, 0, GFP_NOWAIT,
>> +    coredump = kzalloc(sizeof(*coredump), GFP_NOWAIT);
>> +
>> +    if (!coredump) {
>> +        DRM_ERROR("%s: failed to allocate memory for coredump\n", 
>> __func__);
>> +        return;
>> +    }
>> +
>> +    coredump->reset_vram_lost = vram_lost;
>> +
>> +    if (reset_context->job && reset_context->job->vm)
>> +        coredump->reset_task_info = reset_context->job->vm->task_info;
>> +
>> +    coredump->adev = adev;
>> +
>> +    ktime_get_ts64(&coredump->reset_time);
>> +
>> +    dev_coredumpm(dev->dev, THIS_MODULE, coredump, 0, GFP_NOWAIT,
>>                 amdgpu_devcoredump_read, amdgpu_devcoredump_free);
>>   }
>>   #endif
>> @@ -4955,15 +4978,9 @@ int amdgpu_do_asic_reset(struct list_head 
>> *device_list_handle,
>>                       goto out;
>>                     vram_lost = amdgpu_device_check_vram_lost(tmp_adev);
>> -#ifdef CONFIG_DEV_COREDUMP
>> -                tmp_adev->reset_vram_lost = vram_lost;
>> -                memset(&tmp_adev->reset_task_info, 0,
>> -                        sizeof(tmp_adev->reset_task_info));
>> -                if (reset_context->job && reset_context->job->vm)
>> -                    tmp_adev->reset_task_info =
>> - reset_context->job->vm->task_info;
>> -                amdgpu_reset_capture_coredumpm(tmp_adev);
>> -#endif
>> +
>> +                amdgpu_coredump(tmp_adev, vram_lost, reset_context);
>> +
>>                   if (vram_lost) {
>>                       DRM_INFO("VRAM is lost due to GPU reset!\n");
>>                       amdgpu_inc_vram_lost(tmp_adev);
>
Sharma, Shashank Aug. 17, 2023, 6:41 a.m. UTC | #3
Hello Andre,

On 15/08/2023 21:50, André Almeida wrote:
> Instead of storing coredump information inside amdgpu_device struct,
> move if to a proper separated struct and allocate it dynamically. This
> will make it easier to further expand the logged information.
>
> Signed-off-by: André Almeida <andrealmeid@igalia.com>
> ---
> v4: change kmalloc to kzalloc
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu.h        | 14 +++--
>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 63 ++++++++++++++--------
>   2 files changed, 49 insertions(+), 28 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index 9c6a332261ab..0d560b713948 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -1088,11 +1088,6 @@ struct amdgpu_device {
>   	uint32_t                        *reset_dump_reg_list;
>   	uint32_t			*reset_dump_reg_value;
>   	int                             num_regs;
> -#ifdef CONFIG_DEV_COREDUMP
> -	struct amdgpu_task_info         reset_task_info;
> -	bool                            reset_vram_lost;
> -	struct timespec64               reset_time;
> -#endif
>   
>   	bool                            scpm_enabled;
>   	uint32_t                        scpm_status;
> @@ -1105,6 +1100,15 @@ struct amdgpu_device {
>   	uint32_t			aid_mask;
>   };
>   
> +#ifdef CONFIG_DEV_COREDUMP
> +struct amdgpu_coredump_info {
> +	struct amdgpu_device		*adev;
> +	struct amdgpu_task_info         reset_task_info;
> +	struct timespec64               reset_time;
> +	bool                            reset_vram_lost;
> +};

The patch looks good to me in general, but I would recommend slightly 
different arrangement and segregation of GPU reset information.

Please consider a higher level structure adev->gpu_reset_info, and move 
everything related to reset dump info into that, including this new 
coredump_info structure, something like this:

struct amdgpu_reset_info {

     uint32_t *reset_dump_reg_list;

     uint32_t *reset_dump_reg_value;

     int num_regs;

#ifdef CONFIG_DEV_COREDUMP

    struct amdgpu_coredump_info *coredump_info;/* keep this dynamic 
allocation */

#endif

}


This will make sure that all the relevant information is at the same place.

- Shashank

> +#endif
> +
>   static inline struct amdgpu_device *drm_to_adev(struct drm_device *ddev)
>   {
>   	return container_of(ddev, struct amdgpu_device, ddev);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index bf4781551f88..b5b879bcc5c9 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -4799,12 +4799,17 @@ static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev)
>   	return 0;
>   }
>   
> -#ifdef CONFIG_DEV_COREDUMP
> +#ifndef CONFIG_DEV_COREDUMP
> +static void amdgpu_coredump(struct amdgpu_device *adev, bool vram_lost,
> +			    struct amdgpu_reset_context *reset_context)
> +{
> +}
> +#else
>   static ssize_t amdgpu_devcoredump_read(char *buffer, loff_t offset,
>   		size_t count, void *data, size_t datalen)
>   {
>   	struct drm_printer p;
> -	struct amdgpu_device *adev = data;
> +	struct amdgpu_coredump_info *coredump = data;
>   	struct drm_print_iterator iter;
>   	int i;
>   
> @@ -4818,21 +4823,21 @@ static ssize_t amdgpu_devcoredump_read(char *buffer, loff_t offset,
>   	drm_printf(&p, "**** AMDGPU Device Coredump ****\n");
>   	drm_printf(&p, "kernel: " UTS_RELEASE "\n");
>   	drm_printf(&p, "module: " KBUILD_MODNAME "\n");
> -	drm_printf(&p, "time: %lld.%09ld\n", adev->reset_time.tv_sec, adev->reset_time.tv_nsec);
> -	if (adev->reset_task_info.pid)
> +	drm_printf(&p, "time: %lld.%09ld\n", coredump->reset_time.tv_sec, coredump->reset_time.tv_nsec);
> +	if (coredump->reset_task_info.pid)
>   		drm_printf(&p, "process_name: %s PID: %d\n",
> -			   adev->reset_task_info.process_name,
> -			   adev->reset_task_info.pid);
> +			   coredump->reset_task_info.process_name,
> +			   coredump->reset_task_info.pid);
>   
> -	if (adev->reset_vram_lost)
> +	if (coredump->reset_vram_lost)
>   		drm_printf(&p, "VRAM is lost due to GPU reset!\n");
> -	if (adev->num_regs) {
> +	if (coredump->adev->num_regs) {
>   		drm_printf(&p, "AMDGPU register dumps:\nOffset:     Value:\n");
>   
> -		for (i = 0; i < adev->num_regs; i++)
> +		for (i = 0; i < coredump->adev->num_regs; i++)
>   			drm_printf(&p, "0x%08x: 0x%08x\n",
> -				   adev->reset_dump_reg_list[i],
> -				   adev->reset_dump_reg_value[i]);
> +				   coredump->adev->reset_dump_reg_list[i],
> +				   coredump->adev->reset_dump_reg_value[i]);
>   	}
>   
>   	return count - iter.remain;
> @@ -4840,14 +4845,32 @@ static ssize_t amdgpu_devcoredump_read(char *buffer, loff_t offset,
>   
>   static void amdgpu_devcoredump_free(void *data)
>   {
> +	kfree(data);
>   }
>   
> -static void amdgpu_reset_capture_coredumpm(struct amdgpu_device *adev)
> +static void amdgpu_coredump(struct amdgpu_device *adev, bool vram_lost,
> +			    struct amdgpu_reset_context *reset_context)
>   {
> +	struct amdgpu_coredump_info *coredump;
>   	struct drm_device *dev = adev_to_drm(adev);
>   
> -	ktime_get_ts64(&adev->reset_time);
> -	dev_coredumpm(dev->dev, THIS_MODULE, adev, 0, GFP_NOWAIT,
> +	coredump = kzalloc(sizeof(*coredump), GFP_NOWAIT);
> +
> +	if (!coredump) {
> +		DRM_ERROR("%s: failed to allocate memory for coredump\n", __func__);
> +		return;
> +	}
> +
> +	coredump->reset_vram_lost = vram_lost;
> +
> +	if (reset_context->job && reset_context->job->vm)
> +		coredump->reset_task_info = reset_context->job->vm->task_info;
> +
> +	coredump->adev = adev;
> +
> +	ktime_get_ts64(&coredump->reset_time);
> +
> +	dev_coredumpm(dev->dev, THIS_MODULE, coredump, 0, GFP_NOWAIT,
>   		      amdgpu_devcoredump_read, amdgpu_devcoredump_free);
>   }
>   #endif
> @@ -4955,15 +4978,9 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle,
>   					goto out;
>   
>   				vram_lost = amdgpu_device_check_vram_lost(tmp_adev);
> -#ifdef CONFIG_DEV_COREDUMP
> -				tmp_adev->reset_vram_lost = vram_lost;
> -				memset(&tmp_adev->reset_task_info, 0,
> -						sizeof(tmp_adev->reset_task_info));
> -				if (reset_context->job && reset_context->job->vm)
> -					tmp_adev->reset_task_info =
> -						reset_context->job->vm->task_info;
> -				amdgpu_reset_capture_coredumpm(tmp_adev);
> -#endif
> +
> +				amdgpu_coredump(tmp_adev, vram_lost, reset_context);
> +
>   				if (vram_lost) {
>   					DRM_INFO("VRAM is lost due to GPU reset!\n");
>   					amdgpu_inc_vram_lost(tmp_adev);
André Almeida Aug. 17, 2023, 1:45 p.m. UTC | #4
Hi Shashank,

Em 17/08/2023 03:41, Shashank Sharma escreveu:
> Hello Andre,
> 
> On 15/08/2023 21:50, André Almeida wrote:
>> Instead of storing coredump information inside amdgpu_device struct,
>> move if to a proper separated struct and allocate it dynamically. This
>> will make it easier to further expand the logged information.
>>
>> Signed-off-by: André Almeida <andrealmeid@igalia.com>
>> ---
>> v4: change kmalloc to kzalloc
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu.h        | 14 +++--
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 63 ++++++++++++++--------
>>   2 files changed, 49 insertions(+), 28 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> index 9c6a332261ab..0d560b713948 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> @@ -1088,11 +1088,6 @@ struct amdgpu_device {
>>       uint32_t                        *reset_dump_reg_list;
>>       uint32_t            *reset_dump_reg_value;
>>       int                             num_regs;
>> -#ifdef CONFIG_DEV_COREDUMP
>> -    struct amdgpu_task_info         reset_task_info;
>> -    bool                            reset_vram_lost;
>> -    struct timespec64               reset_time;
>> -#endif
>>       bool                            scpm_enabled;
>>       uint32_t                        scpm_status;
>> @@ -1105,6 +1100,15 @@ struct amdgpu_device {
>>       uint32_t            aid_mask;
>>   };
>> +#ifdef CONFIG_DEV_COREDUMP
>> +struct amdgpu_coredump_info {
>> +    struct amdgpu_device        *adev;
>> +    struct amdgpu_task_info         reset_task_info;
>> +    struct timespec64               reset_time;
>> +    bool                            reset_vram_lost;
>> +};
> 
> The patch looks good to me in general, but I would recommend slightly 
> different arrangement and segregation of GPU reset information.
> 
> Please consider a higher level structure adev->gpu_reset_info, and move 
> everything related to reset dump info into that, including this new 
> coredump_info structure, something like this:
> 
> struct amdgpu_reset_info {
> 
>      uint32_t *reset_dump_reg_list;
> 
>      uint32_t *reset_dump_reg_value;
> 
>      int num_regs;
> 

Right, I can encapsulate there reset_dump members,

> #ifdef CONFIG_DEV_COREDUMP
> 
>     struct amdgpu_coredump_info *coredump_info;/* keep this dynamic 
> allocation */

but we don't need a pointer for amdgpu_coredump_info inside 
amdgpu_device or inside of amdgpu_device->gpu_reset_info, right?

> 
> #endif
> 
> }
> 
> 
> This will make sure that all the relevant information is at the same place.
> 
> - Shashank
> 
        amdgpu_inc_vram_lost(tmp_adev);
Sharma, Shashank Aug. 17, 2023, 3:04 p.m. UTC | #5
On 17/08/2023 15:45, André Almeida wrote:
> Hi Shashank,
>
> Em 17/08/2023 03:41, Shashank Sharma escreveu:
>> Hello Andre,
>>
>> On 15/08/2023 21:50, André Almeida wrote:
>>> Instead of storing coredump information inside amdgpu_device struct,
>>> move if to a proper separated struct and allocate it dynamically. This
>>> will make it easier to further expand the logged information.
>>>
>>> Signed-off-by: André Almeida <andrealmeid@igalia.com>
>>> ---
>>> v4: change kmalloc to kzalloc
>>> ---
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu.h        | 14 +++--
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 63 
>>> ++++++++++++++--------
>>>   2 files changed, 49 insertions(+), 28 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> index 9c6a332261ab..0d560b713948 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> @@ -1088,11 +1088,6 @@ struct amdgpu_device {
>>>       uint32_t                        *reset_dump_reg_list;
>>>       uint32_t            *reset_dump_reg_value;
>>>       int                             num_regs;
>>> -#ifdef CONFIG_DEV_COREDUMP
>>> -    struct amdgpu_task_info         reset_task_info;
>>> -    bool                            reset_vram_lost;
>>> -    struct timespec64               reset_time;
>>> -#endif
>>>       bool                            scpm_enabled;
>>>       uint32_t                        scpm_status;
>>> @@ -1105,6 +1100,15 @@ struct amdgpu_device {
>>>       uint32_t            aid_mask;
>>>   };
>>> +#ifdef CONFIG_DEV_COREDUMP
>>> +struct amdgpu_coredump_info {
>>> +    struct amdgpu_device        *adev;
>>> +    struct amdgpu_task_info         reset_task_info;
>>> +    struct timespec64               reset_time;
>>> +    bool                            reset_vram_lost;
>>> +};
>>
>> The patch looks good to me in general, but I would recommend slightly 
>> different arrangement and segregation of GPU reset information.
>>
>> Please consider a higher level structure adev->gpu_reset_info, and 
>> move everything related to reset dump info into that, including this 
>> new coredump_info structure, something like this:
>>
>> struct amdgpu_reset_info {
>>
>>      uint32_t *reset_dump_reg_list;
>>
>>      uint32_t *reset_dump_reg_value;
>>
>>      int num_regs;
>>
>
> Right, I can encapsulate there reset_dump members,
>
>> #ifdef CONFIG_DEV_COREDUMP
>>
>>     struct amdgpu_coredump_info *coredump_info;/* keep this dynamic 
>> allocation */
>
> but we don't need a pointer for amdgpu_coredump_info inside 
> amdgpu_device or inside of amdgpu_device->gpu_reset_info, right?

I think it would be better if we keep all of the GPU reset related data 
in the same structure, so adev->gpu_reset_info->coredump_info sounds 
about right to me.

- Shashank

>
>>
>> #endif
>>
>> }
>>
>>
>> This will make sure that all the relevant information is at the same 
>> place.
>>
>> - Shashank
>>
>        amdgpu_inc_vram_lost(tmp_adev);
André Almeida Aug. 17, 2023, 3:17 p.m. UTC | #6
Em 17/08/2023 12:04, Shashank Sharma escreveu:
> 
> On 17/08/2023 15:45, André Almeida wrote:
>> Hi Shashank,
>>
>> Em 17/08/2023 03:41, Shashank Sharma escreveu:
>>> Hello Andre,
>>>
>>> On 15/08/2023 21:50, André Almeida wrote:
>>>> Instead of storing coredump information inside amdgpu_device struct,
>>>> move if to a proper separated struct and allocate it dynamically. This
>>>> will make it easier to further expand the logged information.
>>>>
>>>> Signed-off-by: André Almeida <andrealmeid@igalia.com>
>>>> ---
>>>> v4: change kmalloc to kzalloc
>>>> ---
>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu.h        | 14 +++--
>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 63 
>>>> ++++++++++++++--------
>>>>   2 files changed, 49 insertions(+), 28 deletions(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>> index 9c6a332261ab..0d560b713948 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>> @@ -1088,11 +1088,6 @@ struct amdgpu_device {
>>>>       uint32_t                        *reset_dump_reg_list;
>>>>       uint32_t            *reset_dump_reg_value;
>>>>       int                             num_regs;
>>>> -#ifdef CONFIG_DEV_COREDUMP
>>>> -    struct amdgpu_task_info         reset_task_info;
>>>> -    bool                            reset_vram_lost;
>>>> -    struct timespec64               reset_time;
>>>> -#endif
>>>>       bool                            scpm_enabled;
>>>>       uint32_t                        scpm_status;
>>>> @@ -1105,6 +1100,15 @@ struct amdgpu_device {
>>>>       uint32_t            aid_mask;
>>>>   };
>>>> +#ifdef CONFIG_DEV_COREDUMP
>>>> +struct amdgpu_coredump_info {
>>>> +    struct amdgpu_device        *adev;
>>>> +    struct amdgpu_task_info         reset_task_info;
>>>> +    struct timespec64               reset_time;
>>>> +    bool                            reset_vram_lost;
>>>> +};
>>>
>>> The patch looks good to me in general, but I would recommend slightly 
>>> different arrangement and segregation of GPU reset information.
>>>
>>> Please consider a higher level structure adev->gpu_reset_info, and 
>>> move everything related to reset dump info into that, including this 
>>> new coredump_info structure, something like this:
>>>
>>> struct amdgpu_reset_info {
>>>
>>>      uint32_t *reset_dump_reg_list;
>>>
>>>      uint32_t *reset_dump_reg_value;
>>>
>>>      int num_regs;
>>>
>>
>> Right, I can encapsulate there reset_dump members,
>>
>>> #ifdef CONFIG_DEV_COREDUMP
>>>
>>>     struct amdgpu_coredump_info *coredump_info;/* keep this dynamic 
>>> allocation */
>>
>> but we don't need a pointer for amdgpu_coredump_info inside 
>> amdgpu_device or inside of amdgpu_device->gpu_reset_info, right?
> 
> I think it would be better if we keep all of the GPU reset related data 
> in the same structure, so adev->gpu_reset_info->coredump_info sounds 
> about right to me.
> 

But after patch 2/4, we don't need to store a coredump_info pointer 
inside adev, this is what I meant. What would be the purpose of having 
this pointer? It's freed by amdgpu_devcoredump_free(), so we don't need 
to keep track of it.

> - Shashank
> 
>>
>>>
>>> #endif
>>>
>>> }
>>>
>>>
>>> This will make sure that all the relevant information is at the same 
>>> place.
>>>
>>> - Shashank
>>>
>>        amdgpu_inc_vram_lost(tmp_adev);
Sharma, Shashank Aug. 17, 2023, 3:26 p.m. UTC | #7
On 17/08/2023 17:17, André Almeida wrote:
>
>
> Em 17/08/2023 12:04, Shashank Sharma escreveu:
>>
>> On 17/08/2023 15:45, André Almeida wrote:
>>> Hi Shashank,
>>>
>>> Em 17/08/2023 03:41, Shashank Sharma escreveu:
>>>> Hello Andre,
>>>>
>>>> On 15/08/2023 21:50, André Almeida wrote:
>>>>> Instead of storing coredump information inside amdgpu_device struct,
>>>>> move if to a proper separated struct and allocate it dynamically. 
>>>>> This
>>>>> will make it easier to further expand the logged information.
>>>>>
>>>>> Signed-off-by: André Almeida <andrealmeid@igalia.com>
>>>>> ---
>>>>> v4: change kmalloc to kzalloc
>>>>> ---
>>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu.h        | 14 +++--
>>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 63 
>>>>> ++++++++++++++--------
>>>>>   2 files changed, 49 insertions(+), 28 deletions(-)
>>>>>
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>>> index 9c6a332261ab..0d560b713948 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>>> @@ -1088,11 +1088,6 @@ struct amdgpu_device {
>>>>>       uint32_t *reset_dump_reg_list;
>>>>>       uint32_t            *reset_dump_reg_value;
>>>>>       int                             num_regs;
>>>>> -#ifdef CONFIG_DEV_COREDUMP
>>>>> -    struct amdgpu_task_info         reset_task_info;
>>>>> -    bool                            reset_vram_lost;
>>>>> -    struct timespec64               reset_time;
>>>>> -#endif
>>>>>       bool                            scpm_enabled;
>>>>>       uint32_t                        scpm_status;
>>>>> @@ -1105,6 +1100,15 @@ struct amdgpu_device {
>>>>>       uint32_t            aid_mask;
>>>>>   };
>>>>> +#ifdef CONFIG_DEV_COREDUMP
>>>>> +struct amdgpu_coredump_info {
>>>>> +    struct amdgpu_device        *adev;
>>>>> +    struct amdgpu_task_info         reset_task_info;
>>>>> +    struct timespec64               reset_time;
>>>>> +    bool                            reset_vram_lost;
>>>>> +};
>>>>
>>>> The patch looks good to me in general, but I would recommend 
>>>> slightly different arrangement and segregation of GPU reset 
>>>> information.
>>>>
>>>> Please consider a higher level structure adev->gpu_reset_info, and 
>>>> move everything related to reset dump info into that, including 
>>>> this new coredump_info structure, something like this:
>>>>
>>>> struct amdgpu_reset_info {
>>>>
>>>>      uint32_t *reset_dump_reg_list;
>>>>
>>>>      uint32_t *reset_dump_reg_value;
>>>>
>>>>      int num_regs;
>>>>
>>>
>>> Right, I can encapsulate there reset_dump members,
>>>
>>>> #ifdef CONFIG_DEV_COREDUMP
>>>>
>>>>     struct amdgpu_coredump_info *coredump_info;/* keep this dynamic 
>>>> allocation */
>>>
>>> but we don't need a pointer for amdgpu_coredump_info inside 
>>> amdgpu_device or inside of amdgpu_device->gpu_reset_info, right?
>>
>> I think it would be better if we keep all of the GPU reset related 
>> data in the same structure, so adev->gpu_reset_info->coredump_info 
>> sounds about right to me.
>>
>
> But after patch 2/4, we don't need to store a coredump_info pointer 
> inside adev, this is what I meant. What would be the purpose of having 
> this pointer? It's freed by amdgpu_devcoredump_free(), so we don't 
> need to keep track of it.

Well, actually we are pulling in some 0parallel efforts on enhancing the 
GPU reset information, and we were planning to use the coredump info for 
some additional things. So if I have the coredump_info available (like 
reset_task_info and vram_lost) across a few functions in the driver with 
adev, it would make my job easy there :).

- Shashank

>
>> - Shashank
>>
>>>
>>>>
>>>> #endif
>>>>
>>>> }
>>>>
>>>>
>>>> This will make sure that all the relevant information is at the 
>>>> same place.
>>>>
>>>> - Shashank
>>>>
>>>        amdgpu_inc_vram_lost(tmp_adev);
André Almeida Aug. 17, 2023, 3:38 p.m. UTC | #8
Em 17/08/2023 12:26, Shashank Sharma escreveu:
> 
> On 17/08/2023 17:17, André Almeida wrote:
>>
>>
>> Em 17/08/2023 12:04, Shashank Sharma escreveu:
>>>
>>> On 17/08/2023 15:45, André Almeida wrote:
>>>> Hi Shashank,
>>>>
>>>> Em 17/08/2023 03:41, Shashank Sharma escreveu:
>>>>> Hello Andre,
>>>>>
>>>>> On 15/08/2023 21:50, André Almeida wrote:
>>>>>> Instead of storing coredump information inside amdgpu_device struct,
>>>>>> move if to a proper separated struct and allocate it dynamically. 
>>>>>> This
>>>>>> will make it easier to further expand the logged information.
>>>>>>
>>>>>> Signed-off-by: André Almeida <andrealmeid@igalia.com>
>>>>>> ---
>>>>>> v4: change kmalloc to kzalloc
>>>>>> ---
>>>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu.h        | 14 +++--
>>>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 63 
>>>>>> ++++++++++++++--------
>>>>>>   2 files changed, 49 insertions(+), 28 deletions(-)
>>>>>>
>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>>>> index 9c6a332261ab..0d560b713948 100644
>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>>>> @@ -1088,11 +1088,6 @@ struct amdgpu_device {
>>>>>>       uint32_t *reset_dump_reg_list;
>>>>>>       uint32_t            *reset_dump_reg_value;
>>>>>>       int                             num_regs;
>>>>>> -#ifdef CONFIG_DEV_COREDUMP
>>>>>> -    struct amdgpu_task_info         reset_task_info;
>>>>>> -    bool                            reset_vram_lost;
>>>>>> -    struct timespec64               reset_time;
>>>>>> -#endif
>>>>>>       bool                            scpm_enabled;
>>>>>>       uint32_t                        scpm_status;
>>>>>> @@ -1105,6 +1100,15 @@ struct amdgpu_device {
>>>>>>       uint32_t            aid_mask;
>>>>>>   };
>>>>>> +#ifdef CONFIG_DEV_COREDUMP
>>>>>> +struct amdgpu_coredump_info {
>>>>>> +    struct amdgpu_device        *adev;
>>>>>> +    struct amdgpu_task_info         reset_task_info;
>>>>>> +    struct timespec64               reset_time;
>>>>>> +    bool                            reset_vram_lost;
>>>>>> +};
>>>>>
>>>>> The patch looks good to me in general, but I would recommend 
>>>>> slightly different arrangement and segregation of GPU reset 
>>>>> information.
>>>>>
>>>>> Please consider a higher level structure adev->gpu_reset_info, and 
>>>>> move everything related to reset dump info into that, including 
>>>>> this new coredump_info structure, something like this:
>>>>>
>>>>> struct amdgpu_reset_info {
>>>>>
>>>>>      uint32_t *reset_dump_reg_list;
>>>>>
>>>>>      uint32_t *reset_dump_reg_value;
>>>>>
>>>>>      int num_regs;
>>>>>
>>>>
>>>> Right, I can encapsulate there reset_dump members,
>>>>
>>>>> #ifdef CONFIG_DEV_COREDUMP
>>>>>
>>>>>     struct amdgpu_coredump_info *coredump_info;/* keep this dynamic 
>>>>> allocation */
>>>>
>>>> but we don't need a pointer for amdgpu_coredump_info inside 
>>>> amdgpu_device or inside of amdgpu_device->gpu_reset_info, right?
>>>
>>> I think it would be better if we keep all of the GPU reset related 
>>> data in the same structure, so adev->gpu_reset_info->coredump_info 
>>> sounds about right to me.
>>>
>>
>> But after patch 2/4, we don't need to store a coredump_info pointer 
>> inside adev, this is what I meant. What would be the purpose of having 
>> this pointer? It's freed by amdgpu_devcoredump_free(), so we don't 
>> need to keep track of it.
> 
> Well, actually we are pulling in some 0parallel efforts on enhancing the 
> GPU reset information, and we were planning to use the coredump info for 
> some additional things. So if I have the coredump_info available (like 
> reset_task_info and vram_lost) across a few functions in the driver with 
> adev, it would make my job easy there :).

It seems dangerous to use an object with this limited lifetime to rely 
to read on. If you want to use it you will need to change 
amdgpu_devcoredump_free() to drop a reference or you will need to use it 
statically, which defeats the purpose of this patch. Anyway, I'll add it 
as you requested.

> 
> - Shashank
> 
>>
>>> - Shashank
>>>
>>>>
>>>>>
>>>>> #endif
>>>>>
>>>>> }
>>>>>
>>>>>
>>>>> This will make sure that all the relevant information is at the 
>>>>> same place.
>>>>>
>>>>> - Shashank
>>>>>
>>>>        amdgpu_inc_vram_lost(tmp_adev);
Sharma, Shashank Aug. 17, 2023, 3:42 p.m. UTC | #9
On 17/08/2023 17:38, André Almeida wrote:
>
>
> Em 17/08/2023 12:26, Shashank Sharma escreveu:
>>
>> On 17/08/2023 17:17, André Almeida wrote:
>>>
>>>
>>> Em 17/08/2023 12:04, Shashank Sharma escreveu:
>>>>
>>>> On 17/08/2023 15:45, André Almeida wrote:
>>>>> Hi Shashank,
>>>>>
>>>>> Em 17/08/2023 03:41, Shashank Sharma escreveu:
>>>>>> Hello Andre,
>>>>>>
>>>>>> On 15/08/2023 21:50, André Almeida wrote:
>>>>>>> Instead of storing coredump information inside amdgpu_device 
>>>>>>> struct,
>>>>>>> move if to a proper separated struct and allocate it 
>>>>>>> dynamically. This
>>>>>>> will make it easier to further expand the logged information.
>>>>>>>
>>>>>>> Signed-off-by: André Almeida <andrealmeid@igalia.com>
>>>>>>> ---
>>>>>>> v4: change kmalloc to kzalloc
>>>>>>> ---
>>>>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu.h        | 14 +++--
>>>>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 63 
>>>>>>> ++++++++++++++--------
>>>>>>>   2 files changed, 49 insertions(+), 28 deletions(-)
>>>>>>>
>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>>>>> index 9c6a332261ab..0d560b713948 100644
>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>>>>> @@ -1088,11 +1088,6 @@ struct amdgpu_device {
>>>>>>>       uint32_t *reset_dump_reg_list;
>>>>>>>       uint32_t            *reset_dump_reg_value;
>>>>>>>       int                             num_regs;
>>>>>>> -#ifdef CONFIG_DEV_COREDUMP
>>>>>>> -    struct amdgpu_task_info         reset_task_info;
>>>>>>> -    bool                            reset_vram_lost;
>>>>>>> -    struct timespec64               reset_time;
>>>>>>> -#endif
>>>>>>>       bool                            scpm_enabled;
>>>>>>>       uint32_t                        scpm_status;
>>>>>>> @@ -1105,6 +1100,15 @@ struct amdgpu_device {
>>>>>>>       uint32_t            aid_mask;
>>>>>>>   };
>>>>>>> +#ifdef CONFIG_DEV_COREDUMP
>>>>>>> +struct amdgpu_coredump_info {
>>>>>>> +    struct amdgpu_device        *adev;
>>>>>>> +    struct amdgpu_task_info         reset_task_info;
>>>>>>> +    struct timespec64               reset_time;
>>>>>>> +    bool                            reset_vram_lost;
>>>>>>> +};
>>>>>>
>>>>>> The patch looks good to me in general, but I would recommend 
>>>>>> slightly different arrangement and segregation of GPU reset 
>>>>>> information.
>>>>>>
>>>>>> Please consider a higher level structure adev->gpu_reset_info, 
>>>>>> and move everything related to reset dump info into that, 
>>>>>> including this new coredump_info structure, something like this:
>>>>>>
>>>>>> struct amdgpu_reset_info {
>>>>>>
>>>>>>      uint32_t *reset_dump_reg_list;
>>>>>>
>>>>>>      uint32_t *reset_dump_reg_value;
>>>>>>
>>>>>>      int num_regs;
>>>>>>
>>>>>
>>>>> Right, I can encapsulate there reset_dump members,
>>>>>
>>>>>> #ifdef CONFIG_DEV_COREDUMP
>>>>>>
>>>>>>     struct amdgpu_coredump_info *coredump_info;/* keep this 
>>>>>> dynamic allocation */
>>>>>
>>>>> but we don't need a pointer for amdgpu_coredump_info inside 
>>>>> amdgpu_device or inside of amdgpu_device->gpu_reset_info, right?
>>>>
>>>> I think it would be better if we keep all of the GPU reset related 
>>>> data in the same structure, so adev->gpu_reset_info->coredump_info 
>>>> sounds about right to me.
>>>>
>>>
>>> But after patch 2/4, we don't need to store a coredump_info pointer 
>>> inside adev, this is what I meant. What would be the purpose of 
>>> having this pointer? It's freed by amdgpu_devcoredump_free(), so we 
>>> don't need to keep track of it.
>>
>> Well, actually we are pulling in some 0parallel efforts on enhancing 
>> the GPU reset information, and we were planning to use the coredump 
>> info for some additional things. So if I have the coredump_info 
>> available (like reset_task_info and vram_lost) across a few functions 
>> in the driver with adev, it would make my job easy there :).
>
> It seems dangerous to use an object with this limited lifetime to rely 
> to read on. If you want to use it you will need to change 
> amdgpu_devcoredump_free() to drop a reference or you will need to use 
> it statically, which defeats the purpose of this patch. Anyway, I'll 
> add it as you requested.
>
I guess if the coredump_free function can make the

adev->reset_info->coredump_info= NULL, after freeing it, that will 
actually help the case.


While consuming it, I can simply check if 
(adev->reset_info->coredump_info) is available to be read.

- Shashank

>>
>> - Shashank
>>
>>>
>>>> - Shashank
>>>>
>>>>>
>>>>>>
>>>>>> #endif
>>>>>>
>>>>>> }
>>>>>>
>>>>>>
>>>>>> This will make sure that all the relevant information is at the 
>>>>>> same place.
>>>>>>
>>>>>> - Shashank
>>>>>>
>>>>>        amdgpu_inc_vram_lost(tmp_adev);
diff mbox series

Patch

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 9c6a332261ab..0d560b713948 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -1088,11 +1088,6 @@  struct amdgpu_device {
 	uint32_t                        *reset_dump_reg_list;
 	uint32_t			*reset_dump_reg_value;
 	int                             num_regs;
-#ifdef CONFIG_DEV_COREDUMP
-	struct amdgpu_task_info         reset_task_info;
-	bool                            reset_vram_lost;
-	struct timespec64               reset_time;
-#endif
 
 	bool                            scpm_enabled;
 	uint32_t                        scpm_status;
@@ -1105,6 +1100,15 @@  struct amdgpu_device {
 	uint32_t			aid_mask;
 };
 
+#ifdef CONFIG_DEV_COREDUMP
+struct amdgpu_coredump_info {
+	struct amdgpu_device		*adev;
+	struct amdgpu_task_info         reset_task_info;
+	struct timespec64               reset_time;
+	bool                            reset_vram_lost;
+};
+#endif
+
 static inline struct amdgpu_device *drm_to_adev(struct drm_device *ddev)
 {
 	return container_of(ddev, struct amdgpu_device, ddev);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index bf4781551f88..b5b879bcc5c9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -4799,12 +4799,17 @@  static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev)
 	return 0;
 }
 
-#ifdef CONFIG_DEV_COREDUMP
+#ifndef CONFIG_DEV_COREDUMP
+static void amdgpu_coredump(struct amdgpu_device *adev, bool vram_lost,
+			    struct amdgpu_reset_context *reset_context)
+{
+}
+#else
 static ssize_t amdgpu_devcoredump_read(char *buffer, loff_t offset,
 		size_t count, void *data, size_t datalen)
 {
 	struct drm_printer p;
-	struct amdgpu_device *adev = data;
+	struct amdgpu_coredump_info *coredump = data;
 	struct drm_print_iterator iter;
 	int i;
 
@@ -4818,21 +4823,21 @@  static ssize_t amdgpu_devcoredump_read(char *buffer, loff_t offset,
 	drm_printf(&p, "**** AMDGPU Device Coredump ****\n");
 	drm_printf(&p, "kernel: " UTS_RELEASE "\n");
 	drm_printf(&p, "module: " KBUILD_MODNAME "\n");
-	drm_printf(&p, "time: %lld.%09ld\n", adev->reset_time.tv_sec, adev->reset_time.tv_nsec);
-	if (adev->reset_task_info.pid)
+	drm_printf(&p, "time: %lld.%09ld\n", coredump->reset_time.tv_sec, coredump->reset_time.tv_nsec);
+	if (coredump->reset_task_info.pid)
 		drm_printf(&p, "process_name: %s PID: %d\n",
-			   adev->reset_task_info.process_name,
-			   adev->reset_task_info.pid);
+			   coredump->reset_task_info.process_name,
+			   coredump->reset_task_info.pid);
 
-	if (adev->reset_vram_lost)
+	if (coredump->reset_vram_lost)
 		drm_printf(&p, "VRAM is lost due to GPU reset!\n");
-	if (adev->num_regs) {
+	if (coredump->adev->num_regs) {
 		drm_printf(&p, "AMDGPU register dumps:\nOffset:     Value:\n");
 
-		for (i = 0; i < adev->num_regs; i++)
+		for (i = 0; i < coredump->adev->num_regs; i++)
 			drm_printf(&p, "0x%08x: 0x%08x\n",
-				   adev->reset_dump_reg_list[i],
-				   adev->reset_dump_reg_value[i]);
+				   coredump->adev->reset_dump_reg_list[i],
+				   coredump->adev->reset_dump_reg_value[i]);
 	}
 
 	return count - iter.remain;
@@ -4840,14 +4845,32 @@  static ssize_t amdgpu_devcoredump_read(char *buffer, loff_t offset,
 
 static void amdgpu_devcoredump_free(void *data)
 {
+	kfree(data);
 }
 
-static void amdgpu_reset_capture_coredumpm(struct amdgpu_device *adev)
+static void amdgpu_coredump(struct amdgpu_device *adev, bool vram_lost,
+			    struct amdgpu_reset_context *reset_context)
 {
+	struct amdgpu_coredump_info *coredump;
 	struct drm_device *dev = adev_to_drm(adev);
 
-	ktime_get_ts64(&adev->reset_time);
-	dev_coredumpm(dev->dev, THIS_MODULE, adev, 0, GFP_NOWAIT,
+	coredump = kzalloc(sizeof(*coredump), GFP_NOWAIT);
+
+	if (!coredump) {
+		DRM_ERROR("%s: failed to allocate memory for coredump\n", __func__);
+		return;
+	}
+
+	coredump->reset_vram_lost = vram_lost;
+
+	if (reset_context->job && reset_context->job->vm)
+		coredump->reset_task_info = reset_context->job->vm->task_info;
+
+	coredump->adev = adev;
+
+	ktime_get_ts64(&coredump->reset_time);
+
+	dev_coredumpm(dev->dev, THIS_MODULE, coredump, 0, GFP_NOWAIT,
 		      amdgpu_devcoredump_read, amdgpu_devcoredump_free);
 }
 #endif
@@ -4955,15 +4978,9 @@  int amdgpu_do_asic_reset(struct list_head *device_list_handle,
 					goto out;
 
 				vram_lost = amdgpu_device_check_vram_lost(tmp_adev);
-#ifdef CONFIG_DEV_COREDUMP
-				tmp_adev->reset_vram_lost = vram_lost;
-				memset(&tmp_adev->reset_task_info, 0,
-						sizeof(tmp_adev->reset_task_info));
-				if (reset_context->job && reset_context->job->vm)
-					tmp_adev->reset_task_info =
-						reset_context->job->vm->task_info;
-				amdgpu_reset_capture_coredumpm(tmp_adev);
-#endif
+
+				amdgpu_coredump(tmp_adev, vram_lost, reset_context);
+
 				if (vram_lost) {
 					DRM_INFO("VRAM is lost due to GPU reset!\n");
 					amdgpu_inc_vram_lost(tmp_adev);