diff mbox series

drm/msm: Optimize GPU crashstate capture read path

Message ID 1541061341-29994-1-git-send-email-smasetty@codeaurora.org (mailing list archive)
State Not Applicable, archived
Headers show
Series drm/msm: Optimize GPU crashstate capture read path | expand

Commit Message

Sharat Masetty Nov. 1, 2018, 8:35 a.m. UTC
When the userspace tries to read the crashstate dump, the read side
implementation in the driver currently ascii85 encodes all the binary
buffers and it does this each time the read system call is called.
A userspace tool like cat typically does a page by page read and the
number of read calls depends on the size of the data captured by the
driver. This is certainly not desirable and does not scale well with
large captures.

This patch starts off with moving the encoding part to the capture time,
that is when the GPU tries to recover after a hang. Now we only encode
once per buffer object and not per page read. With this there is an
immediate >10X speed improvement in crashstate save time.

The flip side however is that the GPU recovery time increases and this
negatively impacts the user experience, so this is handled by moving the
encoding part to a worker thread and only register with the dev_coredump
once the encoding of the buffers is complete.

Signed-off-by: Sharat Masetty <smasetty@codeaurora.org>
---
 drivers/gpu/drm/msm/adreno/adreno_gpu.c | 35 +++----------
 drivers/gpu/drm/msm/msm_gpu.c           | 93 +++++++++++++++++++++++++++++++--
 drivers/gpu/drm/msm/msm_gpu.h           |  1 +
 3 files changed, 98 insertions(+), 31 deletions(-)

Comments

Jordan Crouse Nov. 1, 2018, 3:04 p.m. UTC | #1
On Thu, Nov 01, 2018 at 02:05:41PM +0530, Sharat Masetty wrote:
> When the userspace tries to read the crashstate dump, the read side
> implementation in the driver currently ascii85 encodes all the binary
> buffers and it does this each time the read system call is called.
> A userspace tool like cat typically does a page by page read and the
> number of read calls depends on the size of the data captured by the
> driver. This is certainly not desirable and does not scale well with
> large captures.
> 
> This patch starts off with moving the encoding part to the capture time,
> that is when the GPU tries to recover after a hang. Now we only encode
> once per buffer object and not per page read. With this there is an
> immediate >10X speed improvement in crashstate save time.
> 
> The flip side however is that the GPU recovery time increases and this
> negatively impacts the user experience, so this is handled by moving the
> encoding part to a worker thread and only register with the dev_coredump
> once the encoding of the buffers is complete.

Does your tree have https://patchwork.freedesktop.org/patch/257181/ ? That will
help with a lot of the problems you have described.

> Signed-off-by: Sharat Masetty <smasetty@codeaurora.org>
> ---
>  drivers/gpu/drm/msm/adreno/adreno_gpu.c | 35 +++----------
>  drivers/gpu/drm/msm/msm_gpu.c           | 93 +++++++++++++++++++++++++++++++--
>  drivers/gpu/drm/msm/msm_gpu.h           |  1 +
>  3 files changed, 98 insertions(+), 31 deletions(-)
> 
> diff --git a/drivers/gpu/drm/msm/adreno/adreno_gpu.c b/drivers/gpu/drm/msm/adreno/adreno_gpu.c
> index 141062f..7441571 100644
> --- a/drivers/gpu/drm/msm/adreno/adreno_gpu.c
> +++ b/drivers/gpu/drm/msm/adreno/adreno_gpu.c
> @@ -406,7 +406,7 @@ int adreno_gpu_state_get(struct msm_gpu *gpu, struct msm_gpu_state *state)
>  				size = j + 1;
>  
>  		if (size) {
> -			state->ring[i].data = kmalloc(size << 2, GFP_KERNEL);
> +			state->ring[i].data = kvmalloc(size << 2, GFP_KERNEL);
>  			if (state->ring[i].data) {
>  				memcpy(state->ring[i].data, gpu->rb[i]->start, size << 2);
>  				state->ring[i].data_size = size << 2;

This is a good fix, but unrelated to this change as detailed above. Should be
separate.

> @@ -445,7 +445,7 @@ void adreno_gpu_state_destroy(struct msm_gpu_state *state)
>  	int i;
>  
>  	for (i = 0; i < ARRAY_SIZE(state->ring); i++)
> -		kfree(state->ring[i].data);
> +		kvfree(state->ring[i].data);

As above.

>  
>  	for (i = 0; state->bos && i < state->nr_bos; i++)
>  		kvfree(state->bos[i].data);
> @@ -475,34 +475,15 @@ int adreno_gpu_state_put(struct msm_gpu_state *state)
>  
>  #if defined(CONFIG_DEBUG_FS) || defined(CONFIG_DEV_COREDUMP)
>  
> -static void adreno_show_object(struct drm_printer *p, u32 *ptr, int len)
> +static void adreno_show_object(struct drm_printer *p, void *ptr)
>  {
> -	char out[ASCII85_BUFSZ];
> -	long l, datalen, i;
> -
> -	if (!ptr || !len)
> -		return;
> -
> -	/*
> -	 * Only dump the non-zero part of the buffer - rarely will any data
> -	 * completely fill the entire allocated size of the buffer
> -	 */
> -	for (datalen = 0, i = 0; i < len >> 2; i++) {
> -		if (ptr[i])
> -			datalen = (i << 2) + 1;
> -	}
> -
> -	/* Skip printing the object if it is empty */
> -	if (datalen == 0)
> +	if (!ptr)
>  		return;
>  
> -	l = ascii85_encode_len(datalen);
> -
>  	drm_puts(p, "    data: !!ascii85 |\n");
>  	drm_puts(p, "     ");
>  
> -	for (i = 0; i < l; i++)
> -		drm_puts(p, ascii85_encode(ptr[i], out));
> +	drm_puts(p, ptr);
>  
>  	drm_puts(p, "\n");
>  }
> @@ -534,8 +515,7 @@ void adreno_show(struct msm_gpu *gpu, struct msm_gpu_state *state,
>  		drm_printf(p, "    wptr: %d\n", state->ring[i].wptr);
>  		drm_printf(p, "    size: %d\n", MSM_GPU_RINGBUFFER_SZ);
>  
> -		adreno_show_object(p, state->ring[i].data,
> -			state->ring[i].data_size);
> +		adreno_show_object(p, state->ring[i].data);
>  	}
>  
>  	if (state->bos) {
> @@ -546,8 +526,7 @@ void adreno_show(struct msm_gpu *gpu, struct msm_gpu_state *state,
>  				state->bos[i].iova);
>  			drm_printf(p, "    size: %zd\n", state->bos[i].size);
>  
> -			adreno_show_object(p, state->bos[i].data,
> -				state->bos[i].size);
> +			adreno_show_object(p, state->bos[i].data);
>  		}
>  	}
>  
> diff --git a/drivers/gpu/drm/msm/msm_gpu.c b/drivers/gpu/drm/msm/msm_gpu.c
> index ff95848..d5533f1 100644
> --- a/drivers/gpu/drm/msm/msm_gpu.c
> +++ b/drivers/gpu/drm/msm/msm_gpu.c
> @@ -21,6 +21,7 @@
>  #include "msm_fence.h"
>  
>  #include <generated/utsrelease.h>
> +#include <linux/ascii85.h>
>  #include <linux/string_helpers.h>
>  #include <linux/pm_opp.h>
>  #include <linux/devfreq.h>
> @@ -375,9 +376,7 @@ static void msm_gpu_crashstate_capture(struct msm_gpu *gpu,
>  	/* Set the active crash state to be dumped on failure */
>  	gpu->crashstate = state;
>  
> -	/* FIXME: Release the crashstate if this errors out? */
> -	dev_coredumpm(gpu->dev->dev, THIS_MODULE, gpu, 0, GFP_KERNEL,
> -		msm_gpu_devcoredump_read, msm_gpu_devcoredump_free);
> +	schedule_work(&gpu->crashstate_work);

I'm super hesitant to ever add anything having to do with scheduling because we
already abuse it so badly but this seems like it might be okay since it isn't
high priority and not related to timing.  I would be curious what others think.

>  }
>  #else
>  static void msm_gpu_crashstate_capture(struct msm_gpu *gpu,
> @@ -420,6 +419,93 @@ static void update_fences(struct msm_gpu *gpu, struct msm_ringbuffer *ring,
>  
>  static void retire_submits(struct msm_gpu *gpu);
>  
> +static char *msm_gpu_ascii85_encode_buf(u32 *src, size_t len)
> +{
> +	void *buf;
> +	size_t buf_itr = 0, buffer_size;
> +	char out[ASCII85_BUFSZ];
> +	long l;
> +	int i;
> +
> +	if (!src || !len)
> +		return NULL;

I don't think src() will ever legitimately be NULL so you can remove the check.

> +	l = ascii85_encode_len(len);
> +
> +	/*
> +	 * Ascii85 outputs either a 5 byte string or a 1 byte string. So we
> +	 * account for the worst case of 5 bytes per dword plus the 1 for '\0'
> +	 */
> +	buffer_size = (l * 5) + 1;

buffer_size is't needed - you can do the math inline in the kvmalloc() function.

> +
> +	buf = kvmalloc(buffer_size, GFP_KERNEL);
> +	if (!buf)
> +		return NULL;
> +
> +	for (i = 0; i < l; i++)
> +		buf_itr += snprintf(buf + buf_itr, buffer_size - buf_itr, "%s",

This specific algorithm should use scnprintf() here but on a broader note
sprintf() and friends is a bad choice if performance is important because it
jumps through more hoops to eventually do what amounts to a mempcy (this is the
reason why seq_printf() and seq_puts() are different things and checkpatch
insists that you use seq_puts() for constant strings).

Inf you are planning on  outputting into a buffer you should just make a custom
ascii85 function to output directly and avoid copying in the first place. 

If you wanted to get fancy you could add it to include/linux/ascii85.h - 
ascii85_encode_to_buf() or some such.

> +				ascii85_encode(src[i], out));
> +
> +	return buf;
> +}
> +
> +/*
> + * Convert the binary data in the gpu crash state capture(like bos and
> + * ringbuffer data) to ascii85 encoded strings. Note that the encoded
> + * string is NULL terminated.

This is technically not true because you are specifically accounting for a NULL
terminator in the function above.

> + */
> +static void crashstate_worker(struct work_struct *work)
> +{
> +	struct msm_gpu_state *state;
> +	struct msm_gpu *gpu = container_of(work, struct msm_gpu,
> +			crashstate_work);
> +	int i;
> +
> +	state = msm_gpu_crashstate_get(gpu);
> +	if (!state)
> +		return;
> +
> +	for (i = 0; i < MSM_GPU_MAX_RINGS; i++) {
> +		void *buf = NULL;

You don't needed to initialize buf.

> +
> +		if (!state->ring[i].data)
> +			continue;
> +

A comment here describing the variable switch-a-roo you are doing would be
helpful.

> +		buf = state->ring[i].data;
> +
> +		state->ring[i].data = msm_gpu_ascii85_encode_buf(buf,
> +				state->ring[i].data_size);
> +		kvfree(buf);
> +	}
> +
> +	for (i = 0; i < state->nr_bos; i++) {
> +		u32 *buf = NULL;

buf does not need to be initialized.

> +		long datalen, j;
> +
> +		if (!state->bos[i].data)
> +			continue;
> +
> +		buf = state->bos[i].data;
> +
> +		/*
> +		 * Only dump the non-zero part of the buffer - rarely will
> +		 * any data completely fill the entire allocated size of the
> +		 * buffer
> +		 */
> +		for (datalen = 0, j = 0; j < state->bos[i].size >> 2; j++)
> +			if (buf[j])
> +				datalen = ((j + 1) << 2);
> +
> +		state->bos[i].data = msm_gpu_ascii85_encode_buf(buf, datalen);
> +		kvfree(buf);
> +	}
> +
> +	msm_gpu_crashstate_put(gpu);
> +
> +	dev_coredumpm(gpu->dev->dev, THIS_MODULE, gpu, 0, GFP_KERNEL,
> +		msm_gpu_devcoredump_read, msm_gpu_devcoredump_free);
> +}
> +
>  static void recover_worker(struct work_struct *work)
>  {
>  	struct msm_gpu *gpu = container_of(work, struct msm_gpu, recover_work);
> @@ -856,6 +942,7 @@ int msm_gpu_init(struct drm_device *drm, struct platform_device *pdev,
>  	INIT_LIST_HEAD(&gpu->active_list);
>  	INIT_WORK(&gpu->retire_work, retire_worker);
>  	INIT_WORK(&gpu->recover_work, recover_worker);
> +	INIT_WORK(&gpu->crashstate_work, crashstate_worker);
>  
>  
>  	timer_setup(&gpu->hangcheck_timer, hangcheck_handler, 0);
> diff --git a/drivers/gpu/drm/msm/msm_gpu.h b/drivers/gpu/drm/msm/msm_gpu.h
> index f82bac0..8825069 100644
> --- a/drivers/gpu/drm/msm/msm_gpu.h
> +++ b/drivers/gpu/drm/msm/msm_gpu.h
> @@ -137,6 +137,7 @@ struct msm_gpu {
>  	} devfreq;
>  
>  	struct msm_gpu_state *crashstate;
> +	struct work_struct crashstate_work;
>  };
>  
>  /* It turns out that all targets use the same ringbuffer size */
> -- 
> 1.9.1
>
Sharat Masetty Nov. 2, 2018, 9:42 a.m. UTC | #2
Thanks for the comments Jordan -

On 11/1/2018 8:34 PM, Jordan Crouse wrote:
> On Thu, Nov 01, 2018 at 02:05:41PM +0530, Sharat Masetty wrote:
>> When the userspace tries to read the crashstate dump, the read side
>> implementation in the driver currently ascii85 encodes all the binary
>> buffers and it does this each time the read system call is called.
>> A userspace tool like cat typically does a page by page read and the
>> number of read calls depends on the size of the data captured by the
>> driver. This is certainly not desirable and does not scale well with
>> large captures.
>>
>> This patch starts off with moving the encoding part to the capture time,
>> that is when the GPU tries to recover after a hang. Now we only encode
>> once per buffer object and not per page read. With this there is an
>> immediate >10X speed improvement in crashstate save time.
>>
>> The flip side however is that the GPU recovery time increases and this
>> negatively impacts the user experience, so this is handled by moving the
>> encoding part to a worker thread and only register with the dev_coredump
>> once the encoding of the buffers is complete.
> 
> Does your tree have https://patchwork.freedesktop.org/patch/257181/ ? That will
> help with a lot of the problems you have described.
The issue is that even with small sized files like ~5MB or so, the 
capture time is huge and sometime does not survive the dev_coredump 
timeout of 5 minutes. With this change however I am able to dump large 
file sizes like 60MB or so in reasonable amount of time (~50 seconds).

As for the patch that you shared, I am thinking that we should drop it 
and instead go with this or whatever we agree on. The patch loses stuff 
like IB2's which might be important while debugging hang issues.
> 
>> Signed-off-by: Sharat Masetty <smasetty@codeaurora.org>
>> ---
>>   drivers/gpu/drm/msm/adreno/adreno_gpu.c | 35 +++----------
>>   drivers/gpu/drm/msm/msm_gpu.c           | 93 +++++++++++++++++++++++++++++++--
>>   drivers/gpu/drm/msm/msm_gpu.h           |  1 +
>>   3 files changed, 98 insertions(+), 31 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/msm/adreno/adreno_gpu.c b/drivers/gpu/drm/msm/adreno/adreno_gpu.c
>> index 141062f..7441571 100644
>> --- a/drivers/gpu/drm/msm/adreno/adreno_gpu.c
>> +++ b/drivers/gpu/drm/msm/adreno/adreno_gpu.c
>> @@ -406,7 +406,7 @@ int adreno_gpu_state_get(struct msm_gpu *gpu, struct msm_gpu_state *state)
>>   				size = j + 1;
>>   
>>   		if (size) {
>> -			state->ring[i].data = kmalloc(size << 2, GFP_KERNEL);
>> +			state->ring[i].data = kvmalloc(size << 2, GFP_KERNEL);
>>   			if (state->ring[i].data) {
>>   				memcpy(state->ring[i].data, gpu->rb[i]->start, size << 2);
>>   				state->ring[i].data_size = size << 2;
> 
> This is a good fix, but unrelated to this change as detailed above. Should be
> separate.
> 
>> @@ -445,7 +445,7 @@ void adreno_gpu_state_destroy(struct msm_gpu_state *state)
>>   	int i;
>>   
>>   	for (i = 0; i < ARRAY_SIZE(state->ring); i++)
>> -		kfree(state->ring[i].data);
>> +		kvfree(state->ring[i].data);
> 
> As above.
> 
>>   
>>   	for (i = 0; state->bos && i < state->nr_bos; i++)
>>   		kvfree(state->bos[i].data);
>> @@ -475,34 +475,15 @@ int adreno_gpu_state_put(struct msm_gpu_state *state)
>>   
>>   #if defined(CONFIG_DEBUG_FS) || defined(CONFIG_DEV_COREDUMP)
>>   
>> -static void adreno_show_object(struct drm_printer *p, u32 *ptr, int len)
>> +static void adreno_show_object(struct drm_printer *p, void *ptr)
>>   {
>> -	char out[ASCII85_BUFSZ];
>> -	long l, datalen, i;
>> -
>> -	if (!ptr || !len)
>> -		return;
>> -
>> -	/*
>> -	 * Only dump the non-zero part of the buffer - rarely will any data
>> -	 * completely fill the entire allocated size of the buffer
>> -	 */
>> -	for (datalen = 0, i = 0; i < len >> 2; i++) {
>> -		if (ptr[i])
>> -			datalen = (i << 2) + 1;
>> -	}
>> -
>> -	/* Skip printing the object if it is empty */
>> -	if (datalen == 0)
>> +	if (!ptr)
>>   		return;
>>   
>> -	l = ascii85_encode_len(datalen);
>> -
>>   	drm_puts(p, "    data: !!ascii85 |\n");
>>   	drm_puts(p, "     ");
>>   
>> -	for (i = 0; i < l; i++)
>> -		drm_puts(p, ascii85_encode(ptr[i], out));
>> +	drm_puts(p, ptr);
>>   
>>   	drm_puts(p, "\n");
>>   }
>> @@ -534,8 +515,7 @@ void adreno_show(struct msm_gpu *gpu, struct msm_gpu_state *state,
>>   		drm_printf(p, "    wptr: %d\n", state->ring[i].wptr);
>>   		drm_printf(p, "    size: %d\n", MSM_GPU_RINGBUFFER_SZ);
>>   
>> -		adreno_show_object(p, state->ring[i].data,
>> -			state->ring[i].data_size);
>> +		adreno_show_object(p, state->ring[i].data);
>>   	}
>>   
>>   	if (state->bos) {
>> @@ -546,8 +526,7 @@ void adreno_show(struct msm_gpu *gpu, struct msm_gpu_state *state,
>>   				state->bos[i].iova);
>>   			drm_printf(p, "    size: %zd\n", state->bos[i].size);
>>   
>> -			adreno_show_object(p, state->bos[i].data,
>> -				state->bos[i].size);
>> +			adreno_show_object(p, state->bos[i].data);
>>   		}
>>   	}
>>   
>> diff --git a/drivers/gpu/drm/msm/msm_gpu.c b/drivers/gpu/drm/msm/msm_gpu.c
>> index ff95848..d5533f1 100644
>> --- a/drivers/gpu/drm/msm/msm_gpu.c
>> +++ b/drivers/gpu/drm/msm/msm_gpu.c
>> @@ -21,6 +21,7 @@
>>   #include "msm_fence.h"
>>   
>>   #include <generated/utsrelease.h>
>> +#include <linux/ascii85.h>
>>   #include <linux/string_helpers.h>
>>   #include <linux/pm_opp.h>
>>   #include <linux/devfreq.h>
>> @@ -375,9 +376,7 @@ static void msm_gpu_crashstate_capture(struct msm_gpu *gpu,
>>   	/* Set the active crash state to be dumped on failure */
>>   	gpu->crashstate = state;
>>   
>> -	/* FIXME: Release the crashstate if this errors out? */
>> -	dev_coredumpm(gpu->dev->dev, THIS_MODULE, gpu, 0, GFP_KERNEL,
>> -		msm_gpu_devcoredump_read, msm_gpu_devcoredump_free);
>> +	schedule_work(&gpu->crashstate_work);
> 
> I'm super hesitant to ever add anything having to do with scheduling because we
> already abuse it so badly but this seems like it might be okay since it isn't
> high priority and not related to timing.  I would be curious what others think.
> 
>>   }
>>   #else
>>   static void msm_gpu_crashstate_capture(struct msm_gpu *gpu,
>> @@ -420,6 +419,93 @@ static void update_fences(struct msm_gpu *gpu, struct msm_ringbuffer *ring,
>>   
>>   static void retire_submits(struct msm_gpu *gpu);
>>   
>> +static char *msm_gpu_ascii85_encode_buf(u32 *src, size_t len)
>> +{
>> +	void *buf;
>> +	size_t buf_itr = 0, buffer_size;
>> +	char out[ASCII85_BUFSZ];
>> +	long l;
>> +	int i;
>> +
>> +	if (!src || !len)
>> +		return NULL;
> 
> I don't think src() will ever legitimately be NULL so you can remove the check.
> 
>> +	l = ascii85_encode_len(len);
>> +
>> +	/*
>> +	 * Ascii85 outputs either a 5 byte string or a 1 byte string. So we
>> +	 * account for the worst case of 5 bytes per dword plus the 1 for '\0'
>> +	 */
>> +	buffer_size = (l * 5) + 1;
> 
> buffer_size is't needed - you can do the math inline in the kvmalloc() function.
> 
>> +
>> +	buf = kvmalloc(buffer_size, GFP_KERNEL);
>> +	if (!buf)
>> +		return NULL;
>> +
>> +	for (i = 0; i < l; i++)
>> +		buf_itr += snprintf(buf + buf_itr, buffer_size - buf_itr, "%s",
> 
> This specific algorithm should use scnprintf() here but on a broader note
> sprintf() and friends is a bad choice if performance is important because it
> jumps through more hoops to eventually do what amounts to a mempcy (this is the
> reason why seq_printf() and seq_puts() are different things and checkpatch
> insists that you use seq_puts() for constant strings).
> 
> Inf you are planning on  outputting into a buffer you should just make a custom
> ascii85 function to output directly and avoid copying in the first place.
> 
> If you wanted to get fancy you could add it to include/linux/ascii85.h -
> ascii85_encode_to_buf() or some such.
> 
Ah! This is a good point - I will try out something on these lines.
>> +				ascii85_encode(src[i], out));
>> +
>> +	return buf;
>> +}
>> +
>> +/*
>> + * Convert the binary data in the gpu crash state capture(like bos and
>> + * ringbuffer data) to ascii85 encoded strings. Note that the encoded
>> + * string is NULL terminated.
> 
> This is technically not true because you are specifically accounting for a NULL
> terminator in the function above.
> 
>> + */
>> +static void crashstate_worker(struct work_struct *work)
>> +{
>> +	struct msm_gpu_state *state;
>> +	struct msm_gpu *gpu = container_of(work, struct msm_gpu,
>> +			crashstate_work);
>> +	int i;
>> +
>> +	state = msm_gpu_crashstate_get(gpu);
>> +	if (!state)
>> +		return;
>> +
>> +	for (i = 0; i < MSM_GPU_MAX_RINGS; i++) {
>> +		void *buf = NULL;
> 
> You don't needed to initialize buf.
> 
>> +
>> +		if (!state->ring[i].data)
>> +			continue;
>> +
> 
> A comment here describing the variable switch-a-roo you are doing would be
> helpful.
> 
>> +		buf = state->ring[i].data;
>> +
>> +		state->ring[i].data = msm_gpu_ascii85_encode_buf(buf,
>> +				state->ring[i].data_size);
>> +		kvfree(buf);
>> +	}
>> +
>> +	for (i = 0; i < state->nr_bos; i++) {
>> +		u32 *buf = NULL;
> 
> buf does not need to be initialized.
> 
>> +		long datalen, j;
>> +
>> +		if (!state->bos[i].data)
>> +			continue;
>> +
>> +		buf = state->bos[i].data;
>> +
>> +		/*
>> +		 * Only dump the non-zero part of the buffer - rarely will
>> +		 * any data completely fill the entire allocated size of the
>> +		 * buffer
>> +		 */
>> +		for (datalen = 0, j = 0; j < state->bos[i].size >> 2; j++)
>> +			if (buf[j])
>> +				datalen = ((j + 1) << 2);
>> +
>> +		state->bos[i].data = msm_gpu_ascii85_encode_buf(buf, datalen);
>> +		kvfree(buf);
>> +	}
>> +
>> +	msm_gpu_crashstate_put(gpu);
>> +
>> +	dev_coredumpm(gpu->dev->dev, THIS_MODULE, gpu, 0, GFP_KERNEL,
>> +		msm_gpu_devcoredump_read, msm_gpu_devcoredump_free);
>> +}
>> +
>>   static void recover_worker(struct work_struct *work)
>>   {
>>   	struct msm_gpu *gpu = container_of(work, struct msm_gpu, recover_work);
>> @@ -856,6 +942,7 @@ int msm_gpu_init(struct drm_device *drm, struct platform_device *pdev,
>>   	INIT_LIST_HEAD(&gpu->active_list);
>>   	INIT_WORK(&gpu->retire_work, retire_worker);
>>   	INIT_WORK(&gpu->recover_work, recover_worker);
>> +	INIT_WORK(&gpu->crashstate_work, crashstate_worker);
>>   
>>   
>>   	timer_setup(&gpu->hangcheck_timer, hangcheck_handler, 0);
>> diff --git a/drivers/gpu/drm/msm/msm_gpu.h b/drivers/gpu/drm/msm/msm_gpu.h
>> index f82bac0..8825069 100644
>> --- a/drivers/gpu/drm/msm/msm_gpu.h
>> +++ b/drivers/gpu/drm/msm/msm_gpu.h
>> @@ -137,6 +137,7 @@ struct msm_gpu {
>>   	} devfreq;
>>   
>>   	struct msm_gpu_state *crashstate;
>> +	struct work_struct crashstate_work;
>>   };
>>   
>>   /* It turns out that all targets use the same ringbuffer size */
>> -- 
>> 1.9.1
>>
>
Jordan Crouse Nov. 2, 2018, 2:06 p.m. UTC | #3
On Fri, Nov 02, 2018 at 03:12:51PM +0530, Sharat Masetty wrote:
> Thanks for the comments Jordan -
> 
> On 11/1/2018 8:34 PM, Jordan Crouse wrote:
> >On Thu, Nov 01, 2018 at 02:05:41PM +0530, Sharat Masetty wrote:
> >>When the userspace tries to read the crashstate dump, the read side
> >>implementation in the driver currently ascii85 encodes all the binary
> >>buffers and it does this each time the read system call is called.
> >>A userspace tool like cat typically does a page by page read and the
> >>number of read calls depends on the size of the data captured by the
> >>driver. This is certainly not desirable and does not scale well with
> >>large captures.
> >>
> >>This patch starts off with moving the encoding part to the capture time,
> >>that is when the GPU tries to recover after a hang. Now we only encode
> >>once per buffer object and not per page read. With this there is an
> >>immediate >10X speed improvement in crashstate save time.
> >>
> >>The flip side however is that the GPU recovery time increases and this
> >>negatively impacts the user experience, so this is handled by moving the
> >>encoding part to a worker thread and only register with the dev_coredump
> >>once the encoding of the buffers is complete.
> >
> >Does your tree have https://patchwork.freedesktop.org/patch/257181/ ? That will
> >help with a lot of the problems you have described.
> The issue is that even with small sized files like ~5MB or so, the
> capture time is huge and sometime does not survive the dev_coredump
> timeout of 5 minutes. With this change however I am able to dump
> large file sizes like 60MB or so in reasonable amount of time (~50
> seconds).
> 
> As for the patch that you shared, I am thinking that we should drop
> it and instead go with this or whatever we agree on. The patch loses
> stuff like IB2's which might be important while debugging hang
> issues.

You won't lose IB2s if you mark them as MSM_SUBMIT_CMD_IB_TARGET_BUF and Rob is
working on some new changes for userspace to identify buffers as "dumpable"
which will coincide with the patch I posted.

Thats not to say that your patch isn't appropriate - there is good stuff here
but even with it in place we probably shouldn't be generating 60MB dumps unless
we're going for some sort of replay scenario (which we could enabled with an
appropriate debugfs flag).

Jordan
diff mbox series

Patch

diff --git a/drivers/gpu/drm/msm/adreno/adreno_gpu.c b/drivers/gpu/drm/msm/adreno/adreno_gpu.c
index 141062f..7441571 100644
--- a/drivers/gpu/drm/msm/adreno/adreno_gpu.c
+++ b/drivers/gpu/drm/msm/adreno/adreno_gpu.c
@@ -406,7 +406,7 @@  int adreno_gpu_state_get(struct msm_gpu *gpu, struct msm_gpu_state *state)
 				size = j + 1;
 
 		if (size) {
-			state->ring[i].data = kmalloc(size << 2, GFP_KERNEL);
+			state->ring[i].data = kvmalloc(size << 2, GFP_KERNEL);
 			if (state->ring[i].data) {
 				memcpy(state->ring[i].data, gpu->rb[i]->start, size << 2);
 				state->ring[i].data_size = size << 2;
@@ -445,7 +445,7 @@  void adreno_gpu_state_destroy(struct msm_gpu_state *state)
 	int i;
 
 	for (i = 0; i < ARRAY_SIZE(state->ring); i++)
-		kfree(state->ring[i].data);
+		kvfree(state->ring[i].data);
 
 	for (i = 0; state->bos && i < state->nr_bos; i++)
 		kvfree(state->bos[i].data);
@@ -475,34 +475,15 @@  int adreno_gpu_state_put(struct msm_gpu_state *state)
 
 #if defined(CONFIG_DEBUG_FS) || defined(CONFIG_DEV_COREDUMP)
 
-static void adreno_show_object(struct drm_printer *p, u32 *ptr, int len)
+static void adreno_show_object(struct drm_printer *p, void *ptr)
 {
-	char out[ASCII85_BUFSZ];
-	long l, datalen, i;
-
-	if (!ptr || !len)
-		return;
-
-	/*
-	 * Only dump the non-zero part of the buffer - rarely will any data
-	 * completely fill the entire allocated size of the buffer
-	 */
-	for (datalen = 0, i = 0; i < len >> 2; i++) {
-		if (ptr[i])
-			datalen = (i << 2) + 1;
-	}
-
-	/* Skip printing the object if it is empty */
-	if (datalen == 0)
+	if (!ptr)
 		return;
 
-	l = ascii85_encode_len(datalen);
-
 	drm_puts(p, "    data: !!ascii85 |\n");
 	drm_puts(p, "     ");
 
-	for (i = 0; i < l; i++)
-		drm_puts(p, ascii85_encode(ptr[i], out));
+	drm_puts(p, ptr);
 
 	drm_puts(p, "\n");
 }
@@ -534,8 +515,7 @@  void adreno_show(struct msm_gpu *gpu, struct msm_gpu_state *state,
 		drm_printf(p, "    wptr: %d\n", state->ring[i].wptr);
 		drm_printf(p, "    size: %d\n", MSM_GPU_RINGBUFFER_SZ);
 
-		adreno_show_object(p, state->ring[i].data,
-			state->ring[i].data_size);
+		adreno_show_object(p, state->ring[i].data);
 	}
 
 	if (state->bos) {
@@ -546,8 +526,7 @@  void adreno_show(struct msm_gpu *gpu, struct msm_gpu_state *state,
 				state->bos[i].iova);
 			drm_printf(p, "    size: %zd\n", state->bos[i].size);
 
-			adreno_show_object(p, state->bos[i].data,
-				state->bos[i].size);
+			adreno_show_object(p, state->bos[i].data);
 		}
 	}
 
diff --git a/drivers/gpu/drm/msm/msm_gpu.c b/drivers/gpu/drm/msm/msm_gpu.c
index ff95848..d5533f1 100644
--- a/drivers/gpu/drm/msm/msm_gpu.c
+++ b/drivers/gpu/drm/msm/msm_gpu.c
@@ -21,6 +21,7 @@ 
 #include "msm_fence.h"
 
 #include <generated/utsrelease.h>
+#include <linux/ascii85.h>
 #include <linux/string_helpers.h>
 #include <linux/pm_opp.h>
 #include <linux/devfreq.h>
@@ -375,9 +376,7 @@  static void msm_gpu_crashstate_capture(struct msm_gpu *gpu,
 	/* Set the active crash state to be dumped on failure */
 	gpu->crashstate = state;
 
-	/* FIXME: Release the crashstate if this errors out? */
-	dev_coredumpm(gpu->dev->dev, THIS_MODULE, gpu, 0, GFP_KERNEL,
-		msm_gpu_devcoredump_read, msm_gpu_devcoredump_free);
+	schedule_work(&gpu->crashstate_work);
 }
 #else
 static void msm_gpu_crashstate_capture(struct msm_gpu *gpu,
@@ -420,6 +419,93 @@  static void update_fences(struct msm_gpu *gpu, struct msm_ringbuffer *ring,
 
 static void retire_submits(struct msm_gpu *gpu);
 
+static char *msm_gpu_ascii85_encode_buf(u32 *src, size_t len)
+{
+	void *buf;
+	size_t buf_itr = 0, buffer_size;
+	char out[ASCII85_BUFSZ];
+	long l;
+	int i;
+
+	if (!src || !len)
+		return NULL;
+
+	l = ascii85_encode_len(len);
+
+	/*
+	 * Ascii85 outputs either a 5 byte string or a 1 byte string. So we
+	 * account for the worst case of 5 bytes per dword plus the 1 for '\0'
+	 */
+	buffer_size = (l * 5) + 1;
+
+	buf = kvmalloc(buffer_size, GFP_KERNEL);
+	if (!buf)
+		return NULL;
+
+	for (i = 0; i < l; i++)
+		buf_itr += snprintf(buf + buf_itr, buffer_size - buf_itr, "%s",
+				ascii85_encode(src[i], out));
+
+	return buf;
+}
+
+/*
+ * Convert the binary data in the gpu crash state capture(like bos and
+ * ringbuffer data) to ascii85 encoded strings. Note that the encoded
+ * string is NULL terminated.
+ */
+static void crashstate_worker(struct work_struct *work)
+{
+	struct msm_gpu_state *state;
+	struct msm_gpu *gpu = container_of(work, struct msm_gpu,
+			crashstate_work);
+	int i;
+
+	state = msm_gpu_crashstate_get(gpu);
+	if (!state)
+		return;
+
+	for (i = 0; i < MSM_GPU_MAX_RINGS; i++) {
+		void *buf = NULL;
+
+		if (!state->ring[i].data)
+			continue;
+
+		buf = state->ring[i].data;
+
+		state->ring[i].data = msm_gpu_ascii85_encode_buf(buf,
+				state->ring[i].data_size);
+		kvfree(buf);
+	}
+
+	for (i = 0; i < state->nr_bos; i++) {
+		u32 *buf = NULL;
+		long datalen, j;
+
+		if (!state->bos[i].data)
+			continue;
+
+		buf = state->bos[i].data;
+
+		/*
+		 * Only dump the non-zero part of the buffer - rarely will
+		 * any data completely fill the entire allocated size of the
+		 * buffer
+		 */
+		for (datalen = 0, j = 0; j < state->bos[i].size >> 2; j++)
+			if (buf[j])
+				datalen = ((j + 1) << 2);
+
+		state->bos[i].data = msm_gpu_ascii85_encode_buf(buf, datalen);
+		kvfree(buf);
+	}
+
+	msm_gpu_crashstate_put(gpu);
+
+	dev_coredumpm(gpu->dev->dev, THIS_MODULE, gpu, 0, GFP_KERNEL,
+		msm_gpu_devcoredump_read, msm_gpu_devcoredump_free);
+}
+
 static void recover_worker(struct work_struct *work)
 {
 	struct msm_gpu *gpu = container_of(work, struct msm_gpu, recover_work);
@@ -856,6 +942,7 @@  int msm_gpu_init(struct drm_device *drm, struct platform_device *pdev,
 	INIT_LIST_HEAD(&gpu->active_list);
 	INIT_WORK(&gpu->retire_work, retire_worker);
 	INIT_WORK(&gpu->recover_work, recover_worker);
+	INIT_WORK(&gpu->crashstate_work, crashstate_worker);
 
 
 	timer_setup(&gpu->hangcheck_timer, hangcheck_handler, 0);
diff --git a/drivers/gpu/drm/msm/msm_gpu.h b/drivers/gpu/drm/msm/msm_gpu.h
index f82bac0..8825069 100644
--- a/drivers/gpu/drm/msm/msm_gpu.h
+++ b/drivers/gpu/drm/msm/msm_gpu.h
@@ -137,6 +137,7 @@  struct msm_gpu {
 	} devfreq;
 
 	struct msm_gpu_state *crashstate;
+	struct work_struct crashstate_work;
 };
 
 /* It turns out that all targets use the same ringbuffer size */