diff mbox series

drm/etnaviv: don't block scheduler when GPU is still active

Message ID 20230331110012.69844-1-l.stach@pengutronix.de (mailing list archive)
State New, archived
Headers show
Series drm/etnaviv: don't block scheduler when GPU is still active | expand

Commit Message

Lucas Stach March 31, 2023, 11 a.m. UTC
Since 45ecaea73883 ("drm/sched: Partial revert of 'drm/sched: Keep
s_fence->parent pointer'") still active jobs aren't put back in the
pending list on drm_sched_start(), as they don't have a active
parent fence anymore, so if the GPU is still working and the timeout
is extended, all currently active jobs will be freed.

To avoid prematurely freeing jobs that are still active on the GPU,
don't block the scheduler until we are fully committed to actually
reset the GPU.

Cc: stable@vger.kernel.org #6.0
Signed-off-by: Lucas Stach <l.stach@pengutronix.de>
---
The behavior change in the scheduler is unfortunate and at least
deserves some updated documentation. This change aligns etnaviv with
the behavior of other drivers and avoids the issue.
---
 drivers/gpu/drm/etnaviv/etnaviv_sched.c | 15 +++++----------
 1 file changed, 5 insertions(+), 10 deletions(-)

Comments

Lucas Stach April 19, 2023, 9:35 a.m. UTC | #1
Hi,

does anyone have some bandwidth to review this?

Regards,
Lucas

Am Freitag, dem 31.03.2023 um 13:00 +0200 schrieb Lucas Stach:
> Since 45ecaea73883 ("drm/sched: Partial revert of 'drm/sched: Keep
> s_fence->parent pointer'") still active jobs aren't put back in the
> pending list on drm_sched_start(), as they don't have a active
> parent fence anymore, so if the GPU is still working and the timeout
> is extended, all currently active jobs will be freed.
> 
> To avoid prematurely freeing jobs that are still active on the GPU,
> don't block the scheduler until we are fully committed to actually
> reset the GPU.
> 
> Cc: stable@vger.kernel.org #6.0
> Signed-off-by: Lucas Stach <l.stach@pengutronix.de>
> ---
> The behavior change in the scheduler is unfortunate and at least
> deserves some updated documentation. This change aligns etnaviv with
> the behavior of other drivers and avoids the issue.
> ---
>  drivers/gpu/drm/etnaviv/etnaviv_sched.c | 15 +++++----------
>  1 file changed, 5 insertions(+), 10 deletions(-)
> 
> diff --git a/drivers/gpu/drm/etnaviv/etnaviv_sched.c b/drivers/gpu/drm/etnaviv/etnaviv_sched.c
> index 1ae87dfd19c4..35d7c2ef7a57 100644
> --- a/drivers/gpu/drm/etnaviv/etnaviv_sched.c
> +++ b/drivers/gpu/drm/etnaviv/etnaviv_sched.c
> @@ -38,15 +38,12 @@ static enum drm_gpu_sched_stat etnaviv_sched_timedout_job(struct drm_sched_job
>  	u32 dma_addr;
>  	int change;
>  
> -	/* block scheduler */
> -	drm_sched_stop(&gpu->sched, sched_job);
> -
>  	/*
>  	 * If the GPU managed to complete this jobs fence, the timout is
>  	 * spurious. Bail out.
>  	 */
>  	if (dma_fence_is_signaled(submit->out_fence))
> -		goto out_no_timeout;
> +		return DRM_GPU_SCHED_STAT_NOMINAL;
>  
>  	/*
>  	 * If the GPU is still making forward progress on the front-end (which
> @@ -59,9 +56,12 @@ static enum drm_gpu_sched_stat etnaviv_sched_timedout_job(struct drm_sched_job
>  	    change < 0 || change > 16) {
>  		gpu->hangcheck_dma_addr = dma_addr;
>  		gpu->hangcheck_fence = gpu->completed_fence;
> -		goto out_no_timeout;
> +		return DRM_GPU_SCHED_STAT_NOMINAL;
>  	}
>  
> +	/* block scheduler */
> +	drm_sched_stop(&gpu->sched, sched_job);
> +
>  	if(sched_job)
>  		drm_sched_increase_karma(sched_job);
>  
> @@ -73,11 +73,6 @@ static enum drm_gpu_sched_stat etnaviv_sched_timedout_job(struct drm_sched_job
>  
>  	drm_sched_start(&gpu->sched, true);
>  	return DRM_GPU_SCHED_STAT_NOMINAL;
> -
> -out_no_timeout:
> -	/* restart scheduler after GPU is usable again */
> -	drm_sched_start(&gpu->sched, true);
> -	return DRM_GPU_SCHED_STAT_NOMINAL;
>  }
>  
>  static void etnaviv_sched_free_job(struct drm_sched_job *sched_job)
diff mbox series

Patch

diff --git a/drivers/gpu/drm/etnaviv/etnaviv_sched.c b/drivers/gpu/drm/etnaviv/etnaviv_sched.c
index 1ae87dfd19c4..35d7c2ef7a57 100644
--- a/drivers/gpu/drm/etnaviv/etnaviv_sched.c
+++ b/drivers/gpu/drm/etnaviv/etnaviv_sched.c
@@ -38,15 +38,12 @@  static enum drm_gpu_sched_stat etnaviv_sched_timedout_job(struct drm_sched_job
 	u32 dma_addr;
 	int change;
 
-	/* block scheduler */
-	drm_sched_stop(&gpu->sched, sched_job);
-
 	/*
 	 * If the GPU managed to complete this jobs fence, the timout is
 	 * spurious. Bail out.
 	 */
 	if (dma_fence_is_signaled(submit->out_fence))
-		goto out_no_timeout;
+		return DRM_GPU_SCHED_STAT_NOMINAL;
 
 	/*
 	 * If the GPU is still making forward progress on the front-end (which
@@ -59,9 +56,12 @@  static enum drm_gpu_sched_stat etnaviv_sched_timedout_job(struct drm_sched_job
 	    change < 0 || change > 16) {
 		gpu->hangcheck_dma_addr = dma_addr;
 		gpu->hangcheck_fence = gpu->completed_fence;
-		goto out_no_timeout;
+		return DRM_GPU_SCHED_STAT_NOMINAL;
 	}
 
+	/* block scheduler */
+	drm_sched_stop(&gpu->sched, sched_job);
+
 	if(sched_job)
 		drm_sched_increase_karma(sched_job);
 
@@ -73,11 +73,6 @@  static enum drm_gpu_sched_stat etnaviv_sched_timedout_job(struct drm_sched_job
 
 	drm_sched_start(&gpu->sched, true);
 	return DRM_GPU_SCHED_STAT_NOMINAL;
-
-out_no_timeout:
-	/* restart scheduler after GPU is usable again */
-	drm_sched_start(&gpu->sched, true);
-	return DRM_GPU_SCHED_STAT_NOMINAL;
 }
 
 static void etnaviv_sched_free_job(struct drm_sched_job *sched_job)