diff mbox series

[4/5] drm/sched: Partial revert of 'drm/sched: Keep s_fence->parent pointer'

Message ID 20220620220302.86389-5-andrey.grodzovsky@amd.com (mailing list archive)
State New, archived
Headers show
Series Rework amdgpu HW fence refocunt and update scheduler parent fence refcount. | expand

Commit Message

Andrey Grodzovsky June 20, 2022, 10:03 p.m. UTC
Problem:
This patch caused negative refcount as described in [1] because
for that case parent fence did not signal by the time of drm_sched_stop and hence
kept in pending list the assumption was they will not signal and
so fence was put to account for the s_fence->parent refcount but for
amdgpu which has embedded HW fence (always same parent fence)
drm_sched_fence_release_scheduled was always called and would
still drop the count for parent fence once more. For jobs that
never signaled this imbalance was masked by refcount bug in
amdgpu_fence_driver_clear_job_fences that would not drop
refcount on the fences that were removed from fence drive
fences array (against prevois insertion into the array in
get in amdgpu_fence_emit).

Fix:
Revert this patch and by setting s_job->s_fence->parent to NULL
as before prevent the extra refcount drop in amdgpu when
drm_sched_fence_release_scheduled is called on job release.

Also - align behaviour in drm_sched_resubmit_jobs_ext with that of
drm_sched_main when submitting jobs - take a refcount for the
new parent fence pointer and drop refcount for original kref_init
for new HW fence creation (or fake new HW fence in amdgpu - see next patch).

[1] - https://lore.kernel.org/all/731b7ff1-3cc9-e314-df2a-7c51b76d4db0@amd.com/t/#r00c728fcc069b1276642c325bfa9d82bf8fa21a3

Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
Tested-by: Yiqing Yao <yiqing.yao@amd.com>
---
 drivers/gpu/drm/scheduler/sched_main.c | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

Comments

Christian König June 21, 2022, 7:26 a.m. UTC | #1
Am 21.06.22 um 00:03 schrieb Andrey Grodzovsky:
> Problem:
> This patch caused negative refcount as described in [1] because
> for that case parent fence did not signal by the time of drm_sched_stop and hence
> kept in pending list the assumption was they will not signal and
> so fence was put to account for the s_fence->parent refcount but for
> amdgpu which has embedded HW fence (always same parent fence)
> drm_sched_fence_release_scheduled was always called and would
> still drop the count for parent fence once more. For jobs that
> never signaled this imbalance was masked by refcount bug in
> amdgpu_fence_driver_clear_job_fences that would not drop
> refcount on the fences that were removed from fence drive
> fences array (against prevois insertion into the array in
> get in amdgpu_fence_emit).
>
> Fix:
> Revert this patch and by setting s_job->s_fence->parent to NULL
> as before prevent the extra refcount drop in amdgpu when
> drm_sched_fence_release_scheduled is called on job release.
>
> Also - align behaviour in drm_sched_resubmit_jobs_ext with that of
> drm_sched_main when submitting jobs - take a refcount for the
> new parent fence pointer and drop refcount for original kref_init
> for new HW fence creation (or fake new HW fence in amdgpu - see next patch).
>
> [1] - https://lore.kernel.org/all/731b7ff1-3cc9-e314-df2a-7c51b76d4db0@amd.com/t/#r00c728fcc069b1276642c325bfa9d82bf8fa21a3
>
> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
> Tested-by: Yiqing Yao <yiqing.yao@amd.com>
> ---
>   drivers/gpu/drm/scheduler/sched_main.c | 16 +++++++++++++---
>   1 file changed, 13 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c
> index b81fceb0b8a2..b38394f5694f 100644
> --- a/drivers/gpu/drm/scheduler/sched_main.c
> +++ b/drivers/gpu/drm/scheduler/sched_main.c
> @@ -419,6 +419,11 @@ void drm_sched_stop(struct drm_gpu_scheduler *sched, struct drm_sched_job *bad)
>   		if (s_job->s_fence->parent &&
>   		    dma_fence_remove_callback(s_job->s_fence->parent,
>   					      &s_job->cb)) {
> +			/* Revert drm/sched: Keep s_fence->parent pointer, no
> +			 * need anymore for amdgpu and creates only troubles
> +			 */

Please no amdgpu specific comments in common code.

Apart from that looks like a step into the right direction to me.

Regards,
Christian.

> +			dma_fence_put(s_job->s_fence->parent);
> +			s_job->s_fence->parent = NULL;
>   			atomic_dec(&sched->hw_rq_count);
>   		} else {
>   			/*
> @@ -548,7 +553,6 @@ void drm_sched_resubmit_jobs_ext(struct drm_gpu_scheduler *sched, int max)
>   		if (found_guilty && s_job->s_fence->scheduled.context == guilty_context)
>   			dma_fence_set_error(&s_fence->finished, -ECANCELED);
>   
> -		dma_fence_put(s_job->s_fence->parent);
>   		fence = sched->ops->run_job(s_job);
>   		i++;
>   
> @@ -558,7 +562,11 @@ void drm_sched_resubmit_jobs_ext(struct drm_gpu_scheduler *sched, int max)
>   
>   			s_job->s_fence->parent = NULL;
>   		} else {
> -			s_job->s_fence->parent = fence;
> +
> +			s_job->s_fence->parent = dma_fence_get(fence);
> +
> +			/* Drop for orignal kref_init */
> +			dma_fence_put(fence);
>   		}
>   	}
>   }
> @@ -952,6 +960,9 @@ static int drm_sched_main(void *param)
>   
>   		if (!IS_ERR_OR_NULL(fence)) {
>   			s_fence->parent = dma_fence_get(fence);
> +			/* Drop for original kref_init of the fence */
> +			dma_fence_put(fence);
> +
>   			r = dma_fence_add_callback(fence, &sched_job->cb,
>   						   drm_sched_job_done_cb);
>   			if (r == -ENOENT)
> @@ -959,7 +970,6 @@ static int drm_sched_main(void *param)
>   			else if (r)
>   				DRM_DEV_ERROR(sched->dev, "fence add callback failed (%d)\n",
>   					  r);
> -			dma_fence_put(fence);
>   		} else {
>   			if (IS_ERR(fence))
>   				dma_fence_set_error(&s_fence->finished, PTR_ERR(fence));
diff mbox series

Patch

diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c
index b81fceb0b8a2..b38394f5694f 100644
--- a/drivers/gpu/drm/scheduler/sched_main.c
+++ b/drivers/gpu/drm/scheduler/sched_main.c
@@ -419,6 +419,11 @@  void drm_sched_stop(struct drm_gpu_scheduler *sched, struct drm_sched_job *bad)
 		if (s_job->s_fence->parent &&
 		    dma_fence_remove_callback(s_job->s_fence->parent,
 					      &s_job->cb)) {
+			/* Revert drm/sched: Keep s_fence->parent pointer, no
+			 * need anymore for amdgpu and creates only troubles
+			 */
+			dma_fence_put(s_job->s_fence->parent);
+			s_job->s_fence->parent = NULL;
 			atomic_dec(&sched->hw_rq_count);
 		} else {
 			/*
@@ -548,7 +553,6 @@  void drm_sched_resubmit_jobs_ext(struct drm_gpu_scheduler *sched, int max)
 		if (found_guilty && s_job->s_fence->scheduled.context == guilty_context)
 			dma_fence_set_error(&s_fence->finished, -ECANCELED);
 
-		dma_fence_put(s_job->s_fence->parent);
 		fence = sched->ops->run_job(s_job);
 		i++;
 
@@ -558,7 +562,11 @@  void drm_sched_resubmit_jobs_ext(struct drm_gpu_scheduler *sched, int max)
 
 			s_job->s_fence->parent = NULL;
 		} else {
-			s_job->s_fence->parent = fence;
+
+			s_job->s_fence->parent = dma_fence_get(fence);
+
+			/* Drop for orignal kref_init */
+			dma_fence_put(fence);
 		}
 	}
 }
@@ -952,6 +960,9 @@  static int drm_sched_main(void *param)
 
 		if (!IS_ERR_OR_NULL(fence)) {
 			s_fence->parent = dma_fence_get(fence);
+			/* Drop for original kref_init of the fence */
+			dma_fence_put(fence);
+
 			r = dma_fence_add_callback(fence, &sched_job->cb,
 						   drm_sched_job_done_cb);
 			if (r == -ENOENT)
@@ -959,7 +970,6 @@  static int drm_sched_main(void *param)
 			else if (r)
 				DRM_DEV_ERROR(sched->dev, "fence add callback failed (%d)\n",
 					  r);
-			dma_fence_put(fence);
 		} else {
 			if (IS_ERR(fence))
 				dma_fence_set_error(&s_fence->finished, PTR_ERR(fence));