diff mbox series

[2/3] drm/amdgpu: mostly revert "fix force APP kill hang(v4)"

Message ID 20230615115630.164098-2-christian.koenig@amd.com (mailing list archive)
State New, archived
Headers show
Series [1/3] drm/scheduler: implement hw time accounting | expand

Commit Message

Christian König June 15, 2023, 11:56 a.m. UTC
This reverts commit 8ee3a52e3f35e064a3bf82f21dc74ddaf9843648.

The new amdgpu_ctx_mgr_entity_fini() was never called, so it was pure
coincident that this patch didn't cause a crash. Since the workaround
shouldn't be needed any more just mostly revert the changes to amdgpu.

Signed-off-by: Christian König <christian.koenig@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c | 59 ++-----------------------
 drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h |  1 -
 drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c |  2 +-
 3 files changed, 5 insertions(+), 57 deletions(-)

Comments

Luben Tuikov June 15, 2023, 2:15 p.m. UTC | #1
On 2023-06-15 07:56, Christian König wrote:
> This reverts commit 8ee3a52e3f35e064a3bf82f21dc74ddaf9843648.
> 
> The new amdgpu_ctx_mgr_entity_fini() was never called, so it was pure
> coincident that this patch didn't cause a crash. Since the workaround
> shouldn't be needed any more just mostly revert the changes to amdgpu.
> 
> Signed-off-by: Christian König <christian.koenig@amd.com>

Add a fixes-tag,
Fixes: 8ee3a52e3f35e0 ("drm/gpu-sched: fix force APP kill hang(v4)")

Acked-by: Luben Tuikov <luben.tuikov@amd.com>

Regards,
Luben

> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c | 59 ++-----------------------
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h |  1 -
>  drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c |  2 +-
>  3 files changed, 5 insertions(+), 57 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
> index d2139ac12159..1445e030d788 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
> @@ -267,7 +267,7 @@ static ktime_t amdgpu_ctx_fini_entity(struct amdgpu_ctx_entity *entity)
>  		res = ktime_add(res, amdgpu_ctx_fence_time(entity->fences[i]));
>  		dma_fence_put(entity->fences[i]);
>  	}
> -
> +	drm_sched_entity_destroy(&entity->entity);
>  	kfree(entity);
>  	return res;
>  }
> @@ -476,24 +476,6 @@ static int amdgpu_ctx_alloc(struct amdgpu_device *adev,
>  	return r;
>  }
>  
> -static void amdgpu_ctx_do_release(struct kref *ref)
> -{
> -	struct amdgpu_ctx *ctx;
> -	u32 i, j;
> -
> -	ctx = container_of(ref, struct amdgpu_ctx, refcount);
> -	for (i = 0; i < AMDGPU_HW_IP_NUM; ++i) {
> -		for (j = 0; j < amdgpu_ctx_num_entities[i]; ++j) {
> -			if (!ctx->entities[i][j])
> -				continue;
> -
> -			drm_sched_entity_destroy(&ctx->entities[i][j]->entity);
> -		}
> -	}
> -
> -	amdgpu_ctx_fini(ref);
> -}
> -
>  static int amdgpu_ctx_free(struct amdgpu_fpriv *fpriv, uint32_t id)
>  {
>  	struct amdgpu_ctx_mgr *mgr = &fpriv->ctx_mgr;
> @@ -502,7 +484,7 @@ static int amdgpu_ctx_free(struct amdgpu_fpriv *fpriv, uint32_t id)
>  	mutex_lock(&mgr->lock);
>  	ctx = idr_remove(&mgr->ctx_handles, id);
>  	if (ctx)
> -		kref_put(&ctx->refcount, amdgpu_ctx_do_release);
> +		kref_put(&ctx->refcount, amdgpu_ctx_fini);
>  	mutex_unlock(&mgr->lock);
>  	return ctx ? 0 : -EINVAL;
>  }
> @@ -712,7 +694,7 @@ int amdgpu_ctx_put(struct amdgpu_ctx *ctx)
>  	if (ctx == NULL)
>  		return -EINVAL;
>  
> -	kref_put(&ctx->refcount, amdgpu_ctx_do_release);
> +	kref_put(&ctx->refcount, amdgpu_ctx_fini);
>  	return 0;
>  }
>  
> @@ -881,45 +863,12 @@ long amdgpu_ctx_mgr_entity_flush(struct amdgpu_ctx_mgr *mgr, long timeout)
>  	return timeout;
>  }
>  
> -void amdgpu_ctx_mgr_entity_fini(struct amdgpu_ctx_mgr *mgr)
> -{
> -	struct amdgpu_ctx *ctx;
> -	struct idr *idp;
> -	uint32_t id, i, j;
> -
> -	idp = &mgr->ctx_handles;
> -
> -	idr_for_each_entry(idp, ctx, id) {
> -		if (kref_read(&ctx->refcount) != 1) {
> -			DRM_ERROR("ctx %p is still alive\n", ctx);
> -			continue;
> -		}
> -
> -		for (i = 0; i < AMDGPU_HW_IP_NUM; ++i) {
> -			for (j = 0; j < amdgpu_ctx_num_entities[i]; ++j) {
> -				struct drm_sched_entity *entity;
> -
> -				if (!ctx->entities[i][j])
> -					continue;
> -
> -				entity = &ctx->entities[i][j]->entity;
> -				drm_sched_entity_fini(entity);
> -			}
> -		}
> -	}
> -}
> -
>  void amdgpu_ctx_mgr_fini(struct amdgpu_ctx_mgr *mgr)
>  {
>  	struct amdgpu_ctx *ctx;
> -	struct idr *idp;
>  	uint32_t id;
>  
> -	amdgpu_ctx_mgr_entity_fini(mgr);
> -
> -	idp = &mgr->ctx_handles;
> -
> -	idr_for_each_entry(idp, ctx, id) {
> +	idr_for_each_entry(&mgr->ctx_handles, ctx, id) {
>  		if (kref_put(&ctx->refcount, amdgpu_ctx_fini) != 1)
>  			DRM_ERROR("ctx %p is still alive\n", ctx);
>  	}
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h
> index 0fa0e56daf67..729cf479d71d 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h
> @@ -91,7 +91,6 @@ int amdgpu_ctx_wait_prev_fence(struct amdgpu_ctx *ctx,
>  
>  void amdgpu_ctx_mgr_init(struct amdgpu_ctx_mgr *mgr,
>  			 struct amdgpu_device *adev);
> -void amdgpu_ctx_mgr_entity_fini(struct amdgpu_ctx_mgr *mgr);
>  long amdgpu_ctx_mgr_entity_flush(struct amdgpu_ctx_mgr *mgr, long timeout);
>  void amdgpu_ctx_mgr_fini(struct amdgpu_ctx_mgr *mgr);
>  void amdgpu_ctx_mgr_usage(struct amdgpu_ctx_mgr *mgr,
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
> index 0efb38539d70..50c36c95556d 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
> @@ -1278,6 +1278,7 @@ void amdgpu_driver_postclose_kms(struct drm_device *dev,
>  		return;
>  
>  	pm_runtime_get_sync(dev->dev);
> +	amdgpu_ctx_mgr_fini(&fpriv->ctx_mgr);
>  
>  	if (amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_UVD) != NULL)
>  		amdgpu_uvd_free_handles(adev, file_priv);
> @@ -1299,7 +1300,6 @@ void amdgpu_driver_postclose_kms(struct drm_device *dev,
>  		amdgpu_bo_unreserve(pd);
>  	}
>  
> -	amdgpu_ctx_mgr_fini(&fpriv->ctx_mgr);
>  	amdgpu_vm_fini(adev, &fpriv->vm);
>  
>  	if (pasid)
diff mbox series

Patch

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
index d2139ac12159..1445e030d788 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
@@ -267,7 +267,7 @@  static ktime_t amdgpu_ctx_fini_entity(struct amdgpu_ctx_entity *entity)
 		res = ktime_add(res, amdgpu_ctx_fence_time(entity->fences[i]));
 		dma_fence_put(entity->fences[i]);
 	}
-
+	drm_sched_entity_destroy(&entity->entity);
 	kfree(entity);
 	return res;
 }
@@ -476,24 +476,6 @@  static int amdgpu_ctx_alloc(struct amdgpu_device *adev,
 	return r;
 }
 
-static void amdgpu_ctx_do_release(struct kref *ref)
-{
-	struct amdgpu_ctx *ctx;
-	u32 i, j;
-
-	ctx = container_of(ref, struct amdgpu_ctx, refcount);
-	for (i = 0; i < AMDGPU_HW_IP_NUM; ++i) {
-		for (j = 0; j < amdgpu_ctx_num_entities[i]; ++j) {
-			if (!ctx->entities[i][j])
-				continue;
-
-			drm_sched_entity_destroy(&ctx->entities[i][j]->entity);
-		}
-	}
-
-	amdgpu_ctx_fini(ref);
-}
-
 static int amdgpu_ctx_free(struct amdgpu_fpriv *fpriv, uint32_t id)
 {
 	struct amdgpu_ctx_mgr *mgr = &fpriv->ctx_mgr;
@@ -502,7 +484,7 @@  static int amdgpu_ctx_free(struct amdgpu_fpriv *fpriv, uint32_t id)
 	mutex_lock(&mgr->lock);
 	ctx = idr_remove(&mgr->ctx_handles, id);
 	if (ctx)
-		kref_put(&ctx->refcount, amdgpu_ctx_do_release);
+		kref_put(&ctx->refcount, amdgpu_ctx_fini);
 	mutex_unlock(&mgr->lock);
 	return ctx ? 0 : -EINVAL;
 }
@@ -712,7 +694,7 @@  int amdgpu_ctx_put(struct amdgpu_ctx *ctx)
 	if (ctx == NULL)
 		return -EINVAL;
 
-	kref_put(&ctx->refcount, amdgpu_ctx_do_release);
+	kref_put(&ctx->refcount, amdgpu_ctx_fini);
 	return 0;
 }
 
@@ -881,45 +863,12 @@  long amdgpu_ctx_mgr_entity_flush(struct amdgpu_ctx_mgr *mgr, long timeout)
 	return timeout;
 }
 
-void amdgpu_ctx_mgr_entity_fini(struct amdgpu_ctx_mgr *mgr)
-{
-	struct amdgpu_ctx *ctx;
-	struct idr *idp;
-	uint32_t id, i, j;
-
-	idp = &mgr->ctx_handles;
-
-	idr_for_each_entry(idp, ctx, id) {
-		if (kref_read(&ctx->refcount) != 1) {
-			DRM_ERROR("ctx %p is still alive\n", ctx);
-			continue;
-		}
-
-		for (i = 0; i < AMDGPU_HW_IP_NUM; ++i) {
-			for (j = 0; j < amdgpu_ctx_num_entities[i]; ++j) {
-				struct drm_sched_entity *entity;
-
-				if (!ctx->entities[i][j])
-					continue;
-
-				entity = &ctx->entities[i][j]->entity;
-				drm_sched_entity_fini(entity);
-			}
-		}
-	}
-}
-
 void amdgpu_ctx_mgr_fini(struct amdgpu_ctx_mgr *mgr)
 {
 	struct amdgpu_ctx *ctx;
-	struct idr *idp;
 	uint32_t id;
 
-	amdgpu_ctx_mgr_entity_fini(mgr);
-
-	idp = &mgr->ctx_handles;
-
-	idr_for_each_entry(idp, ctx, id) {
+	idr_for_each_entry(&mgr->ctx_handles, ctx, id) {
 		if (kref_put(&ctx->refcount, amdgpu_ctx_fini) != 1)
 			DRM_ERROR("ctx %p is still alive\n", ctx);
 	}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h
index 0fa0e56daf67..729cf479d71d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h
@@ -91,7 +91,6 @@  int amdgpu_ctx_wait_prev_fence(struct amdgpu_ctx *ctx,
 
 void amdgpu_ctx_mgr_init(struct amdgpu_ctx_mgr *mgr,
 			 struct amdgpu_device *adev);
-void amdgpu_ctx_mgr_entity_fini(struct amdgpu_ctx_mgr *mgr);
 long amdgpu_ctx_mgr_entity_flush(struct amdgpu_ctx_mgr *mgr, long timeout);
 void amdgpu_ctx_mgr_fini(struct amdgpu_ctx_mgr *mgr);
 void amdgpu_ctx_mgr_usage(struct amdgpu_ctx_mgr *mgr,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
index 0efb38539d70..50c36c95556d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
@@ -1278,6 +1278,7 @@  void amdgpu_driver_postclose_kms(struct drm_device *dev,
 		return;
 
 	pm_runtime_get_sync(dev->dev);
+	amdgpu_ctx_mgr_fini(&fpriv->ctx_mgr);
 
 	if (amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_UVD) != NULL)
 		amdgpu_uvd_free_handles(adev, file_priv);
@@ -1299,7 +1300,6 @@  void amdgpu_driver_postclose_kms(struct drm_device *dev,
 		amdgpu_bo_unreserve(pd);
 	}
 
-	amdgpu_ctx_mgr_fini(&fpriv->ctx_mgr);
 	amdgpu_vm_fini(adev, &fpriv->vm);
 
 	if (pasid)