diff mbox series

[RFC,v2,2/8] drm/amdgpu: Move scheduler init to after XGMI is ready

Message ID 20211222220506.789133-3-andrey.grodzovsky@amd.com (mailing list archive)
State New, archived
Headers show
Series Define and use reset domain for GPU recovery in amdgpu | expand

Commit Message

Andrey Grodzovsky Dec. 22, 2021, 10:05 p.m. UTC
Before we initialize schedulers we must know which reset
domain are we in - for single device there iis a single
domain per device and so single wq per device. For XGMI
the reset domain spans the entire XGMI hive and so the
reset wq is per hive.

Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 45 ++++++++++++++++++++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c  | 34 ++--------------
 drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h   |  2 +
 3 files changed, 51 insertions(+), 30 deletions(-)

Comments

Christian König Dec. 23, 2021, 8:39 a.m. UTC | #1
Am 22.12.21 um 23:05 schrieb Andrey Grodzovsky:
> Before we initialize schedulers we must know which reset
> domain are we in - for single device there iis a single
> domain per device and so single wq per device. For XGMI
> the reset domain spans the entire XGMI hive and so the
> reset wq is per hive.
>
> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>

Reviewed-by: Christian König <christian.koenig@amd.com>

> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 45 ++++++++++++++++++++++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c  | 34 ++--------------
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h   |  2 +
>   3 files changed, 51 insertions(+), 30 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 0f3e6c078f88..7c063fd37389 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -2284,6 +2284,47 @@ static int amdgpu_device_fw_loading(struct amdgpu_device *adev)
>   	return r;
>   }
>   
> +static int amdgpu_device_init_schedulers(struct amdgpu_device *adev)
> +{
> +	long timeout;
> +	int r, i;
> +
> +	for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
> +		struct amdgpu_ring *ring = adev->rings[i];
> +
> +		/* No need to setup the GPU scheduler for rings that don't need it */
> +		if (!ring || ring->no_scheduler)
> +			continue;
> +
> +		switch (ring->funcs->type) {
> +		case AMDGPU_RING_TYPE_GFX:
> +			timeout = adev->gfx_timeout;
> +			break;
> +		case AMDGPU_RING_TYPE_COMPUTE:
> +			timeout = adev->compute_timeout;
> +			break;
> +		case AMDGPU_RING_TYPE_SDMA:
> +			timeout = adev->sdma_timeout;
> +			break;
> +		default:
> +			timeout = adev->video_timeout;
> +			break;
> +		}
> +
> +		r = drm_sched_init(&ring->sched, &amdgpu_sched_ops,
> +				   ring->num_hw_submission, amdgpu_job_hang_limit,
> +				   timeout, adev->reset_domain.wq, ring->sched_score, ring->name);
> +		if (r) {
> +			DRM_ERROR("Failed to create scheduler on ring %s.\n",
> +				  ring->name);
> +			return r;
> +		}
> +	}
> +
> +	return 0;
> +}
> +
> +
>   /**
>    * amdgpu_device_ip_init - run init for hardware IPs
>    *
> @@ -2412,6 +2453,10 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev)
>   		}
>   	}
>   
> +	r = amdgpu_device_init_schedulers(adev);
> +	if (r)
> +		goto init_failed;
> +
>   	/* Don't init kfd if whole hive need to be reset during init */
>   	if (!adev->gmc.xgmi.pending_reset)
>   		amdgpu_amdkfd_device_init(adev);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> index 3b7e86ea7167..5527c68c51de 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> @@ -456,8 +456,6 @@ int amdgpu_fence_driver_init_ring(struct amdgpu_ring *ring,
>   				  atomic_t *sched_score)
>   {
>   	struct amdgpu_device *adev = ring->adev;
> -	long timeout;
> -	int r;
>   
>   	if (!adev)
>   		return -EINVAL;
> @@ -477,36 +475,12 @@ int amdgpu_fence_driver_init_ring(struct amdgpu_ring *ring,
>   	spin_lock_init(&ring->fence_drv.lock);
>   	ring->fence_drv.fences = kcalloc(num_hw_submission * 2, sizeof(void *),
>   					 GFP_KERNEL);
> -	if (!ring->fence_drv.fences)
> -		return -ENOMEM;
>   
> -	/* No need to setup the GPU scheduler for rings that don't need it */
> -	if (ring->no_scheduler)
> -		return 0;
> +	ring->num_hw_submission = num_hw_submission;
> +	ring->sched_score = sched_score;
>   
> -	switch (ring->funcs->type) {
> -	case AMDGPU_RING_TYPE_GFX:
> -		timeout = adev->gfx_timeout;
> -		break;
> -	case AMDGPU_RING_TYPE_COMPUTE:
> -		timeout = adev->compute_timeout;
> -		break;
> -	case AMDGPU_RING_TYPE_SDMA:
> -		timeout = adev->sdma_timeout;
> -		break;
> -	default:
> -		timeout = adev->video_timeout;
> -		break;
> -	}
> -
> -	r = drm_sched_init(&ring->sched, &amdgpu_sched_ops,
> -			   num_hw_submission, amdgpu_job_hang_limit,
> -			   timeout, NULL, sched_score, ring->name);
> -	if (r) {
> -		DRM_ERROR("Failed to create scheduler on ring %s.\n",
> -			  ring->name);
> -		return r;
> -	}
> +	if (!ring->fence_drv.fences)
> +		return -ENOMEM;
>   
>   	return 0;
>   }
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> index 4d380e79752c..a4b8279e3011 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> @@ -253,6 +253,8 @@ struct amdgpu_ring {
>   	bool			has_compute_vm_bug;
>   	bool			no_scheduler;
>   	int			hw_prio;
> +	unsigned 		num_hw_submission;
> +	atomic_t		*sched_score;
>   };
>   
>   #define amdgpu_ring_parse_cs(r, p, ib) ((r)->funcs->parse_cs((p), (ib)))
diff mbox series

Patch

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 0f3e6c078f88..7c063fd37389 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2284,6 +2284,47 @@  static int amdgpu_device_fw_loading(struct amdgpu_device *adev)
 	return r;
 }
 
+static int amdgpu_device_init_schedulers(struct amdgpu_device *adev)
+{
+	long timeout;
+	int r, i;
+
+	for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
+		struct amdgpu_ring *ring = adev->rings[i];
+
+		/* No need to setup the GPU scheduler for rings that don't need it */
+		if (!ring || ring->no_scheduler)
+			continue;
+
+		switch (ring->funcs->type) {
+		case AMDGPU_RING_TYPE_GFX:
+			timeout = adev->gfx_timeout;
+			break;
+		case AMDGPU_RING_TYPE_COMPUTE:
+			timeout = adev->compute_timeout;
+			break;
+		case AMDGPU_RING_TYPE_SDMA:
+			timeout = adev->sdma_timeout;
+			break;
+		default:
+			timeout = adev->video_timeout;
+			break;
+		}
+
+		r = drm_sched_init(&ring->sched, &amdgpu_sched_ops,
+				   ring->num_hw_submission, amdgpu_job_hang_limit,
+				   timeout, adev->reset_domain.wq, ring->sched_score, ring->name);
+		if (r) {
+			DRM_ERROR("Failed to create scheduler on ring %s.\n",
+				  ring->name);
+			return r;
+		}
+	}
+
+	return 0;
+}
+
+
 /**
  * amdgpu_device_ip_init - run init for hardware IPs
  *
@@ -2412,6 +2453,10 @@  static int amdgpu_device_ip_init(struct amdgpu_device *adev)
 		}
 	}
 
+	r = amdgpu_device_init_schedulers(adev);
+	if (r)
+		goto init_failed;
+
 	/* Don't init kfd if whole hive need to be reset during init */
 	if (!adev->gmc.xgmi.pending_reset)
 		amdgpu_amdkfd_device_init(adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
index 3b7e86ea7167..5527c68c51de 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
@@ -456,8 +456,6 @@  int amdgpu_fence_driver_init_ring(struct amdgpu_ring *ring,
 				  atomic_t *sched_score)
 {
 	struct amdgpu_device *adev = ring->adev;
-	long timeout;
-	int r;
 
 	if (!adev)
 		return -EINVAL;
@@ -477,36 +475,12 @@  int amdgpu_fence_driver_init_ring(struct amdgpu_ring *ring,
 	spin_lock_init(&ring->fence_drv.lock);
 	ring->fence_drv.fences = kcalloc(num_hw_submission * 2, sizeof(void *),
 					 GFP_KERNEL);
-	if (!ring->fence_drv.fences)
-		return -ENOMEM;
 
-	/* No need to setup the GPU scheduler for rings that don't need it */
-	if (ring->no_scheduler)
-		return 0;
+	ring->num_hw_submission = num_hw_submission;
+	ring->sched_score = sched_score;
 
-	switch (ring->funcs->type) {
-	case AMDGPU_RING_TYPE_GFX:
-		timeout = adev->gfx_timeout;
-		break;
-	case AMDGPU_RING_TYPE_COMPUTE:
-		timeout = adev->compute_timeout;
-		break;
-	case AMDGPU_RING_TYPE_SDMA:
-		timeout = adev->sdma_timeout;
-		break;
-	default:
-		timeout = adev->video_timeout;
-		break;
-	}
-
-	r = drm_sched_init(&ring->sched, &amdgpu_sched_ops,
-			   num_hw_submission, amdgpu_job_hang_limit,
-			   timeout, NULL, sched_score, ring->name);
-	if (r) {
-		DRM_ERROR("Failed to create scheduler on ring %s.\n",
-			  ring->name);
-		return r;
-	}
+	if (!ring->fence_drv.fences)
+		return -ENOMEM;
 
 	return 0;
 }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
index 4d380e79752c..a4b8279e3011 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
@@ -253,6 +253,8 @@  struct amdgpu_ring {
 	bool			has_compute_vm_bug;
 	bool			no_scheduler;
 	int			hw_prio;
+	unsigned 		num_hw_submission;
+	atomic_t		*sched_score;
 };
 
 #define amdgpu_ring_parse_cs(r, p, ib) ((r)->funcs->parse_cs((p), (ib)))