Message ID | 20211222220506.789133-3-andrey.grodzovsky@amd.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | Define and use reset domain for GPU recovery in amdgpu | expand |
Am 22.12.21 um 23:05 schrieb Andrey Grodzovsky: > Before we initialize schedulers we must know which reset > domain are we in - for single device there iis a single > domain per device and so single wq per device. For XGMI > the reset domain spans the entire XGMI hive and so the > reset wq is per hive. > > Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com> Reviewed-by: Christian König <christian.koenig@amd.com> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 45 ++++++++++++++++++++++ > drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c | 34 ++-------------- > drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h | 2 + > 3 files changed, 51 insertions(+), 30 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > index 0f3e6c078f88..7c063fd37389 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > @@ -2284,6 +2284,47 @@ static int amdgpu_device_fw_loading(struct amdgpu_device *adev) > return r; > } > > +static int amdgpu_device_init_schedulers(struct amdgpu_device *adev) > +{ > + long timeout; > + int r, i; > + > + for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { > + struct amdgpu_ring *ring = adev->rings[i]; > + > + /* No need to setup the GPU scheduler for rings that don't need it */ > + if (!ring || ring->no_scheduler) > + continue; > + > + switch (ring->funcs->type) { > + case AMDGPU_RING_TYPE_GFX: > + timeout = adev->gfx_timeout; > + break; > + case AMDGPU_RING_TYPE_COMPUTE: > + timeout = adev->compute_timeout; > + break; > + case AMDGPU_RING_TYPE_SDMA: > + timeout = adev->sdma_timeout; > + break; > + default: > + timeout = adev->video_timeout; > + break; > + } > + > + r = drm_sched_init(&ring->sched, &amdgpu_sched_ops, > + ring->num_hw_submission, amdgpu_job_hang_limit, > + timeout, adev->reset_domain.wq, ring->sched_score, ring->name); > + if (r) { > + DRM_ERROR("Failed to create scheduler on ring %s.\n", > + ring->name); > + return r; > + } > + } > + > + return 0; > +} > + > + > /** > * amdgpu_device_ip_init - run init for hardware IPs > * > @@ -2412,6 +2453,10 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev) > } > } > > + r = amdgpu_device_init_schedulers(adev); > + if (r) > + goto init_failed; > + > /* Don't init kfd if whole hive need to be reset during init */ > if (!adev->gmc.xgmi.pending_reset) > amdgpu_amdkfd_device_init(adev); > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c > index 3b7e86ea7167..5527c68c51de 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c > @@ -456,8 +456,6 @@ int amdgpu_fence_driver_init_ring(struct amdgpu_ring *ring, > atomic_t *sched_score) > { > struct amdgpu_device *adev = ring->adev; > - long timeout; > - int r; > > if (!adev) > return -EINVAL; > @@ -477,36 +475,12 @@ int amdgpu_fence_driver_init_ring(struct amdgpu_ring *ring, > spin_lock_init(&ring->fence_drv.lock); > ring->fence_drv.fences = kcalloc(num_hw_submission * 2, sizeof(void *), > GFP_KERNEL); > - if (!ring->fence_drv.fences) > - return -ENOMEM; > > - /* No need to setup the GPU scheduler for rings that don't need it */ > - if (ring->no_scheduler) > - return 0; > + ring->num_hw_submission = num_hw_submission; > + ring->sched_score = sched_score; > > - switch (ring->funcs->type) { > - case AMDGPU_RING_TYPE_GFX: > - timeout = adev->gfx_timeout; > - break; > - case AMDGPU_RING_TYPE_COMPUTE: > - timeout = adev->compute_timeout; > - break; > - case AMDGPU_RING_TYPE_SDMA: > - timeout = adev->sdma_timeout; > - break; > - default: > - timeout = adev->video_timeout; > - break; > - } > - > - r = drm_sched_init(&ring->sched, &amdgpu_sched_ops, > - num_hw_submission, amdgpu_job_hang_limit, > - timeout, NULL, sched_score, ring->name); > - if (r) { > - DRM_ERROR("Failed to create scheduler on ring %s.\n", > - ring->name); > - return r; > - } > + if (!ring->fence_drv.fences) > + return -ENOMEM; > > return 0; > } > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h > index 4d380e79752c..a4b8279e3011 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h > @@ -253,6 +253,8 @@ struct amdgpu_ring { > bool has_compute_vm_bug; > bool no_scheduler; > int hw_prio; > + unsigned num_hw_submission; > + atomic_t *sched_score; > }; > > #define amdgpu_ring_parse_cs(r, p, ib) ((r)->funcs->parse_cs((p), (ib)))
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 0f3e6c078f88..7c063fd37389 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -2284,6 +2284,47 @@ static int amdgpu_device_fw_loading(struct amdgpu_device *adev) return r; } +static int amdgpu_device_init_schedulers(struct amdgpu_device *adev) +{ + long timeout; + int r, i; + + for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { + struct amdgpu_ring *ring = adev->rings[i]; + + /* No need to setup the GPU scheduler for rings that don't need it */ + if (!ring || ring->no_scheduler) + continue; + + switch (ring->funcs->type) { + case AMDGPU_RING_TYPE_GFX: + timeout = adev->gfx_timeout; + break; + case AMDGPU_RING_TYPE_COMPUTE: + timeout = adev->compute_timeout; + break; + case AMDGPU_RING_TYPE_SDMA: + timeout = adev->sdma_timeout; + break; + default: + timeout = adev->video_timeout; + break; + } + + r = drm_sched_init(&ring->sched, &amdgpu_sched_ops, + ring->num_hw_submission, amdgpu_job_hang_limit, + timeout, adev->reset_domain.wq, ring->sched_score, ring->name); + if (r) { + DRM_ERROR("Failed to create scheduler on ring %s.\n", + ring->name); + return r; + } + } + + return 0; +} + + /** * amdgpu_device_ip_init - run init for hardware IPs * @@ -2412,6 +2453,10 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev) } } + r = amdgpu_device_init_schedulers(adev); + if (r) + goto init_failed; + /* Don't init kfd if whole hive need to be reset during init */ if (!adev->gmc.xgmi.pending_reset) amdgpu_amdkfd_device_init(adev); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c index 3b7e86ea7167..5527c68c51de 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c @@ -456,8 +456,6 @@ int amdgpu_fence_driver_init_ring(struct amdgpu_ring *ring, atomic_t *sched_score) { struct amdgpu_device *adev = ring->adev; - long timeout; - int r; if (!adev) return -EINVAL; @@ -477,36 +475,12 @@ int amdgpu_fence_driver_init_ring(struct amdgpu_ring *ring, spin_lock_init(&ring->fence_drv.lock); ring->fence_drv.fences = kcalloc(num_hw_submission * 2, sizeof(void *), GFP_KERNEL); - if (!ring->fence_drv.fences) - return -ENOMEM; - /* No need to setup the GPU scheduler for rings that don't need it */ - if (ring->no_scheduler) - return 0; + ring->num_hw_submission = num_hw_submission; + ring->sched_score = sched_score; - switch (ring->funcs->type) { - case AMDGPU_RING_TYPE_GFX: - timeout = adev->gfx_timeout; - break; - case AMDGPU_RING_TYPE_COMPUTE: - timeout = adev->compute_timeout; - break; - case AMDGPU_RING_TYPE_SDMA: - timeout = adev->sdma_timeout; - break; - default: - timeout = adev->video_timeout; - break; - } - - r = drm_sched_init(&ring->sched, &amdgpu_sched_ops, - num_hw_submission, amdgpu_job_hang_limit, - timeout, NULL, sched_score, ring->name); - if (r) { - DRM_ERROR("Failed to create scheduler on ring %s.\n", - ring->name); - return r; - } + if (!ring->fence_drv.fences) + return -ENOMEM; return 0; } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h index 4d380e79752c..a4b8279e3011 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h @@ -253,6 +253,8 @@ struct amdgpu_ring { bool has_compute_vm_bug; bool no_scheduler; int hw_prio; + unsigned num_hw_submission; + atomic_t *sched_score; }; #define amdgpu_ring_parse_cs(r, p, ib) ((r)->funcs->parse_cs((p), (ib)))
Before we initialize schedulers we must know which reset domain are we in - for single device there iis a single domain per device and so single wq per device. For XGMI the reset domain spans the entire XGMI hive and so the reset wq is per hive. Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com> --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 45 ++++++++++++++++++++++ drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c | 34 ++-------------- drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h | 2 + 3 files changed, 51 insertions(+), 30 deletions(-)