Message ID | 20210510163625.407105-13-andrey.grodzovsky@amd.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | RFC Support hot device unplug in amdgpu | expand |
Am 10.05.21 um 18:36 schrieb Andrey Grodzovsky: > Return DRM_TASK_STATUS_ENODEV back to the scheduler when device > is not present so they timeout timer will not be rearmed. > > v5: Update to match updated return values in enum drm_gpu_sched_stat > > Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com> Reviewed-by: Christian König <christian.koenig@amd.com> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 19 ++++++++++++++++--- > 1 file changed, 16 insertions(+), 3 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c > index 759b34799221..d33e6d97cc89 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c > @@ -25,6 +25,8 @@ > #include <linux/wait.h> > #include <linux/sched.h> > > +#include <drm/drm_drv.h> > + > #include "amdgpu.h" > #include "amdgpu_trace.h" > > @@ -34,6 +36,15 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job) > struct amdgpu_job *job = to_amdgpu_job(s_job); > struct amdgpu_task_info ti; > struct amdgpu_device *adev = ring->adev; > + int idx; > + > + if (!drm_dev_enter(&adev->ddev, &idx)) { > + DRM_INFO("%s - device unplugged skipping recovery on scheduler:%s", > + __func__, s_job->sched->name); > + > + /* Effectively the job is aborted as the device is gone */ > + return DRM_GPU_SCHED_STAT_ENODEV; > + } > > memset(&ti, 0, sizeof(struct amdgpu_task_info)); > > @@ -41,7 +52,7 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job) > amdgpu_ring_soft_recovery(ring, job->vmid, s_job->s_fence->parent)) { > DRM_ERROR("ring %s timeout, but soft recovered\n", > s_job->sched->name); > - return DRM_GPU_SCHED_STAT_NOMINAL; > + goto exit; > } > > amdgpu_vm_get_task_info(ring->adev, job->pasid, &ti); > @@ -53,13 +64,15 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job) > > if (amdgpu_device_should_recover_gpu(ring->adev)) { > amdgpu_device_gpu_recover(ring->adev, job); > - return DRM_GPU_SCHED_STAT_NOMINAL; > } else { > drm_sched_suspend_timeout(&ring->sched); > if (amdgpu_sriov_vf(adev)) > adev->virt.tdr_debug = true; > - return DRM_GPU_SCHED_STAT_NOMINAL; > } > + > +exit: > + drm_dev_exit(idx); > + return DRM_GPU_SCHED_STAT_NOMINAL; > } > > int amdgpu_job_alloc(struct amdgpu_device *adev, unsigned num_ibs,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c index 759b34799221..d33e6d97cc89 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c @@ -25,6 +25,8 @@ #include <linux/wait.h> #include <linux/sched.h> +#include <drm/drm_drv.h> + #include "amdgpu.h" #include "amdgpu_trace.h" @@ -34,6 +36,15 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job) struct amdgpu_job *job = to_amdgpu_job(s_job); struct amdgpu_task_info ti; struct amdgpu_device *adev = ring->adev; + int idx; + + if (!drm_dev_enter(&adev->ddev, &idx)) { + DRM_INFO("%s - device unplugged skipping recovery on scheduler:%s", + __func__, s_job->sched->name); + + /* Effectively the job is aborted as the device is gone */ + return DRM_GPU_SCHED_STAT_ENODEV; + } memset(&ti, 0, sizeof(struct amdgpu_task_info)); @@ -41,7 +52,7 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job) amdgpu_ring_soft_recovery(ring, job->vmid, s_job->s_fence->parent)) { DRM_ERROR("ring %s timeout, but soft recovered\n", s_job->sched->name); - return DRM_GPU_SCHED_STAT_NOMINAL; + goto exit; } amdgpu_vm_get_task_info(ring->adev, job->pasid, &ti); @@ -53,13 +64,15 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job) if (amdgpu_device_should_recover_gpu(ring->adev)) { amdgpu_device_gpu_recover(ring->adev, job); - return DRM_GPU_SCHED_STAT_NOMINAL; } else { drm_sched_suspend_timeout(&ring->sched); if (amdgpu_sriov_vf(adev)) adev->virt.tdr_debug = true; - return DRM_GPU_SCHED_STAT_NOMINAL; } + +exit: + drm_dev_exit(idx); + return DRM_GPU_SCHED_STAT_NOMINAL; } int amdgpu_job_alloc(struct amdgpu_device *adev, unsigned num_ibs,
Return DRM_TASK_STATUS_ENODEV back to the scheduler when device is not present so they timeout timer will not be rearmed. v5: Update to match updated return values in enum drm_gpu_sched_stat Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com> --- drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-)