Message ID | 20220209002320.6077-5-andrey.grodzovsky@amd.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | Define and use reset domain for GPU recovery in amdgpu | expand |
[AMD Official Use Only] This patch is reviewed by Shaoyun.liu <Shaoyun.liu@amd.com> Since other patches are suggested by other engineer and they may already od some review on them , so I will leave them to continue review the rest patches. Regards Shaoyun.liu -----Original Message----- From: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com> Sent: Tuesday, February 8, 2022 7:23 PM To: dri-devel@lists.freedesktop.org; amd-gfx@lists.freedesktop.org Cc: Koenig, Christian <Christian.Koenig@amd.com>; daniel@ffwll.ch; Liu, Monk <Monk.Liu@amd.com>; Chen, Horace <Horace.Chen@amd.com>; Lazar, Lijo <Lijo.Lazar@amd.com>; Chen, JingWen <JingWen.Chen2@amd.com>; Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>; Liu, Shaoyun <Shaoyun.Liu@amd.com> Subject: [RFC v4 04/11] drm/amd/virt: For SRIOV send GPU reset directly to TDR queue. No need to to trigger another work queue inside the work queue. v3: Problem: Extra reset caused by host side FLR notification following guest side triggered reset. Fix: Preven qeuing flr_work from mailbox irq if guest already executing a reset. Suggested-by: Liu Shaoyun <Shaoyun.Liu@amd.com> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com> --- drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 9 ++++++--- drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 9 ++++++--- drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c | 9 ++++++--- 3 files changed, 18 insertions(+), 9 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c index 56da5ab82987..5869d51d8bee 100644 --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c @@ -282,7 +282,7 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct *work) if (amdgpu_device_should_recover_gpu(adev) && (!amdgpu_device_has_job_running(adev) || adev->sdma_timeout == MAX_SCHEDULE_TIMEOUT)) - amdgpu_device_gpu_recover(adev, NULL); + amdgpu_device_gpu_recover_imp(adev, NULL); } static int xgpu_ai_set_mailbox_rcv_irq(struct amdgpu_device *adev, @@ -307,8 +307,11 @@ static int xgpu_ai_mailbox_rcv_irq(struct amdgpu_device *adev, switch (event) { case IDH_FLR_NOTIFICATION: - if (amdgpu_sriov_runtime(adev)) - schedule_work(&adev->virt.flr_work); + if (amdgpu_sriov_runtime(adev) && !amdgpu_in_reset(adev)) + WARN_ONCE(!queue_work(adev->reset_domain.wq, + &adev->virt.flr_work), + "Failed to queue work! at %s", + __func__); break; case IDH_QUERY_ALIVE: xgpu_ai_mailbox_send_ack(adev); diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c index 477d0dde19c5..5728a6401d73 100644 --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c @@ -309,7 +309,7 @@ static void xgpu_nv_mailbox_flr_work(struct work_struct *work) adev->gfx_timeout == MAX_SCHEDULE_TIMEOUT || adev->compute_timeout == MAX_SCHEDULE_TIMEOUT || adev->video_timeout == MAX_SCHEDULE_TIMEOUT)) - amdgpu_device_gpu_recover(adev, NULL); + amdgpu_device_gpu_recover_imp(adev, NULL); } static int xgpu_nv_set_mailbox_rcv_irq(struct amdgpu_device *adev, @@ -337,8 +337,11 @@ static int xgpu_nv_mailbox_rcv_irq(struct amdgpu_device *adev, switch (event) { case IDH_FLR_NOTIFICATION: - if (amdgpu_sriov_runtime(adev)) - schedule_work(&adev->virt.flr_work); + if (amdgpu_sriov_runtime(adev) && !amdgpu_in_reset(adev)) + WARN_ONCE(!queue_work(adev->reset_domain.wq, + &adev->virt.flr_work), + "Failed to queue work! at %s", + __func__); break; /* READY_TO_ACCESS_GPU is fetched by kernel polling, IRQ can ignore * it byfar since that polling thread will handle it, diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c index aef9d059ae52..02290febfcf4 100644 --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c @@ -521,7 +521,7 @@ static void xgpu_vi_mailbox_flr_work(struct work_struct *work) /* Trigger recovery due to world switch failure */ if (amdgpu_device_should_recover_gpu(adev)) - amdgpu_device_gpu_recover(adev, NULL); + amdgpu_device_gpu_recover_imp(adev, NULL); } static int xgpu_vi_set_mailbox_rcv_irq(struct amdgpu_device *adev, @@ -550,8 +550,11 @@ static int xgpu_vi_mailbox_rcv_irq(struct amdgpu_device *adev, r = xgpu_vi_mailbox_rcv_msg(adev, IDH_FLR_NOTIFICATION); /* only handle FLR_NOTIFY now */ - if (!r) - schedule_work(&adev->virt.flr_work); + if (!r && !amdgpu_in_reset(adev)) + WARN_ONCE(!queue_work(adev->reset_domain.wq, + &adev->virt.flr_work), + "Failed to queue work! at %s", + __func__); } return 0; -- 2.25.1
Am 09.02.22 um 01:23 schrieb Andrey Grodzovsky: > No need to to trigger another work queue inside the work queue. > > v3: > > Problem: > Extra reset caused by host side FLR notification > following guest side triggered reset. > Fix: Preven qeuing flr_work from mailbox irq if guest > already executing a reset. > > Suggested-by: Liu Shaoyun <Shaoyun.Liu@amd.com> > Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com> Feel free to add an Acked-by: Christian König <christian.koenig@amd.com>, but an rb from somebody more familiar with the code would be better. Regards, Christian. > --- > drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 9 ++++++--- > drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 9 ++++++--- > drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c | 9 ++++++--- > 3 files changed, 18 insertions(+), 9 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c > index 56da5ab82987..5869d51d8bee 100644 > --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c > +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c > @@ -282,7 +282,7 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct *work) > if (amdgpu_device_should_recover_gpu(adev) > && (!amdgpu_device_has_job_running(adev) || > adev->sdma_timeout == MAX_SCHEDULE_TIMEOUT)) > - amdgpu_device_gpu_recover(adev, NULL); > + amdgpu_device_gpu_recover_imp(adev, NULL); > } > > static int xgpu_ai_set_mailbox_rcv_irq(struct amdgpu_device *adev, > @@ -307,8 +307,11 @@ static int xgpu_ai_mailbox_rcv_irq(struct amdgpu_device *adev, > > switch (event) { > case IDH_FLR_NOTIFICATION: > - if (amdgpu_sriov_runtime(adev)) > - schedule_work(&adev->virt.flr_work); > + if (amdgpu_sriov_runtime(adev) && !amdgpu_in_reset(adev)) > + WARN_ONCE(!queue_work(adev->reset_domain.wq, > + &adev->virt.flr_work), > + "Failed to queue work! at %s", > + __func__); > break; > case IDH_QUERY_ALIVE: > xgpu_ai_mailbox_send_ack(adev); > diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c > index 477d0dde19c5..5728a6401d73 100644 > --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c > +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c > @@ -309,7 +309,7 @@ static void xgpu_nv_mailbox_flr_work(struct work_struct *work) > adev->gfx_timeout == MAX_SCHEDULE_TIMEOUT || > adev->compute_timeout == MAX_SCHEDULE_TIMEOUT || > adev->video_timeout == MAX_SCHEDULE_TIMEOUT)) > - amdgpu_device_gpu_recover(adev, NULL); > + amdgpu_device_gpu_recover_imp(adev, NULL); > } > > static int xgpu_nv_set_mailbox_rcv_irq(struct amdgpu_device *adev, > @@ -337,8 +337,11 @@ static int xgpu_nv_mailbox_rcv_irq(struct amdgpu_device *adev, > > switch (event) { > case IDH_FLR_NOTIFICATION: > - if (amdgpu_sriov_runtime(adev)) > - schedule_work(&adev->virt.flr_work); > + if (amdgpu_sriov_runtime(adev) && !amdgpu_in_reset(adev)) > + WARN_ONCE(!queue_work(adev->reset_domain.wq, > + &adev->virt.flr_work), > + "Failed to queue work! at %s", > + __func__); > break; > /* READY_TO_ACCESS_GPU is fetched by kernel polling, IRQ can ignore > * it byfar since that polling thread will handle it, > diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c > index aef9d059ae52..02290febfcf4 100644 > --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c > +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c > @@ -521,7 +521,7 @@ static void xgpu_vi_mailbox_flr_work(struct work_struct *work) > > /* Trigger recovery due to world switch failure */ > if (amdgpu_device_should_recover_gpu(adev)) > - amdgpu_device_gpu_recover(adev, NULL); > + amdgpu_device_gpu_recover_imp(adev, NULL); > } > > static int xgpu_vi_set_mailbox_rcv_irq(struct amdgpu_device *adev, > @@ -550,8 +550,11 @@ static int xgpu_vi_mailbox_rcv_irq(struct amdgpu_device *adev, > r = xgpu_vi_mailbox_rcv_msg(adev, IDH_FLR_NOTIFICATION); > > /* only handle FLR_NOTIFY now */ > - if (!r) > - schedule_work(&adev->virt.flr_work); > + if (!r && !amdgpu_in_reset(adev)) > + WARN_ONCE(!queue_work(adev->reset_domain.wq, > + &adev->virt.flr_work), > + "Failed to queue work! at %s", > + __func__); > } > > return 0;
diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c index 56da5ab82987..5869d51d8bee 100644 --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c @@ -282,7 +282,7 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct *work) if (amdgpu_device_should_recover_gpu(adev) && (!amdgpu_device_has_job_running(adev) || adev->sdma_timeout == MAX_SCHEDULE_TIMEOUT)) - amdgpu_device_gpu_recover(adev, NULL); + amdgpu_device_gpu_recover_imp(adev, NULL); } static int xgpu_ai_set_mailbox_rcv_irq(struct amdgpu_device *adev, @@ -307,8 +307,11 @@ static int xgpu_ai_mailbox_rcv_irq(struct amdgpu_device *adev, switch (event) { case IDH_FLR_NOTIFICATION: - if (amdgpu_sriov_runtime(adev)) - schedule_work(&adev->virt.flr_work); + if (amdgpu_sriov_runtime(adev) && !amdgpu_in_reset(adev)) + WARN_ONCE(!queue_work(adev->reset_domain.wq, + &adev->virt.flr_work), + "Failed to queue work! at %s", + __func__); break; case IDH_QUERY_ALIVE: xgpu_ai_mailbox_send_ack(adev); diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c index 477d0dde19c5..5728a6401d73 100644 --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c @@ -309,7 +309,7 @@ static void xgpu_nv_mailbox_flr_work(struct work_struct *work) adev->gfx_timeout == MAX_SCHEDULE_TIMEOUT || adev->compute_timeout == MAX_SCHEDULE_TIMEOUT || adev->video_timeout == MAX_SCHEDULE_TIMEOUT)) - amdgpu_device_gpu_recover(adev, NULL); + amdgpu_device_gpu_recover_imp(adev, NULL); } static int xgpu_nv_set_mailbox_rcv_irq(struct amdgpu_device *adev, @@ -337,8 +337,11 @@ static int xgpu_nv_mailbox_rcv_irq(struct amdgpu_device *adev, switch (event) { case IDH_FLR_NOTIFICATION: - if (amdgpu_sriov_runtime(adev)) - schedule_work(&adev->virt.flr_work); + if (amdgpu_sriov_runtime(adev) && !amdgpu_in_reset(adev)) + WARN_ONCE(!queue_work(adev->reset_domain.wq, + &adev->virt.flr_work), + "Failed to queue work! at %s", + __func__); break; /* READY_TO_ACCESS_GPU is fetched by kernel polling, IRQ can ignore * it byfar since that polling thread will handle it, diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c index aef9d059ae52..02290febfcf4 100644 --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c @@ -521,7 +521,7 @@ static void xgpu_vi_mailbox_flr_work(struct work_struct *work) /* Trigger recovery due to world switch failure */ if (amdgpu_device_should_recover_gpu(adev)) - amdgpu_device_gpu_recover(adev, NULL); + amdgpu_device_gpu_recover_imp(adev, NULL); } static int xgpu_vi_set_mailbox_rcv_irq(struct amdgpu_device *adev, @@ -550,8 +550,11 @@ static int xgpu_vi_mailbox_rcv_irq(struct amdgpu_device *adev, r = xgpu_vi_mailbox_rcv_msg(adev, IDH_FLR_NOTIFICATION); /* only handle FLR_NOTIFY now */ - if (!r) - schedule_work(&adev->virt.flr_work); + if (!r && !amdgpu_in_reset(adev)) + WARN_ONCE(!queue_work(adev->reset_domain.wq, + &adev->virt.flr_work), + "Failed to queue work! at %s", + __func__); } return 0;
No need to to trigger another work queue inside the work queue. v3: Problem: Extra reset caused by host side FLR notification following guest side triggered reset. Fix: Preven qeuing flr_work from mailbox irq if guest already executing a reset. Suggested-by: Liu Shaoyun <Shaoyun.Liu@amd.com> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com> --- drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 9 ++++++--- drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 9 ++++++--- drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c | 9 ++++++--- 3 files changed, 18 insertions(+), 9 deletions(-)