Message ID | 20220209002320.6077-10-andrey.grodzovsky@amd.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | Define and use reset domain for GPU recovery in amdgpu | expand |
Am 09.02.22 um 01:23 schrieb Andrey Grodzovsky: > We should have a single instance per entrire reset domain. > > Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com> > Suggested-by: Lijo Lazar <lijo.lazar@amd.com> Reviewed-by: Christian König <christian.koenig@amd.com> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu.h | 7 ++----- > drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 10 +++++++--- > drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c | 1 + > drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h | 1 + > drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 4 ++-- > drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 4 ++-- > 6 files changed, 15 insertions(+), 12 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h > index ddfbcc8fd3d3..b89406b01694 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h > @@ -1056,7 +1056,6 @@ struct amdgpu_device { > bool in_s4; > bool in_s0ix; > > - atomic_t in_gpu_reset; > enum pp_mp1_state mp1_state; > struct amdgpu_doorbell_index doorbell_index; > > @@ -1463,8 +1462,6 @@ static inline bool amdgpu_is_tmz(struct amdgpu_device *adev) > return adev->gmc.tmz_enabled; > } > > -static inline int amdgpu_in_reset(struct amdgpu_device *adev) > -{ > - return atomic_read(&adev->in_gpu_reset); > -} > +int amdgpu_in_reset(struct amdgpu_device *adev); > + > #endif > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > index dcbb175d336f..e05d7cbefd2c 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > @@ -3554,7 +3554,6 @@ int amdgpu_device_init(struct amdgpu_device *adev, > mutex_init(&adev->mn_lock); > mutex_init(&adev->virt.vf_errors.lock); > hash_init(adev->mn_hash); > - atomic_set(&adev->in_gpu_reset, 0); > mutex_init(&adev->psp.mutex); > mutex_init(&adev->notifier_lock); > > @@ -4829,7 +4828,7 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle, > static void amdgpu_device_lock_adev(struct amdgpu_device *adev, > struct amdgpu_hive_info *hive) > { > - atomic_set(&adev->in_gpu_reset, 1); > + atomic_set(&adev->reset_domain->in_gpu_reset, 1); > > if (hive) { > down_write_nest_lock(&adev->reset_domain->sem, &hive->hive_lock); > @@ -4854,7 +4853,7 @@ static void amdgpu_device_unlock_adev(struct amdgpu_device *adev) > { > amdgpu_vf_error_trans_all(adev); > adev->mp1_state = PP_MP1_STATE_NONE; > - atomic_set(&adev->in_gpu_reset, 0); > + atomic_set(&adev->reset_domain->in_gpu_reset, 0); > up_write(&adev->reset_domain->sem); > } > > @@ -5699,6 +5698,11 @@ void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev, > amdgpu_asic_invalidate_hdp(adev, ring); > } > > +int amdgpu_in_reset(struct amdgpu_device *adev) > +{ > + return atomic_read(&adev->reset_domain->in_gpu_reset); > + } > + > /** > * amdgpu_device_halt() - bring hardware to some kind of halt state > * > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c > index c0988c804459..5ab72c3bfbda 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c > @@ -131,6 +131,7 @@ struct amdgpu_reset_domain *amdgpu_reset_create_reset_domain(enum amdgpu_reset_d > > } > > + atomic_set(&reset_domain->in_gpu_reset, 0); > init_rwsem(&reset_domain->sem); > > return reset_domain; > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h > index 80f918e87d4f..ea6fc98ea927 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h > @@ -81,6 +81,7 @@ struct amdgpu_reset_domain { > struct workqueue_struct *wq; > enum amdgpu_reset_domain_type type; > struct rw_semaphore sem; > + atomic_t in_gpu_reset; > }; > > > diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c > index 4e23c29e665c..b81acf59870c 100644 > --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c > +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c > @@ -259,7 +259,7 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct *work) > * otherwise the mailbox msg will be ruined/reseted by > * the VF FLR. > */ > - if (atomic_cmpxchg(&adev->in_gpu_reset, 0, 1) != 0) > + if (atomic_cmpxchg(&adev->reset_domain->in_gpu_reset, 0, 1) != 0) > return; > > down_write(&adev->reset_domain->sem); > @@ -277,7 +277,7 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct *work) > } while (timeout > 1); > > flr_done: > - atomic_set(&adev->in_gpu_reset, 0); > + atomic_set(&adev->reset_domain->in_gpu_reset, 0); > up_write(&adev->reset_domain->sem); > > /* Trigger recovery for world switch failure if no TDR */ > diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c > index f715780f7d20..22c10b97ea81 100644 > --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c > +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c > @@ -283,7 +283,7 @@ static void xgpu_nv_mailbox_flr_work(struct work_struct *work) > * otherwise the mailbox msg will be ruined/reseted by > * the VF FLR. > */ > - if (atomic_cmpxchg(&adev->in_gpu_reset, 0, 1) != 0) > + if (atomic_cmpxchg(&adev->reset_domain->in_gpu_reset, 0, 1) != 0) > return; > > down_write(&adev->reset_domain->sem); > @@ -301,7 +301,7 @@ static void xgpu_nv_mailbox_flr_work(struct work_struct *work) > } while (timeout > 1); > > flr_done: > - atomic_set(&adev->in_gpu_reset, 0); > + atomic_set(&adev->reset_domain->in_gpu_reset, 0); > up_write(&adev->reset_domain->sem); > > /* Trigger recovery for world switch failure if no TDR */
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index ddfbcc8fd3d3..b89406b01694 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -1056,7 +1056,6 @@ struct amdgpu_device { bool in_s4; bool in_s0ix; - atomic_t in_gpu_reset; enum pp_mp1_state mp1_state; struct amdgpu_doorbell_index doorbell_index; @@ -1463,8 +1462,6 @@ static inline bool amdgpu_is_tmz(struct amdgpu_device *adev) return adev->gmc.tmz_enabled; } -static inline int amdgpu_in_reset(struct amdgpu_device *adev) -{ - return atomic_read(&adev->in_gpu_reset); -} +int amdgpu_in_reset(struct amdgpu_device *adev); + #endif diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index dcbb175d336f..e05d7cbefd2c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -3554,7 +3554,6 @@ int amdgpu_device_init(struct amdgpu_device *adev, mutex_init(&adev->mn_lock); mutex_init(&adev->virt.vf_errors.lock); hash_init(adev->mn_hash); - atomic_set(&adev->in_gpu_reset, 0); mutex_init(&adev->psp.mutex); mutex_init(&adev->notifier_lock); @@ -4829,7 +4828,7 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle, static void amdgpu_device_lock_adev(struct amdgpu_device *adev, struct amdgpu_hive_info *hive) { - atomic_set(&adev->in_gpu_reset, 1); + atomic_set(&adev->reset_domain->in_gpu_reset, 1); if (hive) { down_write_nest_lock(&adev->reset_domain->sem, &hive->hive_lock); @@ -4854,7 +4853,7 @@ static void amdgpu_device_unlock_adev(struct amdgpu_device *adev) { amdgpu_vf_error_trans_all(adev); adev->mp1_state = PP_MP1_STATE_NONE; - atomic_set(&adev->in_gpu_reset, 0); + atomic_set(&adev->reset_domain->in_gpu_reset, 0); up_write(&adev->reset_domain->sem); } @@ -5699,6 +5698,11 @@ void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev, amdgpu_asic_invalidate_hdp(adev, ring); } +int amdgpu_in_reset(struct amdgpu_device *adev) +{ + return atomic_read(&adev->reset_domain->in_gpu_reset); + } + /** * amdgpu_device_halt() - bring hardware to some kind of halt state * diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c index c0988c804459..5ab72c3bfbda 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c @@ -131,6 +131,7 @@ struct amdgpu_reset_domain *amdgpu_reset_create_reset_domain(enum amdgpu_reset_d } + atomic_set(&reset_domain->in_gpu_reset, 0); init_rwsem(&reset_domain->sem); return reset_domain; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h index 80f918e87d4f..ea6fc98ea927 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h @@ -81,6 +81,7 @@ struct amdgpu_reset_domain { struct workqueue_struct *wq; enum amdgpu_reset_domain_type type; struct rw_semaphore sem; + atomic_t in_gpu_reset; }; diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c index 4e23c29e665c..b81acf59870c 100644 --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c @@ -259,7 +259,7 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct *work) * otherwise the mailbox msg will be ruined/reseted by * the VF FLR. */ - if (atomic_cmpxchg(&adev->in_gpu_reset, 0, 1) != 0) + if (atomic_cmpxchg(&adev->reset_domain->in_gpu_reset, 0, 1) != 0) return; down_write(&adev->reset_domain->sem); @@ -277,7 +277,7 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct *work) } while (timeout > 1); flr_done: - atomic_set(&adev->in_gpu_reset, 0); + atomic_set(&adev->reset_domain->in_gpu_reset, 0); up_write(&adev->reset_domain->sem); /* Trigger recovery for world switch failure if no TDR */ diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c index f715780f7d20..22c10b97ea81 100644 --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c @@ -283,7 +283,7 @@ static void xgpu_nv_mailbox_flr_work(struct work_struct *work) * otherwise the mailbox msg will be ruined/reseted by * the VF FLR. */ - if (atomic_cmpxchg(&adev->in_gpu_reset, 0, 1) != 0) + if (atomic_cmpxchg(&adev->reset_domain->in_gpu_reset, 0, 1) != 0) return; down_write(&adev->reset_domain->sem); @@ -301,7 +301,7 @@ static void xgpu_nv_mailbox_flr_work(struct work_struct *work) } while (timeout > 1); flr_done: - atomic_set(&adev->in_gpu_reset, 0); + atomic_set(&adev->reset_domain->in_gpu_reset, 0); up_write(&adev->reset_domain->sem); /* Trigger recovery for world switch failure if no TDR */
We should have a single instance per entrire reset domain. Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com> Suggested-by: Lijo Lazar <lijo.lazar@amd.com> --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 7 ++----- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 10 +++++++--- drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c | 1 + drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h | 1 + drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 4 ++-- drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 4 ++-- 6 files changed, 15 insertions(+), 12 deletions(-)