Message ID | 20220125223752.200211-11-andrey.grodzovsky@amd.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | Define and use reset domain for GPU recovery in amdgpu | expand |
On 1/26/2022 4:07 AM, Andrey Grodzovsky wrote: > We should have a single instance per entrire reset domain. > > Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com> > Suggested-by: Lijo Lazar <lijo.lazar@amd.com> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu.h | 7 ++----- > drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 10 +++++++--- > drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c | 1 + > drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h | 1 + > drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 4 ++-- > drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 4 ++-- > 6 files changed, 15 insertions(+), 12 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h > index f021cd3c9d34..087796e389ab 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h > @@ -1056,7 +1056,6 @@ struct amdgpu_device { > bool in_s4; > bool in_s0ix; > > - atomic_t in_gpu_reset; > enum pp_mp1_state mp1_state; > struct amdgpu_doorbell_index doorbell_index; > > @@ -1461,8 +1460,6 @@ static inline bool amdgpu_is_tmz(struct amdgpu_device *adev) > return adev->gmc.tmz_enabled; > } > > -static inline int amdgpu_in_reset(struct amdgpu_device *adev) > -{ > - return atomic_read(&adev->in_gpu_reset); > -} > +int amdgpu_in_reset(struct amdgpu_device *adev); > + > #endif > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > index 6991ab4a8191..aa43af443ebe 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > @@ -3511,7 +3511,6 @@ int amdgpu_device_init(struct amdgpu_device *adev, > mutex_init(&adev->mn_lock); > mutex_init(&adev->virt.vf_errors.lock); > hash_init(adev->mn_hash); > - atomic_set(&adev->in_gpu_reset, 0); > mutex_init(&adev->psp.mutex); > mutex_init(&adev->notifier_lock); > > @@ -4775,7 +4774,7 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle, > static void amdgpu_device_lock_adev(struct amdgpu_device *adev, > struct amdgpu_hive_info *hive) > { > - atomic_set(&adev->in_gpu_reset, 1); > + atomic_set(&adev->reset_domain->in_gpu_reset, 1); > > if (hive) { > down_write_nest_lock(&adev->reset_domain->sem, &hive->hive_lock); > @@ -4800,7 +4799,7 @@ static void amdgpu_device_unlock_adev(struct amdgpu_device *adev) > { > amdgpu_vf_error_trans_all(adev); > adev->mp1_state = PP_MP1_STATE_NONE; > - atomic_set(&adev->in_gpu_reset, 0); > + atomic_set(&adev->reset_domain->in_gpu_reset, 0); > up_write(&adev->reset_domain->sem); > } > > @@ -5643,3 +5642,8 @@ void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev, > > amdgpu_asic_invalidate_hdp(adev, ring); > } > + > +int amdgpu_in_reset(struct amdgpu_device *adev) > +{ > + return atomic_read(&adev->reset_domain->in_gpu_reset); > +} > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c > index 011585e330f6..e9b804a89b34 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c > @@ -127,6 +127,7 @@ struct amdgpu_reset_domain *amdgpu_reset_create_reset_domain(char *wq_name) > > } > > + atomic_set(&reset_domain->in_gpu_reset, 0); > init_rwsem(&reset_domain->sem); > > return reset_domain; > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h > index 7451089b0c06..413982f4e1ce 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h > @@ -74,6 +74,7 @@ struct amdgpu_reset_domain { > struct kref refcount; > struct workqueue_struct *wq; > struct rw_semaphore sem; > + atomic_t in_gpu_reset; Maybe 'active' (independent of gpu) just to indicate that a reset is ongoing in the domain? Thanks, Lijo > }; > > > diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c > index 5dab06fce26a..6c79746d18db 100644 > --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c > +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c > @@ -258,7 +258,7 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct *work) > return; > > amdgpu_virt_fini_data_exchange(adev); > - atomic_set(&adev->in_gpu_reset, 1); > + atomic_set(&adev->reset_domain->in_gpu_reset, 1); > > xgpu_ai_mailbox_trans_msg(adev, IDH_READY_TO_RESET, 0, 0, 0); > > @@ -271,7 +271,7 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct *work) > } while (timeout > 1); > > flr_done: > - atomic_set(&adev->in_gpu_reset, 0); > + atomic_set(&adev->reset_domain->in_gpu_reset, 0); > up_write(&adev->reset_domain->sem); > > /* Trigger recovery for world switch failure if no TDR */ > diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c > index 868144fff16a..39f7e1e9ab81 100644 > --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c > +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c > @@ -287,7 +287,7 @@ static void xgpu_nv_mailbox_flr_work(struct work_struct *work) > return; > > amdgpu_virt_fini_data_exchange(adev); > - atomic_set(&adev->in_gpu_reset, 1); > + atomic_set(&adev->reset_domain->in_gpu_reset, 1); > > xgpu_nv_mailbox_trans_msg(adev, IDH_READY_TO_RESET, 0, 0, 0); > > @@ -300,7 +300,7 @@ static void xgpu_nv_mailbox_flr_work(struct work_struct *work) > } while (timeout > 1); > > flr_done: > - atomic_set(&adev->in_gpu_reset, 0); > + atomic_set(&adev->reset_domain->in_gpu_reset, 0); > up_write(&adev->reset_domain->sem); > > /* Trigger recovery for world switch failure if no TDR */ >
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index f021cd3c9d34..087796e389ab 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -1056,7 +1056,6 @@ struct amdgpu_device { bool in_s4; bool in_s0ix; - atomic_t in_gpu_reset; enum pp_mp1_state mp1_state; struct amdgpu_doorbell_index doorbell_index; @@ -1461,8 +1460,6 @@ static inline bool amdgpu_is_tmz(struct amdgpu_device *adev) return adev->gmc.tmz_enabled; } -static inline int amdgpu_in_reset(struct amdgpu_device *adev) -{ - return atomic_read(&adev->in_gpu_reset); -} +int amdgpu_in_reset(struct amdgpu_device *adev); + #endif diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 6991ab4a8191..aa43af443ebe 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -3511,7 +3511,6 @@ int amdgpu_device_init(struct amdgpu_device *adev, mutex_init(&adev->mn_lock); mutex_init(&adev->virt.vf_errors.lock); hash_init(adev->mn_hash); - atomic_set(&adev->in_gpu_reset, 0); mutex_init(&adev->psp.mutex); mutex_init(&adev->notifier_lock); @@ -4775,7 +4774,7 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle, static void amdgpu_device_lock_adev(struct amdgpu_device *adev, struct amdgpu_hive_info *hive) { - atomic_set(&adev->in_gpu_reset, 1); + atomic_set(&adev->reset_domain->in_gpu_reset, 1); if (hive) { down_write_nest_lock(&adev->reset_domain->sem, &hive->hive_lock); @@ -4800,7 +4799,7 @@ static void amdgpu_device_unlock_adev(struct amdgpu_device *adev) { amdgpu_vf_error_trans_all(adev); adev->mp1_state = PP_MP1_STATE_NONE; - atomic_set(&adev->in_gpu_reset, 0); + atomic_set(&adev->reset_domain->in_gpu_reset, 0); up_write(&adev->reset_domain->sem); } @@ -5643,3 +5642,8 @@ void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev, amdgpu_asic_invalidate_hdp(adev, ring); } + +int amdgpu_in_reset(struct amdgpu_device *adev) +{ + return atomic_read(&adev->reset_domain->in_gpu_reset); +} diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c index 011585e330f6..e9b804a89b34 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c @@ -127,6 +127,7 @@ struct amdgpu_reset_domain *amdgpu_reset_create_reset_domain(char *wq_name) } + atomic_set(&reset_domain->in_gpu_reset, 0); init_rwsem(&reset_domain->sem); return reset_domain; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h index 7451089b0c06..413982f4e1ce 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h @@ -74,6 +74,7 @@ struct amdgpu_reset_domain { struct kref refcount; struct workqueue_struct *wq; struct rw_semaphore sem; + atomic_t in_gpu_reset; }; diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c index 5dab06fce26a..6c79746d18db 100644 --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c @@ -258,7 +258,7 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct *work) return; amdgpu_virt_fini_data_exchange(adev); - atomic_set(&adev->in_gpu_reset, 1); + atomic_set(&adev->reset_domain->in_gpu_reset, 1); xgpu_ai_mailbox_trans_msg(adev, IDH_READY_TO_RESET, 0, 0, 0); @@ -271,7 +271,7 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct *work) } while (timeout > 1); flr_done: - atomic_set(&adev->in_gpu_reset, 0); + atomic_set(&adev->reset_domain->in_gpu_reset, 0); up_write(&adev->reset_domain->sem); /* Trigger recovery for world switch failure if no TDR */ diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c index 868144fff16a..39f7e1e9ab81 100644 --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c @@ -287,7 +287,7 @@ static void xgpu_nv_mailbox_flr_work(struct work_struct *work) return; amdgpu_virt_fini_data_exchange(adev); - atomic_set(&adev->in_gpu_reset, 1); + atomic_set(&adev->reset_domain->in_gpu_reset, 1); xgpu_nv_mailbox_trans_msg(adev, IDH_READY_TO_RESET, 0, 0, 0); @@ -300,7 +300,7 @@ static void xgpu_nv_mailbox_flr_work(struct work_struct *work) } while (timeout > 1); flr_done: - atomic_set(&adev->in_gpu_reset, 0); + atomic_set(&adev->reset_domain->in_gpu_reset, 0); up_write(&adev->reset_domain->sem); /* Trigger recovery for world switch failure if no TDR */
We should have a single instance per entrire reset domain. Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com> Suggested-by: Lijo Lazar <lijo.lazar@amd.com> --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 7 ++----- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 10 +++++++--- drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c | 1 + drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h | 1 + drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 4 ++-- drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 4 ++-- 6 files changed, 15 insertions(+), 12 deletions(-)