Message ID | 20220125223752.200211-2-andrey.grodzovsky@amd.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | Define and use reset domain for GPU recovery in amdgpu | expand |
Am 25.01.22 um 23:37 schrieb Andrey Grodzovsky: > Defined a reset_domain struct such that > all the entities that go through reset > together will be serialized one against > another. Do it for both single device and > XGMI hive cases. > > Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com> > Suggested-by: Daniel Vetter <daniel.vetter@ffwll.ch> > Suggested-by: Christian König <ckoenig.leichtzumerken@gmail.com> > Reviewed-by: Christian König <christian.koenig@amd.com> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu.h | 7 +++++++ > drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 20 +++++++++++++++++++- > drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 9 +++++++++ > drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h | 2 ++ > 4 files changed, 37 insertions(+), 1 deletion(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h > index 9f017663ac50..b5ff76aae7e0 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h > @@ -812,6 +812,11 @@ struct amd_powerplay { > > #define AMDGPU_RESET_MAGIC_NUM 64 > #define AMDGPU_MAX_DF_PERFMONS 4 > + > +struct amdgpu_reset_domain { > + struct workqueue_struct *wq; > +}; > + > struct amdgpu_device { > struct device *dev; > struct pci_dev *pdev; > @@ -1096,6 +1101,8 @@ struct amdgpu_device { > > struct amdgpu_reset_control *reset_cntl; > uint32_t ip_versions[HW_ID_MAX][HWIP_MAX_INSTANCE]; > + > + struct amdgpu_reset_domain reset_domain; I'm a bit confused, shouldn't this be a pointer? Regards, Christian. > }; > > static inline struct amdgpu_device *drm_to_adev(struct drm_device *ddev) > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > index 90d22a376632..0f3e6c078f88 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > @@ -2391,9 +2391,27 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev) > if (r) > goto init_failed; > > - if (adev->gmc.xgmi.num_physical_nodes > 1) > + if (adev->gmc.xgmi.num_physical_nodes > 1) { > + struct amdgpu_hive_info *hive; > + > amdgpu_xgmi_add_device(adev); > > + hive = amdgpu_get_xgmi_hive(adev); > + if (!hive || !hive->reset_domain.wq) { > + DRM_ERROR("Failed to obtain reset domain info for XGMI hive:%llx", hive->hive_id); > + r = -EINVAL; > + goto init_failed; > + } > + > + adev->reset_domain.wq = hive->reset_domain.wq; > + } else { > + adev->reset_domain.wq = alloc_ordered_workqueue("amdgpu-reset-dev", 0); > + if (!adev->reset_domain.wq) { > + r = -ENOMEM; > + goto init_failed; > + } > + } > + > /* Don't init kfd if whole hive need to be reset during init */ > if (!adev->gmc.xgmi.pending_reset) > amdgpu_amdkfd_device_init(adev); > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c > index 567df2db23ac..a858e3457c5c 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c > @@ -392,6 +392,14 @@ struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev) > goto pro_end; > } > > + hive->reset_domain.wq = alloc_ordered_workqueue("amdgpu-reset-hive", 0); > + if (!hive->reset_domain.wq) { > + dev_err(adev->dev, "XGMI: failed allocating wq for reset domain!\n"); > + kfree(hive); > + hive = NULL; > + goto pro_end; > + } > + > hive->hive_id = adev->gmc.xgmi.hive_id; > INIT_LIST_HEAD(&hive->device_list); > INIT_LIST_HEAD(&hive->node); > @@ -401,6 +409,7 @@ struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev) > task_barrier_init(&hive->tb); > hive->pstate = AMDGPU_XGMI_PSTATE_UNKNOWN; > hive->hi_req_gpu = NULL; > + > /* > * hive pstate on boot is high in vega20 so we have to go to low > * pstate on after boot. > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h > index d2189bf7d428..6121aaa292cb 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h > @@ -42,6 +42,8 @@ struct amdgpu_hive_info { > AMDGPU_XGMI_PSTATE_MAX_VEGA20, > AMDGPU_XGMI_PSTATE_UNKNOWN > } pstate; > + > + struct amdgpu_reset_domain reset_domain; > }; > > struct amdgpu_pcs_ras_field {
On 2022-01-26 07:07, Christian König wrote: > Am 25.01.22 um 23:37 schrieb Andrey Grodzovsky: >> Defined a reset_domain struct such that >> all the entities that go through reset >> together will be serialized one against >> another. Do it for both single device and >> XGMI hive cases. >> >> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com> >> Suggested-by: Daniel Vetter <daniel.vetter@ffwll.ch> >> Suggested-by: Christian König <ckoenig.leichtzumerken@gmail.com> >> Reviewed-by: Christian König <christian.koenig@amd.com> >> --- >> drivers/gpu/drm/amd/amdgpu/amdgpu.h | 7 +++++++ >> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 20 +++++++++++++++++++- >> drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 9 +++++++++ >> drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h | 2 ++ >> 4 files changed, 37 insertions(+), 1 deletion(-) >> >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h >> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h >> index 9f017663ac50..b5ff76aae7e0 100644 >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h >> @@ -812,6 +812,11 @@ struct amd_powerplay { >> #define AMDGPU_RESET_MAGIC_NUM 64 >> #define AMDGPU_MAX_DF_PERFMONS 4 >> + >> +struct amdgpu_reset_domain { >> + struct workqueue_struct *wq; >> +}; >> + >> struct amdgpu_device { >> struct device *dev; >> struct pci_dev *pdev; >> @@ -1096,6 +1101,8 @@ struct amdgpu_device { >> struct amdgpu_reset_control *reset_cntl; >> uint32_t ip_versions[HW_ID_MAX][HWIP_MAX_INSTANCE]; >> + >> + struct amdgpu_reset_domain reset_domain; > > I'm a bit confused, shouldn't this be a pointer? > > Regards, > Christian. Yea, I see you already noticed in the followup patch - i had troubles reworking from first patch, a lot of merge conflicts and so I just added the rework on top of last patch-set. Andrey > >> }; >> static inline struct amdgpu_device *drm_to_adev(struct drm_device >> *ddev) >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >> index 90d22a376632..0f3e6c078f88 100644 >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >> @@ -2391,9 +2391,27 @@ static int amdgpu_device_ip_init(struct >> amdgpu_device *adev) >> if (r) >> goto init_failed; >> - if (adev->gmc.xgmi.num_physical_nodes > 1) >> + if (adev->gmc.xgmi.num_physical_nodes > 1) { >> + struct amdgpu_hive_info *hive; >> + >> amdgpu_xgmi_add_device(adev); >> + hive = amdgpu_get_xgmi_hive(adev); >> + if (!hive || !hive->reset_domain.wq) { >> + DRM_ERROR("Failed to obtain reset domain info for XGMI >> hive:%llx", hive->hive_id); >> + r = -EINVAL; >> + goto init_failed; >> + } >> + >> + adev->reset_domain.wq = hive->reset_domain.wq; >> + } else { >> + adev->reset_domain.wq = >> alloc_ordered_workqueue("amdgpu-reset-dev", 0); >> + if (!adev->reset_domain.wq) { >> + r = -ENOMEM; >> + goto init_failed; >> + } >> + } >> + >> /* Don't init kfd if whole hive need to be reset during init */ >> if (!adev->gmc.xgmi.pending_reset) >> amdgpu_amdkfd_device_init(adev); >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c >> b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c >> index 567df2db23ac..a858e3457c5c 100644 >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c >> @@ -392,6 +392,14 @@ struct amdgpu_hive_info >> *amdgpu_get_xgmi_hive(struct amdgpu_device *adev) >> goto pro_end; >> } >> + hive->reset_domain.wq = >> alloc_ordered_workqueue("amdgpu-reset-hive", 0); >> + if (!hive->reset_domain.wq) { >> + dev_err(adev->dev, "XGMI: failed allocating wq for reset >> domain!\n"); >> + kfree(hive); >> + hive = NULL; >> + goto pro_end; >> + } >> + >> hive->hive_id = adev->gmc.xgmi.hive_id; >> INIT_LIST_HEAD(&hive->device_list); >> INIT_LIST_HEAD(&hive->node); >> @@ -401,6 +409,7 @@ struct amdgpu_hive_info >> *amdgpu_get_xgmi_hive(struct amdgpu_device *adev) >> task_barrier_init(&hive->tb); >> hive->pstate = AMDGPU_XGMI_PSTATE_UNKNOWN; >> hive->hi_req_gpu = NULL; >> + >> /* >> * hive pstate on boot is high in vega20 so we have to go to low >> * pstate on after boot. >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h >> b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h >> index d2189bf7d428..6121aaa292cb 100644 >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h >> @@ -42,6 +42,8 @@ struct amdgpu_hive_info { >> AMDGPU_XGMI_PSTATE_MAX_VEGA20, >> AMDGPU_XGMI_PSTATE_UNKNOWN >> } pstate; >> + >> + struct amdgpu_reset_domain reset_domain; >> }; >> struct amdgpu_pcs_ras_field { >
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index 9f017663ac50..b5ff76aae7e0 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -812,6 +812,11 @@ struct amd_powerplay { #define AMDGPU_RESET_MAGIC_NUM 64 #define AMDGPU_MAX_DF_PERFMONS 4 + +struct amdgpu_reset_domain { + struct workqueue_struct *wq; +}; + struct amdgpu_device { struct device *dev; struct pci_dev *pdev; @@ -1096,6 +1101,8 @@ struct amdgpu_device { struct amdgpu_reset_control *reset_cntl; uint32_t ip_versions[HW_ID_MAX][HWIP_MAX_INSTANCE]; + + struct amdgpu_reset_domain reset_domain; }; static inline struct amdgpu_device *drm_to_adev(struct drm_device *ddev) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 90d22a376632..0f3e6c078f88 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -2391,9 +2391,27 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev) if (r) goto init_failed; - if (adev->gmc.xgmi.num_physical_nodes > 1) + if (adev->gmc.xgmi.num_physical_nodes > 1) { + struct amdgpu_hive_info *hive; + amdgpu_xgmi_add_device(adev); + hive = amdgpu_get_xgmi_hive(adev); + if (!hive || !hive->reset_domain.wq) { + DRM_ERROR("Failed to obtain reset domain info for XGMI hive:%llx", hive->hive_id); + r = -EINVAL; + goto init_failed; + } + + adev->reset_domain.wq = hive->reset_domain.wq; + } else { + adev->reset_domain.wq = alloc_ordered_workqueue("amdgpu-reset-dev", 0); + if (!adev->reset_domain.wq) { + r = -ENOMEM; + goto init_failed; + } + } + /* Don't init kfd if whole hive need to be reset during init */ if (!adev->gmc.xgmi.pending_reset) amdgpu_amdkfd_device_init(adev); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c index 567df2db23ac..a858e3457c5c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c @@ -392,6 +392,14 @@ struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev) goto pro_end; } + hive->reset_domain.wq = alloc_ordered_workqueue("amdgpu-reset-hive", 0); + if (!hive->reset_domain.wq) { + dev_err(adev->dev, "XGMI: failed allocating wq for reset domain!\n"); + kfree(hive); + hive = NULL; + goto pro_end; + } + hive->hive_id = adev->gmc.xgmi.hive_id; INIT_LIST_HEAD(&hive->device_list); INIT_LIST_HEAD(&hive->node); @@ -401,6 +409,7 @@ struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev) task_barrier_init(&hive->tb); hive->pstate = AMDGPU_XGMI_PSTATE_UNKNOWN; hive->hi_req_gpu = NULL; + /* * hive pstate on boot is high in vega20 so we have to go to low * pstate on after boot. diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h index d2189bf7d428..6121aaa292cb 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h @@ -42,6 +42,8 @@ struct amdgpu_hive_info { AMDGPU_XGMI_PSTATE_MAX_VEGA20, AMDGPU_XGMI_PSTATE_UNKNOWN } pstate; + + struct amdgpu_reset_domain reset_domain; }; struct amdgpu_pcs_ras_field {