Message ID | 1576256049-12838-1-git-send-email-andrey.grodzovsky@amd.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | [v2,1/5] drm/amdgpu: reverts commit b01245ff54db66073b104ac9d9fbefb7b264b36d. | expand |
Ping on unreviewed V2s... Andrey On 12/13/19 11:54 AM, Andrey Grodzovsky wrote: > In preparation for doing XGMI reset synchronization using task barrier. > > Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com> > Reviewed-by: Le Ma <Le.Ma@amd.com> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu.h | 2 - > drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 76 +++++------------------------- > 2 files changed, 12 insertions(+), 66 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h > index a78a363..50bab33 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h > @@ -1001,8 +1001,6 @@ struct amdgpu_device { > > bool pm_sysfs_en; > bool ucode_sysfs_en; > - > - bool in_baco; > }; > > static inline struct amdgpu_device *amdgpu_ttm_adev(struct ttm_bo_device *bdev) > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > index 7324a5f..1d19edfa 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > @@ -2667,7 +2667,7 @@ static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) > if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) > adev->asic_reset_res = (adev->in_baco == false) ? > amdgpu_device_baco_enter(adev->ddev) : > - amdgpu_device_baco_exit(adev->ddev); > + qamdgpu_device_baco_exit(adev->ddev); > else > adev->asic_reset_res = amdgpu_asic_reset(adev); > > @@ -3796,18 +3796,13 @@ static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, > return r; > } > > -static int amdgpu_do_asic_reset(struct amdgpu_device *adev, > - struct amdgpu_hive_info *hive, > +static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive, > struct list_head *device_list_handle, > bool *need_full_reset_arg) > { > struct amdgpu_device *tmp_adev = NULL; > bool need_full_reset = *need_full_reset_arg, vram_lost = false; > int r = 0; > - int cpu = smp_processor_id(); > - bool use_baco = > - (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) ? > - true : false; > > /* > * ASIC reset has to be done on all HGMI hive nodes ASAP > @@ -3815,62 +3810,22 @@ static int amdgpu_do_asic_reset(struct amdgpu_device *adev, > */ > if (need_full_reset) { > list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { > - /* > - * For XGMI run all resets in parallel to speed up the > - * process by scheduling the highpri wq on different > - * cpus. For XGMI with baco reset, all nodes must enter > - * baco within close proximity before anyone exit. > - */ > + /* For XGMI run all resets in parallel to speed up the process */ > if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { > - if (!queue_work_on(cpu, system_highpri_wq, > - &tmp_adev->xgmi_reset_work)) > + if (!queue_work(system_highpri_wq, &tmp_adev->xgmi_reset_work)) > r = -EALREADY; > - cpu = cpumask_next(cpu, cpu_online_mask); > } else > r = amdgpu_asic_reset(tmp_adev); > - if (r) > - break; > - } > - > - /* For XGMI wait for all work to complete before proceed */ > - if (!r) { > - list_for_each_entry(tmp_adev, device_list_handle, > - gmc.xgmi.head) { > - if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { > - flush_work(&tmp_adev->xgmi_reset_work); > - r = tmp_adev->asic_reset_res; > - if (r) > - break; > - if (use_baco) > - tmp_adev->in_baco = true; > - } > - } > - } > > - /* > - * For XGMI with baco reset, need exit baco phase by scheduling > - * xgmi_reset_work one more time. PSP reset and sGPU skips this > - * phase. Not assume the situation that PSP reset and baco reset > - * coexist within an XGMI hive. > - */ > - > - if (!r && use_baco) { > - cpu = smp_processor_id(); > - list_for_each_entry(tmp_adev, device_list_handle, > - gmc.xgmi.head) { > - if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { > - if (!queue_work_on(cpu, > - system_highpri_wq, > - &tmp_adev->xgmi_reset_work)) > - r = -EALREADY; > - if (r) > - break; > - cpu = cpumask_next(cpu, cpu_online_mask); > - } > + if (r) { > + DRM_ERROR("ASIC reset failed with error, %d for drm dev, %s", > + r, tmp_adev->ddev->unique); > + break; > } > } > > - if (!r && use_baco) { > + /* For XGMI wait for all PSP resets to complete before proceed */ > + if (!r) { > list_for_each_entry(tmp_adev, device_list_handle, > gmc.xgmi.head) { > if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { > @@ -3878,21 +3833,15 @@ static int amdgpu_do_asic_reset(struct amdgpu_device *adev, > r = tmp_adev->asic_reset_res; > if (r) > break; > - tmp_adev->in_baco = false; > } > } > } > - > - if (r) { > - DRM_ERROR("ASIC reset failed with error, %d for drm dev, %s", > - r, tmp_adev->ddev->unique); > - goto end; > - } > } > > if (!r && amdgpu_ras_intr_triggered()) > amdgpu_ras_intr_cleared(); > > + > list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { > if (need_full_reset) { > /* post card */ > @@ -4181,8 +4130,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, > if (r) > adev->asic_reset_res = r; > } else { > - r = amdgpu_do_asic_reset(adev, hive, device_list_handle, > - &need_full_reset); > + r = amdgpu_do_asic_reset(hive, device_list_handle, &need_full_reset); > if (r && r == -EAGAIN) > goto retry; > }
[AMD Official Use Only - Internal Distribution Only] Hi Andry Please check the 3 minor comments in this patch. With that addressed, the V2s series is Reviewed-by: Le Ma <Le.Ma@amd.com<mailto:Le.Ma@amd.com>> Regards, Ma Le -----Original Message----- From: Andrey Grodzovsky <andrey.grodzovsky@amd.com> Sent: Saturday, December 14, 2019 12:54 AM To: dri-devel@lists.freedesktop.org; amd-gfx@lists.freedesktop.org Cc: Deucher, Alexander <Alexander.Deucher@amd.com>; Ma, Le <Le.Ma@amd.com>; Zhang, Hawking <Hawking.Zhang@amd.com>; Quan, Evan <Evan.Quan@amd.com>; Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com> Subject: [PATCH v2 1/5] drm/amdgpu: reverts commit b01245ff54db66073b104ac9d9fbefb7b264b36d. In preparation for doing XGMI reset synchronization using task barrier. Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com<mailto:andrey.grodzovsky@amd.com>> Reviewed-by: Le Ma <Le.Ma@amd.com<mailto:Le.Ma@amd.com>> --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 2 - drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 76 +++++------------------------- 2 files changed, 12 insertions(+), 66 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index a78a363..50bab33 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -1001,8 +1001,6 @@ struct amdgpu_device { bool pm_sysfs_en; bool ucode_sysfs_en; - - bool in_baco; }; static inline struct amdgpu_device *amdgpu_ttm_adev(struct ttm_bo_device *bdev) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 7324a5f..1d19edfa 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -2667,7 +2667,7 @@ static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) adev->asic_reset_res = (adev->in_baco == false) ? amdgpu_device_baco_enter(adev->ddev) : - amdgpu_device_baco_exit(adev->ddev); + qamdgpu_device_baco_exit(adev->ddev); [Le] 1/3: Still unnecessary typo here, although it will be removed in patch #4. else adev->asic_reset_res = amdgpu_asic_reset(adev); @@ -3796,18 +3796,13 @@ static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, return r; } -static int amdgpu_do_asic_reset(struct amdgpu_device *adev, - struct amdgpu_hive_info *hive, +static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive, struct list_head *device_list_handle, bool *need_full_reset_arg) { struct amdgpu_device *tmp_adev = NULL; bool need_full_reset = *need_full_reset_arg, vram_lost = false; int r = 0; - int cpu = smp_processor_id(); - bool use_baco = - (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) ? - true : false; /* * ASIC reset has to be done on all HGMI hive nodes ASAP @@ -3815,62 +3810,22 @@ static int amdgpu_do_asic_reset(struct amdgpu_device *adev, */ if (need_full_reset) { list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { - /* - * For XGMI run all resets in parallel to speed up the - * process by scheduling the highpri wq on different - * cpus. For XGMI with baco reset, all nodes must enter - * baco within close proximity before anyone exit. - */ + /* For XGMI run all resets in parallel to speed up the process */ if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { - if (!queue_work_on(cpu, system_highpri_wq, - &tmp_adev->xgmi_reset_work)) + if (!queue_work(system_highpri_wq, &tmp_adev->xgmi_reset_work)) r = -EALREADY; - cpu = cpumask_next(cpu, cpu_online_mask); } else r = amdgpu_asic_reset(tmp_adev); - if (r) - break; - } - - /* For XGMI wait for all work to complete before proceed */ - if (!r) { - list_for_each_entry(tmp_adev, device_list_handle, - gmc.xgmi.head) { - if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { - flush_work(&tmp_adev->xgmi_reset_work); - r = tmp_adev->asic_reset_res; - if (r) - break; - if (use_baco) - tmp_adev->in_baco = true; - } - } - } - /* - * For XGMI with baco reset, need exit baco phase by scheduling - * xgmi_reset_work one more time. PSP reset and sGPU skips this - * phase. Not assume the situation that PSP reset and baco reset - * coexist within an XGMI hive. - */ - - if (!r && use_baco) { - cpu = smp_processor_id(); - list_for_each_entry(tmp_adev, device_list_handle, - gmc.xgmi.head) { - if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { - if (!queue_work_on(cpu, - system_highpri_wq, - &tmp_adev->xgmi_reset_work)) - r = -EALREADY; - if (r) - break; - cpu = cpumask_next(cpu, cpu_online_mask); - } + if (r) { + DRM_ERROR("ASIC reset failed with error, %d for drm dev, %s", + r, tmp_adev->ddev->unique); + break; } } - if (!r && use_baco) { + /* For XGMI wait for all PSP resets to complete before proceed */ [Le] 2/3: We can drop PSP string in this comment. + if (!r) { list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { @@ -3878,21 +3833,15 @@ static int amdgpu_do_asic_reset(struct amdgpu_device *adev, r = tmp_adev->asic_reset_res; if (r) break; - tmp_adev->in_baco = false; } } } - - if (r) { - DRM_ERROR("ASIC reset failed with error, %d for drm dev, %s", - r, tmp_adev->ddev->unique); - goto end; - } } if (!r && amdgpu_ras_intr_triggered()) amdgpu_ras_intr_cleared(); + [Le] 3/3: Redundant white line. list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { if (need_full_reset) { /* post card */ @@ -4181,8 +4130,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, if (r) adev->asic_reset_res = r; } else { - r = amdgpu_do_asic_reset(adev, hive, device_list_handle, - &need_full_reset); + r = amdgpu_do_asic_reset(hive, device_list_handle, +&need_full_reset); if (r && r == -EAGAIN) goto retry; } -- 2.7.4
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index a78a363..50bab33 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -1001,8 +1001,6 @@ struct amdgpu_device { bool pm_sysfs_en; bool ucode_sysfs_en; - - bool in_baco; }; static inline struct amdgpu_device *amdgpu_ttm_adev(struct ttm_bo_device *bdev) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 7324a5f..1d19edfa 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -2667,7 +2667,7 @@ static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) adev->asic_reset_res = (adev->in_baco == false) ? amdgpu_device_baco_enter(adev->ddev) : - amdgpu_device_baco_exit(adev->ddev); + qamdgpu_device_baco_exit(adev->ddev); else adev->asic_reset_res = amdgpu_asic_reset(adev); @@ -3796,18 +3796,13 @@ static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, return r; } -static int amdgpu_do_asic_reset(struct amdgpu_device *adev, - struct amdgpu_hive_info *hive, +static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive, struct list_head *device_list_handle, bool *need_full_reset_arg) { struct amdgpu_device *tmp_adev = NULL; bool need_full_reset = *need_full_reset_arg, vram_lost = false; int r = 0; - int cpu = smp_processor_id(); - bool use_baco = - (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) ? - true : false; /* * ASIC reset has to be done on all HGMI hive nodes ASAP @@ -3815,62 +3810,22 @@ static int amdgpu_do_asic_reset(struct amdgpu_device *adev, */ if (need_full_reset) { list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { - /* - * For XGMI run all resets in parallel to speed up the - * process by scheduling the highpri wq on different - * cpus. For XGMI with baco reset, all nodes must enter - * baco within close proximity before anyone exit. - */ + /* For XGMI run all resets in parallel to speed up the process */ if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { - if (!queue_work_on(cpu, system_highpri_wq, - &tmp_adev->xgmi_reset_work)) + if (!queue_work(system_highpri_wq, &tmp_adev->xgmi_reset_work)) r = -EALREADY; - cpu = cpumask_next(cpu, cpu_online_mask); } else r = amdgpu_asic_reset(tmp_adev); - if (r) - break; - } - - /* For XGMI wait for all work to complete before proceed */ - if (!r) { - list_for_each_entry(tmp_adev, device_list_handle, - gmc.xgmi.head) { - if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { - flush_work(&tmp_adev->xgmi_reset_work); - r = tmp_adev->asic_reset_res; - if (r) - break; - if (use_baco) - tmp_adev->in_baco = true; - } - } - } - /* - * For XGMI with baco reset, need exit baco phase by scheduling - * xgmi_reset_work one more time. PSP reset and sGPU skips this - * phase. Not assume the situation that PSP reset and baco reset - * coexist within an XGMI hive. - */ - - if (!r && use_baco) { - cpu = smp_processor_id(); - list_for_each_entry(tmp_adev, device_list_handle, - gmc.xgmi.head) { - if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { - if (!queue_work_on(cpu, - system_highpri_wq, - &tmp_adev->xgmi_reset_work)) - r = -EALREADY; - if (r) - break; - cpu = cpumask_next(cpu, cpu_online_mask); - } + if (r) { + DRM_ERROR("ASIC reset failed with error, %d for drm dev, %s", + r, tmp_adev->ddev->unique); + break; } } - if (!r && use_baco) { + /* For XGMI wait for all PSP resets to complete before proceed */ + if (!r) { list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { @@ -3878,21 +3833,15 @@ static int amdgpu_do_asic_reset(struct amdgpu_device *adev, r = tmp_adev->asic_reset_res; if (r) break; - tmp_adev->in_baco = false; } } } - - if (r) { - DRM_ERROR("ASIC reset failed with error, %d for drm dev, %s", - r, tmp_adev->ddev->unique); - goto end; - } } if (!r && amdgpu_ras_intr_triggered()) amdgpu_ras_intr_cleared(); + list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { if (need_full_reset) { /* post card */ @@ -4181,8 +4130,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, if (r) adev->asic_reset_res = r; } else { - r = amdgpu_do_asic_reset(adev, hive, device_list_handle, - &need_full_reset); + r = amdgpu_do_asic_reset(hive, device_list_handle, &need_full_reset); if (r && r == -EAGAIN) goto retry; }