Message ID | 20240502183813.1612017-5-boris.brezillon@collabora.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | drm/panthor: More reset fixes | expand |
On 02/05/2024 19:38, Boris Brezillon wrote: > We need to undo what was done in panthor_sched_pre_reset() even if the > reset failed. We just flag all previously running groups as terminated > when that happens to unblock things. > > Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com> Seems reasonable, although I hope this case doesn't happen in practice ;) Reviewed-by: Steven Price <steven.price@arm.com> > --- > drivers/gpu/drm/panthor/panthor_device.c | 7 +------ > drivers/gpu/drm/panthor/panthor_sched.c | 19 ++++++++++++++----- > drivers/gpu/drm/panthor/panthor_sched.h | 2 +- > 3 files changed, 16 insertions(+), 12 deletions(-) > > diff --git a/drivers/gpu/drm/panthor/panthor_device.c b/drivers/gpu/drm/panthor/panthor_device.c > index 4c5b54e7abb7..4082c8f2951d 100644 > --- a/drivers/gpu/drm/panthor/panthor_device.c > +++ b/drivers/gpu/drm/panthor/panthor_device.c > @@ -129,13 +129,8 @@ static void panthor_device_reset_work(struct work_struct *work) > panthor_gpu_l2_power_on(ptdev); > panthor_mmu_post_reset(ptdev); > ret = panthor_fw_post_reset(ptdev); > - if (ret) > - goto out_dev_exit; > - > atomic_set(&ptdev->reset.pending, 0); > - panthor_sched_post_reset(ptdev); > - > -out_dev_exit: > + panthor_sched_post_reset(ptdev, ret != 0); > drm_dev_exit(cookie); > > if (ret) { > diff --git a/drivers/gpu/drm/panthor/panthor_sched.c b/drivers/gpu/drm/panthor/panthor_sched.c > index 6ea094b00cf9..fc43ff62c77d 100644 > --- a/drivers/gpu/drm/panthor/panthor_sched.c > +++ b/drivers/gpu/drm/panthor/panthor_sched.c > @@ -2728,15 +2728,22 @@ void panthor_sched_pre_reset(struct panthor_device *ptdev) > mutex_unlock(&sched->reset.lock); > } > > -void panthor_sched_post_reset(struct panthor_device *ptdev) > +void panthor_sched_post_reset(struct panthor_device *ptdev, bool reset_failed) > { > struct panthor_scheduler *sched = ptdev->scheduler; > struct panthor_group *group, *group_tmp; > > mutex_lock(&sched->reset.lock); > > - list_for_each_entry_safe(group, group_tmp, &sched->reset.stopped_groups, run_node) > + list_for_each_entry_safe(group, group_tmp, &sched->reset.stopped_groups, run_node) { > + /* Consider all previously running group as terminated if the > + * reset failed. > + */ > + if (reset_failed) > + group->state = PANTHOR_CS_GROUP_TERMINATED; > + > panthor_group_start(group); > + } > > /* We're done resetting the GPU, clear the reset.in_progress bit so we can > * kick the scheduler. > @@ -2744,9 +2751,11 @@ void panthor_sched_post_reset(struct panthor_device *ptdev) > atomic_set(&sched->reset.in_progress, false); > mutex_unlock(&sched->reset.lock); > > - sched_queue_delayed_work(sched, tick, 0); > - > - sched_queue_work(sched, sync_upd); > + /* No need to queue a tick and update syncs if the reset failed. */ > + if (!reset_failed) { > + sched_queue_delayed_work(sched, tick, 0); > + sched_queue_work(sched, sync_upd); > + } > } > > static void group_sync_upd_work(struct work_struct *work) > diff --git a/drivers/gpu/drm/panthor/panthor_sched.h b/drivers/gpu/drm/panthor/panthor_sched.h > index 66438b1f331f..3a30d2328b30 100644 > --- a/drivers/gpu/drm/panthor/panthor_sched.h > +++ b/drivers/gpu/drm/panthor/panthor_sched.h > @@ -40,7 +40,7 @@ void panthor_group_pool_destroy(struct panthor_file *pfile); > int panthor_sched_init(struct panthor_device *ptdev); > void panthor_sched_unplug(struct panthor_device *ptdev); > void panthor_sched_pre_reset(struct panthor_device *ptdev); > -void panthor_sched_post_reset(struct panthor_device *ptdev); > +void panthor_sched_post_reset(struct panthor_device *ptdev, bool reset_failed); > void panthor_sched_suspend(struct panthor_device *ptdev); > void panthor_sched_resume(struct panthor_device *ptdev); >
On Thu, May 02, 2024 at 08:38:12PM +0200, Boris Brezillon wrote: > We need to undo what was done in panthor_sched_pre_reset() even if the > reset failed. We just flag all previously running groups as terminated > when that happens to unblock things. > > Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com> Reviewed-by: Liviu Dudau <liviu.dudau@arm.com> > --- > drivers/gpu/drm/panthor/panthor_device.c | 7 +------ > drivers/gpu/drm/panthor/panthor_sched.c | 19 ++++++++++++++----- > drivers/gpu/drm/panthor/panthor_sched.h | 2 +- > 3 files changed, 16 insertions(+), 12 deletions(-) > > diff --git a/drivers/gpu/drm/panthor/panthor_device.c b/drivers/gpu/drm/panthor/panthor_device.c > index 4c5b54e7abb7..4082c8f2951d 100644 > --- a/drivers/gpu/drm/panthor/panthor_device.c > +++ b/drivers/gpu/drm/panthor/panthor_device.c > @@ -129,13 +129,8 @@ static void panthor_device_reset_work(struct work_struct *work) > panthor_gpu_l2_power_on(ptdev); > panthor_mmu_post_reset(ptdev); > ret = panthor_fw_post_reset(ptdev); > - if (ret) > - goto out_dev_exit; > - > atomic_set(&ptdev->reset.pending, 0); > - panthor_sched_post_reset(ptdev); > - > -out_dev_exit: > + panthor_sched_post_reset(ptdev, ret != 0); > drm_dev_exit(cookie); > > if (ret) { > diff --git a/drivers/gpu/drm/panthor/panthor_sched.c b/drivers/gpu/drm/panthor/panthor_sched.c > index 6ea094b00cf9..fc43ff62c77d 100644 > --- a/drivers/gpu/drm/panthor/panthor_sched.c > +++ b/drivers/gpu/drm/panthor/panthor_sched.c > @@ -2728,15 +2728,22 @@ void panthor_sched_pre_reset(struct panthor_device *ptdev) > mutex_unlock(&sched->reset.lock); > } > > -void panthor_sched_post_reset(struct panthor_device *ptdev) > +void panthor_sched_post_reset(struct panthor_device *ptdev, bool reset_failed) > { > struct panthor_scheduler *sched = ptdev->scheduler; > struct panthor_group *group, *group_tmp; > > mutex_lock(&sched->reset.lock); > > - list_for_each_entry_safe(group, group_tmp, &sched->reset.stopped_groups, run_node) > + list_for_each_entry_safe(group, group_tmp, &sched->reset.stopped_groups, run_node) { > + /* Consider all previously running group as terminated if the > + * reset failed. > + */ > + if (reset_failed) > + group->state = PANTHOR_CS_GROUP_TERMINATED; > + > panthor_group_start(group); > + } > > /* We're done resetting the GPU, clear the reset.in_progress bit so we can > * kick the scheduler. > @@ -2744,9 +2751,11 @@ void panthor_sched_post_reset(struct panthor_device *ptdev) > atomic_set(&sched->reset.in_progress, false); > mutex_unlock(&sched->reset.lock); > > - sched_queue_delayed_work(sched, tick, 0); > - > - sched_queue_work(sched, sync_upd); > + /* No need to queue a tick and update syncs if the reset failed. */ > + if (!reset_failed) { > + sched_queue_delayed_work(sched, tick, 0); > + sched_queue_work(sched, sync_upd); > + } > } > > static void group_sync_upd_work(struct work_struct *work) > diff --git a/drivers/gpu/drm/panthor/panthor_sched.h b/drivers/gpu/drm/panthor/panthor_sched.h > index 66438b1f331f..3a30d2328b30 100644 > --- a/drivers/gpu/drm/panthor/panthor_sched.h > +++ b/drivers/gpu/drm/panthor/panthor_sched.h > @@ -40,7 +40,7 @@ void panthor_group_pool_destroy(struct panthor_file *pfile); > int panthor_sched_init(struct panthor_device *ptdev); > void panthor_sched_unplug(struct panthor_device *ptdev); > void panthor_sched_pre_reset(struct panthor_device *ptdev); > -void panthor_sched_post_reset(struct panthor_device *ptdev); > +void panthor_sched_post_reset(struct panthor_device *ptdev, bool reset_failed); > void panthor_sched_suspend(struct panthor_device *ptdev); > void panthor_sched_resume(struct panthor_device *ptdev); > > -- > 2.44.0 >
diff --git a/drivers/gpu/drm/panthor/panthor_device.c b/drivers/gpu/drm/panthor/panthor_device.c index 4c5b54e7abb7..4082c8f2951d 100644 --- a/drivers/gpu/drm/panthor/panthor_device.c +++ b/drivers/gpu/drm/panthor/panthor_device.c @@ -129,13 +129,8 @@ static void panthor_device_reset_work(struct work_struct *work) panthor_gpu_l2_power_on(ptdev); panthor_mmu_post_reset(ptdev); ret = panthor_fw_post_reset(ptdev); - if (ret) - goto out_dev_exit; - atomic_set(&ptdev->reset.pending, 0); - panthor_sched_post_reset(ptdev); - -out_dev_exit: + panthor_sched_post_reset(ptdev, ret != 0); drm_dev_exit(cookie); if (ret) { diff --git a/drivers/gpu/drm/panthor/panthor_sched.c b/drivers/gpu/drm/panthor/panthor_sched.c index 6ea094b00cf9..fc43ff62c77d 100644 --- a/drivers/gpu/drm/panthor/panthor_sched.c +++ b/drivers/gpu/drm/panthor/panthor_sched.c @@ -2728,15 +2728,22 @@ void panthor_sched_pre_reset(struct panthor_device *ptdev) mutex_unlock(&sched->reset.lock); } -void panthor_sched_post_reset(struct panthor_device *ptdev) +void panthor_sched_post_reset(struct panthor_device *ptdev, bool reset_failed) { struct panthor_scheduler *sched = ptdev->scheduler; struct panthor_group *group, *group_tmp; mutex_lock(&sched->reset.lock); - list_for_each_entry_safe(group, group_tmp, &sched->reset.stopped_groups, run_node) + list_for_each_entry_safe(group, group_tmp, &sched->reset.stopped_groups, run_node) { + /* Consider all previously running group as terminated if the + * reset failed. + */ + if (reset_failed) + group->state = PANTHOR_CS_GROUP_TERMINATED; + panthor_group_start(group); + } /* We're done resetting the GPU, clear the reset.in_progress bit so we can * kick the scheduler. @@ -2744,9 +2751,11 @@ void panthor_sched_post_reset(struct panthor_device *ptdev) atomic_set(&sched->reset.in_progress, false); mutex_unlock(&sched->reset.lock); - sched_queue_delayed_work(sched, tick, 0); - - sched_queue_work(sched, sync_upd); + /* No need to queue a tick and update syncs if the reset failed. */ + if (!reset_failed) { + sched_queue_delayed_work(sched, tick, 0); + sched_queue_work(sched, sync_upd); + } } static void group_sync_upd_work(struct work_struct *work) diff --git a/drivers/gpu/drm/panthor/panthor_sched.h b/drivers/gpu/drm/panthor/panthor_sched.h index 66438b1f331f..3a30d2328b30 100644 --- a/drivers/gpu/drm/panthor/panthor_sched.h +++ b/drivers/gpu/drm/panthor/panthor_sched.h @@ -40,7 +40,7 @@ void panthor_group_pool_destroy(struct panthor_file *pfile); int panthor_sched_init(struct panthor_device *ptdev); void panthor_sched_unplug(struct panthor_device *ptdev); void panthor_sched_pre_reset(struct panthor_device *ptdev); -void panthor_sched_post_reset(struct panthor_device *ptdev); +void panthor_sched_post_reset(struct panthor_device *ptdev, bool reset_failed); void panthor_sched_suspend(struct panthor_device *ptdev); void panthor_sched_resume(struct panthor_device *ptdev);
We need to undo what was done in panthor_sched_pre_reset() even if the reset failed. We just flag all previously running groups as terminated when that happens to unblock things. Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com> --- drivers/gpu/drm/panthor/panthor_device.c | 7 +------ drivers/gpu/drm/panthor/panthor_sched.c | 19 ++++++++++++++----- drivers/gpu/drm/panthor/panthor_sched.h | 2 +- 3 files changed, 16 insertions(+), 12 deletions(-)