Message ID | 20240123021155.2775-1-mario.limonciello@amd.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | [RFC] Revert "drm/sched: Split free_job into own work item" | expand |
On 1/22/2024 20:11, Mario Limonciello wrote: > commit f7fe64ad0f22 ("drm/sched: Split free_job into own work item") > causes graphics hangs at GDM or right after logging in on a > Framework 13 AMD laptop (containing a Phoenix APU). > > This reverts commit f7fe64ad0f22ff034f8ebcfbd7299ee9cc9b57d7. > > Fixes: f7fe64ad0f22 ("drm/sched: Split free_job into own work item") > Signed-off-by: Mario Limonciello <mario.limonciello@amd.com> > --- > This is a regression introduced in 6.8-rc1, bisected from 6.7. > This revert done on top of 6.8-rc1 fixes the issue. > > I'm happy to gather any data to use to properly debug if that is > preferable to a revert. BTW - nothing in kernel log when this happens. I did catch an ftrace with all the GPU scheduler trace events though: https://gist.github.com/superm1/1d3fc752a414f324fbde526052573acd > --- > drivers/gpu/drm/scheduler/sched_main.c | 133 +++++++++---------------- > include/drm/gpu_scheduler.h | 4 +- > 2 files changed, 48 insertions(+), 89 deletions(-) > > diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c > index 550492a7a031..91c96ab53fb5 100644 > --- a/drivers/gpu/drm/scheduler/sched_main.c > +++ b/drivers/gpu/drm/scheduler/sched_main.c > @@ -369,32 +369,6 @@ static void drm_sched_run_job_queue(struct drm_gpu_scheduler *sched) > queue_work(sched->submit_wq, &sched->work_run_job); > } > > -/** > - * __drm_sched_run_free_queue - enqueue free-job work > - * @sched: scheduler instance > - */ > -static void __drm_sched_run_free_queue(struct drm_gpu_scheduler *sched) > -{ > - if (!READ_ONCE(sched->pause_submit)) > - queue_work(sched->submit_wq, &sched->work_free_job); > -} > - > -/** > - * drm_sched_run_free_queue - enqueue free-job work if ready > - * @sched: scheduler instance > - */ > -static void drm_sched_run_free_queue(struct drm_gpu_scheduler *sched) > -{ > - struct drm_sched_job *job; > - > - spin_lock(&sched->job_list_lock); > - job = list_first_entry_or_null(&sched->pending_list, > - struct drm_sched_job, list); > - if (job && dma_fence_is_signaled(&job->s_fence->finished)) > - __drm_sched_run_free_queue(sched); > - spin_unlock(&sched->job_list_lock); > -} > - > /** > * drm_sched_job_done - complete a job > * @s_job: pointer to the job which is done > @@ -414,7 +388,7 @@ static void drm_sched_job_done(struct drm_sched_job *s_job, int result) > dma_fence_get(&s_fence->finished); > drm_sched_fence_finished(s_fence, result); > dma_fence_put(&s_fence->finished); > - __drm_sched_run_free_queue(sched); > + drm_sched_run_job_queue(sched); > } > > /** > @@ -1092,10 +1066,8 @@ drm_sched_get_finished_job(struct drm_gpu_scheduler *sched) > typeof(*next), list); > > if (next) { > - if (test_bit(DMA_FENCE_FLAG_TIMESTAMP_BIT, > - &next->s_fence->scheduled.flags)) > - next->s_fence->scheduled.timestamp = > - dma_fence_timestamp(&job->s_fence->finished); > + next->s_fence->scheduled.timestamp = > + dma_fence_timestamp(&job->s_fence->finished); > /* start TO timer for next job */ > drm_sched_start_timeout(sched); > } > @@ -1145,29 +1117,7 @@ drm_sched_pick_best(struct drm_gpu_scheduler **sched_list, > EXPORT_SYMBOL(drm_sched_pick_best); > > /** > - * drm_sched_free_job_work - worker to call free_job > - * > - * @w: free job work > - */ > -static void drm_sched_free_job_work(struct work_struct *w) > -{ > - struct drm_gpu_scheduler *sched = > - container_of(w, struct drm_gpu_scheduler, work_free_job); > - struct drm_sched_job *job; > - > - if (READ_ONCE(sched->pause_submit)) > - return; > - > - job = drm_sched_get_finished_job(sched); > - if (job) > - sched->ops->free_job(job); > - > - drm_sched_run_free_queue(sched); > - drm_sched_run_job_queue(sched); > -} > - > -/** > - * drm_sched_run_job_work - worker to call run_job > + * drm_sched_run_job_work - main scheduler thread > * > * @w: run job work > */ > @@ -1176,50 +1126,64 @@ static void drm_sched_run_job_work(struct work_struct *w) > struct drm_gpu_scheduler *sched = > container_of(w, struct drm_gpu_scheduler, work_run_job); > struct drm_sched_entity *entity; > - struct dma_fence *fence; > - struct drm_sched_fence *s_fence; > - struct drm_sched_job *sched_job; > + struct drm_sched_job *cleanup_job; > int r; > > if (READ_ONCE(sched->pause_submit)) > return; > > + cleanup_job = drm_sched_get_finished_job(sched); > entity = drm_sched_select_entity(sched); > - if (!entity) > - return; > > - sched_job = drm_sched_entity_pop_job(entity); > - if (!sched_job) { > - complete_all(&entity->entity_idle); > + if (!entity && !cleanup_job) > return; /* No more work */ > - } > > - s_fence = sched_job->s_fence; > + if (cleanup_job) > + sched->ops->free_job(cleanup_job); > > - atomic_add(sched_job->credits, &sched->credit_count); > - drm_sched_job_begin(sched_job); > + if (entity) { > + struct dma_fence *fence; > + struct drm_sched_fence *s_fence; > + struct drm_sched_job *sched_job; > + > + sched_job = drm_sched_entity_pop_job(entity); > + if (!sched_job) { > + complete_all(&entity->entity_idle); > + if (!cleanup_job) > + return; /* No more work */ > + goto again; > + } > > - trace_drm_run_job(sched_job, entity); > - fence = sched->ops->run_job(sched_job); > - complete_all(&entity->entity_idle); > - drm_sched_fence_scheduled(s_fence, fence); > + s_fence = sched_job->s_fence; > > - if (!IS_ERR_OR_NULL(fence)) { > - /* Drop for original kref_init of the fence */ > - dma_fence_put(fence); > + atomic_inc(&sched->credit_count); > + drm_sched_job_begin(sched_job); > > - r = dma_fence_add_callback(fence, &sched_job->cb, > - drm_sched_job_done_cb); > - if (r == -ENOENT) > - drm_sched_job_done(sched_job, fence->error); > - else if (r) > - DRM_DEV_ERROR(sched->dev, "fence add callback failed (%d)\n", r); > - } else { > - drm_sched_job_done(sched_job, IS_ERR(fence) ? > - PTR_ERR(fence) : 0); > + trace_drm_run_job(sched_job, entity); > + fence = sched->ops->run_job(sched_job); > + complete_all(&entity->entity_idle); > + drm_sched_fence_scheduled(s_fence, fence); > + > + if (!IS_ERR_OR_NULL(fence)) { > + /* Drop for original kref_init of the fence */ > + dma_fence_put(fence); > + > + r = dma_fence_add_callback(fence, &sched_job->cb, > + drm_sched_job_done_cb); > + if (r == -ENOENT) > + drm_sched_job_done(sched_job, fence->error); > + else if (r) > + DRM_DEV_ERROR(sched->dev, "fence add callback failed (%d)\n", > + r); > + } else { > + drm_sched_job_done(sched_job, IS_ERR(fence) ? > + PTR_ERR(fence) : 0); > + } > + > + wake_up(&sched->job_scheduled); > } > > - wake_up(&sched->job_scheduled); > +again: > drm_sched_run_job_queue(sched); > } > > @@ -1304,7 +1268,6 @@ int drm_sched_init(struct drm_gpu_scheduler *sched, > atomic_set(&sched->credit_count, 0); > INIT_DELAYED_WORK(&sched->work_tdr, drm_sched_job_timedout); > INIT_WORK(&sched->work_run_job, drm_sched_run_job_work); > - INIT_WORK(&sched->work_free_job, drm_sched_free_job_work); > atomic_set(&sched->_score, 0); > atomic64_set(&sched->job_id_count, 0); > sched->pause_submit = false; > @@ -1432,7 +1395,6 @@ void drm_sched_wqueue_stop(struct drm_gpu_scheduler *sched) > { > WRITE_ONCE(sched->pause_submit, true); > cancel_work_sync(&sched->work_run_job); > - cancel_work_sync(&sched->work_free_job); > } > EXPORT_SYMBOL(drm_sched_wqueue_stop); > > @@ -1445,6 +1407,5 @@ void drm_sched_wqueue_start(struct drm_gpu_scheduler *sched) > { > WRITE_ONCE(sched->pause_submit, false); > queue_work(sched->submit_wq, &sched->work_run_job); > - queue_work(sched->submit_wq, &sched->work_free_job); > } > EXPORT_SYMBOL(drm_sched_wqueue_start); > diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h > index 5acc64954a88..72f64bd0a6eb 100644 > --- a/include/drm/gpu_scheduler.h > +++ b/include/drm/gpu_scheduler.h > @@ -495,10 +495,9 @@ struct drm_sched_backend_ops { > * waits on this wait queue until all the scheduled jobs are > * finished. > * @job_id_count: used to assign unique id to the each job. > - * @submit_wq: workqueue used to queue @work_run_job and @work_free_job > + * @submit_wq: workqueue used to queue @work_run_job > * @timeout_wq: workqueue used to queue @work_tdr > * @work_run_job: work which calls run_job op of each scheduler. > - * @work_free_job: work which calls free_job op of each scheduler. > * @work_tdr: schedules a delayed call to @drm_sched_job_timedout after the > * timeout interval is over. > * @pending_list: the list of jobs which are currently in the job queue. > @@ -528,7 +527,6 @@ struct drm_gpu_scheduler { > struct workqueue_struct *submit_wq; > struct workqueue_struct *timeout_wq; > struct work_struct work_run_job; > - struct work_struct work_free_job; > struct delayed_work work_tdr; > struct list_head pending_list; > spinlock_t job_list_lock;
On 1/23/24 03:11, Mario Limonciello wrote: > commit f7fe64ad0f22 ("drm/sched: Split free_job into own work item") > causes graphics hangs at GDM or right after logging in on a > Framework 13 AMD laptop (containing a Phoenix APU). > > This reverts commit f7fe64ad0f22ff034f8ebcfbd7299ee9cc9b57d7. > > Fixes: f7fe64ad0f22 ("drm/sched: Split free_job into own work item") > Signed-off-by: Mario Limonciello <mario.limonciello@amd.com> > --- > This is a regression introduced in 6.8-rc1, bisected from 6.7. > This revert done on top of 6.8-rc1 fixes the issue. Applying this revert on 6.8-rc1 fixed my issues reported here: https://lore.kernel.org/all/2faccc1a-7fdd-499b-aa0a-bd54f4068f3e@suse.cz/ Let me know if there's another fix instead of revert so I can test. Thanks, Vlastimil > > I'm happy to gather any data to use to properly debug if that is > preferable to a revert. > --- > drivers/gpu/drm/scheduler/sched_main.c | 133 +++++++++---------------- > include/drm/gpu_scheduler.h | 4 +- > 2 files changed, 48 insertions(+), 89 deletions(-) >
On 1/24/2024 10:26, Vlastimil Babka wrote: > On 1/23/24 03:11, Mario Limonciello wrote: >> commit f7fe64ad0f22 ("drm/sched: Split free_job into own work item") >> causes graphics hangs at GDM or right after logging in on a >> Framework 13 AMD laptop (containing a Phoenix APU). >> >> This reverts commit f7fe64ad0f22ff034f8ebcfbd7299ee9cc9b57d7. >> >> Fixes: f7fe64ad0f22 ("drm/sched: Split free_job into own work item") >> Signed-off-by: Mario Limonciello <mario.limonciello@amd.com> >> --- >> This is a regression introduced in 6.8-rc1, bisected from 6.7. >> This revert done on top of 6.8-rc1 fixes the issue. > > Applying this revert on 6.8-rc1 fixed my issues reported here: > https://lore.kernel.org/all/2faccc1a-7fdd-499b-aa0a-bd54f4068f3e@suse.cz/ > > Let me know if there's another fix instead of revert so I can test. > There's not another fix at the moment, but Matthew has posted a patch to allow ftrace to capture more data with the gpu_scheduler trace events to this bug report: https://gitlab.freedesktop.org/drm/amd/-/issues/3124 I already captured from a local machine and attached to that bug report. > Thanks, > Vlastimil > >> >> I'm happy to gather any data to use to properly debug if that is >> preferable to a revert. >> --- >> drivers/gpu/drm/scheduler/sched_main.c | 133 +++++++++---------------- >> include/drm/gpu_scheduler.h | 4 +- >> 2 files changed, 48 insertions(+), 89 deletions(-) >> >
diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c index 550492a7a031..91c96ab53fb5 100644 --- a/drivers/gpu/drm/scheduler/sched_main.c +++ b/drivers/gpu/drm/scheduler/sched_main.c @@ -369,32 +369,6 @@ static void drm_sched_run_job_queue(struct drm_gpu_scheduler *sched) queue_work(sched->submit_wq, &sched->work_run_job); } -/** - * __drm_sched_run_free_queue - enqueue free-job work - * @sched: scheduler instance - */ -static void __drm_sched_run_free_queue(struct drm_gpu_scheduler *sched) -{ - if (!READ_ONCE(sched->pause_submit)) - queue_work(sched->submit_wq, &sched->work_free_job); -} - -/** - * drm_sched_run_free_queue - enqueue free-job work if ready - * @sched: scheduler instance - */ -static void drm_sched_run_free_queue(struct drm_gpu_scheduler *sched) -{ - struct drm_sched_job *job; - - spin_lock(&sched->job_list_lock); - job = list_first_entry_or_null(&sched->pending_list, - struct drm_sched_job, list); - if (job && dma_fence_is_signaled(&job->s_fence->finished)) - __drm_sched_run_free_queue(sched); - spin_unlock(&sched->job_list_lock); -} - /** * drm_sched_job_done - complete a job * @s_job: pointer to the job which is done @@ -414,7 +388,7 @@ static void drm_sched_job_done(struct drm_sched_job *s_job, int result) dma_fence_get(&s_fence->finished); drm_sched_fence_finished(s_fence, result); dma_fence_put(&s_fence->finished); - __drm_sched_run_free_queue(sched); + drm_sched_run_job_queue(sched); } /** @@ -1092,10 +1066,8 @@ drm_sched_get_finished_job(struct drm_gpu_scheduler *sched) typeof(*next), list); if (next) { - if (test_bit(DMA_FENCE_FLAG_TIMESTAMP_BIT, - &next->s_fence->scheduled.flags)) - next->s_fence->scheduled.timestamp = - dma_fence_timestamp(&job->s_fence->finished); + next->s_fence->scheduled.timestamp = + dma_fence_timestamp(&job->s_fence->finished); /* start TO timer for next job */ drm_sched_start_timeout(sched); } @@ -1145,29 +1117,7 @@ drm_sched_pick_best(struct drm_gpu_scheduler **sched_list, EXPORT_SYMBOL(drm_sched_pick_best); /** - * drm_sched_free_job_work - worker to call free_job - * - * @w: free job work - */ -static void drm_sched_free_job_work(struct work_struct *w) -{ - struct drm_gpu_scheduler *sched = - container_of(w, struct drm_gpu_scheduler, work_free_job); - struct drm_sched_job *job; - - if (READ_ONCE(sched->pause_submit)) - return; - - job = drm_sched_get_finished_job(sched); - if (job) - sched->ops->free_job(job); - - drm_sched_run_free_queue(sched); - drm_sched_run_job_queue(sched); -} - -/** - * drm_sched_run_job_work - worker to call run_job + * drm_sched_run_job_work - main scheduler thread * * @w: run job work */ @@ -1176,50 +1126,64 @@ static void drm_sched_run_job_work(struct work_struct *w) struct drm_gpu_scheduler *sched = container_of(w, struct drm_gpu_scheduler, work_run_job); struct drm_sched_entity *entity; - struct dma_fence *fence; - struct drm_sched_fence *s_fence; - struct drm_sched_job *sched_job; + struct drm_sched_job *cleanup_job; int r; if (READ_ONCE(sched->pause_submit)) return; + cleanup_job = drm_sched_get_finished_job(sched); entity = drm_sched_select_entity(sched); - if (!entity) - return; - sched_job = drm_sched_entity_pop_job(entity); - if (!sched_job) { - complete_all(&entity->entity_idle); + if (!entity && !cleanup_job) return; /* No more work */ - } - s_fence = sched_job->s_fence; + if (cleanup_job) + sched->ops->free_job(cleanup_job); - atomic_add(sched_job->credits, &sched->credit_count); - drm_sched_job_begin(sched_job); + if (entity) { + struct dma_fence *fence; + struct drm_sched_fence *s_fence; + struct drm_sched_job *sched_job; + + sched_job = drm_sched_entity_pop_job(entity); + if (!sched_job) { + complete_all(&entity->entity_idle); + if (!cleanup_job) + return; /* No more work */ + goto again; + } - trace_drm_run_job(sched_job, entity); - fence = sched->ops->run_job(sched_job); - complete_all(&entity->entity_idle); - drm_sched_fence_scheduled(s_fence, fence); + s_fence = sched_job->s_fence; - if (!IS_ERR_OR_NULL(fence)) { - /* Drop for original kref_init of the fence */ - dma_fence_put(fence); + atomic_inc(&sched->credit_count); + drm_sched_job_begin(sched_job); - r = dma_fence_add_callback(fence, &sched_job->cb, - drm_sched_job_done_cb); - if (r == -ENOENT) - drm_sched_job_done(sched_job, fence->error); - else if (r) - DRM_DEV_ERROR(sched->dev, "fence add callback failed (%d)\n", r); - } else { - drm_sched_job_done(sched_job, IS_ERR(fence) ? - PTR_ERR(fence) : 0); + trace_drm_run_job(sched_job, entity); + fence = sched->ops->run_job(sched_job); + complete_all(&entity->entity_idle); + drm_sched_fence_scheduled(s_fence, fence); + + if (!IS_ERR_OR_NULL(fence)) { + /* Drop for original kref_init of the fence */ + dma_fence_put(fence); + + r = dma_fence_add_callback(fence, &sched_job->cb, + drm_sched_job_done_cb); + if (r == -ENOENT) + drm_sched_job_done(sched_job, fence->error); + else if (r) + DRM_DEV_ERROR(sched->dev, "fence add callback failed (%d)\n", + r); + } else { + drm_sched_job_done(sched_job, IS_ERR(fence) ? + PTR_ERR(fence) : 0); + } + + wake_up(&sched->job_scheduled); } - wake_up(&sched->job_scheduled); +again: drm_sched_run_job_queue(sched); } @@ -1304,7 +1268,6 @@ int drm_sched_init(struct drm_gpu_scheduler *sched, atomic_set(&sched->credit_count, 0); INIT_DELAYED_WORK(&sched->work_tdr, drm_sched_job_timedout); INIT_WORK(&sched->work_run_job, drm_sched_run_job_work); - INIT_WORK(&sched->work_free_job, drm_sched_free_job_work); atomic_set(&sched->_score, 0); atomic64_set(&sched->job_id_count, 0); sched->pause_submit = false; @@ -1432,7 +1395,6 @@ void drm_sched_wqueue_stop(struct drm_gpu_scheduler *sched) { WRITE_ONCE(sched->pause_submit, true); cancel_work_sync(&sched->work_run_job); - cancel_work_sync(&sched->work_free_job); } EXPORT_SYMBOL(drm_sched_wqueue_stop); @@ -1445,6 +1407,5 @@ void drm_sched_wqueue_start(struct drm_gpu_scheduler *sched) { WRITE_ONCE(sched->pause_submit, false); queue_work(sched->submit_wq, &sched->work_run_job); - queue_work(sched->submit_wq, &sched->work_free_job); } EXPORT_SYMBOL(drm_sched_wqueue_start); diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h index 5acc64954a88..72f64bd0a6eb 100644 --- a/include/drm/gpu_scheduler.h +++ b/include/drm/gpu_scheduler.h @@ -495,10 +495,9 @@ struct drm_sched_backend_ops { * waits on this wait queue until all the scheduled jobs are * finished. * @job_id_count: used to assign unique id to the each job. - * @submit_wq: workqueue used to queue @work_run_job and @work_free_job + * @submit_wq: workqueue used to queue @work_run_job * @timeout_wq: workqueue used to queue @work_tdr * @work_run_job: work which calls run_job op of each scheduler. - * @work_free_job: work which calls free_job op of each scheduler. * @work_tdr: schedules a delayed call to @drm_sched_job_timedout after the * timeout interval is over. * @pending_list: the list of jobs which are currently in the job queue. @@ -528,7 +527,6 @@ struct drm_gpu_scheduler { struct workqueue_struct *submit_wq; struct workqueue_struct *timeout_wq; struct work_struct work_run_job; - struct work_struct work_free_job; struct delayed_work work_tdr; struct list_head pending_list; spinlock_t job_list_lock;
commit f7fe64ad0f22 ("drm/sched: Split free_job into own work item") causes graphics hangs at GDM or right after logging in on a Framework 13 AMD laptop (containing a Phoenix APU). This reverts commit f7fe64ad0f22ff034f8ebcfbd7299ee9cc9b57d7. Fixes: f7fe64ad0f22 ("drm/sched: Split free_job into own work item") Signed-off-by: Mario Limonciello <mario.limonciello@amd.com> --- This is a regression introduced in 6.8-rc1, bisected from 6.7. This revert done on top of 6.8-rc1 fixes the issue. I'm happy to gather any data to use to properly debug if that is preferable to a revert. --- drivers/gpu/drm/scheduler/sched_main.c | 133 +++++++++---------------- include/drm/gpu_scheduler.h | 4 +- 2 files changed, 48 insertions(+), 89 deletions(-)