@@ -1388,31 +1388,46 @@ drm_sched_no_jobs_pending(struct drm_gpu_scheduler *sched)
return empty;
}
+/**
+ * drm_sched_cancel_jobs_and_wait - trigger freeing of all pending jobs
+ * @sched: scheduler instance
+ *
+ * Must only be called if &struct drm_sched_backend_ops.kill_fence_context is
+ * implemented.
+ *
+ * Instructs the driver to kill the fence context associated with this scheduler,
+ * thereby signalling all pending fences. This, in turn, will trigger
+ * &struct drm_sched_backend_ops.free_job to be called for all pending jobs.
+ * The function then blocks until all pending jobs have been freed.
+ */
+static inline void
+drm_sched_cancel_jobs_and_wait(struct drm_gpu_scheduler *sched)
+{
+ sched->ops->kill_fence_context(sched);
+ wait_event(sched->pending_list_waitque, drm_sched_no_jobs_pending(sched));
+}
+
/**
* drm_sched_fini - Destroy a gpu scheduler
*
* @sched: scheduler instance
*
- * Tears down and cleans up the scheduler.
- *
- * Note that this function blocks until all the fences returned by
- * &struct drm_sched_backend_ops.run_job have been signalled.
+ * Tears down and cleans up the scheduler. Might leak memory if
+ * &struct drm_sched_backend_ops.kill_fence_context is not implemented.
*/
void drm_sched_fini(struct drm_gpu_scheduler *sched)
{
struct drm_sched_entity *s_entity;
int i;
- /*
- * Jobs that have neither been scheduled or which have timed out are
- * gone by now, but jobs that have been submitted through
- * backend_ops.run_job() and have not yet terminated are still pending.
- *
- * Wait for the pending_list to become empty to avoid leaking those jobs.
- */
- drm_sched_submission_and_timeout_stop(sched);
- wait_event(sched->pending_list_waitque, drm_sched_no_jobs_pending(sched));
- drm_sched_free_stop(sched);
+ if (sched->ops->kill_fence_context) {
+ drm_sched_submission_and_timeout_stop(sched);
+ drm_sched_cancel_jobs_and_wait(sched);
+ drm_sched_free_stop(sched);
+ } else {
+ /* We're in "legacy free-mode" and ignore potential mem leaks */
+ drm_sched_wqueue_stop(sched);
+ }
for (i = DRM_SCHED_PRIORITY_KERNEL; i < sched->num_rqs; i++) {
struct drm_sched_rq *rq = sched->sched_rq[i];
@@ -1500,7 +1515,7 @@ bool drm_sched_wqueue_ready(struct drm_gpu_scheduler *sched)
EXPORT_SYMBOL(drm_sched_wqueue_ready);
/**
- * drm_sched_wqueue_stop - stop scheduler submission
+ * drm_sched_wqueue_stop - stop scheduler submission and freeing
* @sched: scheduler instance
*
* Stops the scheduler from pulling new jobs from entities. It also stops
@@ -1516,7 +1531,7 @@ void drm_sched_wqueue_stop(struct drm_gpu_scheduler *sched)
EXPORT_SYMBOL(drm_sched_wqueue_stop);
/**
- * drm_sched_wqueue_start - start scheduler submission
+ * drm_sched_wqueue_start - start scheduler submission and freeing
* @sched: scheduler instance
*
* Restarts the scheduler after drm_sched_wqueue_stop() has stopped it.
@@ -509,6 +509,17 @@ struct drm_sched_backend_ops {
* and it's time to clean it up.
*/
void (*free_job)(struct drm_sched_job *sched_job);
+
+ /**
+ * @kill_fence_context: kill the fence context belonging to this scheduler
+ *
+ * Needed to cleanly tear the scheduler down in drm_sched_fini(). This
+ * callback will cause all hardware fences to be signalled by the driver,
+ * which, ultimately, ensures that all jobs get freed before teardown.
+ *
+ * This callback is optional, but it is highly recommended to implement it.
+ */
+ void (*kill_fence_context)(struct drm_gpu_scheduler *sched);
};
/**
The waitqueue that ensures that drm_sched_fini() blocks until the pending_list has become empty could theoretically cause that function to block for a very long time. That, ultimately, could block userspace procesess and prevent them from being killable through SIGKILL. When a driver calls drm_sched_fini(), it is safe to assume that all still pending jobs are not needed anymore anyways. Thus, they can be cancelled and thereby it can be ensured that drm_sched_fini() will return relatively quickly. Implement a new helper to stop all work items / submission except for the drm_sched_backend_ops.run_job(). Implement a driver callback, kill_fence_context(), that instructs the driver to kill the fence context associated with this scheduler, thereby causing all pending hardware fences to be signalled. Call those new routines in drm_sched_fini() and ensure backwards compatibility if the new callback is not implemented. Suggested-by: Danilo Krummrich <dakr@redhat.com> Signed-off-by: Philipp Stanner <phasta@kernel.org> --- drivers/gpu/drm/scheduler/sched_main.c | 47 +++++++++++++++++--------- include/drm/gpu_scheduler.h | 11 ++++++ 2 files changed, 42 insertions(+), 16 deletions(-)