diff mbox series

[6/6] drm/sched: Make use of a "done" thread

Message ID 20201125031708.6433-7-luben.tuikov@amd.com (mailing list archive)
State New, archived
Headers show
Series Allow to extend the timeout without jobs disappearing | expand

Commit Message

Luben Tuikov Nov. 25, 2020, 3:17 a.m. UTC
Add a "done" list to which all completed jobs are added
to be freed. The drm_sched_job_done() callback is the
producer of jobs to this list.

Add a "done" thread which consumes from the done list
and frees up jobs. Now, the main scheduler thread only
pushes jobs to the GPU and the "done" thread frees them
up, on the way out of the GPU when they've completed
execution.

Make use of the status returned by the GPU driver
timeout handler to decide whether to leave the job in
the pending list, or to send it off to the done list.
If a job is done, it is added to the done list and the
done thread woken up. If a job needs more time, it is
left on the pending list and the timeout timer
restarted.

Eliminate the polling mechanism of picking out done
jobs from the pending list, i.e. eliminate
drm_sched_get_cleanup_job(). Now the main scheduler
thread only pushes jobs down to the GPU.

Various other optimizations to the GPU scheduler
and job recovery are possible with this format.

Signed-off-by: Luben Tuikov <luben.tuikov@amd.com>
---
 drivers/gpu/drm/scheduler/sched_main.c | 173 +++++++++++++------------
 include/drm/gpu_scheduler.h            |  14 ++
 2 files changed, 101 insertions(+), 86 deletions(-)

Comments

Christian König Nov. 25, 2020, 10:10 a.m. UTC | #1
Am 25.11.20 um 04:17 schrieb Luben Tuikov:
> Add a "done" list to which all completed jobs are added
> to be freed. The drm_sched_job_done() callback is the
> producer of jobs to this list.
>
> Add a "done" thread which consumes from the done list
> and frees up jobs. Now, the main scheduler thread only
> pushes jobs to the GPU and the "done" thread frees them
> up, on the way out of the GPU when they've completed
> execution.

Well there are quite a number of problems in this patch.

 From the design I think we should be getting rid of the linked list and 
not extend its use. And we also don't want to offload the freeing of 
jobs into a different thread because that could potentially mean that 
this is executed on a different CPU.

Then one obvious problem seems to be that you don't take into account 
that we moved the job freeing into the scheduler thread to make sure 
that this is suspended while the scheduler thread is stopped. This 
behavior is now completely gone, e.g. the delete thread keeps running 
while the scheduler thread is stopped.

A few more comments below.

> Make use of the status returned by the GPU driver
> timeout handler to decide whether to leave the job in
> the pending list, or to send it off to the done list.
> If a job is done, it is added to the done list and the
> done thread woken up. If a job needs more time, it is
> left on the pending list and the timeout timer
> restarted.
>
> Eliminate the polling mechanism of picking out done
> jobs from the pending list, i.e. eliminate
> drm_sched_get_cleanup_job(). Now the main scheduler
> thread only pushes jobs down to the GPU.
>
> Various other optimizations to the GPU scheduler
> and job recovery are possible with this format.
>
> Signed-off-by: Luben Tuikov <luben.tuikov@amd.com>
> ---
>   drivers/gpu/drm/scheduler/sched_main.c | 173 +++++++++++++------------
>   include/drm/gpu_scheduler.h            |  14 ++
>   2 files changed, 101 insertions(+), 86 deletions(-)
>
> diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c
> index 3eb7618a627d..289ae68cd97f 100644
> --- a/drivers/gpu/drm/scheduler/sched_main.c
> +++ b/drivers/gpu/drm/scheduler/sched_main.c
> @@ -164,7 +164,8 @@ drm_sched_rq_select_entity(struct drm_sched_rq *rq)
>    * drm_sched_job_done - complete a job
>    * @s_job: pointer to the job which is done
>    *
> - * Finish the job's fence and wake up the worker thread.
> + * Finish the job's fence, move it to the done list,
> + * and wake up the done thread.
>    */
>   static void drm_sched_job_done(struct drm_sched_job *s_job)
>   {
> @@ -179,7 +180,12 @@ static void drm_sched_job_done(struct drm_sched_job *s_job)
>   	dma_fence_get(&s_fence->finished);
>   	drm_sched_fence_finished(s_fence);
>   	dma_fence_put(&s_fence->finished);
> -	wake_up_interruptible(&sched->wake_up_worker);
> +
> +	spin_lock(&sched->job_list_lock);
> +	list_move(&s_job->list, &sched->done_list);
> +	spin_unlock(&sched->job_list_lock);
> +
> +	wake_up_interruptible(&sched->done_wait_q);

How is the worker thread then woken up to push new jobs to the hardware?

>   }
>   
>   /**
> @@ -221,11 +227,10 @@ bool drm_sched_dependency_optimized(struct dma_fence* fence,
>   EXPORT_SYMBOL(drm_sched_dependency_optimized);
>   
>   /**
> - * drm_sched_start_timeout - start timeout for reset worker
> - *
> - * @sched: scheduler instance to start the worker for
> + * drm_sched_start_timeout - start a timeout timer
> + * @sched: scheduler instance whose job we're timing
>    *
> - * Start the timeout for the given scheduler.
> + * Start a timeout timer for the given scheduler.
>    */
>   static void drm_sched_start_timeout(struct drm_gpu_scheduler *sched)
>   {
> @@ -305,8 +310,8 @@ static void drm_sched_job_begin(struct drm_sched_job *s_job)
>   
>   	spin_lock(&sched->job_list_lock);
>   	list_add_tail(&s_job->list, &sched->pending_list);
> -	drm_sched_start_timeout(sched);
>   	spin_unlock(&sched->job_list_lock);
> +	drm_sched_start_timeout(sched);

This looks wrong, the drm_sched_start_timeout() function used to need 
the lock. Why should that have changed?

>   }
>   
>   static void drm_sched_job_timedout(struct work_struct *work)
> @@ -316,37 +321,30 @@ static void drm_sched_job_timedout(struct work_struct *work)
>   
>   	sched = container_of(work, struct drm_gpu_scheduler, work_tdr.work);
>   
> -	/* Protects against concurrent deletion in drm_sched_get_cleanup_job */
>   	spin_lock(&sched->job_list_lock);
>   	job = list_first_entry_or_null(&sched->pending_list,
>   				       struct drm_sched_job, list);
> +	spin_unlock(&sched->job_list_lock);
>   
>   	if (job) {
> -		/*
> -		 * Remove the bad job so it cannot be freed by concurrent
> -		 * drm_sched_cleanup_jobs. It will be reinserted back after sched->thread
> -		 * is parked at which point it's safe.
> -		 */
> -		list_del_init(&job->list);
> -		spin_unlock(&sched->job_list_lock);
> +		int res;
>   
> -		job->sched->ops->timedout_job(job);
> +		job->job_status |= DRM_JOB_STATUS_TIMEOUT;
> +		res = job->sched->ops->timedout_job(job);
> +		if (res == 0) {
> +			/* The job is out of the device.
> +			 */
> +			spin_lock(&sched->job_list_lock);
> +			list_move(&job->list, &sched->done_list);
> +			spin_unlock(&sched->job_list_lock);
>   
> -		/*
> -		 * Guilty job did complete and hence needs to be manually removed
> -		 * See drm_sched_stop doc.
> -		 */
> -		if (sched->free_guilty) {
> -			job->sched->ops->free_job(job);
> -			sched->free_guilty = false;
> +			wake_up_interruptible(&sched->done_wait_q);
> +		} else {
> +			/* The job needs more time.
> +			 */
> +			drm_sched_start_timeout(sched);
>   		}
> -	} else {
> -		spin_unlock(&sched->job_list_lock);
>   	}
> -
> -	spin_lock(&sched->job_list_lock);
> -	drm_sched_start_timeout(sched);
> -	spin_unlock(&sched->job_list_lock);
>   }
>   
>    /**
> @@ -511,15 +509,13 @@ void drm_sched_start(struct drm_gpu_scheduler *sched, bool full_recovery)
>   			else if (r)
>   				DRM_ERROR("fence add callback failed (%d)\n",
>   					  r);
> -		} else
> +		} else {
>   			drm_sched_job_done(s_job);
> +		}
>   	}
>   
> -	if (full_recovery) {
> -		spin_lock(&sched->job_list_lock);
> +	if (full_recovery)
>   		drm_sched_start_timeout(sched);
> -		spin_unlock(&sched->job_list_lock);

Same here.

Regards,
Christian.

> -	}
>   
>   	kthread_unpark(sched->thread);
>   }
> @@ -667,47 +663,6 @@ drm_sched_select_entity(struct drm_gpu_scheduler *sched)
>   	return entity;
>   }
>   
> -/**
> - * drm_sched_get_cleanup_job - fetch the next finished job to be destroyed
> - *
> - * @sched: scheduler instance
> - *
> - * Returns the next finished job from the pending list (if there is one)
> - * ready for it to be destroyed.
> - */
> -static struct drm_sched_job *
> -drm_sched_get_cleanup_job(struct drm_gpu_scheduler *sched)
> -{
> -	struct drm_sched_job *job;
> -
> -	/*
> -	 * Don't destroy jobs while the timeout worker is running  OR thread
> -	 * is being parked and hence assumed to not touch pending_list
> -	 */
> -	if ((sched->timeout != MAX_SCHEDULE_TIMEOUT &&
> -	    !cancel_delayed_work(&sched->work_tdr)) ||
> -	    kthread_should_park())
> -		return NULL;
> -
> -	spin_lock(&sched->job_list_lock);
> -
> -	job = list_first_entry_or_null(&sched->pending_list,
> -				       struct drm_sched_job, list);
> -
> -	if (job && dma_fence_is_signaled(&job->s_fence->finished)) {
> -		/* remove job from pending_list */
> -		list_del_init(&job->list);
> -	} else {
> -		job = NULL;
> -		/* queue timeout for next job */
> -		drm_sched_start_timeout(sched);
> -	}
> -
> -	spin_unlock(&sched->job_list_lock);
> -
> -	return job;
> -}
> -
>   /**
>    * drm_sched_pick_best - Get a drm sched from a sched_list with the least load
>    * @sched_list: list of drm_gpu_schedulers
> @@ -761,6 +716,44 @@ static bool drm_sched_blocked(struct drm_gpu_scheduler *sched)
>   	return false;
>   }
>   
> +/**
> + * drm_sched_done - free done tasks
> + * @param: pointer to a scheduler instance
> + *
> + * Returns 0.
> + */
> +static int drm_sched_done(void *param)
> +{
> +	struct drm_gpu_scheduler *sched = param;
> +
> +	do {
> +		LIST_HEAD(done_q);
> +
> +		wait_event_interruptible(sched->done_wait_q,
> +					 kthread_should_stop() ||
> +					 !list_empty(&sched->done_list));
> +
> +		spin_lock(&sched->job_list_lock);
> +		list_splice_init(&sched->done_list, &done_q);
> +		spin_unlock(&sched->job_list_lock);
> +
> +		if (list_empty(&done_q))
> +			continue;
> +
> +		while (!list_empty(&done_q)) {
> +			struct drm_sched_job *job;
> +
> +			job = list_first_entry(&done_q,
> +					       struct drm_sched_job,
> +					       list);
> +			list_del_init(&job->list);
> +			sched->ops->free_job(job);
> +		}
> +	} while (!kthread_should_stop());
> +
> +	return 0;
> +}
> +
>   /**
>    * drm_sched_main - main scheduler thread
>    *
> @@ -770,7 +763,7 @@ static bool drm_sched_blocked(struct drm_gpu_scheduler *sched)
>    */
>   static int drm_sched_main(void *param)
>   {
> -	struct drm_gpu_scheduler *sched = (struct drm_gpu_scheduler *)param;
> +	struct drm_gpu_scheduler *sched = param;
>   	int r;
>   
>   	sched_set_fifo_low(current);
> @@ -780,20 +773,12 @@ static int drm_sched_main(void *param)
>   		struct drm_sched_fence *s_fence;
>   		struct drm_sched_job *sched_job;
>   		struct dma_fence *fence;
> -		struct drm_sched_job *cleanup_job = NULL;
>   
>   		wait_event_interruptible(sched->wake_up_worker,
> -					 (cleanup_job = drm_sched_get_cleanup_job(sched)) ||
>   					 (!drm_sched_blocked(sched) &&
>   					  (entity = drm_sched_select_entity(sched))) ||
>   					 kthread_should_stop());
>   
> -		if (cleanup_job) {
> -			sched->ops->free_job(cleanup_job);
> -			/* queue timeout for next job */
> -			drm_sched_start_timeout(sched);
> -		}
> -
>   		if (!entity)
>   			continue;
>   
> @@ -820,8 +805,7 @@ static int drm_sched_main(void *param)
>   			if (r == -ENOENT)
>   				drm_sched_job_done(sched_job);
>   			else if (r)
> -				DRM_ERROR("fence add callback failed (%d)\n",
> -					  r);
> +				DRM_ERROR("fence add callback failed (%d)\n", r);
>   			dma_fence_put(fence);
>   		} else {
>   			if (IS_ERR(fence))
> @@ -865,7 +849,9 @@ int drm_sched_init(struct drm_gpu_scheduler *sched,
>   
>   	init_waitqueue_head(&sched->wake_up_worker);
>   	init_waitqueue_head(&sched->job_scheduled);
> +	init_waitqueue_head(&sched->done_wait_q);
>   	INIT_LIST_HEAD(&sched->pending_list);
> +	INIT_LIST_HEAD(&sched->done_list);
>   	spin_lock_init(&sched->job_list_lock);
>   	atomic_set(&sched->hw_rq_count, 0);
>   	INIT_DELAYED_WORK(&sched->work_tdr, drm_sched_job_timedout);
> @@ -881,6 +867,21 @@ int drm_sched_init(struct drm_gpu_scheduler *sched,
>   		return ret;
>   	}
>   
> +	snprintf(sched->thread_done_name, DRM_THREAD_NAME_LEN, "%s%s",
> +		 sched->name, "-done");
> +	sched->thread_done_name[DRM_THREAD_NAME_LEN - 1] = '\0';
> +	sched->thread_done = kthread_run(drm_sched_done, sched,
> +					 sched->thread_done_name);
> +	if (IS_ERR(sched->thread_done)) {
> +		ret = kthread_stop(sched->thread);
> +		if (!ret) {
> +			/* free_kthread_struct(sched->thread); */
> +			sched->thread = NULL;
> +		}
> +		DRM_ERROR("Failed to start thread %s", sched->thread_done_name);
> +		return ret;
> +	}
> +
>   	sched->ready = true;
>   	return 0;
>   }
> diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
> index 3a5686c3b5e9..b282d6158b50 100644
> --- a/include/drm/gpu_scheduler.h
> +++ b/include/drm/gpu_scheduler.h
> @@ -169,6 +169,12 @@ struct drm_sched_fence {
>   
>   struct drm_sched_fence *to_drm_sched_fence(struct dma_fence *f);
>   
> +enum drm_job_status {
> +	DRM_JOB_STATUS_NONE    = 0 << 0,
> +	DRM_JOB_STATUS_DONE    = 1 << 0,
> +	DRM_JOB_STATUS_TIMEOUT = 1 << 1,
> +};
> +
>   /**
>    * struct drm_sched_job - A job to be run by an entity.
>    *
> @@ -198,6 +204,7 @@ struct drm_sched_job {
>   	uint64_t			id;
>   	atomic_t			karma;
>   	enum drm_sched_priority		s_priority;
> +	enum drm_job_status             job_status;
>   	struct drm_sched_entity         *entity;
>   	struct dma_fence_cb		cb;
>   };
> @@ -284,15 +291,22 @@ struct drm_gpu_scheduler {
>   	uint32_t			hw_submission_limit;
>   	long				timeout;
>   	const char			*name;
> +	char                            thread_done_name[DRM_THREAD_NAME_LEN];
> +
>   	struct drm_sched_rq		sched_rq[DRM_SCHED_PRIORITY_COUNT];
>   	wait_queue_head_t		wake_up_worker;
>   	wait_queue_head_t		job_scheduled;
> +	wait_queue_head_t               done_wait_q;
>   	atomic_t			hw_rq_count;
>   	atomic64_t			job_id_count;
>   	struct delayed_work		work_tdr;
>   	struct task_struct		*thread;
> +	struct task_struct		*thread_done;
> +
>   	struct list_head		pending_list;
> +	struct list_head                done_list;
>   	spinlock_t			job_list_lock;
> +
>   	int				hang_limit;
>   	atomic_t                        score;
>   	bool				ready;
Steven Price Nov. 25, 2020, 11:09 a.m. UTC | #2
On 25/11/2020 03:17, Luben Tuikov wrote:
> Add a "done" list to which all completed jobs are added
> to be freed. The drm_sched_job_done() callback is the
> producer of jobs to this list.
> 
> Add a "done" thread which consumes from the done list
> and frees up jobs. Now, the main scheduler thread only
> pushes jobs to the GPU and the "done" thread frees them
> up, on the way out of the GPU when they've completed
> execution.

Generally I'd be in favour of a "done thread" as I think there are some 
murky corners of Panfrost's locking that would be helped by deferring 
the free_job() callback.

But I think you're trying to do too much in one patch here. And as 
Christian has pointed out there's some dodgy looking changes to locking 
which aren't explained.

Steve

> 
> Make use of the status returned by the GPU driver
> timeout handler to decide whether to leave the job in
> the pending list, or to send it off to the done list.
> If a job is done, it is added to the done list and the
> done thread woken up. If a job needs more time, it is
> left on the pending list and the timeout timer
> restarted.
> 
> Eliminate the polling mechanism of picking out done
> jobs from the pending list, i.e. eliminate
> drm_sched_get_cleanup_job(). Now the main scheduler
> thread only pushes jobs down to the GPU.
> 
> Various other optimizations to the GPU scheduler
> and job recovery are possible with this format.
> 
> Signed-off-by: Luben Tuikov <luben.tuikov@amd.com>
> ---
>   drivers/gpu/drm/scheduler/sched_main.c | 173 +++++++++++++------------
>   include/drm/gpu_scheduler.h            |  14 ++
>   2 files changed, 101 insertions(+), 86 deletions(-)
> 
> diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c
> index 3eb7618a627d..289ae68cd97f 100644
> --- a/drivers/gpu/drm/scheduler/sched_main.c
> +++ b/drivers/gpu/drm/scheduler/sched_main.c
> @@ -164,7 +164,8 @@ drm_sched_rq_select_entity(struct drm_sched_rq *rq)
>    * drm_sched_job_done - complete a job
>    * @s_job: pointer to the job which is done
>    *
> - * Finish the job's fence and wake up the worker thread.
> + * Finish the job's fence, move it to the done list,
> + * and wake up the done thread.
>    */
>   static void drm_sched_job_done(struct drm_sched_job *s_job)
>   {
> @@ -179,7 +180,12 @@ static void drm_sched_job_done(struct drm_sched_job *s_job)
>   	dma_fence_get(&s_fence->finished);
>   	drm_sched_fence_finished(s_fence);
>   	dma_fence_put(&s_fence->finished);
> -	wake_up_interruptible(&sched->wake_up_worker);
> +
> +	spin_lock(&sched->job_list_lock);
> +	list_move(&s_job->list, &sched->done_list);
> +	spin_unlock(&sched->job_list_lock);
> +
> +	wake_up_interruptible(&sched->done_wait_q);
>   }
>   
>   /**
> @@ -221,11 +227,10 @@ bool drm_sched_dependency_optimized(struct dma_fence* fence,
>   EXPORT_SYMBOL(drm_sched_dependency_optimized);
>   
>   /**
> - * drm_sched_start_timeout - start timeout for reset worker
> - *
> - * @sched: scheduler instance to start the worker for
> + * drm_sched_start_timeout - start a timeout timer
> + * @sched: scheduler instance whose job we're timing
>    *
> - * Start the timeout for the given scheduler.
> + * Start a timeout timer for the given scheduler.
>    */
>   static void drm_sched_start_timeout(struct drm_gpu_scheduler *sched)
>   {
> @@ -305,8 +310,8 @@ static void drm_sched_job_begin(struct drm_sched_job *s_job)
>   
>   	spin_lock(&sched->job_list_lock);
>   	list_add_tail(&s_job->list, &sched->pending_list);
> -	drm_sched_start_timeout(sched);
>   	spin_unlock(&sched->job_list_lock);
> +	drm_sched_start_timeout(sched);
>   }
>   
>   static void drm_sched_job_timedout(struct work_struct *work)
> @@ -316,37 +321,30 @@ static void drm_sched_job_timedout(struct work_struct *work)
>   
>   	sched = container_of(work, struct drm_gpu_scheduler, work_tdr.work);
>   
> -	/* Protects against concurrent deletion in drm_sched_get_cleanup_job */
>   	spin_lock(&sched->job_list_lock);
>   	job = list_first_entry_or_null(&sched->pending_list,
>   				       struct drm_sched_job, list);
> +	spin_unlock(&sched->job_list_lock);
>   
>   	if (job) {
> -		/*
> -		 * Remove the bad job so it cannot be freed by concurrent
> -		 * drm_sched_cleanup_jobs. It will be reinserted back after sched->thread
> -		 * is parked at which point it's safe.
> -		 */
> -		list_del_init(&job->list);
> -		spin_unlock(&sched->job_list_lock);
> +		int res;
>   
> -		job->sched->ops->timedout_job(job);
> +		job->job_status |= DRM_JOB_STATUS_TIMEOUT;
> +		res = job->sched->ops->timedout_job(job);
> +		if (res == 0) {
> +			/* The job is out of the device.
> +			 */
> +			spin_lock(&sched->job_list_lock);
> +			list_move(&job->list, &sched->done_list);
> +			spin_unlock(&sched->job_list_lock);
>   
> -		/*
> -		 * Guilty job did complete and hence needs to be manually removed
> -		 * See drm_sched_stop doc.
> -		 */
> -		if (sched->free_guilty) {
> -			job->sched->ops->free_job(job);
> -			sched->free_guilty = false;
> +			wake_up_interruptible(&sched->done_wait_q);
> +		} else {
> +			/* The job needs more time.
> +			 */
> +			drm_sched_start_timeout(sched);
>   		}
> -	} else {
> -		spin_unlock(&sched->job_list_lock);
>   	}
> -
> -	spin_lock(&sched->job_list_lock);
> -	drm_sched_start_timeout(sched);
> -	spin_unlock(&sched->job_list_lock);
>   }
>   
>    /**
> @@ -511,15 +509,13 @@ void drm_sched_start(struct drm_gpu_scheduler *sched, bool full_recovery)
>   			else if (r)
>   				DRM_ERROR("fence add callback failed (%d)\n",
>   					  r);
> -		} else
> +		} else {
>   			drm_sched_job_done(s_job);
> +		}
>   	}
>   
> -	if (full_recovery) {
> -		spin_lock(&sched->job_list_lock);
> +	if (full_recovery)
>   		drm_sched_start_timeout(sched);
> -		spin_unlock(&sched->job_list_lock);
> -	}
>   
>   	kthread_unpark(sched->thread);
>   }
> @@ -667,47 +663,6 @@ drm_sched_select_entity(struct drm_gpu_scheduler *sched)
>   	return entity;
>   }
>   
> -/**
> - * drm_sched_get_cleanup_job - fetch the next finished job to be destroyed
> - *
> - * @sched: scheduler instance
> - *
> - * Returns the next finished job from the pending list (if there is one)
> - * ready for it to be destroyed.
> - */
> -static struct drm_sched_job *
> -drm_sched_get_cleanup_job(struct drm_gpu_scheduler *sched)
> -{
> -	struct drm_sched_job *job;
> -
> -	/*
> -	 * Don't destroy jobs while the timeout worker is running  OR thread
> -	 * is being parked and hence assumed to not touch pending_list
> -	 */
> -	if ((sched->timeout != MAX_SCHEDULE_TIMEOUT &&
> -	    !cancel_delayed_work(&sched->work_tdr)) ||
> -	    kthread_should_park())
> -		return NULL;
> -
> -	spin_lock(&sched->job_list_lock);
> -
> -	job = list_first_entry_or_null(&sched->pending_list,
> -				       struct drm_sched_job, list);
> -
> -	if (job && dma_fence_is_signaled(&job->s_fence->finished)) {
> -		/* remove job from pending_list */
> -		list_del_init(&job->list);
> -	} else {
> -		job = NULL;
> -		/* queue timeout for next job */
> -		drm_sched_start_timeout(sched);
> -	}
> -
> -	spin_unlock(&sched->job_list_lock);
> -
> -	return job;
> -}
> -
>   /**
>    * drm_sched_pick_best - Get a drm sched from a sched_list with the least load
>    * @sched_list: list of drm_gpu_schedulers
> @@ -761,6 +716,44 @@ static bool drm_sched_blocked(struct drm_gpu_scheduler *sched)
>   	return false;
>   }
>   
> +/**
> + * drm_sched_done - free done tasks
> + * @param: pointer to a scheduler instance
> + *
> + * Returns 0.
> + */
> +static int drm_sched_done(void *param)
> +{
> +	struct drm_gpu_scheduler *sched = param;
> +
> +	do {
> +		LIST_HEAD(done_q);
> +
> +		wait_event_interruptible(sched->done_wait_q,
> +					 kthread_should_stop() ||
> +					 !list_empty(&sched->done_list));
> +
> +		spin_lock(&sched->job_list_lock);
> +		list_splice_init(&sched->done_list, &done_q);
> +		spin_unlock(&sched->job_list_lock);
> +
> +		if (list_empty(&done_q))
> +			continue;
> +
> +		while (!list_empty(&done_q)) {
> +			struct drm_sched_job *job;
> +
> +			job = list_first_entry(&done_q,
> +					       struct drm_sched_job,
> +					       list);
> +			list_del_init(&job->list);
> +			sched->ops->free_job(job);
> +		}
> +	} while (!kthread_should_stop());
> +
> +	return 0;
> +}
> +
>   /**
>    * drm_sched_main - main scheduler thread
>    *
> @@ -770,7 +763,7 @@ static bool drm_sched_blocked(struct drm_gpu_scheduler *sched)
>    */
>   static int drm_sched_main(void *param)
>   {
> -	struct drm_gpu_scheduler *sched = (struct drm_gpu_scheduler *)param;
> +	struct drm_gpu_scheduler *sched = param;
>   	int r;
>   
>   	sched_set_fifo_low(current);
> @@ -780,20 +773,12 @@ static int drm_sched_main(void *param)
>   		struct drm_sched_fence *s_fence;
>   		struct drm_sched_job *sched_job;
>   		struct dma_fence *fence;
> -		struct drm_sched_job *cleanup_job = NULL;
>   
>   		wait_event_interruptible(sched->wake_up_worker,
> -					 (cleanup_job = drm_sched_get_cleanup_job(sched)) ||
>   					 (!drm_sched_blocked(sched) &&
>   					  (entity = drm_sched_select_entity(sched))) ||
>   					 kthread_should_stop());
>   
> -		if (cleanup_job) {
> -			sched->ops->free_job(cleanup_job);
> -			/* queue timeout for next job */
> -			drm_sched_start_timeout(sched);
> -		}
> -
>   		if (!entity)
>   			continue;
>   
> @@ -820,8 +805,7 @@ static int drm_sched_main(void *param)
>   			if (r == -ENOENT)
>   				drm_sched_job_done(sched_job);
>   			else if (r)
> -				DRM_ERROR("fence add callback failed (%d)\n",
> -					  r);
> +				DRM_ERROR("fence add callback failed (%d)\n", r);
>   			dma_fence_put(fence);
>   		} else {
>   			if (IS_ERR(fence))
> @@ -865,7 +849,9 @@ int drm_sched_init(struct drm_gpu_scheduler *sched,
>   
>   	init_waitqueue_head(&sched->wake_up_worker);
>   	init_waitqueue_head(&sched->job_scheduled);
> +	init_waitqueue_head(&sched->done_wait_q);
>   	INIT_LIST_HEAD(&sched->pending_list);
> +	INIT_LIST_HEAD(&sched->done_list);
>   	spin_lock_init(&sched->job_list_lock);
>   	atomic_set(&sched->hw_rq_count, 0);
>   	INIT_DELAYED_WORK(&sched->work_tdr, drm_sched_job_timedout);
> @@ -881,6 +867,21 @@ int drm_sched_init(struct drm_gpu_scheduler *sched,
>   		return ret;
>   	}
>   
> +	snprintf(sched->thread_done_name, DRM_THREAD_NAME_LEN, "%s%s",
> +		 sched->name, "-done");
> +	sched->thread_done_name[DRM_THREAD_NAME_LEN - 1] = '\0';
> +	sched->thread_done = kthread_run(drm_sched_done, sched,
> +					 sched->thread_done_name);
> +	if (IS_ERR(sched->thread_done)) {
> +		ret = kthread_stop(sched->thread);
> +		if (!ret) {
> +			/* free_kthread_struct(sched->thread); */
> +			sched->thread = NULL;
> +		}
> +		DRM_ERROR("Failed to start thread %s", sched->thread_done_name);
> +		return ret;
> +	}
> +
>   	sched->ready = true;
>   	return 0;
>   }
> diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
> index 3a5686c3b5e9..b282d6158b50 100644
> --- a/include/drm/gpu_scheduler.h
> +++ b/include/drm/gpu_scheduler.h
> @@ -169,6 +169,12 @@ struct drm_sched_fence {
>   
>   struct drm_sched_fence *to_drm_sched_fence(struct dma_fence *f);
>   
> +enum drm_job_status {
> +	DRM_JOB_STATUS_NONE    = 0 << 0,
> +	DRM_JOB_STATUS_DONE    = 1 << 0,
> +	DRM_JOB_STATUS_TIMEOUT = 1 << 1,
> +};
> +
>   /**
>    * struct drm_sched_job - A job to be run by an entity.
>    *
> @@ -198,6 +204,7 @@ struct drm_sched_job {
>   	uint64_t			id;
>   	atomic_t			karma;
>   	enum drm_sched_priority		s_priority;
> +	enum drm_job_status             job_status;
>   	struct drm_sched_entity         *entity;
>   	struct dma_fence_cb		cb;
>   };
> @@ -284,15 +291,22 @@ struct drm_gpu_scheduler {
>   	uint32_t			hw_submission_limit;
>   	long				timeout;
>   	const char			*name;
> +	char                            thread_done_name[DRM_THREAD_NAME_LEN];
> +
>   	struct drm_sched_rq		sched_rq[DRM_SCHED_PRIORITY_COUNT];
>   	wait_queue_head_t		wake_up_worker;
>   	wait_queue_head_t		job_scheduled;
> +	wait_queue_head_t               done_wait_q;
>   	atomic_t			hw_rq_count;
>   	atomic64_t			job_id_count;
>   	struct delayed_work		work_tdr;
>   	struct task_struct		*thread;
> +	struct task_struct		*thread_done;
> +
>   	struct list_head		pending_list;
> +	struct list_head                done_list;
>   	spinlock_t			job_list_lock;
> +
>   	int				hang_limit;
>   	atomic_t                        score;
>   	bool				ready;
>
Luben Tuikov Nov. 26, 2020, 12:24 a.m. UTC | #3
On 2020-11-25 05:10, Christian König wrote:
> Am 25.11.20 um 04:17 schrieb Luben Tuikov:
>> Add a "done" list to which all completed jobs are added
>> to be freed. The drm_sched_job_done() callback is the
>> producer of jobs to this list.
>>
>> Add a "done" thread which consumes from the done list
>> and frees up jobs. Now, the main scheduler thread only
>> pushes jobs to the GPU and the "done" thread frees them
>> up, on the way out of the GPU when they've completed
>> execution.
> 
> Well there are quite a number of problems in this patch.
> 
>  From the design I think we should be getting rid of the linked list and

Sure, we can do this in a separate future patch. I'd imagine it'll
touch a lot of places and I didn't want this patch and this series
of patches to get out of hand, by changing too many things.

Here in this patch I wanted to change as little as possible.

> not extend its use. And we also don't want to offload the freeing of 
> jobs into a different thread because that could potentially mean that 
> this is executed on a different CPU.

Yes, of course it could.

From my experience working with hardware, I always envision work
being done by small units, in a pipeline, concurrently, while all
of them working concurrently, all the time.

It's hard to go back to unitary processing. :-)

> 
> Then one obvious problem seems to be that you don't take into account 
> that we moved the job freeing into the scheduler thread to make sure 
> that this is suspended while the scheduler thread is stopped. 

I don't understand what "this" refers to in "that this is suspended
while the scheduler thread is stopped."

> This 
> behavior is now completely gone, e.g. the delete thread keeps running 
> while the scheduler thread is stopped.

Yes, indeed, that is the case and intentional.

There seems to be no requirement to have to stop the main
scheduler thread, which pushes tasks down to the GPU,
so that we can free jobs. In other words, both
threads can work concurrently, one pushing jobs down
to the GPU, while the other freeing done jobs coming
out of the GPU.

If this concurrency is something you don't like,
then no problem, we can keep them interlocked in one
thread as before.

> 
> A few more comments below.
> 
>> Make use of the status returned by the GPU driver
>> timeout handler to decide whether to leave the job in
>> the pending list, or to send it off to the done list.
>> If a job is done, it is added to the done list and the
>> done thread woken up. If a job needs more time, it is
>> left on the pending list and the timeout timer
>> restarted.
>>
>> Eliminate the polling mechanism of picking out done
>> jobs from the pending list, i.e. eliminate
>> drm_sched_get_cleanup_job(). Now the main scheduler
>> thread only pushes jobs down to the GPU.
>>
>> Various other optimizations to the GPU scheduler
>> and job recovery are possible with this format.
>>
>> Signed-off-by: Luben Tuikov <luben.tuikov@amd.com>
>> ---
>>   drivers/gpu/drm/scheduler/sched_main.c | 173 +++++++++++++------------
>>   include/drm/gpu_scheduler.h            |  14 ++
>>   2 files changed, 101 insertions(+), 86 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c
>> index 3eb7618a627d..289ae68cd97f 100644
>> --- a/drivers/gpu/drm/scheduler/sched_main.c
>> +++ b/drivers/gpu/drm/scheduler/sched_main.c
>> @@ -164,7 +164,8 @@ drm_sched_rq_select_entity(struct drm_sched_rq *rq)
>>    * drm_sched_job_done - complete a job
>>    * @s_job: pointer to the job which is done
>>    *
>> - * Finish the job's fence and wake up the worker thread.
>> + * Finish the job's fence, move it to the done list,
>> + * and wake up the done thread.
>>    */
>>   static void drm_sched_job_done(struct drm_sched_job *s_job)
>>   {
>> @@ -179,7 +180,12 @@ static void drm_sched_job_done(struct drm_sched_job *s_job)
>>   	dma_fence_get(&s_fence->finished);
>>   	drm_sched_fence_finished(s_fence);
>>   	dma_fence_put(&s_fence->finished);
>> -	wake_up_interruptible(&sched->wake_up_worker);
>> +
>> +	spin_lock(&sched->job_list_lock);
>> +	list_move(&s_job->list, &sched->done_list);
>> +	spin_unlock(&sched->job_list_lock);
>> +
>> +	wake_up_interruptible(&sched->done_wait_q);
> 
> How is the worker thread then woken up to push new jobs to the hardware?

A-ha! Thank you Christian for bringing this up--perhaps that is what
the problem is I was seeing on my test machine, which I described
in the cover letter 0/6, that X/GDM just sleeping in wait.

So, I'd imagined that whomever pushed jobs down to DRM, i.e.
the producer of jobs, also did a "up"/"wake-up" of the main
scheduler thread, so that the main scheduler thread would
then wake up and "schedule" tasks down into the GPU. It seems
I've only "imagined" :-) such concurrency and the the main scheduler
thread needs to be woken up to poll? I'll try this next.
Thanks for the tip Christian!

> 
>>   }
>>   
>>   /**
>> @@ -221,11 +227,10 @@ bool drm_sched_dependency_optimized(struct dma_fence* fence,
>>   EXPORT_SYMBOL(drm_sched_dependency_optimized);
>>   
>>   /**
>> - * drm_sched_start_timeout - start timeout for reset worker
>> - *
>> - * @sched: scheduler instance to start the worker for
>> + * drm_sched_start_timeout - start a timeout timer
>> + * @sched: scheduler instance whose job we're timing
>>    *
>> - * Start the timeout for the given scheduler.
>> + * Start a timeout timer for the given scheduler.
>>    */
>>   static void drm_sched_start_timeout(struct drm_gpu_scheduler *sched)
>>   {
>> @@ -305,8 +310,8 @@ static void drm_sched_job_begin(struct drm_sched_job *s_job)
>>   
>>   	spin_lock(&sched->job_list_lock);
>>   	list_add_tail(&s_job->list, &sched->pending_list);
>> -	drm_sched_start_timeout(sched);
>>   	spin_unlock(&sched->job_list_lock);
>> +	drm_sched_start_timeout(sched);
> 
> This looks wrong, the drm_sched_start_timeout() function used to need 
> the lock. Why should that have changed?

I'd originally removed the check in drm_sched_start_timeout(),
of whether the "pending_list" is empty, because the use
of that function became more _deterministic_, with this patch.
By this I mean that the timeout timer is now started
_only_ when we push down new jobs.

But then I noticed that "full recovery" business in
in drm_sched_start(), which calls drm_sched_start_timeout(),
and put that !list_empty() check back in, and I seem to have
forgotten to move this back inside the lock.

I'll move it back in, no problem, thanks for catching this.

Regards,
Luben

> 
>>   }
>>   
>>   static void drm_sched_job_timedout(struct work_struct *work)
>> @@ -316,37 +321,30 @@ static void drm_sched_job_timedout(struct work_struct *work)
>>   
>>   	sched = container_of(work, struct drm_gpu_scheduler, work_tdr.work);
>>   
>> -	/* Protects against concurrent deletion in drm_sched_get_cleanup_job */
>>   	spin_lock(&sched->job_list_lock);
>>   	job = list_first_entry_or_null(&sched->pending_list,
>>   				       struct drm_sched_job, list);
>> +	spin_unlock(&sched->job_list_lock);
>>   
>>   	if (job) {
>> -		/*
>> -		 * Remove the bad job so it cannot be freed by concurrent
>> -		 * drm_sched_cleanup_jobs. It will be reinserted back after sched->thread
>> -		 * is parked at which point it's safe.
>> -		 */
>> -		list_del_init(&job->list);
>> -		spin_unlock(&sched->job_list_lock);
>> +		int res;
>>   
>> -		job->sched->ops->timedout_job(job);
>> +		job->job_status |= DRM_JOB_STATUS_TIMEOUT;
>> +		res = job->sched->ops->timedout_job(job);
>> +		if (res == 0) {
>> +			/* The job is out of the device.
>> +			 */
>> +			spin_lock(&sched->job_list_lock);
>> +			list_move(&job->list, &sched->done_list);
>> +			spin_unlock(&sched->job_list_lock);
>>   
>> -		/*
>> -		 * Guilty job did complete and hence needs to be manually removed
>> -		 * See drm_sched_stop doc.
>> -		 */
>> -		if (sched->free_guilty) {
>> -			job->sched->ops->free_job(job);
>> -			sched->free_guilty = false;
>> +			wake_up_interruptible(&sched->done_wait_q);
>> +		} else {
>> +			/* The job needs more time.
>> +			 */
>> +			drm_sched_start_timeout(sched);
>>   		}
>> -	} else {
>> -		spin_unlock(&sched->job_list_lock);
>>   	}
>> -
>> -	spin_lock(&sched->job_list_lock);
>> -	drm_sched_start_timeout(sched);
>> -	spin_unlock(&sched->job_list_lock);
>>   }
>>   
>>    /**
>> @@ -511,15 +509,13 @@ void drm_sched_start(struct drm_gpu_scheduler *sched, bool full_recovery)
>>   			else if (r)
>>   				DRM_ERROR("fence add callback failed (%d)\n",
>>   					  r);
>> -		} else
>> +		} else {
>>   			drm_sched_job_done(s_job);
>> +		}
>>   	}
>>   
>> -	if (full_recovery) {
>> -		spin_lock(&sched->job_list_lock);
>> +	if (full_recovery)
>>   		drm_sched_start_timeout(sched);
>> -		spin_unlock(&sched->job_list_lock);
> 
> Same here.
> 
> Regards,
> Christian.
> 
>> -	}
>>   
>>   	kthread_unpark(sched->thread);
>>   }
>> @@ -667,47 +663,6 @@ drm_sched_select_entity(struct drm_gpu_scheduler *sched)
>>   	return entity;
>>   }
>>   
>> -/**
>> - * drm_sched_get_cleanup_job - fetch the next finished job to be destroyed
>> - *
>> - * @sched: scheduler instance
>> - *
>> - * Returns the next finished job from the pending list (if there is one)
>> - * ready for it to be destroyed.
>> - */
>> -static struct drm_sched_job *
>> -drm_sched_get_cleanup_job(struct drm_gpu_scheduler *sched)
>> -{
>> -	struct drm_sched_job *job;
>> -
>> -	/*
>> -	 * Don't destroy jobs while the timeout worker is running  OR thread
>> -	 * is being parked and hence assumed to not touch pending_list
>> -	 */
>> -	if ((sched->timeout != MAX_SCHEDULE_TIMEOUT &&
>> -	    !cancel_delayed_work(&sched->work_tdr)) ||
>> -	    kthread_should_park())
>> -		return NULL;
>> -
>> -	spin_lock(&sched->job_list_lock);
>> -
>> -	job = list_first_entry_or_null(&sched->pending_list,
>> -				       struct drm_sched_job, list);
>> -
>> -	if (job && dma_fence_is_signaled(&job->s_fence->finished)) {
>> -		/* remove job from pending_list */
>> -		list_del_init(&job->list);
>> -	} else {
>> -		job = NULL;
>> -		/* queue timeout for next job */
>> -		drm_sched_start_timeout(sched);
>> -	}
>> -
>> -	spin_unlock(&sched->job_list_lock);
>> -
>> -	return job;
>> -}
>> -
>>   /**
>>    * drm_sched_pick_best - Get a drm sched from a sched_list with the least load
>>    * @sched_list: list of drm_gpu_schedulers
>> @@ -761,6 +716,44 @@ static bool drm_sched_blocked(struct drm_gpu_scheduler *sched)
>>   	return false;
>>   }
>>   
>> +/**
>> + * drm_sched_done - free done tasks
>> + * @param: pointer to a scheduler instance
>> + *
>> + * Returns 0.
>> + */
>> +static int drm_sched_done(void *param)
>> +{
>> +	struct drm_gpu_scheduler *sched = param;
>> +
>> +	do {
>> +		LIST_HEAD(done_q);
>> +
>> +		wait_event_interruptible(sched->done_wait_q,
>> +					 kthread_should_stop() ||
>> +					 !list_empty(&sched->done_list));
>> +
>> +		spin_lock(&sched->job_list_lock);
>> +		list_splice_init(&sched->done_list, &done_q);
>> +		spin_unlock(&sched->job_list_lock);
>> +
>> +		if (list_empty(&done_q))
>> +			continue;
>> +
>> +		while (!list_empty(&done_q)) {
>> +			struct drm_sched_job *job;
>> +
>> +			job = list_first_entry(&done_q,
>> +					       struct drm_sched_job,
>> +					       list);
>> +			list_del_init(&job->list);
>> +			sched->ops->free_job(job);
>> +		}
>> +	} while (!kthread_should_stop());
>> +
>> +	return 0;
>> +}
>> +
>>   /**
>>    * drm_sched_main - main scheduler thread
>>    *
>> @@ -770,7 +763,7 @@ static bool drm_sched_blocked(struct drm_gpu_scheduler *sched)
>>    */
>>   static int drm_sched_main(void *param)
>>   {
>> -	struct drm_gpu_scheduler *sched = (struct drm_gpu_scheduler *)param;
>> +	struct drm_gpu_scheduler *sched = param;
>>   	int r;
>>   
>>   	sched_set_fifo_low(current);
>> @@ -780,20 +773,12 @@ static int drm_sched_main(void *param)
>>   		struct drm_sched_fence *s_fence;
>>   		struct drm_sched_job *sched_job;
>>   		struct dma_fence *fence;
>> -		struct drm_sched_job *cleanup_job = NULL;
>>   
>>   		wait_event_interruptible(sched->wake_up_worker,
>> -					 (cleanup_job = drm_sched_get_cleanup_job(sched)) ||
>>   					 (!drm_sched_blocked(sched) &&
>>   					  (entity = drm_sched_select_entity(sched))) ||
>>   					 kthread_should_stop());
>>   
>> -		if (cleanup_job) {
>> -			sched->ops->free_job(cleanup_job);
>> -			/* queue timeout for next job */
>> -			drm_sched_start_timeout(sched);
>> -		}
>> -
>>   		if (!entity)
>>   			continue;
>>   
>> @@ -820,8 +805,7 @@ static int drm_sched_main(void *param)
>>   			if (r == -ENOENT)
>>   				drm_sched_job_done(sched_job);
>>   			else if (r)
>> -				DRM_ERROR("fence add callback failed (%d)\n",
>> -					  r);
>> +				DRM_ERROR("fence add callback failed (%d)\n", r);
>>   			dma_fence_put(fence);
>>   		} else {
>>   			if (IS_ERR(fence))
>> @@ -865,7 +849,9 @@ int drm_sched_init(struct drm_gpu_scheduler *sched,
>>   
>>   	init_waitqueue_head(&sched->wake_up_worker);
>>   	init_waitqueue_head(&sched->job_scheduled);
>> +	init_waitqueue_head(&sched->done_wait_q);
>>   	INIT_LIST_HEAD(&sched->pending_list);
>> +	INIT_LIST_HEAD(&sched->done_list);
>>   	spin_lock_init(&sched->job_list_lock);
>>   	atomic_set(&sched->hw_rq_count, 0);
>>   	INIT_DELAYED_WORK(&sched->work_tdr, drm_sched_job_timedout);
>> @@ -881,6 +867,21 @@ int drm_sched_init(struct drm_gpu_scheduler *sched,
>>   		return ret;
>>   	}
>>   
>> +	snprintf(sched->thread_done_name, DRM_THREAD_NAME_LEN, "%s%s",
>> +		 sched->name, "-done");
>> +	sched->thread_done_name[DRM_THREAD_NAME_LEN - 1] = '\0';
>> +	sched->thread_done = kthread_run(drm_sched_done, sched,
>> +					 sched->thread_done_name);
>> +	if (IS_ERR(sched->thread_done)) {
>> +		ret = kthread_stop(sched->thread);
>> +		if (!ret) {
>> +			/* free_kthread_struct(sched->thread); */
>> +			sched->thread = NULL;
>> +		}
>> +		DRM_ERROR("Failed to start thread %s", sched->thread_done_name);
>> +		return ret;
>> +	}
>> +
>>   	sched->ready = true;
>>   	return 0;
>>   }
>> diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
>> index 3a5686c3b5e9..b282d6158b50 100644
>> --- a/include/drm/gpu_scheduler.h
>> +++ b/include/drm/gpu_scheduler.h
>> @@ -169,6 +169,12 @@ struct drm_sched_fence {
>>   
>>   struct drm_sched_fence *to_drm_sched_fence(struct dma_fence *f);
>>   
>> +enum drm_job_status {
>> +	DRM_JOB_STATUS_NONE    = 0 << 0,
>> +	DRM_JOB_STATUS_DONE    = 1 << 0,
>> +	DRM_JOB_STATUS_TIMEOUT = 1 << 1,
>> +};
>> +
>>   /**
>>    * struct drm_sched_job - A job to be run by an entity.
>>    *
>> @@ -198,6 +204,7 @@ struct drm_sched_job {
>>   	uint64_t			id;
>>   	atomic_t			karma;
>>   	enum drm_sched_priority		s_priority;
>> +	enum drm_job_status             job_status;
>>   	struct drm_sched_entity         *entity;
>>   	struct dma_fence_cb		cb;
>>   };
>> @@ -284,15 +291,22 @@ struct drm_gpu_scheduler {
>>   	uint32_t			hw_submission_limit;
>>   	long				timeout;
>>   	const char			*name;
>> +	char                            thread_done_name[DRM_THREAD_NAME_LEN];
>> +
>>   	struct drm_sched_rq		sched_rq[DRM_SCHED_PRIORITY_COUNT];
>>   	wait_queue_head_t		wake_up_worker;
>>   	wait_queue_head_t		job_scheduled;
>> +	wait_queue_head_t               done_wait_q;
>>   	atomic_t			hw_rq_count;
>>   	atomic64_t			job_id_count;
>>   	struct delayed_work		work_tdr;
>>   	struct task_struct		*thread;
>> +	struct task_struct		*thread_done;
>> +
>>   	struct list_head		pending_list;
>> +	struct list_head                done_list;
>>   	spinlock_t			job_list_lock;
>> +
>>   	int				hang_limit;
>>   	atomic_t                        score;
>>   	bool				ready;
>
Luben Tuikov Nov. 26, 2020, 12:30 a.m. UTC | #4
On 2020-11-25 06:09, Steven Price wrote:
> On 25/11/2020 03:17, Luben Tuikov wrote:
>> Add a "done" list to which all completed jobs are added
>> to be freed. The drm_sched_job_done() callback is the
>> producer of jobs to this list.
>>
>> Add a "done" thread which consumes from the done list
>> and frees up jobs. Now, the main scheduler thread only
>> pushes jobs to the GPU and the "done" thread frees them
>> up, on the way out of the GPU when they've completed
>> execution.
> 
> Generally I'd be in favour of a "done thread" as I think there are some 
> murky corners of Panfrost's locking that would be helped by deferring 
> the free_job() callback.

Check my response to his email.

It seems you're okay with a separate thread, when both threads
could be working concurrently, and Christian wants
a single thread doing all this. You should probably address
this in a follow-up to his email, so this can be hashed out.

> 
> But I think you're trying to do too much in one patch here. And as 
> Christian has pointed out there's some dodgy looking changes to locking 
> which aren't explained.

I've addressed this in my response to his email, check it out.

So, if you're in favour of a separate thread working concurrently,
please follow up to his email, so this can be hashed out.

Thanks and Regards,
Luben

> 
> Steve
> 
>>
>> Make use of the status returned by the GPU driver
>> timeout handler to decide whether to leave the job in
>> the pending list, or to send it off to the done list.
>> If a job is done, it is added to the done list and the
>> done thread woken up. If a job needs more time, it is
>> left on the pending list and the timeout timer
>> restarted.
>>
>> Eliminate the polling mechanism of picking out done
>> jobs from the pending list, i.e. eliminate
>> drm_sched_get_cleanup_job(). Now the main scheduler
>> thread only pushes jobs down to the GPU.
>>
>> Various other optimizations to the GPU scheduler
>> and job recovery are possible with this format.
>>
>> Signed-off-by: Luben Tuikov <luben.tuikov@amd.com>
>> ---
>>   drivers/gpu/drm/scheduler/sched_main.c | 173 +++++++++++++------------
>>   include/drm/gpu_scheduler.h            |  14 ++
>>   2 files changed, 101 insertions(+), 86 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c
>> index 3eb7618a627d..289ae68cd97f 100644
>> --- a/drivers/gpu/drm/scheduler/sched_main.c
>> +++ b/drivers/gpu/drm/scheduler/sched_main.c
>> @@ -164,7 +164,8 @@ drm_sched_rq_select_entity(struct drm_sched_rq *rq)
>>    * drm_sched_job_done - complete a job
>>    * @s_job: pointer to the job which is done
>>    *
>> - * Finish the job's fence and wake up the worker thread.
>> + * Finish the job's fence, move it to the done list,
>> + * and wake up the done thread.
>>    */
>>   static void drm_sched_job_done(struct drm_sched_job *s_job)
>>   {
>> @@ -179,7 +180,12 @@ static void drm_sched_job_done(struct drm_sched_job *s_job)
>>   	dma_fence_get(&s_fence->finished);
>>   	drm_sched_fence_finished(s_fence);
>>   	dma_fence_put(&s_fence->finished);
>> -	wake_up_interruptible(&sched->wake_up_worker);
>> +
>> +	spin_lock(&sched->job_list_lock);
>> +	list_move(&s_job->list, &sched->done_list);
>> +	spin_unlock(&sched->job_list_lock);
>> +
>> +	wake_up_interruptible(&sched->done_wait_q);
>>   }
>>   
>>   /**
>> @@ -221,11 +227,10 @@ bool drm_sched_dependency_optimized(struct dma_fence* fence,
>>   EXPORT_SYMBOL(drm_sched_dependency_optimized);
>>   
>>   /**
>> - * drm_sched_start_timeout - start timeout for reset worker
>> - *
>> - * @sched: scheduler instance to start the worker for
>> + * drm_sched_start_timeout - start a timeout timer
>> + * @sched: scheduler instance whose job we're timing
>>    *
>> - * Start the timeout for the given scheduler.
>> + * Start a timeout timer for the given scheduler.
>>    */
>>   static void drm_sched_start_timeout(struct drm_gpu_scheduler *sched)
>>   {
>> @@ -305,8 +310,8 @@ static void drm_sched_job_begin(struct drm_sched_job *s_job)
>>   
>>   	spin_lock(&sched->job_list_lock);
>>   	list_add_tail(&s_job->list, &sched->pending_list);
>> -	drm_sched_start_timeout(sched);
>>   	spin_unlock(&sched->job_list_lock);
>> +	drm_sched_start_timeout(sched);
>>   }
>>   
>>   static void drm_sched_job_timedout(struct work_struct *work)
>> @@ -316,37 +321,30 @@ static void drm_sched_job_timedout(struct work_struct *work)
>>   
>>   	sched = container_of(work, struct drm_gpu_scheduler, work_tdr.work);
>>   
>> -	/* Protects against concurrent deletion in drm_sched_get_cleanup_job */
>>   	spin_lock(&sched->job_list_lock);
>>   	job = list_first_entry_or_null(&sched->pending_list,
>>   				       struct drm_sched_job, list);
>> +	spin_unlock(&sched->job_list_lock);
>>   
>>   	if (job) {
>> -		/*
>> -		 * Remove the bad job so it cannot be freed by concurrent
>> -		 * drm_sched_cleanup_jobs. It will be reinserted back after sched->thread
>> -		 * is parked at which point it's safe.
>> -		 */
>> -		list_del_init(&job->list);
>> -		spin_unlock(&sched->job_list_lock);
>> +		int res;
>>   
>> -		job->sched->ops->timedout_job(job);
>> +		job->job_status |= DRM_JOB_STATUS_TIMEOUT;
>> +		res = job->sched->ops->timedout_job(job);
>> +		if (res == 0) {
>> +			/* The job is out of the device.
>> +			 */
>> +			spin_lock(&sched->job_list_lock);
>> +			list_move(&job->list, &sched->done_list);
>> +			spin_unlock(&sched->job_list_lock);
>>   
>> -		/*
>> -		 * Guilty job did complete and hence needs to be manually removed
>> -		 * See drm_sched_stop doc.
>> -		 */
>> -		if (sched->free_guilty) {
>> -			job->sched->ops->free_job(job);
>> -			sched->free_guilty = false;
>> +			wake_up_interruptible(&sched->done_wait_q);
>> +		} else {
>> +			/* The job needs more time.
>> +			 */
>> +			drm_sched_start_timeout(sched);
>>   		}
>> -	} else {
>> -		spin_unlock(&sched->job_list_lock);
>>   	}
>> -
>> -	spin_lock(&sched->job_list_lock);
>> -	drm_sched_start_timeout(sched);
>> -	spin_unlock(&sched->job_list_lock);
>>   }
>>   
>>    /**
>> @@ -511,15 +509,13 @@ void drm_sched_start(struct drm_gpu_scheduler *sched, bool full_recovery)
>>   			else if (r)
>>   				DRM_ERROR("fence add callback failed (%d)\n",
>>   					  r);
>> -		} else
>> +		} else {
>>   			drm_sched_job_done(s_job);
>> +		}
>>   	}
>>   
>> -	if (full_recovery) {
>> -		spin_lock(&sched->job_list_lock);
>> +	if (full_recovery)
>>   		drm_sched_start_timeout(sched);
>> -		spin_unlock(&sched->job_list_lock);
>> -	}
>>   
>>   	kthread_unpark(sched->thread);
>>   }
>> @@ -667,47 +663,6 @@ drm_sched_select_entity(struct drm_gpu_scheduler *sched)
>>   	return entity;
>>   }
>>   
>> -/**
>> - * drm_sched_get_cleanup_job - fetch the next finished job to be destroyed
>> - *
>> - * @sched: scheduler instance
>> - *
>> - * Returns the next finished job from the pending list (if there is one)
>> - * ready for it to be destroyed.
>> - */
>> -static struct drm_sched_job *
>> -drm_sched_get_cleanup_job(struct drm_gpu_scheduler *sched)
>> -{
>> -	struct drm_sched_job *job;
>> -
>> -	/*
>> -	 * Don't destroy jobs while the timeout worker is running  OR thread
>> -	 * is being parked and hence assumed to not touch pending_list
>> -	 */
>> -	if ((sched->timeout != MAX_SCHEDULE_TIMEOUT &&
>> -	    !cancel_delayed_work(&sched->work_tdr)) ||
>> -	    kthread_should_park())
>> -		return NULL;
>> -
>> -	spin_lock(&sched->job_list_lock);
>> -
>> -	job = list_first_entry_or_null(&sched->pending_list,
>> -				       struct drm_sched_job, list);
>> -
>> -	if (job && dma_fence_is_signaled(&job->s_fence->finished)) {
>> -		/* remove job from pending_list */
>> -		list_del_init(&job->list);
>> -	} else {
>> -		job = NULL;
>> -		/* queue timeout for next job */
>> -		drm_sched_start_timeout(sched);
>> -	}
>> -
>> -	spin_unlock(&sched->job_list_lock);
>> -
>> -	return job;
>> -}
>> -
>>   /**
>>    * drm_sched_pick_best - Get a drm sched from a sched_list with the least load
>>    * @sched_list: list of drm_gpu_schedulers
>> @@ -761,6 +716,44 @@ static bool drm_sched_blocked(struct drm_gpu_scheduler *sched)
>>   	return false;
>>   }
>>   
>> +/**
>> + * drm_sched_done - free done tasks
>> + * @param: pointer to a scheduler instance
>> + *
>> + * Returns 0.
>> + */
>> +static int drm_sched_done(void *param)
>> +{
>> +	struct drm_gpu_scheduler *sched = param;
>> +
>> +	do {
>> +		LIST_HEAD(done_q);
>> +
>> +		wait_event_interruptible(sched->done_wait_q,
>> +					 kthread_should_stop() ||
>> +					 !list_empty(&sched->done_list));
>> +
>> +		spin_lock(&sched->job_list_lock);
>> +		list_splice_init(&sched->done_list, &done_q);
>> +		spin_unlock(&sched->job_list_lock);
>> +
>> +		if (list_empty(&done_q))
>> +			continue;
>> +
>> +		while (!list_empty(&done_q)) {
>> +			struct drm_sched_job *job;
>> +
>> +			job = list_first_entry(&done_q,
>> +					       struct drm_sched_job,
>> +					       list);
>> +			list_del_init(&job->list);
>> +			sched->ops->free_job(job);
>> +		}
>> +	} while (!kthread_should_stop());
>> +
>> +	return 0;
>> +}
>> +
>>   /**
>>    * drm_sched_main - main scheduler thread
>>    *
>> @@ -770,7 +763,7 @@ static bool drm_sched_blocked(struct drm_gpu_scheduler *sched)
>>    */
>>   static int drm_sched_main(void *param)
>>   {
>> -	struct drm_gpu_scheduler *sched = (struct drm_gpu_scheduler *)param;
>> +	struct drm_gpu_scheduler *sched = param;
>>   	int r;
>>   
>>   	sched_set_fifo_low(current);
>> @@ -780,20 +773,12 @@ static int drm_sched_main(void *param)
>>   		struct drm_sched_fence *s_fence;
>>   		struct drm_sched_job *sched_job;
>>   		struct dma_fence *fence;
>> -		struct drm_sched_job *cleanup_job = NULL;
>>   
>>   		wait_event_interruptible(sched->wake_up_worker,
>> -					 (cleanup_job = drm_sched_get_cleanup_job(sched)) ||
>>   					 (!drm_sched_blocked(sched) &&
>>   					  (entity = drm_sched_select_entity(sched))) ||
>>   					 kthread_should_stop());
>>   
>> -		if (cleanup_job) {
>> -			sched->ops->free_job(cleanup_job);
>> -			/* queue timeout for next job */
>> -			drm_sched_start_timeout(sched);
>> -		}
>> -
>>   		if (!entity)
>>   			continue;
>>   
>> @@ -820,8 +805,7 @@ static int drm_sched_main(void *param)
>>   			if (r == -ENOENT)
>>   				drm_sched_job_done(sched_job);
>>   			else if (r)
>> -				DRM_ERROR("fence add callback failed (%d)\n",
>> -					  r);
>> +				DRM_ERROR("fence add callback failed (%d)\n", r);
>>   			dma_fence_put(fence);
>>   		} else {
>>   			if (IS_ERR(fence))
>> @@ -865,7 +849,9 @@ int drm_sched_init(struct drm_gpu_scheduler *sched,
>>   
>>   	init_waitqueue_head(&sched->wake_up_worker);
>>   	init_waitqueue_head(&sched->job_scheduled);
>> +	init_waitqueue_head(&sched->done_wait_q);
>>   	INIT_LIST_HEAD(&sched->pending_list);
>> +	INIT_LIST_HEAD(&sched->done_list);
>>   	spin_lock_init(&sched->job_list_lock);
>>   	atomic_set(&sched->hw_rq_count, 0);
>>   	INIT_DELAYED_WORK(&sched->work_tdr, drm_sched_job_timedout);
>> @@ -881,6 +867,21 @@ int drm_sched_init(struct drm_gpu_scheduler *sched,
>>   		return ret;
>>   	}
>>   
>> +	snprintf(sched->thread_done_name, DRM_THREAD_NAME_LEN, "%s%s",
>> +		 sched->name, "-done");
>> +	sched->thread_done_name[DRM_THREAD_NAME_LEN - 1] = '\0';
>> +	sched->thread_done = kthread_run(drm_sched_done, sched,
>> +					 sched->thread_done_name);
>> +	if (IS_ERR(sched->thread_done)) {
>> +		ret = kthread_stop(sched->thread);
>> +		if (!ret) {
>> +			/* free_kthread_struct(sched->thread); */
>> +			sched->thread = NULL;
>> +		}
>> +		DRM_ERROR("Failed to start thread %s", sched->thread_done_name);
>> +		return ret;
>> +	}
>> +
>>   	sched->ready = true;
>>   	return 0;
>>   }
>> diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
>> index 3a5686c3b5e9..b282d6158b50 100644
>> --- a/include/drm/gpu_scheduler.h
>> +++ b/include/drm/gpu_scheduler.h
>> @@ -169,6 +169,12 @@ struct drm_sched_fence {
>>   
>>   struct drm_sched_fence *to_drm_sched_fence(struct dma_fence *f);
>>   
>> +enum drm_job_status {
>> +	DRM_JOB_STATUS_NONE    = 0 << 0,
>> +	DRM_JOB_STATUS_DONE    = 1 << 0,
>> +	DRM_JOB_STATUS_TIMEOUT = 1 << 1,
>> +};
>> +
>>   /**
>>    * struct drm_sched_job - A job to be run by an entity.
>>    *
>> @@ -198,6 +204,7 @@ struct drm_sched_job {
>>   	uint64_t			id;
>>   	atomic_t			karma;
>>   	enum drm_sched_priority		s_priority;
>> +	enum drm_job_status             job_status;
>>   	struct drm_sched_entity         *entity;
>>   	struct dma_fence_cb		cb;
>>   };
>> @@ -284,15 +291,22 @@ struct drm_gpu_scheduler {
>>   	uint32_t			hw_submission_limit;
>>   	long				timeout;
>>   	const char			*name;
>> +	char                            thread_done_name[DRM_THREAD_NAME_LEN];
>> +
>>   	struct drm_sched_rq		sched_rq[DRM_SCHED_PRIORITY_COUNT];
>>   	wait_queue_head_t		wake_up_worker;
>>   	wait_queue_head_t		job_scheduled;
>> +	wait_queue_head_t               done_wait_q;
>>   	atomic_t			hw_rq_count;
>>   	atomic64_t			job_id_count;
>>   	struct delayed_work		work_tdr;
>>   	struct task_struct		*thread;
>> +	struct task_struct		*thread_done;
>> +
>>   	struct list_head		pending_list;
>> +	struct list_head                done_list;
>>   	spinlock_t			job_list_lock;
>> +
>>   	int				hang_limit;
>>   	atomic_t                        score;
>>   	bool				ready;
>>
>
diff mbox series

Patch

diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c
index 3eb7618a627d..289ae68cd97f 100644
--- a/drivers/gpu/drm/scheduler/sched_main.c
+++ b/drivers/gpu/drm/scheduler/sched_main.c
@@ -164,7 +164,8 @@  drm_sched_rq_select_entity(struct drm_sched_rq *rq)
  * drm_sched_job_done - complete a job
  * @s_job: pointer to the job which is done
  *
- * Finish the job's fence and wake up the worker thread.
+ * Finish the job's fence, move it to the done list,
+ * and wake up the done thread.
  */
 static void drm_sched_job_done(struct drm_sched_job *s_job)
 {
@@ -179,7 +180,12 @@  static void drm_sched_job_done(struct drm_sched_job *s_job)
 	dma_fence_get(&s_fence->finished);
 	drm_sched_fence_finished(s_fence);
 	dma_fence_put(&s_fence->finished);
-	wake_up_interruptible(&sched->wake_up_worker);
+
+	spin_lock(&sched->job_list_lock);
+	list_move(&s_job->list, &sched->done_list);
+	spin_unlock(&sched->job_list_lock);
+
+	wake_up_interruptible(&sched->done_wait_q);
 }
 
 /**
@@ -221,11 +227,10 @@  bool drm_sched_dependency_optimized(struct dma_fence* fence,
 EXPORT_SYMBOL(drm_sched_dependency_optimized);
 
 /**
- * drm_sched_start_timeout - start timeout for reset worker
- *
- * @sched: scheduler instance to start the worker for
+ * drm_sched_start_timeout - start a timeout timer
+ * @sched: scheduler instance whose job we're timing
  *
- * Start the timeout for the given scheduler.
+ * Start a timeout timer for the given scheduler.
  */
 static void drm_sched_start_timeout(struct drm_gpu_scheduler *sched)
 {
@@ -305,8 +310,8 @@  static void drm_sched_job_begin(struct drm_sched_job *s_job)
 
 	spin_lock(&sched->job_list_lock);
 	list_add_tail(&s_job->list, &sched->pending_list);
-	drm_sched_start_timeout(sched);
 	spin_unlock(&sched->job_list_lock);
+	drm_sched_start_timeout(sched);
 }
 
 static void drm_sched_job_timedout(struct work_struct *work)
@@ -316,37 +321,30 @@  static void drm_sched_job_timedout(struct work_struct *work)
 
 	sched = container_of(work, struct drm_gpu_scheduler, work_tdr.work);
 
-	/* Protects against concurrent deletion in drm_sched_get_cleanup_job */
 	spin_lock(&sched->job_list_lock);
 	job = list_first_entry_or_null(&sched->pending_list,
 				       struct drm_sched_job, list);
+	spin_unlock(&sched->job_list_lock);
 
 	if (job) {
-		/*
-		 * Remove the bad job so it cannot be freed by concurrent
-		 * drm_sched_cleanup_jobs. It will be reinserted back after sched->thread
-		 * is parked at which point it's safe.
-		 */
-		list_del_init(&job->list);
-		spin_unlock(&sched->job_list_lock);
+		int res;
 
-		job->sched->ops->timedout_job(job);
+		job->job_status |= DRM_JOB_STATUS_TIMEOUT;
+		res = job->sched->ops->timedout_job(job);
+		if (res == 0) {
+			/* The job is out of the device.
+			 */
+			spin_lock(&sched->job_list_lock);
+			list_move(&job->list, &sched->done_list);
+			spin_unlock(&sched->job_list_lock);
 
-		/*
-		 * Guilty job did complete and hence needs to be manually removed
-		 * See drm_sched_stop doc.
-		 */
-		if (sched->free_guilty) {
-			job->sched->ops->free_job(job);
-			sched->free_guilty = false;
+			wake_up_interruptible(&sched->done_wait_q);
+		} else {
+			/* The job needs more time.
+			 */
+			drm_sched_start_timeout(sched);
 		}
-	} else {
-		spin_unlock(&sched->job_list_lock);
 	}
-
-	spin_lock(&sched->job_list_lock);
-	drm_sched_start_timeout(sched);
-	spin_unlock(&sched->job_list_lock);
 }
 
  /**
@@ -511,15 +509,13 @@  void drm_sched_start(struct drm_gpu_scheduler *sched, bool full_recovery)
 			else if (r)
 				DRM_ERROR("fence add callback failed (%d)\n",
 					  r);
-		} else
+		} else {
 			drm_sched_job_done(s_job);
+		}
 	}
 
-	if (full_recovery) {
-		spin_lock(&sched->job_list_lock);
+	if (full_recovery)
 		drm_sched_start_timeout(sched);
-		spin_unlock(&sched->job_list_lock);
-	}
 
 	kthread_unpark(sched->thread);
 }
@@ -667,47 +663,6 @@  drm_sched_select_entity(struct drm_gpu_scheduler *sched)
 	return entity;
 }
 
-/**
- * drm_sched_get_cleanup_job - fetch the next finished job to be destroyed
- *
- * @sched: scheduler instance
- *
- * Returns the next finished job from the pending list (if there is one)
- * ready for it to be destroyed.
- */
-static struct drm_sched_job *
-drm_sched_get_cleanup_job(struct drm_gpu_scheduler *sched)
-{
-	struct drm_sched_job *job;
-
-	/*
-	 * Don't destroy jobs while the timeout worker is running  OR thread
-	 * is being parked and hence assumed to not touch pending_list
-	 */
-	if ((sched->timeout != MAX_SCHEDULE_TIMEOUT &&
-	    !cancel_delayed_work(&sched->work_tdr)) ||
-	    kthread_should_park())
-		return NULL;
-
-	spin_lock(&sched->job_list_lock);
-
-	job = list_first_entry_or_null(&sched->pending_list,
-				       struct drm_sched_job, list);
-
-	if (job && dma_fence_is_signaled(&job->s_fence->finished)) {
-		/* remove job from pending_list */
-		list_del_init(&job->list);
-	} else {
-		job = NULL;
-		/* queue timeout for next job */
-		drm_sched_start_timeout(sched);
-	}
-
-	spin_unlock(&sched->job_list_lock);
-
-	return job;
-}
-
 /**
  * drm_sched_pick_best - Get a drm sched from a sched_list with the least load
  * @sched_list: list of drm_gpu_schedulers
@@ -761,6 +716,44 @@  static bool drm_sched_blocked(struct drm_gpu_scheduler *sched)
 	return false;
 }
 
+/**
+ * drm_sched_done - free done tasks
+ * @param: pointer to a scheduler instance
+ *
+ * Returns 0.
+ */
+static int drm_sched_done(void *param)
+{
+	struct drm_gpu_scheduler *sched = param;
+
+	do {
+		LIST_HEAD(done_q);
+
+		wait_event_interruptible(sched->done_wait_q,
+					 kthread_should_stop() ||
+					 !list_empty(&sched->done_list));
+
+		spin_lock(&sched->job_list_lock);
+		list_splice_init(&sched->done_list, &done_q);
+		spin_unlock(&sched->job_list_lock);
+
+		if (list_empty(&done_q))
+			continue;
+
+		while (!list_empty(&done_q)) {
+			struct drm_sched_job *job;
+
+			job = list_first_entry(&done_q,
+					       struct drm_sched_job,
+					       list);
+			list_del_init(&job->list);
+			sched->ops->free_job(job);
+		}
+	} while (!kthread_should_stop());
+
+	return 0;
+}
+
 /**
  * drm_sched_main - main scheduler thread
  *
@@ -770,7 +763,7 @@  static bool drm_sched_blocked(struct drm_gpu_scheduler *sched)
  */
 static int drm_sched_main(void *param)
 {
-	struct drm_gpu_scheduler *sched = (struct drm_gpu_scheduler *)param;
+	struct drm_gpu_scheduler *sched = param;
 	int r;
 
 	sched_set_fifo_low(current);
@@ -780,20 +773,12 @@  static int drm_sched_main(void *param)
 		struct drm_sched_fence *s_fence;
 		struct drm_sched_job *sched_job;
 		struct dma_fence *fence;
-		struct drm_sched_job *cleanup_job = NULL;
 
 		wait_event_interruptible(sched->wake_up_worker,
-					 (cleanup_job = drm_sched_get_cleanup_job(sched)) ||
 					 (!drm_sched_blocked(sched) &&
 					  (entity = drm_sched_select_entity(sched))) ||
 					 kthread_should_stop());
 
-		if (cleanup_job) {
-			sched->ops->free_job(cleanup_job);
-			/* queue timeout for next job */
-			drm_sched_start_timeout(sched);
-		}
-
 		if (!entity)
 			continue;
 
@@ -820,8 +805,7 @@  static int drm_sched_main(void *param)
 			if (r == -ENOENT)
 				drm_sched_job_done(sched_job);
 			else if (r)
-				DRM_ERROR("fence add callback failed (%d)\n",
-					  r);
+				DRM_ERROR("fence add callback failed (%d)\n", r);
 			dma_fence_put(fence);
 		} else {
 			if (IS_ERR(fence))
@@ -865,7 +849,9 @@  int drm_sched_init(struct drm_gpu_scheduler *sched,
 
 	init_waitqueue_head(&sched->wake_up_worker);
 	init_waitqueue_head(&sched->job_scheduled);
+	init_waitqueue_head(&sched->done_wait_q);
 	INIT_LIST_HEAD(&sched->pending_list);
+	INIT_LIST_HEAD(&sched->done_list);
 	spin_lock_init(&sched->job_list_lock);
 	atomic_set(&sched->hw_rq_count, 0);
 	INIT_DELAYED_WORK(&sched->work_tdr, drm_sched_job_timedout);
@@ -881,6 +867,21 @@  int drm_sched_init(struct drm_gpu_scheduler *sched,
 		return ret;
 	}
 
+	snprintf(sched->thread_done_name, DRM_THREAD_NAME_LEN, "%s%s",
+		 sched->name, "-done");
+	sched->thread_done_name[DRM_THREAD_NAME_LEN - 1] = '\0';
+	sched->thread_done = kthread_run(drm_sched_done, sched,
+					 sched->thread_done_name);
+	if (IS_ERR(sched->thread_done)) {
+		ret = kthread_stop(sched->thread);
+		if (!ret) {
+			/* free_kthread_struct(sched->thread); */
+			sched->thread = NULL;
+		}
+		DRM_ERROR("Failed to start thread %s", sched->thread_done_name);
+		return ret;
+	}
+
 	sched->ready = true;
 	return 0;
 }
diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
index 3a5686c3b5e9..b282d6158b50 100644
--- a/include/drm/gpu_scheduler.h
+++ b/include/drm/gpu_scheduler.h
@@ -169,6 +169,12 @@  struct drm_sched_fence {
 
 struct drm_sched_fence *to_drm_sched_fence(struct dma_fence *f);
 
+enum drm_job_status {
+	DRM_JOB_STATUS_NONE    = 0 << 0,
+	DRM_JOB_STATUS_DONE    = 1 << 0,
+	DRM_JOB_STATUS_TIMEOUT = 1 << 1,
+};
+
 /**
  * struct drm_sched_job - A job to be run by an entity.
  *
@@ -198,6 +204,7 @@  struct drm_sched_job {
 	uint64_t			id;
 	atomic_t			karma;
 	enum drm_sched_priority		s_priority;
+	enum drm_job_status             job_status;
 	struct drm_sched_entity         *entity;
 	struct dma_fence_cb		cb;
 };
@@ -284,15 +291,22 @@  struct drm_gpu_scheduler {
 	uint32_t			hw_submission_limit;
 	long				timeout;
 	const char			*name;
+	char                            thread_done_name[DRM_THREAD_NAME_LEN];
+
 	struct drm_sched_rq		sched_rq[DRM_SCHED_PRIORITY_COUNT];
 	wait_queue_head_t		wake_up_worker;
 	wait_queue_head_t		job_scheduled;
+	wait_queue_head_t               done_wait_q;
 	atomic_t			hw_rq_count;
 	atomic64_t			job_id_count;
 	struct delayed_work		work_tdr;
 	struct task_struct		*thread;
+	struct task_struct		*thread_done;
+
 	struct list_head		pending_list;
+	struct list_head                done_list;
 	spinlock_t			job_list_lock;
+
 	int				hang_limit;
 	atomic_t                        score;
 	bool				ready;