diff mbox series

drm/scheduler: Add drm_sched_suspend/resume timeout functions

Message ID 1541684554-17115-1-git-send-email-smasetty@codeaurora.org (mailing list archive)
State Not Applicable, archived
Delegated to: Andy Gross
Headers show
Series drm/scheduler: Add drm_sched_suspend/resume timeout functions | expand

Commit Message

Sharat Masetty Nov. 8, 2018, 1:42 p.m. UTC
Hi Christian,

Can you please review this patch? It is a continuation of the discussion at [1].
At first I was thinking of using a cancel for suspend instead of a mod(to an
arbitrarily large value), but I couldn't get it to fit in as I have an additional
constraint of being able to call these functions from an IRQ context.

These new functions race with other contexts, primarily finish_job(),
timedout_job() and recovery(), but I did go through the possible races between
these(I think). Please let me know what you think of this? I have not tested
this yet and if this is something in the right direction, I will put this
through my testing drill and polish it.

IMO I think I prefer the callback approach as it appears to be simple, less
error prone for both the scheduler and the drivers.

[1]  https://patchwork.freedesktop.org/patch/259914/

Signed-off-by: Sharat Masetty <smasetty@codeaurora.org>
---
 drivers/gpu/drm/scheduler/sched_main.c | 81 +++++++++++++++++++++++++++++++++-
 include/drm/gpu_scheduler.h            |  5 +++
 2 files changed, 85 insertions(+), 1 deletion(-)

--
1.9.1

Comments

Christian König Nov. 8, 2018, 2:41 p.m. UTC | #1
Am 08.11.18 um 14:42 schrieb Sharat Masetty:
> Hi Christian,
>
> Can you please review this patch? It is a continuation of the discussion at [1].
> At first I was thinking of using a cancel for suspend instead of a mod(to an
> arbitrarily large value), but I couldn't get it to fit in as I have an additional
> constraint of being able to call these functions from an IRQ context.
>
> These new functions race with other contexts, primarily finish_job(),
> timedout_job() and recovery(), but I did go through the possible races between
> these(I think). Please let me know what you think of this? I have not tested
> this yet and if this is something in the right direction, I will put this
> through my testing drill and polish it.
>
> IMO I think I prefer the callback approach as it appears to be simple, less
> error prone for both the scheduler and the drivers.

Well I agree that this is way to complicated and looks racy to me as 
well. But this is because you moved away from my initial suggestion.

So here is once more how to do it without any additional locks or races:

/**
  * drm_sched_suspend_timeout - suspend timeout for reset worker
  *
  * @sched: scheduler instance for which to suspend the timeout
  *
  * Suspend the delayed work timeout for the scheduler. Note that
  * this function can be called from an IRQ context. It returns the 
timeout remaining.
  */
unsigned long drm_sched_suspend_timeout(struct drm_gpu_scheduler *sched)
{

	unsigned long timeout, current = jiffies

	timeout = sched->work_tdr.timer.expires;

	/*
	 * Set timeout to an arbitrarily large value, this also prevents the timer to be
	 * started when new submissions arrive.
	 */
	if (mod_delayed_work(system_wq, &sched->work_tdr, sched->timeout * 10) &&
	    time_after(timeout, current))
		return timeout - current;
	else
		return sched->timeout;
}

/**
  * drm_sched_resume_timeout - resume timeout for reset worker
  *
  * @sched: scheduler instance for which to resume the timeout
  * @remaining: remaining timeout
  *
  * Resume the delayed work timeout for the scheduler. Note that
  * this function can be called from an IRQ context.
  */
void drm_sched_resume_timeout(struct drm_gpu_scheduler *sched, unsigned long remaining)
{
	if (list_empty(&sched->ring_mirror_list))
		cancel_delayed_work(&sched->work_tdr);
	else
		mod_delayed_work(system_wq, &sched->work_tdr, remaining);
}


Regards,
Christian.

>
> [1]  https://patchwork.freedesktop.org/patch/259914/
>
> Signed-off-by: Sharat Masetty <smasetty@codeaurora.org>
> ---
>   drivers/gpu/drm/scheduler/sched_main.c | 81 +++++++++++++++++++++++++++++++++-
>   include/drm/gpu_scheduler.h            |  5 +++
>   2 files changed, 85 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c
> index c993d10..9645789 100644
> --- a/drivers/gpu/drm/scheduler/sched_main.c
> +++ b/drivers/gpu/drm/scheduler/sched_main.c
> @@ -191,11 +191,84 @@ bool drm_sched_dependency_optimized(struct dma_fence* fence,
>    */
>   static void drm_sched_start_timeout(struct drm_gpu_scheduler *sched)
>   {
> +	unsigned long flags;
> +
> +	spin_lock_irqsave(&sched->tdr_suspend_lock, flags);
> +
> +	sched->timeout_remaining = sched->timeout;
> +
>   	if (sched->timeout != MAX_SCHEDULE_TIMEOUT &&
> -	    !list_empty(&sched->ring_mirror_list))
> +	    !list_empty(&sched->ring_mirror_list) && !sched->work_tdr_suspended)
>   		schedule_delayed_work(&sched->work_tdr, sched->timeout);
> +
> +	spin_unlock_irqrestore(&sched->tdr_suspend_lock, flags);
>   }
>
> +/**
> + * drm_sched_suspend_timeout - suspend timeout for reset worker
> + *
> + * @sched: scheduler instance for which to suspend the timeout
> + *
> + * Suspend the delayed work timeout for the scheduler. Note that
> + * this function can be called from an IRQ context.
> + */
> +void drm_sched_suspend_timeout(struct drm_gpu_scheduler *sched)
> +{
> +	unsigned long flags, timeout;
> +
> +	spin_lock_irqsave(&sched->tdr_suspend_lock, flags);
> +
> +	if (sched->work_tdr_suspended ||
> +			sched->timeout == MAX_SCHEDULE_TIMEOUT ||
> +			list_empty(&sched->ring_mirror_list))
> +		goto done;
> +
> +	timeout = sched->work_tdr.timer.expires;
> +
> +	/*
> +	 * Reset timeout to an arbitrarily large value
> +	 */
> +	mod_delayed_work(system_wq, &sched->work_tdr, sched->timeout * 10);
> +
> +	timeout -= jiffies;
> +
> +	/* FIXME: Can jiffies be after timeout? */
> +	sched->timeout_remaining = time_after(jiffies, timeout)? 0: timeout;
> +	sched->work_tdr_suspended = true;
> +
> +done:
> +	spin_unlock_irqrestore(&sched->tdr_suspend_lock, flags);
> +}
> +EXPORT_SYMBOL(drm_sched_suspend_timeout);
> +
> +/**
> + * drm_sched_resume_timeout - resume timeout for reset worker
> + *
> + * @sched: scheduler instance for which to resume the timeout
> + *
> + * Resume the delayed work timeout for the scheduler. Note that
> + * this function can be called from an IRQ context.
> + */
> +void drm_sched_resume_timeout(struct drm_gpu_scheduler *sched)
> +{
> +	unsigned long flags;
> +
> +	spin_lock_irqsave(&sched->tdr_suspend_lock, flags);
> +
> +	if (!sched->work_tdr_suspended ||
> +			sched->timeout == MAX_SCHEDULE_TIMEOUT) {
> +		spin_unlock_irqrestore(&sched->tdr_suspend_lock, flags);
> +		return;
> +	}
> +
> +	mod_delayed_work(system_wq, &sched->work_tdr, sched->timeout_remaining);
> +
> +	sched->work_tdr_suspended = false;
> +
> +	spin_unlock_irqrestore(&sched->tdr_suspend_lock, flags);
> +}
> +EXPORT_SYMBOL(drm_sched_resume_timeout);
> +
>   /* job_finish is called after hw fence signaled
>    */
>   static void drm_sched_job_finish(struct work_struct *work)
> @@ -348,6 +421,11 @@ void drm_sched_job_recovery(struct drm_gpu_scheduler *sched)
>   	struct drm_sched_job *s_job, *tmp;
>   	bool found_guilty = false;
>   	int r;
> +	unsigned long flags;
> +
> +	spin_lock_irqsave(&sched->tdr_suspend_lock, flags);
> +	sched->work_tdr_suspended = false;
> +	spin_unlock_irqrestore(&sched->tdr_suspend_lock, flags);
>
>   	spin_lock(&sched->job_list_lock);
>   	list_for_each_entry_safe(s_job, tmp, &sched->ring_mirror_list, node) {
> @@ -607,6 +685,7 @@ int drm_sched_init(struct drm_gpu_scheduler *sched,
>   	init_waitqueue_head(&sched->job_scheduled);
>   	INIT_LIST_HEAD(&sched->ring_mirror_list);
>   	spin_lock_init(&sched->job_list_lock);
> +	spin_lock_init(&sched->tdr_suspend_lock);
>   	atomic_set(&sched->hw_rq_count, 0);
>   	INIT_DELAYED_WORK(&sched->work_tdr, drm_sched_job_timedout);
>   	atomic_set(&sched->num_jobs, 0);
> diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
> index d87b268..5d39572 100644
> --- a/include/drm/gpu_scheduler.h
> +++ b/include/drm/gpu_scheduler.h
> @@ -278,6 +278,9 @@ struct drm_gpu_scheduler {
>   	atomic_t			hw_rq_count;
>   	atomic64_t			job_id_count;
>   	struct delayed_work		work_tdr;
> +	unsigned long			timeout_remaining;
> +	bool				work_tdr_suspended;
> +	spinlock_t			tdr_suspend_lock;
>   	struct task_struct		*thread;
>   	struct list_head		ring_mirror_list;
>   	spinlock_t			job_list_lock;
> @@ -300,6 +303,8 @@ void drm_sched_hw_job_reset(struct drm_gpu_scheduler *sched,
>   bool drm_sched_dependency_optimized(struct dma_fence* fence,
>   				    struct drm_sched_entity *entity);
>   void drm_sched_job_kickout(struct drm_sched_job *s_job);
> +void drm_sched_suspend_timeout(struct drm_gpu_scheduler *sched);
> +void drm_sched_resume_timeout(struct drm_gpu_scheduler *sched);
>
>   void drm_sched_rq_add_entity(struct drm_sched_rq *rq,
>   			     struct drm_sched_entity *entity);
> --
> 1.9.1
>
Sharat Masetty Nov. 14, 2018, 5:29 p.m. UTC | #2
On 11/8/2018 8:11 PM, Koenig, Christian wrote:
> Am 08.11.18 um 14:42 schrieb Sharat Masetty:
>> Hi Christian,
>>
>> Can you please review this patch? It is a continuation of the discussion at [1].
>> At first I was thinking of using a cancel for suspend instead of a mod(to an
>> arbitrarily large value), but I couldn't get it to fit in as I have an additional
>> constraint of being able to call these functions from an IRQ context.
>>
>> These new functions race with other contexts, primarily finish_job(),
>> timedout_job() and recovery(), but I did go through the possible races between
>> these(I think). Please let me know what you think of this? I have not tested
>> this yet and if this is something in the right direction, I will put this
>> through my testing drill and polish it.
>>
>> IMO I think I prefer the callback approach as it appears to be simple, less
>> error prone for both the scheduler and the drivers.
> 
> Well I agree that this is way to complicated and looks racy to me as
> well. But this is because you moved away from my initial suggestion.
> 
> So here is once more how to do it without any additional locks or races:
> 
> /**
>    * drm_sched_suspend_timeout - suspend timeout for reset worker
>    *
>    * @sched: scheduler instance for which to suspend the timeout
>    *
>    * Suspend the delayed work timeout for the scheduler. Note that
>    * this function can be called from an IRQ context. It returns the
> timeout remaining.
>    */
> unsigned long drm_sched_suspend_timeout(struct drm_gpu_scheduler *sched)
> {
> 
> 	unsigned long timeout, current = jiffies
> 
> 	timeout = sched->work_tdr.timer.expires;
> 
> 	/*
> 	 * Set timeout to an arbitrarily large value, this also prevents the timer to be
> 	 * started when new submissions arrive.
> 	 */
> 	if (mod_delayed_work(system_wq, &sched->work_tdr, sched->timeout * 10) &&
> 	    time_after(timeout, current))
> 		return timeout - current;
> 	else
> 		return sched->timeout;
> }
> 
> /**
>    * drm_sched_resume_timeout - resume timeout for reset worker
>    *
>    * @sched: scheduler instance for which to resume the timeout
>    * @remaining: remaining timeout
>    *
>    * Resume the delayed work timeout for the scheduler. Note that
>    * this function can be called from an IRQ context.
>    */
> void drm_sched_resume_timeout(struct drm_gpu_scheduler *sched, unsigned long remaining)
> {
> 	if (list_empty(&sched->ring_mirror_list))
> 		cancel_delayed_work(&sched->work_tdr);
> 	else
> 		mod_delayed_work(system_wq, &sched->work_tdr, remaining);
> }
Hi Christian,

Thank you for the suggestions - I was able to try this out this week. It 
works for the most part, but there are a couple of races which need 
further considerations.

1) The drm_sched_resume_timeout() can race with both the 
drm_sched_job_finish() and also new job submissions. In the driver the 
job submission which triggered the preemption can be complete as soon as 
the switch happens and it is quite possible that I get the preemption 
complete and the job done interrupt at the same time. So this means that 
drm_sched_resume_timeout() in IRQ context can race with 
drm_sched_job_finish() in worker thread context on another CPU. Also in 
parallel new jobs can be submitted, which will also update the ring 
mirror list . These races can be addressed however with locking the 
job_list_lock inside the drm_sched_resume_timeout().

2) This one is a little more tricky - In the driver I start off with all 
the timeouts suspended(except the one for the default ring), then on 
every preemption interrupt I suspend the outgoing ring and resume the 
incoming ring. I can only resume if it was previously suspended. This is 
how it is set up. The problem that becomes apparent with this approach 
is that for the suspended rings this arbitrarily large timeout can fire 
at some point(because of no work) and just before 
drm_sched_timedout_job() runs a new job can be inserted into the mirror 
list. So in this case we get an incorrect timeout.

What are your thoughts on using cancel_delayed_work() instead of mod in 
suspend_timeout. Yes we will lose the benefits of mod, but we should be 
able to synchronize drm_sched_suspend_timeout() and 
drm_sched_start_timeout() with some basic state. I have not thought this 
completely through so I may be missing something here.

Please share your thoughts on this

Thank you

Sharat
> 
> 
> Regards,
> Christian.
> 
>>
>> [1]  https://patchwork.freedesktop.org/patch/259914/
>>
>> Signed-off-by: Sharat Masetty <smasetty@codeaurora.org>
>> ---
>>    drivers/gpu/drm/scheduler/sched_main.c | 81 +++++++++++++++++++++++++++++++++-
>>    include/drm/gpu_scheduler.h            |  5 +++
>>    2 files changed, 85 insertions(+), 1 deletion(-)
>>
>> diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c
>> index c993d10..9645789 100644
>> --- a/drivers/gpu/drm/scheduler/sched_main.c
>> +++ b/drivers/gpu/drm/scheduler/sched_main.c
>> @@ -191,11 +191,84 @@ bool drm_sched_dependency_optimized(struct dma_fence* fence,
>>     */
>>    static void drm_sched_start_timeout(struct drm_gpu_scheduler *sched)
>>    {
>> +	unsigned long flags;
>> +
>> +	spin_lock_irqsave(&sched->tdr_suspend_lock, flags);
>> +
>> +	sched->timeout_remaining = sched->timeout;
>> +
>>    	if (sched->timeout != MAX_SCHEDULE_TIMEOUT &&
>> -	    !list_empty(&sched->ring_mirror_list))
>> +	    !list_empty(&sched->ring_mirror_list) && !sched->work_tdr_suspended)
>>    		schedule_delayed_work(&sched->work_tdr, sched->timeout);
>> +
>> +	spin_unlock_irqrestore(&sched->tdr_suspend_lock, flags);
>>    }
>>
>> +/**
>> + * drm_sched_suspend_timeout - suspend timeout for reset worker
>> + *
>> + * @sched: scheduler instance for which to suspend the timeout
>> + *
>> + * Suspend the delayed work timeout for the scheduler. Note that
>> + * this function can be called from an IRQ context.
>> + */
>> +void drm_sched_suspend_timeout(struct drm_gpu_scheduler *sched)
>> +{
>> +	unsigned long flags, timeout;
>> +
>> +	spin_lock_irqsave(&sched->tdr_suspend_lock, flags);
>> +
>> +	if (sched->work_tdr_suspended ||
>> +			sched->timeout == MAX_SCHEDULE_TIMEOUT ||
>> +			list_empty(&sched->ring_mirror_list))
>> +		goto done;
>> +
>> +	timeout = sched->work_tdr.timer.expires;
>> +
>> +	/*
>> +	 * Reset timeout to an arbitrarily large value
>> +	 */
>> +	mod_delayed_work(system_wq, &sched->work_tdr, sched->timeout * 10);
>> +
>> +	timeout -= jiffies;
>> +
>> +	/* FIXME: Can jiffies be after timeout? */
>> +	sched->timeout_remaining = time_after(jiffies, timeout)? 0: timeout;
>> +	sched->work_tdr_suspended = true;
>> +
>> +done:
>> +	spin_unlock_irqrestore(&sched->tdr_suspend_lock, flags);
>> +}
>> +EXPORT_SYMBOL(drm_sched_suspend_timeout);
>> +
>> +/**
>> + * drm_sched_resume_timeout - resume timeout for reset worker
>> + *
>> + * @sched: scheduler instance for which to resume the timeout
>> + *
>> + * Resume the delayed work timeout for the scheduler. Note that
>> + * this function can be called from an IRQ context.
>> + */
>> +void drm_sched_resume_timeout(struct drm_gpu_scheduler *sched)
>> +{
>> +	unsigned long flags;
>> +
>> +	spin_lock_irqsave(&sched->tdr_suspend_lock, flags);
>> +
>> +	if (!sched->work_tdr_suspended ||
>> +			sched->timeout == MAX_SCHEDULE_TIMEOUT) {
>> +		spin_unlock_irqrestore(&sched->tdr_suspend_lock, flags);
>> +		return;
>> +	}
>> +
>> +	mod_delayed_work(system_wq, &sched->work_tdr, sched->timeout_remaining);
>> +
>> +	sched->work_tdr_suspended = false;
>> +
>> +	spin_unlock_irqrestore(&sched->tdr_suspend_lock, flags);
>> +}
>> +EXPORT_SYMBOL(drm_sched_resume_timeout);
>> +
>>    /* job_finish is called after hw fence signaled
>>     */
>>    static void drm_sched_job_finish(struct work_struct *work)
>> @@ -348,6 +421,11 @@ void drm_sched_job_recovery(struct drm_gpu_scheduler *sched)
>>    	struct drm_sched_job *s_job, *tmp;
>>    	bool found_guilty = false;
>>    	int r;
>> +	unsigned long flags;
>> +
>> +	spin_lock_irqsave(&sched->tdr_suspend_lock, flags);
>> +	sched->work_tdr_suspended = false;
>> +	spin_unlock_irqrestore(&sched->tdr_suspend_lock, flags);
>>
>>    	spin_lock(&sched->job_list_lock);
>>    	list_for_each_entry_safe(s_job, tmp, &sched->ring_mirror_list, node) {
>> @@ -607,6 +685,7 @@ int drm_sched_init(struct drm_gpu_scheduler *sched,
>>    	init_waitqueue_head(&sched->job_scheduled);
>>    	INIT_LIST_HEAD(&sched->ring_mirror_list);
>>    	spin_lock_init(&sched->job_list_lock);
>> +	spin_lock_init(&sched->tdr_suspend_lock);
>>    	atomic_set(&sched->hw_rq_count, 0);
>>    	INIT_DELAYED_WORK(&sched->work_tdr, drm_sched_job_timedout);
>>    	atomic_set(&sched->num_jobs, 0);
>> diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
>> index d87b268..5d39572 100644
>> --- a/include/drm/gpu_scheduler.h
>> +++ b/include/drm/gpu_scheduler.h
>> @@ -278,6 +278,9 @@ struct drm_gpu_scheduler {
>>    	atomic_t			hw_rq_count;
>>    	atomic64_t			job_id_count;
>>    	struct delayed_work		work_tdr;
>> +	unsigned long			timeout_remaining;
>> +	bool				work_tdr_suspended;
>> +	spinlock_t			tdr_suspend_lock;
>>    	struct task_struct		*thread;
>>    	struct list_head		ring_mirror_list;
>>    	spinlock_t			job_list_lock;
>> @@ -300,6 +303,8 @@ void drm_sched_hw_job_reset(struct drm_gpu_scheduler *sched,
>>    bool drm_sched_dependency_optimized(struct dma_fence* fence,
>>    				    struct drm_sched_entity *entity);
>>    void drm_sched_job_kickout(struct drm_sched_job *s_job);
>> +void drm_sched_suspend_timeout(struct drm_gpu_scheduler *sched);
>> +void drm_sched_resume_timeout(struct drm_gpu_scheduler *sched);
>>
>>    void drm_sched_rq_add_entity(struct drm_sched_rq *rq,
>>    			     struct drm_sched_entity *entity);
>> --
>> 1.9.1
>>
> 
> _______________________________________________
> Freedreno mailing list
> Freedreno@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/freedreno
>
Christian König Nov. 14, 2018, 7:03 p.m. UTC | #3
Am 14.11.18 um 18:29 schrieb Sharat Masetty:
>
>
> On 11/8/2018 8:11 PM, Koenig, Christian wrote:
>> Am 08.11.18 um 14:42 schrieb Sharat Masetty:
>>> Hi Christian,
>>>
>>> Can you please review this patch? It is a continuation of the 
>>> discussion at [1].
>>> At first I was thinking of using a cancel for suspend instead of a 
>>> mod(to an
>>> arbitrarily large value), but I couldn't get it to fit in as I have 
>>> an additional
>>> constraint of being able to call these functions from an IRQ context.
>>>
>>> These new functions race with other contexts, primarily finish_job(),
>>> timedout_job() and recovery(), but I did go through the possible 
>>> races between
>>> these(I think). Please let me know what you think of this? I have 
>>> not tested
>>> this yet and if this is something in the right direction, I will put 
>>> this
>>> through my testing drill and polish it.
>>>
>>> IMO I think I prefer the callback approach as it appears to be 
>>> simple, less
>>> error prone for both the scheduler and the drivers.
>>
>> Well I agree that this is way to complicated and looks racy to me as
>> well. But this is because you moved away from my initial suggestion.
>>
>> So here is once more how to do it without any additional locks or races:
>>
>> /**
>>    * drm_sched_suspend_timeout - suspend timeout for reset worker
>>    *
>>    * @sched: scheduler instance for which to suspend the timeout
>>    *
>>    * Suspend the delayed work timeout for the scheduler. Note that
>>    * this function can be called from an IRQ context. It returns the
>> timeout remaining.
>>    */
>> unsigned long drm_sched_suspend_timeout(struct drm_gpu_scheduler *sched)
>> {
>>
>>     unsigned long timeout, current = jiffies
>>
>>     timeout = sched->work_tdr.timer.expires;
>>
>>     /*
>>      * Set timeout to an arbitrarily large value, this also prevents 
>> the timer to be
>>      * started when new submissions arrive.
>>      */
>>     if (mod_delayed_work(system_wq, &sched->work_tdr, sched->timeout 
>> * 10) &&
>>         time_after(timeout, current))
>>         return timeout - current;
>>     else
>>         return sched->timeout;
>> }
>>
>> /**
>>    * drm_sched_resume_timeout - resume timeout for reset worker
>>    *
>>    * @sched: scheduler instance for which to resume the timeout
>>    * @remaining: remaining timeout
>>    *
>>    * Resume the delayed work timeout for the scheduler. Note that
>>    * this function can be called from an IRQ context.
>>    */
>> void drm_sched_resume_timeout(struct drm_gpu_scheduler *sched, 
>> unsigned long remaining)
>> {
>>     if (list_empty(&sched->ring_mirror_list))
>>         cancel_delayed_work(&sched->work_tdr);
>>     else
>>         mod_delayed_work(system_wq, &sched->work_tdr, remaining);
>> }
> Hi Christian,
>
> Thank you for the suggestions - I was able to try this out this week. 
> It works for the most part, but there are a couple of races which need 
> further considerations.
>
> 1) The drm_sched_resume_timeout() can race with both the 
> drm_sched_job_finish() and also new job submissions. In the driver the 
> job submission which triggered the preemption can be complete as soon 
> as the switch happens and it is quite possible that I get the 
> preemption complete and the job done interrupt at the same time. So 
> this means that drm_sched_resume_timeout() in IRQ context can race 
> with drm_sched_job_finish() in worker thread context on another CPU. 
> Also in parallel new jobs can be submitted, which will also update the 
> ring mirror list . These races can be addressed however with locking 
> the job_list_lock inside the drm_sched_resume_timeout().

Yeah I know, but I considered this race harmless. Ok, thinking more 
about that I realized that this probably means that we could timeout a 
job way to early.

How about canceling the timer first and then using mod_delayed_work to 
set it to the remaining jiffies if there is a job running?

Otherwise as you noted as well the alternative is really to make the 
job_list_lock irq safe.

>
> 2) This one is a little more tricky - In the driver I start off with 
> all the timeouts suspended(except the one for the default ring), then 
> on every preemption interrupt I suspend the outgoing ring and resume 
> the incoming ring. I can only resume if it was previously suspended. 
> This is how it is set up. The problem that becomes apparent with this 
> approach is that for the suspended rings this arbitrarily large 
> timeout can fire at some point(because of no work) and just before 
> drm_sched_timedout_job() runs a new job can be inserted into the 
> mirror list. So in this case we get an incorrect timeout.
>
> What are your thoughts on using cancel_delayed_work() instead of mod 
> in suspend_timeout. Yes we will lose the benefits of mod, but we 
> should be able to synchronize drm_sched_suspend_timeout() and 
> drm_sched_start_timeout() with some basic state. I have not thought 
> this completely through so I may be missing something here.

Sounds simply like your arbitrary large timeout is not large enough or 
do I miss the problem here?

I just suggested regular timeout*10 because at least on our hardware we 
still want to have some timeout even if the ring is preempted, but you 
could also use MAX_SCHEDULE_TIMEOUT as well.

Christian.

>
> Please share your thoughts on this
>
> Thank you
>
> Sharat
>>
>>
>> Regards,
>> Christian.
>>
>>>
>>> [1]  https://patchwork.freedesktop.org/patch/259914/
>>>
>>> Signed-off-by: Sharat Masetty <smasetty@codeaurora.org>
>>> ---
>>>    drivers/gpu/drm/scheduler/sched_main.c | 81 
>>> +++++++++++++++++++++++++++++++++-
>>>    include/drm/gpu_scheduler.h            |  5 +++
>>>    2 files changed, 85 insertions(+), 1 deletion(-)
>>>
>>> diff --git a/drivers/gpu/drm/scheduler/sched_main.c 
>>> b/drivers/gpu/drm/scheduler/sched_main.c
>>> index c993d10..9645789 100644
>>> --- a/drivers/gpu/drm/scheduler/sched_main.c
>>> +++ b/drivers/gpu/drm/scheduler/sched_main.c
>>> @@ -191,11 +191,84 @@ bool drm_sched_dependency_optimized(struct 
>>> dma_fence* fence,
>>>     */
>>>    static void drm_sched_start_timeout(struct drm_gpu_scheduler *sched)
>>>    {
>>> +    unsigned long flags;
>>> +
>>> +    spin_lock_irqsave(&sched->tdr_suspend_lock, flags);
>>> +
>>> +    sched->timeout_remaining = sched->timeout;
>>> +
>>>        if (sched->timeout != MAX_SCHEDULE_TIMEOUT &&
>>> -        !list_empty(&sched->ring_mirror_list))
>>> +        !list_empty(&sched->ring_mirror_list) && 
>>> !sched->work_tdr_suspended)
>>>            schedule_delayed_work(&sched->work_tdr, sched->timeout);
>>> +
>>> +    spin_unlock_irqrestore(&sched->tdr_suspend_lock, flags);
>>>    }
>>>
>>> +/**
>>> + * drm_sched_suspend_timeout - suspend timeout for reset worker
>>> + *
>>> + * @sched: scheduler instance for which to suspend the timeout
>>> + *
>>> + * Suspend the delayed work timeout for the scheduler. Note that
>>> + * this function can be called from an IRQ context.
>>> + */
>>> +void drm_sched_suspend_timeout(struct drm_gpu_scheduler *sched)
>>> +{
>>> +    unsigned long flags, timeout;
>>> +
>>> +    spin_lock_irqsave(&sched->tdr_suspend_lock, flags);
>>> +
>>> +    if (sched->work_tdr_suspended ||
>>> +            sched->timeout == MAX_SCHEDULE_TIMEOUT ||
>>> +            list_empty(&sched->ring_mirror_list))
>>> +        goto done;
>>> +
>>> +    timeout = sched->work_tdr.timer.expires;
>>> +
>>> +    /*
>>> +     * Reset timeout to an arbitrarily large value
>>> +     */
>>> +    mod_delayed_work(system_wq, &sched->work_tdr, sched->timeout * 
>>> 10);
>>> +
>>> +    timeout -= jiffies;
>>> +
>>> +    /* FIXME: Can jiffies be after timeout? */
>>> +    sched->timeout_remaining = time_after(jiffies, timeout)? 0: 
>>> timeout;
>>> +    sched->work_tdr_suspended = true;
>>> +
>>> +done:
>>> +    spin_unlock_irqrestore(&sched->tdr_suspend_lock, flags);
>>> +}
>>> +EXPORT_SYMBOL(drm_sched_suspend_timeout);
>>> +
>>> +/**
>>> + * drm_sched_resume_timeout - resume timeout for reset worker
>>> + *
>>> + * @sched: scheduler instance for which to resume the timeout
>>> + *
>>> + * Resume the delayed work timeout for the scheduler. Note that
>>> + * this function can be called from an IRQ context.
>>> + */
>>> +void drm_sched_resume_timeout(struct drm_gpu_scheduler *sched)
>>> +{
>>> +    unsigned long flags;
>>> +
>>> +    spin_lock_irqsave(&sched->tdr_suspend_lock, flags);
>>> +
>>> +    if (!sched->work_tdr_suspended ||
>>> +            sched->timeout == MAX_SCHEDULE_TIMEOUT) {
>>> + spin_unlock_irqrestore(&sched->tdr_suspend_lock, flags);
>>> +        return;
>>> +    }
>>> +
>>> +    mod_delayed_work(system_wq, &sched->work_tdr, 
>>> sched->timeout_remaining);
>>> +
>>> +    sched->work_tdr_suspended = false;
>>> +
>>> +    spin_unlock_irqrestore(&sched->tdr_suspend_lock, flags);
>>> +}
>>> +EXPORT_SYMBOL(drm_sched_resume_timeout);
>>> +
>>>    /* job_finish is called after hw fence signaled
>>>     */
>>>    static void drm_sched_job_finish(struct work_struct *work)
>>> @@ -348,6 +421,11 @@ void drm_sched_job_recovery(struct 
>>> drm_gpu_scheduler *sched)
>>>        struct drm_sched_job *s_job, *tmp;
>>>        bool found_guilty = false;
>>>        int r;
>>> +    unsigned long flags;
>>> +
>>> +    spin_lock_irqsave(&sched->tdr_suspend_lock, flags);
>>> +    sched->work_tdr_suspended = false;
>>> +    spin_unlock_irqrestore(&sched->tdr_suspend_lock, flags);
>>>
>>>        spin_lock(&sched->job_list_lock);
>>>        list_for_each_entry_safe(s_job, tmp, 
>>> &sched->ring_mirror_list, node) {
>>> @@ -607,6 +685,7 @@ int drm_sched_init(struct drm_gpu_scheduler *sched,
>>>        init_waitqueue_head(&sched->job_scheduled);
>>>        INIT_LIST_HEAD(&sched->ring_mirror_list);
>>>        spin_lock_init(&sched->job_list_lock);
>>> +    spin_lock_init(&sched->tdr_suspend_lock);
>>>        atomic_set(&sched->hw_rq_count, 0);
>>>        INIT_DELAYED_WORK(&sched->work_tdr, drm_sched_job_timedout);
>>>        atomic_set(&sched->num_jobs, 0);
>>> diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
>>> index d87b268..5d39572 100644
>>> --- a/include/drm/gpu_scheduler.h
>>> +++ b/include/drm/gpu_scheduler.h
>>> @@ -278,6 +278,9 @@ struct drm_gpu_scheduler {
>>>        atomic_t            hw_rq_count;
>>>        atomic64_t            job_id_count;
>>>        struct delayed_work        work_tdr;
>>> +    unsigned long            timeout_remaining;
>>> +    bool                work_tdr_suspended;
>>> +    spinlock_t            tdr_suspend_lock;
>>>        struct task_struct        *thread;
>>>        struct list_head        ring_mirror_list;
>>>        spinlock_t            job_list_lock;
>>> @@ -300,6 +303,8 @@ void drm_sched_hw_job_reset(struct 
>>> drm_gpu_scheduler *sched,
>>>    bool drm_sched_dependency_optimized(struct dma_fence* fence,
>>>                        struct drm_sched_entity *entity);
>>>    void drm_sched_job_kickout(struct drm_sched_job *s_job);
>>> +void drm_sched_suspend_timeout(struct drm_gpu_scheduler *sched);
>>> +void drm_sched_resume_timeout(struct drm_gpu_scheduler *sched);
>>>
>>>    void drm_sched_rq_add_entity(struct drm_sched_rq *rq,
>>>                     struct drm_sched_entity *entity);
>>> -- 
>>> 1.9.1
>>>
>>
>> _______________________________________________
>> Freedreno mailing list
>> Freedreno@lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/freedreno
>>
>
Sharat Masetty Nov. 15, 2018, 9:36 a.m. UTC | #4
On 11/15/2018 12:33 AM, Koenig, Christian wrote:
> Am 14.11.18 um 18:29 schrieb Sharat Masetty:
>>
>>
>> On 11/8/2018 8:11 PM, Koenig, Christian wrote:
>>> Am 08.11.18 um 14:42 schrieb Sharat Masetty:
>>>> Hi Christian,
>>>>
>>>> Can you please review this patch? It is a continuation of the
>>>> discussion at [1].
>>>> At first I was thinking of using a cancel for suspend instead of a
>>>> mod(to an
>>>> arbitrarily large value), but I couldn't get it to fit in as I have
>>>> an additional
>>>> constraint of being able to call these functions from an IRQ context.
>>>>
>>>> These new functions race with other contexts, primarily finish_job(),
>>>> timedout_job() and recovery(), but I did go through the possible
>>>> races between
>>>> these(I think). Please let me know what you think of this? I have
>>>> not tested
>>>> this yet and if this is something in the right direction, I will put
>>>> this
>>>> through my testing drill and polish it.
>>>>
>>>> IMO I think I prefer the callback approach as it appears to be
>>>> simple, less
>>>> error prone for both the scheduler and the drivers.
>>>
>>> Well I agree that this is way to complicated and looks racy to me as
>>> well. But this is because you moved away from my initial suggestion.
>>>
>>> So here is once more how to do it without any additional locks or races:
>>>
>>> /**
>>>     * drm_sched_suspend_timeout - suspend timeout for reset worker
>>>     *
>>>     * @sched: scheduler instance for which to suspend the timeout
>>>     *
>>>     * Suspend the delayed work timeout for the scheduler. Note that
>>>     * this function can be called from an IRQ context. It returns the
>>> timeout remaining.
>>>     */
>>> unsigned long drm_sched_suspend_timeout(struct drm_gpu_scheduler *sched)
>>> {
>>>
>>>      unsigned long timeout, current = jiffies
>>>
>>>      timeout = sched->work_tdr.timer.expires;
>>>
>>>      /*
>>>       * Set timeout to an arbitrarily large value, this also prevents
>>> the timer to be
>>>       * started when new submissions arrive.
>>>       */
>>>      if (mod_delayed_work(system_wq, &sched->work_tdr, sched->timeout
>>> * 10) &&
>>>          time_after(timeout, current))
>>>          return timeout - current;
>>>      else
>>>          return sched->timeout;
>>> }
>>>
>>> /**
>>>     * drm_sched_resume_timeout - resume timeout for reset worker
>>>     *
>>>     * @sched: scheduler instance for which to resume the timeout
>>>     * @remaining: remaining timeout
>>>     *
>>>     * Resume the delayed work timeout for the scheduler. Note that
>>>     * this function can be called from an IRQ context.
>>>     */
>>> void drm_sched_resume_timeout(struct drm_gpu_scheduler *sched,
>>> unsigned long remaining)
>>> {
>>>      if (list_empty(&sched->ring_mirror_list))
>>>          cancel_delayed_work(&sched->work_tdr);
>>>      else
>>>          mod_delayed_work(system_wq, &sched->work_tdr, remaining);
>>> }
>> Hi Christian,
>>
>> Thank you for the suggestions - I was able to try this out this week.
>> It works for the most part, but there are a couple of races which need
>> further considerations.
>>
>> 1) The drm_sched_resume_timeout() can race with both the
>> drm_sched_job_finish() and also new job submissions. In the driver the
>> job submission which triggered the preemption can be complete as soon
>> as the switch happens and it is quite possible that I get the
>> preemption complete and the job done interrupt at the same time. So
>> this means that drm_sched_resume_timeout() in IRQ context can race
>> with drm_sched_job_finish() in worker thread context on another CPU.
>> Also in parallel new jobs can be submitted, which will also update the
>> ring mirror list . These races can be addressed however with locking
>> the job_list_lock inside the drm_sched_resume_timeout().
> 
> Yeah I know, but I considered this race harmless. Ok, thinking more
> about that I realized that this probably means that we could timeout a
> job way to early.
> 
> How about canceling the timer first and then using mod_delayed_work to
> set it to the remaining jiffies if there is a job running?
Do you mean something like this?

void drm_sched_resume_timeout(struct drm_gpu_scheduler *sched,
                 unsigned long remaining)
{
         cancel_delayed_work(&sched->work_tdr);

         if (!list_empty(&sched->ring_mirror_list))
                 mod_delayed_work(system_wq, &sched->work_tdr, remaining);
}

I think that the above can still be racy and I'd prefer locking with the 
job_list_lock...
> 
> Otherwise as you noted as well the alternative is really to make the
> job_list_lock irq safe.
> 
>>
>> 2) This one is a little more tricky - In the driver I start off with
>> all the timeouts suspended(except the one for the default ring), then
>> on every preemption interrupt I suspend the outgoing ring and resume
>> the incoming ring. I can only resume if it was previously suspended.
>> This is how it is set up. The problem that becomes apparent with this
>> approach is that for the suspended rings this arbitrarily large
>> timeout can fire at some point(because of no work) and just before
>> drm_sched_timedout_job() runs a new job can be inserted into the
>> mirror list. So in this case we get an incorrect timeout.
>>
>> What are your thoughts on using cancel_delayed_work() instead of mod
>> in suspend_timeout. Yes we will lose the benefits of mod, but we
>> should be able to synchronize drm_sched_suspend_timeout() and
>> drm_sched_start_timeout() with some basic state. I have not thought
>> this completely through so I may be missing something here.
> 
> Sounds simply like your arbitrary large timeout is not large enough or
> do I miss the problem here?
> 
> I just suggested regular timeout*10 because at least on our hardware we
> still want to have some timeout even if the ring is preempted, but you
> could also use MAX_SCHEDULE_TIMEOUT as well.
Okay, yeah using MAX_SCHEDULE_TIMEOUT is a good idea.

I think I will converge on this by adding the job_list_lock to resume 
and using MAX_SCHEDULE_TIMEOUT for mod_delayed_work() in 
drm_sched_suspend_timeout(). I will try this out.

Sharat
> 
> Christian.
> 
>>
>> Please share your thoughts on this
>>
>> Thank you
>>
>> Sharat
>>>
>>>
>>> Regards,
>>> Christian.
>>>
>>>>
>>>> [1]  https://patchwork.freedesktop.org/patch/259914/
>>>>
>>>> Signed-off-by: Sharat Masetty <smasetty@codeaurora.org>
>>>> ---
>>>>     drivers/gpu/drm/scheduler/sched_main.c | 81
>>>> +++++++++++++++++++++++++++++++++-
>>>>     include/drm/gpu_scheduler.h            |  5 +++
>>>>     2 files changed, 85 insertions(+), 1 deletion(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/scheduler/sched_main.c
>>>> b/drivers/gpu/drm/scheduler/sched_main.c
>>>> index c993d10..9645789 100644
>>>> --- a/drivers/gpu/drm/scheduler/sched_main.c
>>>> +++ b/drivers/gpu/drm/scheduler/sched_main.c
>>>> @@ -191,11 +191,84 @@ bool drm_sched_dependency_optimized(struct
>>>> dma_fence* fence,
>>>>      */
>>>>     static void drm_sched_start_timeout(struct drm_gpu_scheduler *sched)
>>>>     {
>>>> +    unsigned long flags;
>>>> +
>>>> +    spin_lock_irqsave(&sched->tdr_suspend_lock, flags);
>>>> +
>>>> +    sched->timeout_remaining = sched->timeout;
>>>> +
>>>>         if (sched->timeout != MAX_SCHEDULE_TIMEOUT &&
>>>> -        !list_empty(&sched->ring_mirror_list))
>>>> +        !list_empty(&sched->ring_mirror_list) &&
>>>> !sched->work_tdr_suspended)
>>>>             schedule_delayed_work(&sched->work_tdr, sched->timeout);
>>>> +
>>>> +    spin_unlock_irqrestore(&sched->tdr_suspend_lock, flags);
>>>>     }
>>>>
>>>> +/**
>>>> + * drm_sched_suspend_timeout - suspend timeout for reset worker
>>>> + *
>>>> + * @sched: scheduler instance for which to suspend the timeout
>>>> + *
>>>> + * Suspend the delayed work timeout for the scheduler. Note that
>>>> + * this function can be called from an IRQ context.
>>>> + */
>>>> +void drm_sched_suspend_timeout(struct drm_gpu_scheduler *sched)
>>>> +{
>>>> +    unsigned long flags, timeout;
>>>> +
>>>> +    spin_lock_irqsave(&sched->tdr_suspend_lock, flags);
>>>> +
>>>> +    if (sched->work_tdr_suspended ||
>>>> +            sched->timeout == MAX_SCHEDULE_TIMEOUT ||
>>>> +            list_empty(&sched->ring_mirror_list))
>>>> +        goto done;
>>>> +
>>>> +    timeout = sched->work_tdr.timer.expires;
>>>> +
>>>> +    /*
>>>> +     * Reset timeout to an arbitrarily large value
>>>> +     */
>>>> +    mod_delayed_work(system_wq, &sched->work_tdr, sched->timeout *
>>>> 10);
>>>> +
>>>> +    timeout -= jiffies;
>>>> +
>>>> +    /* FIXME: Can jiffies be after timeout? */
>>>> +    sched->timeout_remaining = time_after(jiffies, timeout)? 0:
>>>> timeout;
>>>> +    sched->work_tdr_suspended = true;
>>>> +
>>>> +done:
>>>> +    spin_unlock_irqrestore(&sched->tdr_suspend_lock, flags);
>>>> +}
>>>> +EXPORT_SYMBOL(drm_sched_suspend_timeout);
>>>> +
>>>> +/**
>>>> + * drm_sched_resume_timeout - resume timeout for reset worker
>>>> + *
>>>> + * @sched: scheduler instance for which to resume the timeout
>>>> + *
>>>> + * Resume the delayed work timeout for the scheduler. Note that
>>>> + * this function can be called from an IRQ context.
>>>> + */
>>>> +void drm_sched_resume_timeout(struct drm_gpu_scheduler *sched)
>>>> +{
>>>> +    unsigned long flags;
>>>> +
>>>> +    spin_lock_irqsave(&sched->tdr_suspend_lock, flags);
>>>> +
>>>> +    if (!sched->work_tdr_suspended ||
>>>> +            sched->timeout == MAX_SCHEDULE_TIMEOUT) {
>>>> + spin_unlock_irqrestore(&sched->tdr_suspend_lock, flags);
>>>> +        return;
>>>> +    }
>>>> +
>>>> +    mod_delayed_work(system_wq, &sched->work_tdr,
>>>> sched->timeout_remaining);
>>>> +
>>>> +    sched->work_tdr_suspended = false;
>>>> +
>>>> +    spin_unlock_irqrestore(&sched->tdr_suspend_lock, flags);
>>>> +}
>>>> +EXPORT_SYMBOL(drm_sched_resume_timeout);
>>>> +
>>>>     /* job_finish is called after hw fence signaled
>>>>      */
>>>>     static void drm_sched_job_finish(struct work_struct *work)
>>>> @@ -348,6 +421,11 @@ void drm_sched_job_recovery(struct
>>>> drm_gpu_scheduler *sched)
>>>>         struct drm_sched_job *s_job, *tmp;
>>>>         bool found_guilty = false;
>>>>         int r;
>>>> +    unsigned long flags;
>>>> +
>>>> +    spin_lock_irqsave(&sched->tdr_suspend_lock, flags);
>>>> +    sched->work_tdr_suspended = false;
>>>> +    spin_unlock_irqrestore(&sched->tdr_suspend_lock, flags);
>>>>
>>>>         spin_lock(&sched->job_list_lock);
>>>>         list_for_each_entry_safe(s_job, tmp,
>>>> &sched->ring_mirror_list, node) {
>>>> @@ -607,6 +685,7 @@ int drm_sched_init(struct drm_gpu_scheduler *sched,
>>>>         init_waitqueue_head(&sched->job_scheduled);
>>>>         INIT_LIST_HEAD(&sched->ring_mirror_list);
>>>>         spin_lock_init(&sched->job_list_lock);
>>>> +    spin_lock_init(&sched->tdr_suspend_lock);
>>>>         atomic_set(&sched->hw_rq_count, 0);
>>>>         INIT_DELAYED_WORK(&sched->work_tdr, drm_sched_job_timedout);
>>>>         atomic_set(&sched->num_jobs, 0);
>>>> diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
>>>> index d87b268..5d39572 100644
>>>> --- a/include/drm/gpu_scheduler.h
>>>> +++ b/include/drm/gpu_scheduler.h
>>>> @@ -278,6 +278,9 @@ struct drm_gpu_scheduler {
>>>>         atomic_t            hw_rq_count;
>>>>         atomic64_t            job_id_count;
>>>>         struct delayed_work        work_tdr;
>>>> +    unsigned long            timeout_remaining;
>>>> +    bool                work_tdr_suspended;
>>>> +    spinlock_t            tdr_suspend_lock;
>>>>         struct task_struct        *thread;
>>>>         struct list_head        ring_mirror_list;
>>>>         spinlock_t            job_list_lock;
>>>> @@ -300,6 +303,8 @@ void drm_sched_hw_job_reset(struct
>>>> drm_gpu_scheduler *sched,
>>>>     bool drm_sched_dependency_optimized(struct dma_fence* fence,
>>>>                         struct drm_sched_entity *entity);
>>>>     void drm_sched_job_kickout(struct drm_sched_job *s_job);
>>>> +void drm_sched_suspend_timeout(struct drm_gpu_scheduler *sched);
>>>> +void drm_sched_resume_timeout(struct drm_gpu_scheduler *sched);
>>>>
>>>>     void drm_sched_rq_add_entity(struct drm_sched_rq *rq,
>>>>                      struct drm_sched_entity *entity);
>>>> -- 
>>>> 1.9.1
>>>>
>>>
>>> _______________________________________________
>>> Freedreno mailing list
>>> Freedreno@lists.freedesktop.org
>>> https://lists.freedesktop.org/mailman/listinfo/freedreno
>>>
>>
> 
> _______________________________________________
> Freedreno mailing list
> Freedreno@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/freedreno
>
diff mbox series

Patch

diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c
index c993d10..9645789 100644
--- a/drivers/gpu/drm/scheduler/sched_main.c
+++ b/drivers/gpu/drm/scheduler/sched_main.c
@@ -191,11 +191,84 @@  bool drm_sched_dependency_optimized(struct dma_fence* fence,
  */
 static void drm_sched_start_timeout(struct drm_gpu_scheduler *sched)
 {
+	unsigned long flags;
+
+	spin_lock_irqsave(&sched->tdr_suspend_lock, flags);
+
+	sched->timeout_remaining = sched->timeout;
+
 	if (sched->timeout != MAX_SCHEDULE_TIMEOUT &&
-	    !list_empty(&sched->ring_mirror_list))
+	    !list_empty(&sched->ring_mirror_list) && !sched->work_tdr_suspended)
 		schedule_delayed_work(&sched->work_tdr, sched->timeout);
+
+	spin_unlock_irqrestore(&sched->tdr_suspend_lock, flags);
 }

+/**
+ * drm_sched_suspend_timeout - suspend timeout for reset worker
+ *
+ * @sched: scheduler instance for which to suspend the timeout
+ *
+ * Suspend the delayed work timeout for the scheduler. Note that
+ * this function can be called from an IRQ context.
+ */
+void drm_sched_suspend_timeout(struct drm_gpu_scheduler *sched)
+{
+	unsigned long flags, timeout;
+
+	spin_lock_irqsave(&sched->tdr_suspend_lock, flags);
+
+	if (sched->work_tdr_suspended ||
+			sched->timeout == MAX_SCHEDULE_TIMEOUT ||
+			list_empty(&sched->ring_mirror_list))
+		goto done;
+
+	timeout = sched->work_tdr.timer.expires;
+
+	/*
+	 * Reset timeout to an arbitrarily large value
+	 */
+	mod_delayed_work(system_wq, &sched->work_tdr, sched->timeout * 10);
+
+	timeout -= jiffies;
+
+	/* FIXME: Can jiffies be after timeout? */
+	sched->timeout_remaining = time_after(jiffies, timeout)? 0: timeout;
+	sched->work_tdr_suspended = true;
+
+done:
+	spin_unlock_irqrestore(&sched->tdr_suspend_lock, flags);
+}
+EXPORT_SYMBOL(drm_sched_suspend_timeout);
+
+/**
+ * drm_sched_resume_timeout - resume timeout for reset worker
+ *
+ * @sched: scheduler instance for which to resume the timeout
+ *
+ * Resume the delayed work timeout for the scheduler. Note that
+ * this function can be called from an IRQ context.
+ */
+void drm_sched_resume_timeout(struct drm_gpu_scheduler *sched)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&sched->tdr_suspend_lock, flags);
+
+	if (!sched->work_tdr_suspended ||
+			sched->timeout == MAX_SCHEDULE_TIMEOUT) {
+		spin_unlock_irqrestore(&sched->tdr_suspend_lock, flags);
+		return;
+	}
+
+	mod_delayed_work(system_wq, &sched->work_tdr, sched->timeout_remaining);
+
+	sched->work_tdr_suspended = false;
+
+	spin_unlock_irqrestore(&sched->tdr_suspend_lock, flags);
+}
+EXPORT_SYMBOL(drm_sched_resume_timeout);
+
 /* job_finish is called after hw fence signaled
  */
 static void drm_sched_job_finish(struct work_struct *work)
@@ -348,6 +421,11 @@  void drm_sched_job_recovery(struct drm_gpu_scheduler *sched)
 	struct drm_sched_job *s_job, *tmp;
 	bool found_guilty = false;
 	int r;
+	unsigned long flags;
+
+	spin_lock_irqsave(&sched->tdr_suspend_lock, flags);
+	sched->work_tdr_suspended = false;
+	spin_unlock_irqrestore(&sched->tdr_suspend_lock, flags);

 	spin_lock(&sched->job_list_lock);
 	list_for_each_entry_safe(s_job, tmp, &sched->ring_mirror_list, node) {
@@ -607,6 +685,7 @@  int drm_sched_init(struct drm_gpu_scheduler *sched,
 	init_waitqueue_head(&sched->job_scheduled);
 	INIT_LIST_HEAD(&sched->ring_mirror_list);
 	spin_lock_init(&sched->job_list_lock);
+	spin_lock_init(&sched->tdr_suspend_lock);
 	atomic_set(&sched->hw_rq_count, 0);
 	INIT_DELAYED_WORK(&sched->work_tdr, drm_sched_job_timedout);
 	atomic_set(&sched->num_jobs, 0);
diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
index d87b268..5d39572 100644
--- a/include/drm/gpu_scheduler.h
+++ b/include/drm/gpu_scheduler.h
@@ -278,6 +278,9 @@  struct drm_gpu_scheduler {
 	atomic_t			hw_rq_count;
 	atomic64_t			job_id_count;
 	struct delayed_work		work_tdr;
+	unsigned long			timeout_remaining;
+	bool				work_tdr_suspended;
+	spinlock_t			tdr_suspend_lock;
 	struct task_struct		*thread;
 	struct list_head		ring_mirror_list;
 	spinlock_t			job_list_lock;
@@ -300,6 +303,8 @@  void drm_sched_hw_job_reset(struct drm_gpu_scheduler *sched,
 bool drm_sched_dependency_optimized(struct dma_fence* fence,
 				    struct drm_sched_entity *entity);
 void drm_sched_job_kickout(struct drm_sched_job *s_job);
+void drm_sched_suspend_timeout(struct drm_gpu_scheduler *sched);
+void drm_sched_resume_timeout(struct drm_gpu_scheduler *sched);

 void drm_sched_rq_add_entity(struct drm_sched_rq *rq,
 			     struct drm_sched_entity *entity);