diff mbox series

drm/panfrost: Fix a race in the job timeout handling (again)

Message ID 20201026153206.97037-1-boris.brezillon@collabora.com (mailing list archive)
State New, archived
Headers show
Series drm/panfrost: Fix a race in the job timeout handling (again) | expand

Commit Message

Boris Brezillon Oct. 26, 2020, 3:32 p.m. UTC
In our last attempt to fix races in the panfrost_job_timedout() path we
overlooked the case where a re-submitted job immediately triggers a
fault. This lead to a situation where we try to stop a scheduler that's
not resumed yet and lose the 'timedout' event without restarting the
timeout, thus blocking the whole queue.

Let's fix that by tracking timeouts occurring between the
drm_sched_resubmit_jobs() and drm_sched_start() calls.

Fixes: 1a11a88cfd9a ("drm/panfrost: Fix job timeout handling")
Cc: <stable@vger.kernel.org>
Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com>
---
 drivers/gpu/drm/panfrost/panfrost_job.c | 42 ++++++++++++++++++++-----
 1 file changed, 34 insertions(+), 8 deletions(-)

Comments

Steven Price Oct. 26, 2020, 4:16 p.m. UTC | #1
On 26/10/2020 15:32, Boris Brezillon wrote:
> In our last attempt to fix races in the panfrost_job_timedout() path we
> overlooked the case where a re-submitted job immediately triggers a
> fault. This lead to a situation where we try to stop a scheduler that's
> not resumed yet and lose the 'timedout' event without restarting the
> timeout, thus blocking the whole queue.
> 
> Let's fix that by tracking timeouts occurring between the
> drm_sched_resubmit_jobs() and drm_sched_start() calls.
> 
> Fixes: 1a11a88cfd9a ("drm/panfrost: Fix job timeout handling")
> Cc: <stable@vger.kernel.org>
> Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com>
> ---
>   drivers/gpu/drm/panfrost/panfrost_job.c | 42 ++++++++++++++++++++-----
>   1 file changed, 34 insertions(+), 8 deletions(-)
> 
> diff --git a/drivers/gpu/drm/panfrost/panfrost_job.c b/drivers/gpu/drm/panfrost/panfrost_job.c
> index d0469e944143..96c2c21a4205 100644
> --- a/drivers/gpu/drm/panfrost/panfrost_job.c
> +++ b/drivers/gpu/drm/panfrost/panfrost_job.c
> @@ -26,6 +26,7 @@
>   struct panfrost_queue_state {
>   	struct drm_gpu_scheduler sched;
>   	bool stopped;
> +	bool timedout;
>   	struct mutex lock;
>   	u64 fence_context;
>   	u64 emit_seqno;
> @@ -383,11 +384,33 @@ static bool panfrost_scheduler_stop(struct panfrost_queue_state *queue,
>   		queue->stopped = true;
>   		stopped = true;
>   	}
> +	queue->timedout = true;
>   	mutex_unlock(&queue->lock);
>   
>   	return stopped;
>   }
>   
> +static void panfrost_scheduler_start(struct panfrost_queue_state *queue)
> +{
> +	if (WARN_ON(!queue->stopped))

I *think* this can be hit, see below.

> +		return;
> +
> +	mutex_lock(&queue->lock);
> +	drm_sched_start(&queue->sched, true);
> +
> +	/*
> +	 * We might have missed fault-timeouts (AKA immediate timeouts) while
> +	 * the scheduler was stopped. Let's fake a new fault to trigger an
> +	 * immediate reset.
> +	 */
> +	if (queue->timedout)
> +		drm_sched_fault(&queue->sched);
> +
> +	queue->timedout = false;
> +	queue->stopped = false;
> +	mutex_unlock(&queue->lock);
> +}
> +
>   static void panfrost_job_timedout(struct drm_sched_job *sched_job)
>   {
>   	struct panfrost_job *job = to_panfrost_job(sched_job);
> @@ -437,12 +460,6 @@ static void panfrost_job_timedout(struct drm_sched_job *sched_job)
>   		 */
>   		if (panfrost_scheduler_stop(&pfdev->js->queue[i], NULL))
>   			cancel_delayed_work_sync(&sched->work_tdr);
> -
> -		/*
> -		 * Now that we cancelled the pending timeouts, we can safely
> -		 * reset the stopped state.
> -		 */
> -		pfdev->js->queue[i].stopped = false;
>   	}
>   
>   	spin_lock_irqsave(&pfdev->js->job_lock, flags);
> @@ -457,14 +474,23 @@ static void panfrost_job_timedout(struct drm_sched_job *sched_job)
>   
>   	panfrost_device_reset(pfdev);
>   
> -	for (i = 0; i < NUM_JOB_SLOTS; i++)
> +	for (i = 0; i < NUM_JOB_SLOTS; i++) {
> +		/*
> +		 * The GPU is idle, and the scheduler is stopped, we can safely
> +		 * reset the ->timedout state without taking any lock. We need
> +		 * to do that before calling drm_sched_resubmit_jobs() though,
> +		 * because the resubmission might trigger immediate faults
> +		 * which we want to catch.
> +		 */
> +		pfdev->js->queue[i].timedout = false;
>   		drm_sched_resubmit_jobs(&pfdev->js->queue[i].sched);
> +	}
>   
>   	mutex_unlock(&pfdev->reset_lock);

In here we've resubmitted the jobs and are no longer holding the mutex. 
So AFAICT if one of those jobs fails we may re-enter 
panfrost_job_timedout() and stop (no-op) the scheduler. The first thread 
could then proceed to start the scheduler (possibly during the GPU reset 
handled by the second thread which could be interesting in itself), 
followed by the second thread attempting to start the scheduler which 
then hits the WARN_ON().

Of course the above requires somewhat crazy scheduling, but I can't see 
anything preventing it from happening. Am I missing something?

Steve

>   
>   	/* restart scheduler after GPU is usable again */
>   	for (i = 0; i < NUM_JOB_SLOTS; i++)
> -		drm_sched_start(&pfdev->js->queue[i].sched, true);
> +		panfrost_scheduler_start(&pfdev->js->queue[i]);
>   }
>   
>   static const struct drm_sched_backend_ops panfrost_sched_ops = {
>
Boris Brezillon Oct. 26, 2020, 5:31 p.m. UTC | #2
On Mon, 26 Oct 2020 16:16:49 +0000
Steven Price <steven.price@arm.com> wrote:

> On 26/10/2020 15:32, Boris Brezillon wrote:
> > In our last attempt to fix races in the panfrost_job_timedout() path we
> > overlooked the case where a re-submitted job immediately triggers a
> > fault. This lead to a situation where we try to stop a scheduler that's
> > not resumed yet and lose the 'timedout' event without restarting the
> > timeout, thus blocking the whole queue.
> > 
> > Let's fix that by tracking timeouts occurring between the
> > drm_sched_resubmit_jobs() and drm_sched_start() calls.
> > 
> > Fixes: 1a11a88cfd9a ("drm/panfrost: Fix job timeout handling")
> > Cc: <stable@vger.kernel.org>
> > Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com>
> > ---
> >   drivers/gpu/drm/panfrost/panfrost_job.c | 42 ++++++++++++++++++++-----
> >   1 file changed, 34 insertions(+), 8 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/panfrost/panfrost_job.c b/drivers/gpu/drm/panfrost/panfrost_job.c
> > index d0469e944143..96c2c21a4205 100644
> > --- a/drivers/gpu/drm/panfrost/panfrost_job.c
> > +++ b/drivers/gpu/drm/panfrost/panfrost_job.c
> > @@ -26,6 +26,7 @@
> >   struct panfrost_queue_state {
> >   	struct drm_gpu_scheduler sched;
> >   	bool stopped;
> > +	bool timedout;
> >   	struct mutex lock;
> >   	u64 fence_context;
> >   	u64 emit_seqno;
> > @@ -383,11 +384,33 @@ static bool panfrost_scheduler_stop(struct panfrost_queue_state *queue,
> >   		queue->stopped = true;
> >   		stopped = true;
> >   	}
> > +	queue->timedout = true;
> >   	mutex_unlock(&queue->lock);
> >   
> >   	return stopped;
> >   }
> >   
> > +static void panfrost_scheduler_start(struct panfrost_queue_state *queue)
> > +{
> > +	if (WARN_ON(!queue->stopped))  
> 
> I *think* this can be hit, see below.
> 
> > +		return;
> > +
> > +	mutex_lock(&queue->lock);
> > +	drm_sched_start(&queue->sched, true);
> > +
> > +	/*
> > +	 * We might have missed fault-timeouts (AKA immediate timeouts) while
> > +	 * the scheduler was stopped. Let's fake a new fault to trigger an
> > +	 * immediate reset.
> > +	 */
> > +	if (queue->timedout)
> > +		drm_sched_fault(&queue->sched);
> > +
> > +	queue->timedout = false;
> > +	queue->stopped = false;
> > +	mutex_unlock(&queue->lock);
> > +}
> > +
> >   static void panfrost_job_timedout(struct drm_sched_job *sched_job)
> >   {
> >   	struct panfrost_job *job = to_panfrost_job(sched_job);
> > @@ -437,12 +460,6 @@ static void panfrost_job_timedout(struct drm_sched_job *sched_job)
> >   		 */
> >   		if (panfrost_scheduler_stop(&pfdev->js->queue[i], NULL))
> >   			cancel_delayed_work_sync(&sched->work_tdr);
> > -
> > -		/*
> > -		 * Now that we cancelled the pending timeouts, we can safely
> > -		 * reset the stopped state.
> > -		 */
> > -		pfdev->js->queue[i].stopped = false;
> >   	}
> >   
> >   	spin_lock_irqsave(&pfdev->js->job_lock, flags);
> > @@ -457,14 +474,23 @@ static void panfrost_job_timedout(struct drm_sched_job *sched_job)
> >   
> >   	panfrost_device_reset(pfdev);
> >   
> > -	for (i = 0; i < NUM_JOB_SLOTS; i++)
> > +	for (i = 0; i < NUM_JOB_SLOTS; i++) {
> > +		/*
> > +		 * The GPU is idle, and the scheduler is stopped, we can safely
> > +		 * reset the ->timedout state without taking any lock. We need
> > +		 * to do that before calling drm_sched_resubmit_jobs() though,
> > +		 * because the resubmission might trigger immediate faults
> > +		 * which we want to catch.
> > +		 */
> > +		pfdev->js->queue[i].timedout = false;
> >   		drm_sched_resubmit_jobs(&pfdev->js->queue[i].sched);
> > +	}
> >   
> >   	mutex_unlock(&pfdev->reset_lock);  
> 
> In here we've resubmitted the jobs and are no longer holding the mutex. 
> So AFAICT if one of those jobs fails we may re-enter 
> panfrost_job_timedout() and stop (no-op) the scheduler.

Actually, we won't even try to stop the scheduler, because the
scheduler is still marked as 'stopped', and we bail out early in that
case.

> The first thread 
> could then proceed to start the scheduler (possibly during the GPU reset 
> handled by the second thread which could be interesting in itself),
> followed by the second thread attempting to start the scheduler which 
> then hits the WARN_ON().

Right, one of the queue might be started while another thread
(attached to a different queue) is resetting the GPU, but I'm wondering
if that's really an issue. I mean, the thread resetting the GPU will
wait for all pending timeout handlers to return before proceeding
(cancel_delayed_work_sync() call). Things are then serialized until
we call drm_sched_resubmit_jobs() which restarts the bad jobs and might
lead to new faults. But the queues are still marked as stopped between
drm_sched_resubmit_jobs() and panfrost_scheduler_start(), meaning that
the timeout handlers are effectively NOOPs during that period of time.

This being said, I agree that the current implementation is
cumbersome, and I'd prefer to have something simpler if we can.
diff mbox series

Patch

diff --git a/drivers/gpu/drm/panfrost/panfrost_job.c b/drivers/gpu/drm/panfrost/panfrost_job.c
index d0469e944143..96c2c21a4205 100644
--- a/drivers/gpu/drm/panfrost/panfrost_job.c
+++ b/drivers/gpu/drm/panfrost/panfrost_job.c
@@ -26,6 +26,7 @@ 
 struct panfrost_queue_state {
 	struct drm_gpu_scheduler sched;
 	bool stopped;
+	bool timedout;
 	struct mutex lock;
 	u64 fence_context;
 	u64 emit_seqno;
@@ -383,11 +384,33 @@  static bool panfrost_scheduler_stop(struct panfrost_queue_state *queue,
 		queue->stopped = true;
 		stopped = true;
 	}
+	queue->timedout = true;
 	mutex_unlock(&queue->lock);
 
 	return stopped;
 }
 
+static void panfrost_scheduler_start(struct panfrost_queue_state *queue)
+{
+	if (WARN_ON(!queue->stopped))
+		return;
+
+	mutex_lock(&queue->lock);
+	drm_sched_start(&queue->sched, true);
+
+	/*
+	 * We might have missed fault-timeouts (AKA immediate timeouts) while
+	 * the scheduler was stopped. Let's fake a new fault to trigger an
+	 * immediate reset.
+	 */
+	if (queue->timedout)
+		drm_sched_fault(&queue->sched);
+
+	queue->timedout = false;
+	queue->stopped = false;
+	mutex_unlock(&queue->lock);
+}
+
 static void panfrost_job_timedout(struct drm_sched_job *sched_job)
 {
 	struct panfrost_job *job = to_panfrost_job(sched_job);
@@ -437,12 +460,6 @@  static void panfrost_job_timedout(struct drm_sched_job *sched_job)
 		 */
 		if (panfrost_scheduler_stop(&pfdev->js->queue[i], NULL))
 			cancel_delayed_work_sync(&sched->work_tdr);
-
-		/*
-		 * Now that we cancelled the pending timeouts, we can safely
-		 * reset the stopped state.
-		 */
-		pfdev->js->queue[i].stopped = false;
 	}
 
 	spin_lock_irqsave(&pfdev->js->job_lock, flags);
@@ -457,14 +474,23 @@  static void panfrost_job_timedout(struct drm_sched_job *sched_job)
 
 	panfrost_device_reset(pfdev);
 
-	for (i = 0; i < NUM_JOB_SLOTS; i++)
+	for (i = 0; i < NUM_JOB_SLOTS; i++) {
+		/*
+		 * The GPU is idle, and the scheduler is stopped, we can safely
+		 * reset the ->timedout state without taking any lock. We need
+		 * to do that before calling drm_sched_resubmit_jobs() though,
+		 * because the resubmission might trigger immediate faults
+		 * which we want to catch.
+		 */
+		pfdev->js->queue[i].timedout = false;
 		drm_sched_resubmit_jobs(&pfdev->js->queue[i].sched);
+	}
 
 	mutex_unlock(&pfdev->reset_lock);
 
 	/* restart scheduler after GPU is usable again */
 	for (i = 0; i < NUM_JOB_SLOTS; i++)
-		drm_sched_start(&pfdev->js->queue[i].sched, true);
+		panfrost_scheduler_start(&pfdev->js->queue[i]);
 }
 
 static const struct drm_sched_backend_ops panfrost_sched_ops = {