diff mbox series

[v2,5/8] drm/lima: handle spurious timeouts due to high irq latency

Message ID 20240124025947.2110659-6-nunes.erico@gmail.com (mailing list archive)
State New, archived
Headers show
Series drm/lima: fixes and improvements to error recovery | expand

Commit Message

Erico Nunes Jan. 24, 2024, 2:59 a.m. UTC
There are several unexplained and unreproduced cases of rendering
timeouts with lima, for which one theory is high IRQ latency coming from
somewhere else in the system.
This kind of occurrence may cause applications to trigger unnecessary
resets of the GPU or even applications to hang if it hits an issue in
the recovery path.
Panfrost already does some special handling to account for such
"spurious timeouts", it makes sense to have this in lima too to reduce
the chance that it hit users.

Signed-off-by: Erico Nunes <nunes.erico@gmail.com>
---
 drivers/gpu/drm/lima/lima_sched.c | 31 ++++++++++++++++++++++++++++---
 1 file changed, 28 insertions(+), 3 deletions(-)

Comments

Qiang Yu Jan. 24, 2024, 12:38 p.m. UTC | #1
On Wed, Jan 24, 2024 at 11:00 AM Erico Nunes <nunes.erico@gmail.com> wrote:
>
> There are several unexplained and unreproduced cases of rendering
> timeouts with lima, for which one theory is high IRQ latency coming from
> somewhere else in the system.
> This kind of occurrence may cause applications to trigger unnecessary
> resets of the GPU or even applications to hang if it hits an issue in
> the recovery path.
> Panfrost already does some special handling to account for such
> "spurious timeouts", it makes sense to have this in lima too to reduce
> the chance that it hit users.
>
> Signed-off-by: Erico Nunes <nunes.erico@gmail.com>
> ---
>  drivers/gpu/drm/lima/lima_sched.c | 31 ++++++++++++++++++++++++++++---
>  1 file changed, 28 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/gpu/drm/lima/lima_sched.c b/drivers/gpu/drm/lima/lima_sched.c
> index c3bf8cda8498..814428564637 100644
> --- a/drivers/gpu/drm/lima/lima_sched.c
> +++ b/drivers/gpu/drm/lima/lima_sched.c
> @@ -1,6 +1,7 @@
>  // SPDX-License-Identifier: GPL-2.0 OR MIT
>  /* Copyright 2017-2019 Qiang Yu <yuq825@gmail.com> */
>
> +#include <linux/hardirq.h>
>  #include <linux/iosys-map.h>
>  #include <linux/kthread.h>
>  #include <linux/slab.h>
> @@ -401,9 +402,35 @@ static enum drm_gpu_sched_stat lima_sched_timedout_job(struct drm_sched_job *job
>         struct lima_sched_pipe *pipe = to_lima_pipe(job->sched);
>         struct lima_sched_task *task = to_lima_task(job);
>         struct lima_device *ldev = pipe->ldev;
> +       struct lima_ip *ip = pipe->processor[0];
> +       int i;
> +
> +       /*
> +        * If the GPU managed to complete this jobs fence, the timeout is
> +        * spurious. Bail out.
> +        */
> +       if (dma_fence_is_signaled(task->fence)) {
> +               DRM_WARN("%s spurious timeout\n", lima_ip_name(ip));
> +               return DRM_GPU_SCHED_STAT_NOMINAL;
> +       }
> +
> +       /*
> +        * Lima IRQ handler may take a long time to process an interrupt
> +        * if there is another IRQ handler hogging the processing.
> +        * In order to catch such cases and not report spurious Lima job
> +        * timeouts, synchronize the IRQ handler and re-check the fence
> +        * status.
> +        */
> +       for (i = 0; i < pipe->num_processor; i++)
> +               synchronize_irq(pipe->processor[i]->irq);
> +
I have a question, this timeout handler will be called when GP/PP error IRQ.
If we call synchronize_irq() in the IRQ handler, will we block ourselves here?

> +       if (dma_fence_is_signaled(task->fence)) {
> +               DRM_WARN("%s unexpectedly high interrupt latency\n", lima_ip_name(ip));
> +               return DRM_GPU_SCHED_STAT_NOMINAL;
> +       }
>
>         if (!pipe->error)
> -               DRM_ERROR("lima job timeout\n");
> +               DRM_ERROR("%s job timeout\n", lima_ip_name(ip));
>
>         drm_sched_stop(&pipe->base, &task->base);
>
> @@ -417,8 +444,6 @@ static enum drm_gpu_sched_stat lima_sched_timedout_job(struct drm_sched_job *job
>         if (pipe->bcast_mmu)
>                 lima_mmu_page_fault_resume(pipe->bcast_mmu);
>         else {
> -               int i;
> -
>                 for (i = 0; i < pipe->num_mmu; i++)
>                         lima_mmu_page_fault_resume(pipe->mmu[i]);
>         }
> --
> 2.43.0
>
Erico Nunes Jan. 29, 2024, 10:55 p.m. UTC | #2
On Wed, Jan 24, 2024 at 1:38 PM Qiang Yu <yuq825@gmail.com> wrote:
>
> On Wed, Jan 24, 2024 at 11:00 AM Erico Nunes <nunes.erico@gmail.com> wrote:
> >
> > There are several unexplained and unreproduced cases of rendering
> > timeouts with lima, for which one theory is high IRQ latency coming from
> > somewhere else in the system.
> > This kind of occurrence may cause applications to trigger unnecessary
> > resets of the GPU or even applications to hang if it hits an issue in
> > the recovery path.
> > Panfrost already does some special handling to account for such
> > "spurious timeouts", it makes sense to have this in lima too to reduce
> > the chance that it hit users.
> >
> > Signed-off-by: Erico Nunes <nunes.erico@gmail.com>
> > ---
> >  drivers/gpu/drm/lima/lima_sched.c | 31 ++++++++++++++++++++++++++++---
> >  1 file changed, 28 insertions(+), 3 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/lima/lima_sched.c b/drivers/gpu/drm/lima/lima_sched.c
> > index c3bf8cda8498..814428564637 100644
> > --- a/drivers/gpu/drm/lima/lima_sched.c
> > +++ b/drivers/gpu/drm/lima/lima_sched.c
> > @@ -1,6 +1,7 @@
> >  // SPDX-License-Identifier: GPL-2.0 OR MIT
> >  /* Copyright 2017-2019 Qiang Yu <yuq825@gmail.com> */
> >
> > +#include <linux/hardirq.h>
> >  #include <linux/iosys-map.h>
> >  #include <linux/kthread.h>
> >  #include <linux/slab.h>
> > @@ -401,9 +402,35 @@ static enum drm_gpu_sched_stat lima_sched_timedout_job(struct drm_sched_job *job
> >         struct lima_sched_pipe *pipe = to_lima_pipe(job->sched);
> >         struct lima_sched_task *task = to_lima_task(job);
> >         struct lima_device *ldev = pipe->ldev;
> > +       struct lima_ip *ip = pipe->processor[0];
> > +       int i;
> > +
> > +       /*
> > +        * If the GPU managed to complete this jobs fence, the timeout is
> > +        * spurious. Bail out.
> > +        */
> > +       if (dma_fence_is_signaled(task->fence)) {
> > +               DRM_WARN("%s spurious timeout\n", lima_ip_name(ip));
> > +               return DRM_GPU_SCHED_STAT_NOMINAL;
> > +       }
> > +
> > +       /*
> > +        * Lima IRQ handler may take a long time to process an interrupt
> > +        * if there is another IRQ handler hogging the processing.
> > +        * In order to catch such cases and not report spurious Lima job
> > +        * timeouts, synchronize the IRQ handler and re-check the fence
> > +        * status.
> > +        */
> > +       for (i = 0; i < pipe->num_processor; i++)
> > +               synchronize_irq(pipe->processor[i]->irq);
> > +
> I have a question, this timeout handler will be called when GP/PP error IRQ.
> If we call synchronize_irq() in the IRQ handler, will we block ourselves here?

If I understand correctly, this handler is only called by drm_sched in
a workqueue, not by gp or pp IRQ and it also does not run in any IRQ
context.
So I think this sort of lockup can't happen here.

I ran some additional tests with both timeouts and actual error IRQs
(locally modified Mesa to produce some errored jobs) and was not able
to cause any lockup related to this.

Erico
Qiang Yu Jan. 30, 2024, 1:05 a.m. UTC | #3
On Tue, Jan 30, 2024 at 6:55 AM Erico Nunes <nunes.erico@gmail.com> wrote:
>
> On Wed, Jan 24, 2024 at 1:38 PM Qiang Yu <yuq825@gmail.com> wrote:
> >
> > On Wed, Jan 24, 2024 at 11:00 AM Erico Nunes <nunes.erico@gmail.com> wrote:
> > >
> > > There are several unexplained and unreproduced cases of rendering
> > > timeouts with lima, for which one theory is high IRQ latency coming from
> > > somewhere else in the system.
> > > This kind of occurrence may cause applications to trigger unnecessary
> > > resets of the GPU or even applications to hang if it hits an issue in
> > > the recovery path.
> > > Panfrost already does some special handling to account for such
> > > "spurious timeouts", it makes sense to have this in lima too to reduce
> > > the chance that it hit users.
> > >
> > > Signed-off-by: Erico Nunes <nunes.erico@gmail.com>
> > > ---
> > >  drivers/gpu/drm/lima/lima_sched.c | 31 ++++++++++++++++++++++++++++---
> > >  1 file changed, 28 insertions(+), 3 deletions(-)
> > >
> > > diff --git a/drivers/gpu/drm/lima/lima_sched.c b/drivers/gpu/drm/lima/lima_sched.c
> > > index c3bf8cda8498..814428564637 100644
> > > --- a/drivers/gpu/drm/lima/lima_sched.c
> > > +++ b/drivers/gpu/drm/lima/lima_sched.c
> > > @@ -1,6 +1,7 @@
> > >  // SPDX-License-Identifier: GPL-2.0 OR MIT
> > >  /* Copyright 2017-2019 Qiang Yu <yuq825@gmail.com> */
> > >
> > > +#include <linux/hardirq.h>
> > >  #include <linux/iosys-map.h>
> > >  #include <linux/kthread.h>
> > >  #include <linux/slab.h>
> > > @@ -401,9 +402,35 @@ static enum drm_gpu_sched_stat lima_sched_timedout_job(struct drm_sched_job *job
> > >         struct lima_sched_pipe *pipe = to_lima_pipe(job->sched);
> > >         struct lima_sched_task *task = to_lima_task(job);
> > >         struct lima_device *ldev = pipe->ldev;
> > > +       struct lima_ip *ip = pipe->processor[0];
> > > +       int i;
> > > +
> > > +       /*
> > > +        * If the GPU managed to complete this jobs fence, the timeout is
> > > +        * spurious. Bail out.
> > > +        */
> > > +       if (dma_fence_is_signaled(task->fence)) {
> > > +               DRM_WARN("%s spurious timeout\n", lima_ip_name(ip));
> > > +               return DRM_GPU_SCHED_STAT_NOMINAL;
> > > +       }
> > > +
> > > +       /*
> > > +        * Lima IRQ handler may take a long time to process an interrupt
> > > +        * if there is another IRQ handler hogging the processing.
> > > +        * In order to catch such cases and not report spurious Lima job
> > > +        * timeouts, synchronize the IRQ handler and re-check the fence
> > > +        * status.
> > > +        */
> > > +       for (i = 0; i < pipe->num_processor; i++)
> > > +               synchronize_irq(pipe->processor[i]->irq);
> > > +
> > I have a question, this timeout handler will be called when GP/PP error IRQ.
> > If we call synchronize_irq() in the IRQ handler, will we block ourselves here?
>
> If I understand correctly, this handler is only called by drm_sched in
> a workqueue, not by gp or pp IRQ and it also does not run in any IRQ
> context.
> So I think this sort of lockup can't happen here.
>
Oh, right. I miss understand the drm_sched_fault() which still call the timeout
handler in work queue instead of caller thread.

> I ran some additional tests with both timeouts and actual error IRQs
> (locally modified Mesa to produce some errored jobs) and was not able
> to cause any lockup related to this.
>
> Erico
diff mbox series

Patch

diff --git a/drivers/gpu/drm/lima/lima_sched.c b/drivers/gpu/drm/lima/lima_sched.c
index c3bf8cda8498..814428564637 100644
--- a/drivers/gpu/drm/lima/lima_sched.c
+++ b/drivers/gpu/drm/lima/lima_sched.c
@@ -1,6 +1,7 @@ 
 // SPDX-License-Identifier: GPL-2.0 OR MIT
 /* Copyright 2017-2019 Qiang Yu <yuq825@gmail.com> */
 
+#include <linux/hardirq.h>
 #include <linux/iosys-map.h>
 #include <linux/kthread.h>
 #include <linux/slab.h>
@@ -401,9 +402,35 @@  static enum drm_gpu_sched_stat lima_sched_timedout_job(struct drm_sched_job *job
 	struct lima_sched_pipe *pipe = to_lima_pipe(job->sched);
 	struct lima_sched_task *task = to_lima_task(job);
 	struct lima_device *ldev = pipe->ldev;
+	struct lima_ip *ip = pipe->processor[0];
+	int i;
+
+	/*
+	 * If the GPU managed to complete this jobs fence, the timeout is
+	 * spurious. Bail out.
+	 */
+	if (dma_fence_is_signaled(task->fence)) {
+		DRM_WARN("%s spurious timeout\n", lima_ip_name(ip));
+		return DRM_GPU_SCHED_STAT_NOMINAL;
+	}
+
+	/*
+	 * Lima IRQ handler may take a long time to process an interrupt
+	 * if there is another IRQ handler hogging the processing.
+	 * In order to catch such cases and not report spurious Lima job
+	 * timeouts, synchronize the IRQ handler and re-check the fence
+	 * status.
+	 */
+	for (i = 0; i < pipe->num_processor; i++)
+		synchronize_irq(pipe->processor[i]->irq);
+
+	if (dma_fence_is_signaled(task->fence)) {
+		DRM_WARN("%s unexpectedly high interrupt latency\n", lima_ip_name(ip));
+		return DRM_GPU_SCHED_STAT_NOMINAL;
+	}
 
 	if (!pipe->error)
-		DRM_ERROR("lima job timeout\n");
+		DRM_ERROR("%s job timeout\n", lima_ip_name(ip));
 
 	drm_sched_stop(&pipe->base, &task->base);
 
@@ -417,8 +444,6 @@  static enum drm_gpu_sched_stat lima_sched_timedout_job(struct drm_sched_job *job
 	if (pipe->bcast_mmu)
 		lima_mmu_page_fault_resume(pipe->bcast_mmu);
 	else {
-		int i;
-
 		for (i = 0; i < pipe->num_mmu; i++)
 			lima_mmu_page_fault_resume(pipe->mmu[i]);
 	}