diff mbox series

[v1,4/6] drm/lima: handle spurious timeouts due to high irq latency

Message ID 20240117031212.1104034-5-nunes.erico@gmail.com (mailing list archive)
State New, archived
Headers show
Series drm/lima: fixes and improvements to error recovery | expand

Commit Message

Erico Nunes Jan. 17, 2024, 3:12 a.m. UTC
There are several unexplained and unreproduced cases of rendering
timeouts with lima, for which one theory is high IRQ latency coming from
somewhere else in the system.
This kind of occurrence may cause applications to trigger unnecessary
resets of the GPU or even applications to hang if it hits an issue in
the recovery path.
Panfrost already does some special handling to account for such
"spurious timeouts", it makes sense to have this in lima too to reduce
the chance that it hit users.

Signed-off-by: Erico Nunes <nunes.erico@gmail.com>
---
 drivers/gpu/drm/lima/lima_sched.c | 32 ++++++++++++++++++++++++++-----
 drivers/gpu/drm/lima/lima_sched.h |  2 ++
 2 files changed, 29 insertions(+), 5 deletions(-)

Comments

Vasily Khoruzhick Jan. 17, 2024, 6:26 p.m. UTC | #1
On Tue, Jan 16, 2024 at 7:12 PM Erico Nunes <nunes.erico@gmail.com> wrote:
>
> There are several unexplained and unreproduced cases of rendering
> timeouts with lima, for which one theory is high IRQ latency coming from
> somewhere else in the system.
> This kind of occurrence may cause applications to trigger unnecessary
> resets of the GPU or even applications to hang if it hits an issue in
> the recovery path.
> Panfrost already does some special handling to account for such
> "spurious timeouts", it makes sense to have this in lima too to reduce
> the chance that it hit users.
>
> Signed-off-by: Erico Nunes <nunes.erico@gmail.com>
Reviewed-by: Vasily Khoruzhick <anarsoul@gmail.com>

> ---
>  drivers/gpu/drm/lima/lima_sched.c | 32 ++++++++++++++++++++++++++-----
>  drivers/gpu/drm/lima/lima_sched.h |  2 ++
>  2 files changed, 29 insertions(+), 5 deletions(-)
>
> diff --git a/drivers/gpu/drm/lima/lima_sched.c b/drivers/gpu/drm/lima/lima_sched.c
> index 66317296d831..9449b81bcd5b 100644
> --- a/drivers/gpu/drm/lima/lima_sched.c
> +++ b/drivers/gpu/drm/lima/lima_sched.c
> @@ -1,6 +1,7 @@
>  // SPDX-License-Identifier: GPL-2.0 OR MIT
>  /* Copyright 2017-2019 Qiang Yu <yuq825@gmail.com> */
>
> +#include <linux/hardirq.h>
>  #include <linux/iosys-map.h>
>  #include <linux/kthread.h>
>  #include <linux/slab.h>
> @@ -223,10 +224,7 @@ static struct dma_fence *lima_sched_run_job(struct drm_sched_job *job)
>
>         task->fence = &fence->base;
>
> -       /* for caller usage of the fence, otherwise irq handler
> -        * may consume the fence before caller use it
> -        */
> -       dma_fence_get(task->fence);
> +       task->done_fence = dma_fence_get(task->fence);
>
>         pipe->current_task = task;
>
> @@ -401,9 +399,33 @@ static enum drm_gpu_sched_stat lima_sched_timedout_job(struct drm_sched_job *job
>         struct lima_sched_pipe *pipe = to_lima_pipe(job->sched);
>         struct lima_sched_task *task = to_lima_task(job);
>         struct lima_device *ldev = pipe->ldev;
> +       struct lima_ip *ip = pipe->processor[0];
> +
> +       /*
> +        * If the GPU managed to complete this jobs fence, the timeout is
> +        * spurious. Bail out.
> +        */
> +       if (dma_fence_is_signaled(task->done_fence)) {
> +               DRM_WARN("%s spurious timeout\n", lima_ip_name(ip));
> +               return DRM_GPU_SCHED_STAT_NOMINAL;
> +       }
> +
> +       /*
> +        * Lima IRQ handler may take a long time to process an interrupt
> +        * if there is another IRQ handler hogging the processing.
> +        * In order to catch such cases and not report spurious Lima job
> +        * timeouts, synchronize the IRQ handler and re-check the fence
> +        * status.
> +        */
> +       synchronize_irq(ip->irq);
> +
> +       if (dma_fence_is_signaled(task->done_fence)) {
> +               DRM_WARN("%s unexpectedly high interrupt latency\n", lima_ip_name(ip));
> +               return DRM_GPU_SCHED_STAT_NOMINAL;
> +       }
>
>         if (!pipe->error)
> -               DRM_ERROR("lima job timeout\n");
> +               DRM_ERROR("%s lima job timeout\n", lima_ip_name(ip));
>
>         drm_sched_stop(&pipe->base, &task->base);
>
> diff --git a/drivers/gpu/drm/lima/lima_sched.h b/drivers/gpu/drm/lima/lima_sched.h
> index 6a11764d87b3..34050facb110 100644
> --- a/drivers/gpu/drm/lima/lima_sched.h
> +++ b/drivers/gpu/drm/lima/lima_sched.h
> @@ -29,6 +29,8 @@ struct lima_sched_task {
>         bool recoverable;
>         struct lima_bo *heap;
>
> +       struct dma_fence *done_fence;
> +
>         /* pipe fence */
>         struct dma_fence *fence;
>  };
> --
> 2.43.0
>
Qiang Yu Jan. 18, 2024, 2:46 a.m. UTC | #2
On Wed, Jan 17, 2024 at 11:12 AM Erico Nunes <nunes.erico@gmail.com> wrote:
>
> There are several unexplained and unreproduced cases of rendering
> timeouts with lima, for which one theory is high IRQ latency coming from
> somewhere else in the system.
> This kind of occurrence may cause applications to trigger unnecessary
> resets of the GPU or even applications to hang if it hits an issue in
> the recovery path.
> Panfrost already does some special handling to account for such
> "spurious timeouts", it makes sense to have this in lima too to reduce
> the chance that it hit users.
>
> Signed-off-by: Erico Nunes <nunes.erico@gmail.com>
> ---
>  drivers/gpu/drm/lima/lima_sched.c | 32 ++++++++++++++++++++++++++-----
>  drivers/gpu/drm/lima/lima_sched.h |  2 ++
>  2 files changed, 29 insertions(+), 5 deletions(-)
>
> diff --git a/drivers/gpu/drm/lima/lima_sched.c b/drivers/gpu/drm/lima/lima_sched.c
> index 66317296d831..9449b81bcd5b 100644
> --- a/drivers/gpu/drm/lima/lima_sched.c
> +++ b/drivers/gpu/drm/lima/lima_sched.c
> @@ -1,6 +1,7 @@
>  // SPDX-License-Identifier: GPL-2.0 OR MIT
>  /* Copyright 2017-2019 Qiang Yu <yuq825@gmail.com> */
>
> +#include <linux/hardirq.h>
>  #include <linux/iosys-map.h>
>  #include <linux/kthread.h>
>  #include <linux/slab.h>
> @@ -223,10 +224,7 @@ static struct dma_fence *lima_sched_run_job(struct drm_sched_job *job)
>
>         task->fence = &fence->base;
>
> -       /* for caller usage of the fence, otherwise irq handler
> -        * may consume the fence before caller use it
> -        */
> -       dma_fence_get(task->fence);
> +       task->done_fence = dma_fence_get(task->fence);
>
>         pipe->current_task = task;
>
> @@ -401,9 +399,33 @@ static enum drm_gpu_sched_stat lima_sched_timedout_job(struct drm_sched_job *job
>         struct lima_sched_pipe *pipe = to_lima_pipe(job->sched);
>         struct lima_sched_task *task = to_lima_task(job);
>         struct lima_device *ldev = pipe->ldev;
> +       struct lima_ip *ip = pipe->processor[0];
> +
> +       /*
> +        * If the GPU managed to complete this jobs fence, the timeout is
> +        * spurious. Bail out.
> +        */
> +       if (dma_fence_is_signaled(task->done_fence)) {
> +               DRM_WARN("%s spurious timeout\n", lima_ip_name(ip));
> +               return DRM_GPU_SCHED_STAT_NOMINAL;
> +       }
> +
> +       /*
> +        * Lima IRQ handler may take a long time to process an interrupt
> +        * if there is another IRQ handler hogging the processing.
> +        * In order to catch such cases and not report spurious Lima job
> +        * timeouts, synchronize the IRQ handler and re-check the fence
> +        * status.
> +        */
> +       synchronize_irq(ip->irq);
> +
> +       if (dma_fence_is_signaled(task->done_fence)) {
> +               DRM_WARN("%s unexpectedly high interrupt latency\n", lima_ip_name(ip));
> +               return DRM_GPU_SCHED_STAT_NOMINAL;
> +       }
>
>         if (!pipe->error)
> -               DRM_ERROR("lima job timeout\n");
> +               DRM_ERROR("%s lima job timeout\n", lima_ip_name(ip));
>
>         drm_sched_stop(&pipe->base, &task->base);
>
> diff --git a/drivers/gpu/drm/lima/lima_sched.h b/drivers/gpu/drm/lima/lima_sched.h
> index 6a11764d87b3..34050facb110 100644
> --- a/drivers/gpu/drm/lima/lima_sched.h
> +++ b/drivers/gpu/drm/lima/lima_sched.h
> @@ -29,6 +29,8 @@ struct lima_sched_task {
>         bool recoverable;
>         struct lima_bo *heap;
>
> +       struct dma_fence *done_fence;
This is same as the following fence, do we really need a duplicated one?

> +
>         /* pipe fence */
>         struct dma_fence *fence;
>  };
> --
> 2.43.0
>
Erico Nunes Jan. 18, 2024, 11:38 a.m. UTC | #3
On Thu, Jan 18, 2024 at 3:46 AM Qiang Yu <yuq825@gmail.com> wrote:
>
> On Wed, Jan 17, 2024 at 11:12 AM Erico Nunes <nunes.erico@gmail.com> wrote:
> > diff --git a/drivers/gpu/drm/lima/lima_sched.h b/drivers/gpu/drm/lima/lima_sched.h
> > index 6a11764d87b3..34050facb110 100644
> > --- a/drivers/gpu/drm/lima/lima_sched.h
> > +++ b/drivers/gpu/drm/lima/lima_sched.h
> > @@ -29,6 +29,8 @@ struct lima_sched_task {
> >         bool recoverable;
> >         struct lima_bo *heap;
> >
> > +       struct dma_fence *done_fence;
> This is same as the following fence, do we really need a duplicated one?

Checking again now, I think we can reuse the existing one.
Qiang Yu Jan. 19, 2024, 1:43 a.m. UTC | #4
On Wed, Jan 17, 2024 at 11:12 AM Erico Nunes <nunes.erico@gmail.com> wrote:
>
> There are several unexplained and unreproduced cases of rendering
> timeouts with lima, for which one theory is high IRQ latency coming from
> somewhere else in the system.
> This kind of occurrence may cause applications to trigger unnecessary
> resets of the GPU or even applications to hang if it hits an issue in
> the recovery path.
> Panfrost already does some special handling to account for such
> "spurious timeouts", it makes sense to have this in lima too to reduce
> the chance that it hit users.
>
> Signed-off-by: Erico Nunes <nunes.erico@gmail.com>
> ---
>  drivers/gpu/drm/lima/lima_sched.c | 32 ++++++++++++++++++++++++++-----
>  drivers/gpu/drm/lima/lima_sched.h |  2 ++
>  2 files changed, 29 insertions(+), 5 deletions(-)
>
> diff --git a/drivers/gpu/drm/lima/lima_sched.c b/drivers/gpu/drm/lima/lima_sched.c
> index 66317296d831..9449b81bcd5b 100644
> --- a/drivers/gpu/drm/lima/lima_sched.c
> +++ b/drivers/gpu/drm/lima/lima_sched.c
> @@ -1,6 +1,7 @@
>  // SPDX-License-Identifier: GPL-2.0 OR MIT
>  /* Copyright 2017-2019 Qiang Yu <yuq825@gmail.com> */
>
> +#include <linux/hardirq.h>
>  #include <linux/iosys-map.h>
>  #include <linux/kthread.h>
>  #include <linux/slab.h>
> @@ -223,10 +224,7 @@ static struct dma_fence *lima_sched_run_job(struct drm_sched_job *job)
>
>         task->fence = &fence->base;
>
> -       /* for caller usage of the fence, otherwise irq handler
> -        * may consume the fence before caller use it
> -        */
> -       dma_fence_get(task->fence);
> +       task->done_fence = dma_fence_get(task->fence);
>
>         pipe->current_task = task;
>
> @@ -401,9 +399,33 @@ static enum drm_gpu_sched_stat lima_sched_timedout_job(struct drm_sched_job *job
>         struct lima_sched_pipe *pipe = to_lima_pipe(job->sched);
>         struct lima_sched_task *task = to_lima_task(job);
>         struct lima_device *ldev = pipe->ldev;
> +       struct lima_ip *ip = pipe->processor[0];
> +
> +       /*
> +        * If the GPU managed to complete this jobs fence, the timeout is
> +        * spurious. Bail out.
> +        */
> +       if (dma_fence_is_signaled(task->done_fence)) {
> +               DRM_WARN("%s spurious timeout\n", lima_ip_name(ip));
> +               return DRM_GPU_SCHED_STAT_NOMINAL;
> +       }
> +
You may just remove this check and left the check after sync irq.

> +       /*
> +        * Lima IRQ handler may take a long time to process an interrupt
> +        * if there is another IRQ handler hogging the processing.
> +        * In order to catch such cases and not report spurious Lima job
> +        * timeouts, synchronize the IRQ handler and re-check the fence
> +        * status.
> +        */
> +       synchronize_irq(ip->irq);
This should be done after drm_sched_stop() to prevent drm scheduler
run more jobs. And we need to disable interrupt of GP/PP for no more
running job triggered fresh INT.

PP may have more than one IRQ, so we need to wait on all of them.

> +
> +       if (dma_fence_is_signaled(task->done_fence)) {
> +               DRM_WARN("%s unexpectedly high interrupt latency\n", lima_ip_name(ip));
> +               return DRM_GPU_SCHED_STAT_NOMINAL;
> +       }
>
>         if (!pipe->error)
> -               DRM_ERROR("lima job timeout\n");
> +               DRM_ERROR("%s lima job timeout\n", lima_ip_name(ip));
>
>         drm_sched_stop(&pipe->base, &task->base);
>
> diff --git a/drivers/gpu/drm/lima/lima_sched.h b/drivers/gpu/drm/lima/lima_sched.h
> index 6a11764d87b3..34050facb110 100644
> --- a/drivers/gpu/drm/lima/lima_sched.h
> +++ b/drivers/gpu/drm/lima/lima_sched.h
> @@ -29,6 +29,8 @@ struct lima_sched_task {
>         bool recoverable;
>         struct lima_bo *heap;
>
> +       struct dma_fence *done_fence;
> +
>         /* pipe fence */
>         struct dma_fence *fence;
>  };
> --
> 2.43.0
>
Qiang Yu Jan. 21, 2024, 3:04 a.m. UTC | #5
On Fri, Jan 19, 2024 at 9:43 AM Qiang Yu <yuq825@gmail.com> wrote:
>
> On Wed, Jan 17, 2024 at 11:12 AM Erico Nunes <nunes.erico@gmail.com> wrote:
> >
> > There are several unexplained and unreproduced cases of rendering
> > timeouts with lima, for which one theory is high IRQ latency coming from
> > somewhere else in the system.
> > This kind of occurrence may cause applications to trigger unnecessary
> > resets of the GPU or even applications to hang if it hits an issue in
> > the recovery path.
> > Panfrost already does some special handling to account for such
> > "spurious timeouts", it makes sense to have this in lima too to reduce
> > the chance that it hit users.
> >
> > Signed-off-by: Erico Nunes <nunes.erico@gmail.com>
> > ---
> >  drivers/gpu/drm/lima/lima_sched.c | 32 ++++++++++++++++++++++++++-----
> >  drivers/gpu/drm/lima/lima_sched.h |  2 ++
> >  2 files changed, 29 insertions(+), 5 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/lima/lima_sched.c b/drivers/gpu/drm/lima/lima_sched.c
> > index 66317296d831..9449b81bcd5b 100644
> > --- a/drivers/gpu/drm/lima/lima_sched.c
> > +++ b/drivers/gpu/drm/lima/lima_sched.c
> > @@ -1,6 +1,7 @@
> >  // SPDX-License-Identifier: GPL-2.0 OR MIT
> >  /* Copyright 2017-2019 Qiang Yu <yuq825@gmail.com> */
> >
> > +#include <linux/hardirq.h>
> >  #include <linux/iosys-map.h>
> >  #include <linux/kthread.h>
> >  #include <linux/slab.h>
> > @@ -223,10 +224,7 @@ static struct dma_fence *lima_sched_run_job(struct drm_sched_job *job)
> >
> >         task->fence = &fence->base;
> >
> > -       /* for caller usage of the fence, otherwise irq handler
> > -        * may consume the fence before caller use it
> > -        */
> > -       dma_fence_get(task->fence);
> > +       task->done_fence = dma_fence_get(task->fence);
> >
> >         pipe->current_task = task;
> >
> > @@ -401,9 +399,33 @@ static enum drm_gpu_sched_stat lima_sched_timedout_job(struct drm_sched_job *job
> >         struct lima_sched_pipe *pipe = to_lima_pipe(job->sched);
> >         struct lima_sched_task *task = to_lima_task(job);
> >         struct lima_device *ldev = pipe->ldev;
> > +       struct lima_ip *ip = pipe->processor[0];
> > +
> > +       /*
> > +        * If the GPU managed to complete this jobs fence, the timeout is
> > +        * spurious. Bail out.
> > +        */
> > +       if (dma_fence_is_signaled(task->done_fence)) {
> > +               DRM_WARN("%s spurious timeout\n", lima_ip_name(ip));
> > +               return DRM_GPU_SCHED_STAT_NOMINAL;
> > +       }
> > +
> You may just remove this check and left the check after sync irq.
>
After more thinking, this is only for handling spurious timeouts more
efficiently, not for totally reliable timeout handling. So this check should
be OK.

> > +       /*
> > +        * Lima IRQ handler may take a long time to process an interrupt
> > +        * if there is another IRQ handler hogging the processing.
> > +        * In order to catch such cases and not report spurious Lima job
> > +        * timeouts, synchronize the IRQ handler and re-check the fence
> > +        * status.
> > +        */
> > +       synchronize_irq(ip->irq);
> This should be done after drm_sched_stop() to prevent drm scheduler
> run more jobs. And we need to disable interrupt of GP/PP for no more
> running job triggered fresh INT.
This is OK too. We just need to solve reliable timeout handling after
drm_sched_stop() in another patch.

>
> PP may have more than one IRQ, so we need to wait on all of them.
>
> > +
> > +       if (dma_fence_is_signaled(task->done_fence)) {
> > +               DRM_WARN("%s unexpectedly high interrupt latency\n", lima_ip_name(ip));
> > +               return DRM_GPU_SCHED_STAT_NOMINAL;
> > +       }
> >
> >         if (!pipe->error)
> > -               DRM_ERROR("lima job timeout\n");
> > +               DRM_ERROR("%s lima job timeout\n", lima_ip_name(ip));
> >
> >         drm_sched_stop(&pipe->base, &task->base);
> >
> > diff --git a/drivers/gpu/drm/lima/lima_sched.h b/drivers/gpu/drm/lima/lima_sched.h
> > index 6a11764d87b3..34050facb110 100644
> > --- a/drivers/gpu/drm/lima/lima_sched.h
> > +++ b/drivers/gpu/drm/lima/lima_sched.h
> > @@ -29,6 +29,8 @@ struct lima_sched_task {
> >         bool recoverable;
> >         struct lima_bo *heap;
> >
> > +       struct dma_fence *done_fence;
> > +
> >         /* pipe fence */
> >         struct dma_fence *fence;
> >  };
> > --
> > 2.43.0
> >
Hillf Danton Jan. 21, 2024, 9:56 a.m. UTC | #6
On Wed, 17 Jan 2024 04:12:10 +0100 Erico Nunes <nunes.erico@gmail.com>
>  
> @@ -401,9 +399,33 @@ static enum drm_gpu_sched_stat lima_sched_timedout_job(struct drm_sched_job *job
>  	struct lima_sched_pipe *pipe = to_lima_pipe(job->sched);
>  	struct lima_sched_task *task = to_lima_task(job);
>  	struct lima_device *ldev = pipe->ldev;
> +	struct lima_ip *ip = pipe->processor[0];
> +
> +	/*
> +	 * If the GPU managed to complete this jobs fence, the timeout is
> +	 * spurious. Bail out.
> +	 */
> +	if (dma_fence_is_signaled(task->done_fence)) {
> +		DRM_WARN("%s spurious timeout\n", lima_ip_name(ip));
> +		return DRM_GPU_SCHED_STAT_NOMINAL;
> +	}

Given 500ms in lima_sched_pipe_init(), no timeout is spurious by define,
and stop selling bandaid like this because you have options like locating
the reasons behind timeout.
> +
> +	/*
> +	 * Lima IRQ handler may take a long time to process an interrupt
> +	 * if there is another IRQ handler hogging the processing.
> +	 * In order to catch such cases and not report spurious Lima job
> +	 * timeouts, synchronize the IRQ handler and re-check the fence
> +	 * status.
> +	 */
> +	synchronize_irq(ip->irq);
> +
> +	if (dma_fence_is_signaled(task->done_fence)) {
> +		DRM_WARN("%s unexpectedly high interrupt latency\n", lima_ip_name(ip));
> +		return DRM_GPU_SCHED_STAT_NOMINAL;
> +	}
>  
>  	if (!pipe->error)
> -		DRM_ERROR("lima job timeout\n");
> +		DRM_ERROR("%s lima job timeout\n", lima_ip_name(ip));
>  
>  	drm_sched_stop(&pipe->base, &task->base);
>
Qiang Yu Jan. 21, 2024, 11:20 a.m. UTC | #7
On Sun, Jan 21, 2024 at 5:56 PM Hillf Danton <hdanton@sina.com> wrote:
>
> On Wed, 17 Jan 2024 04:12:10 +0100 Erico Nunes <nunes.erico@gmail.com>
> >
> > @@ -401,9 +399,33 @@ static enum drm_gpu_sched_stat lima_sched_timedout_job(struct drm_sched_job *job
> >       struct lima_sched_pipe *pipe = to_lima_pipe(job->sched);
> >       struct lima_sched_task *task = to_lima_task(job);
> >       struct lima_device *ldev = pipe->ldev;
> > +     struct lima_ip *ip = pipe->processor[0];
> > +
> > +     /*
> > +      * If the GPU managed to complete this jobs fence, the timeout is
> > +      * spurious. Bail out.
> > +      */
> > +     if (dma_fence_is_signaled(task->done_fence)) {
> > +             DRM_WARN("%s spurious timeout\n", lima_ip_name(ip));
> > +             return DRM_GPU_SCHED_STAT_NOMINAL;
> > +     }
>
> Given 500ms in lima_sched_pipe_init(), no timeout is spurious by define,
> and stop selling bandaid like this because you have options like locating
> the reasons behind timeout.

This chang do look like to aim for 2FPS apps. Maybe 500ms is too short
for week mali4x0 gpus (2FPS apps appear more likely). AMD/NV GPU uses
10s timeout. So increasing the timeout seems to be an equivalent and better
way?

> > +
> > +     /*
> > +      * Lima IRQ handler may take a long time to process an interrupt
> > +      * if there is another IRQ handler hogging the processing.
> > +      * In order to catch such cases and not report spurious Lima job
> > +      * timeouts, synchronize the IRQ handler and re-check the fence
> > +      * status.
> > +      */
> > +     synchronize_irq(ip->irq);
> > +
> > +     if (dma_fence_is_signaled(task->done_fence)) {
> > +             DRM_WARN("%s unexpectedly high interrupt latency\n", lima_ip_name(ip));
> > +             return DRM_GPU_SCHED_STAT_NOMINAL;
> > +     }
> >
> >       if (!pipe->error)
> > -             DRM_ERROR("lima job timeout\n");
> > +             DRM_ERROR("%s lima job timeout\n", lima_ip_name(ip));
> >
> >       drm_sched_stop(&pipe->base, &task->base);
> >
Erico Nunes Jan. 21, 2024, 3:11 p.m. UTC | #8
On Sun, Jan 21, 2024 at 12:20 PM Qiang Yu <yuq825@gmail.com> wrote:
>
> On Sun, Jan 21, 2024 at 5:56 PM Hillf Danton <hdanton@sina.com> wrote:
> >
> > On Wed, 17 Jan 2024 04:12:10 +0100 Erico Nunes <nunes.erico@gmail.com>
> > >
> > > @@ -401,9 +399,33 @@ static enum drm_gpu_sched_stat lima_sched_timedout_job(struct drm_sched_job *job
> > >       struct lima_sched_pipe *pipe = to_lima_pipe(job->sched);
> > >       struct lima_sched_task *task = to_lima_task(job);
> > >       struct lima_device *ldev = pipe->ldev;
> > > +     struct lima_ip *ip = pipe->processor[0];
> > > +
> > > +     /*
> > > +      * If the GPU managed to complete this jobs fence, the timeout is
> > > +      * spurious. Bail out.
> > > +      */
> > > +     if (dma_fence_is_signaled(task->done_fence)) {
> > > +             DRM_WARN("%s spurious timeout\n", lima_ip_name(ip));
> > > +             return DRM_GPU_SCHED_STAT_NOMINAL;
> > > +     }
> >
> > Given 500ms in lima_sched_pipe_init(), no timeout is spurious by define,
> > and stop selling bandaid like this because you have options like locating
> > the reasons behind timeout.
>
> This chang do look like to aim for 2FPS apps. Maybe 500ms is too short
> for week mali4x0 gpus (2FPS apps appear more likely). AMD/NV GPU uses
> 10s timeout. So increasing the timeout seems to be an equivalent and better
> way?

Indeed 500ms might be too optimistic for the sort of applications that
users expect to run on this hardware currently. For a more similar
reference though, other embedded drivers like v3d and panfrost do
still set it to 500ms. Note that this patch is just exactly the same
as exists in Panfrost today and was already discussed with some common
arguments in the patches of this series:
https://patchwork.freedesktop.org/series/120820/

But I would agree to bump the timeout to a higher value for lima. Some
distributions are already doing this with module parameters anyway to
even be able to run some more demanding application stacks on a Mali
400.

Another thing we might consider (probably in a followup patchset to
not delay these fixes forever for the people hitting this issue) is to
configure the Mali hardware watchdog to the value that we would like
as a timeout. That way we would get timeout jobs going through the
same error irq path as other hardware error jobs and might be able to
delete(?)/simplify this software timeout code.

In the meantime for v2 of this series I'll make the change to account
for the multiple pp irqs. So do you agree it is ok to leave
drm_sched_stop() as it is at least for this series?

Thanks all for the reviews

Erico
Qiang Yu Jan. 23, 2024, 1:18 a.m. UTC | #9
On Sun, Jan 21, 2024 at 11:11 PM Erico Nunes <nunes.erico@gmail.com> wrote:
>
> On Sun, Jan 21, 2024 at 12:20 PM Qiang Yu <yuq825@gmail.com> wrote:
> >
> > On Sun, Jan 21, 2024 at 5:56 PM Hillf Danton <hdanton@sina.com> wrote:
> > >
> > > On Wed, 17 Jan 2024 04:12:10 +0100 Erico Nunes <nunes.erico@gmail.com>
> > > >
> > > > @@ -401,9 +399,33 @@ static enum drm_gpu_sched_stat lima_sched_timedout_job(struct drm_sched_job *job
> > > >       struct lima_sched_pipe *pipe = to_lima_pipe(job->sched);
> > > >       struct lima_sched_task *task = to_lima_task(job);
> > > >       struct lima_device *ldev = pipe->ldev;
> > > > +     struct lima_ip *ip = pipe->processor[0];
> > > > +
> > > > +     /*
> > > > +      * If the GPU managed to complete this jobs fence, the timeout is
> > > > +      * spurious. Bail out.
> > > > +      */
> > > > +     if (dma_fence_is_signaled(task->done_fence)) {
> > > > +             DRM_WARN("%s spurious timeout\n", lima_ip_name(ip));
> > > > +             return DRM_GPU_SCHED_STAT_NOMINAL;
> > > > +     }
> > >
> > > Given 500ms in lima_sched_pipe_init(), no timeout is spurious by define,
> > > and stop selling bandaid like this because you have options like locating
> > > the reasons behind timeout.
> >
> > This chang do look like to aim for 2FPS apps. Maybe 500ms is too short
> > for week mali4x0 gpus (2FPS apps appear more likely). AMD/NV GPU uses
> > 10s timeout. So increasing the timeout seems to be an equivalent and better
> > way?
>
> Indeed 500ms might be too optimistic for the sort of applications that
> users expect to run on this hardware currently. For a more similar
> reference though, other embedded drivers like v3d and panfrost do
> still set it to 500ms. Note that this patch is just exactly the same
> as exists in Panfrost today and was already discussed with some common
> arguments in the patches of this series:
> https://patchwork.freedesktop.org/series/120820/
>
> But I would agree to bump the timeout to a higher value for lima. Some
> distributions are already doing this with module parameters anyway to
> even be able to run some more demanding application stacks on a Mali
> 400.
>
> Another thing we might consider (probably in a followup patchset to
> not delay these fixes forever for the people hitting this issue) is to
> configure the Mali hardware watchdog to the value that we would like
> as a timeout. That way we would get timeout jobs going through the
> same error irq path as other hardware error jobs and might be able to
> delete(?)/simplify this software timeout code.
>
This way should be much simpler and stable.

> In the meantime for v2 of this series I'll make the change to account
> for the multiple pp irqs. So do you agree it is ok to leave
> drm_sched_stop() as it is at least for this series?
>
I'm OK with this.

> Thanks all for the reviews
>
> Erico
diff mbox series

Patch

diff --git a/drivers/gpu/drm/lima/lima_sched.c b/drivers/gpu/drm/lima/lima_sched.c
index 66317296d831..9449b81bcd5b 100644
--- a/drivers/gpu/drm/lima/lima_sched.c
+++ b/drivers/gpu/drm/lima/lima_sched.c
@@ -1,6 +1,7 @@ 
 // SPDX-License-Identifier: GPL-2.0 OR MIT
 /* Copyright 2017-2019 Qiang Yu <yuq825@gmail.com> */
 
+#include <linux/hardirq.h>
 #include <linux/iosys-map.h>
 #include <linux/kthread.h>
 #include <linux/slab.h>
@@ -223,10 +224,7 @@  static struct dma_fence *lima_sched_run_job(struct drm_sched_job *job)
 
 	task->fence = &fence->base;
 
-	/* for caller usage of the fence, otherwise irq handler
-	 * may consume the fence before caller use it
-	 */
-	dma_fence_get(task->fence);
+	task->done_fence = dma_fence_get(task->fence);
 
 	pipe->current_task = task;
 
@@ -401,9 +399,33 @@  static enum drm_gpu_sched_stat lima_sched_timedout_job(struct drm_sched_job *job
 	struct lima_sched_pipe *pipe = to_lima_pipe(job->sched);
 	struct lima_sched_task *task = to_lima_task(job);
 	struct lima_device *ldev = pipe->ldev;
+	struct lima_ip *ip = pipe->processor[0];
+
+	/*
+	 * If the GPU managed to complete this jobs fence, the timeout is
+	 * spurious. Bail out.
+	 */
+	if (dma_fence_is_signaled(task->done_fence)) {
+		DRM_WARN("%s spurious timeout\n", lima_ip_name(ip));
+		return DRM_GPU_SCHED_STAT_NOMINAL;
+	}
+
+	/*
+	 * Lima IRQ handler may take a long time to process an interrupt
+	 * if there is another IRQ handler hogging the processing.
+	 * In order to catch such cases and not report spurious Lima job
+	 * timeouts, synchronize the IRQ handler and re-check the fence
+	 * status.
+	 */
+	synchronize_irq(ip->irq);
+
+	if (dma_fence_is_signaled(task->done_fence)) {
+		DRM_WARN("%s unexpectedly high interrupt latency\n", lima_ip_name(ip));
+		return DRM_GPU_SCHED_STAT_NOMINAL;
+	}
 
 	if (!pipe->error)
-		DRM_ERROR("lima job timeout\n");
+		DRM_ERROR("%s lima job timeout\n", lima_ip_name(ip));
 
 	drm_sched_stop(&pipe->base, &task->base);
 
diff --git a/drivers/gpu/drm/lima/lima_sched.h b/drivers/gpu/drm/lima/lima_sched.h
index 6a11764d87b3..34050facb110 100644
--- a/drivers/gpu/drm/lima/lima_sched.h
+++ b/drivers/gpu/drm/lima/lima_sched.h
@@ -29,6 +29,8 @@  struct lima_sched_task {
 	bool recoverable;
 	struct lima_bo *heap;
 
+	struct dma_fence *done_fence;
+
 	/* pipe fence */
 	struct dma_fence *fence;
 };