Message ID | 20241211075419.2333731-6-boris.brezillon@collabora.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | drm/panthor: Be robust against failures in the resume path | expand |
On Wed, Dec 11, 2024 at 08:54:19AM +0100, Boris Brezillon wrote: > If we do a GPU soft-reset, that's no longer fast reset. This also means > the slow reset fallback doesn't work because the MCU state is only reset > after a GPU soft-reset. > > Let's move the retry logic to panthor_device_resume() to issue a > soft-reset between the fast and slow attempts, and patch > panthor_gpu_suspend() to only power-off the L2 when a fast reset is > requested. > > v3: > - No changes > > v2: > - Add R-b > > Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com> > Reviewed-by: Steven Price <steven.price@arm.com> For reasons that are not clear yet to me my v2 R-bs emails seem to have not reached you or the dri-devel mailing lists. > --- > drivers/gpu/drm/panthor/panthor_device.c | 32 ++++++++++---- > drivers/gpu/drm/panthor/panthor_device.h | 11 +++++ > drivers/gpu/drm/panthor/panthor_fw.c | 54 ++++++------------------ > drivers/gpu/drm/panthor/panthor_gpu.c | 11 ++--- > 4 files changed, 53 insertions(+), 55 deletions(-) > > diff --git a/drivers/gpu/drm/panthor/panthor_device.c b/drivers/gpu/drm/panthor/panthor_device.c > index 0362101ea896..2c817e65e6be 100644 > --- a/drivers/gpu/drm/panthor/panthor_device.c > +++ b/drivers/gpu/drm/panthor/panthor_device.c > @@ -431,6 +431,22 @@ int panthor_device_mmap_io(struct panthor_device *ptdev, struct vm_area_struct * > return 0; > } > > +static int panthor_device_resume_hw_components(struct panthor_device *ptdev) > +{ > + int ret; > + > + panthor_gpu_resume(ptdev); > + panthor_mmu_resume(ptdev); > + > + ret = panthor_fw_resume(ptdev); > + if (!ret) > + return 0; > + > + panthor_mmu_suspend(ptdev); > + panthor_gpu_suspend(ptdev); > + return ret; My only comment was a nit here where I prefer the construct: if (ret) { panthor_mmu_suspend(ptdev); panthor_gpu_suspend(ptdev); } return ret; but feel free to ignore it. For the whole series: Reviewed-by: Liviu Dudau <liviu.dudau@arm.com> Best regards, Liviu > +} > + > int panthor_device_resume(struct device *dev) > { > struct panthor_device *ptdev = dev_get_drvdata(dev); > @@ -457,16 +473,16 @@ int panthor_device_resume(struct device *dev) > > if (panthor_device_is_initialized(ptdev) && > drm_dev_enter(&ptdev->base, &cookie)) { > - panthor_gpu_resume(ptdev); > - panthor_mmu_resume(ptdev); > - ret = panthor_fw_resume(ptdev); > - if (!drm_WARN_ON(&ptdev->base, ret)) { > - panthor_sched_resume(ptdev); > - } else { > - panthor_mmu_suspend(ptdev); > - panthor_gpu_suspend(ptdev); > + ret = panthor_device_resume_hw_components(ptdev); > + if (ret && ptdev->reset.fast) { > + drm_err(&ptdev->base, "Fast reset failed, trying a slow reset"); > + ptdev->reset.fast = false; > + ret = panthor_device_resume_hw_components(ptdev); > } > > + if (!ret) > + panthor_sched_resume(ptdev); > + > drm_dev_exit(cookie); > > if (ret) > diff --git a/drivers/gpu/drm/panthor/panthor_device.h b/drivers/gpu/drm/panthor/panthor_device.h > index b6c4f25a5d6e..da6574021664 100644 > --- a/drivers/gpu/drm/panthor/panthor_device.h > +++ b/drivers/gpu/drm/panthor/panthor_device.h > @@ -157,6 +157,17 @@ struct panthor_device { > > /** @pending: Set to true if a reset is pending. */ > atomic_t pending; > + > + /** > + * @fast: True if the post_reset logic can proceed with a fast reset. > + * > + * A fast reset is just a reset where the driver doesn't reload the FW sections. > + * > + * Any time the firmware is properly suspended, a fast reset can take place. > + * On the other hand, if the halt operation failed, the driver will reload > + * all FW sections to make sure we start from a fresh state. > + */ > + bool fast; > } reset; > > /** @pm: Power management related data. */ > diff --git a/drivers/gpu/drm/panthor/panthor_fw.c b/drivers/gpu/drm/panthor/panthor_fw.c > index 02789558788d..5b68dc02b5ce 100644 > --- a/drivers/gpu/drm/panthor/panthor_fw.c > +++ b/drivers/gpu/drm/panthor/panthor_fw.c > @@ -263,17 +263,6 @@ struct panthor_fw { > /** @booted: True is the FW is booted */ > bool booted; > > - /** > - * @fast_reset: True if the post_reset logic can proceed with a fast reset. > - * > - * A fast reset is just a reset where the driver doesn't reload the FW sections. > - * > - * Any time the firmware is properly suspended, a fast reset can take place. > - * On the other hand, if the halt operation failed, the driver will reload > - * all sections to make sure we start from a fresh state. > - */ > - bool fast_reset; > - > /** @irq: Job irq data. */ > struct panthor_irq irq; > }; > @@ -1090,7 +1079,7 @@ void panthor_fw_pre_reset(struct panthor_device *ptdev, bool on_hang) > /* Make sure we won't be woken up by a ping. */ > cancel_delayed_work_sync(&ptdev->fw->watchdog.ping_work); > > - ptdev->fw->fast_reset = false; > + ptdev->reset.fast = false; > > if (!on_hang) { > struct panthor_fw_global_iface *glb_iface = panthor_fw_get_glb_iface(ptdev); > @@ -1100,7 +1089,7 @@ void panthor_fw_pre_reset(struct panthor_device *ptdev, bool on_hang) > gpu_write(ptdev, CSF_DOORBELL(CSF_GLB_DOORBELL_ID), 1); > if (!readl_poll_timeout(ptdev->iomem + MCU_STATUS, status, > status == MCU_STATUS_HALT, 10, 100000)) { > - ptdev->fw->fast_reset = true; > + ptdev->reset.fast = true; > } else { > drm_warn(&ptdev->base, "Failed to cleanly suspend MCU"); > } > @@ -1125,49 +1114,30 @@ int panthor_fw_post_reset(struct panthor_device *ptdev) > if (ret) > return ret; > > - /* If this is a fast reset, try to start the MCU without reloading > - * the FW sections. If it fails, go for a full reset. > - */ > - if (ptdev->fw->fast_reset) { > + if (!ptdev->reset.fast) { > + /* On a slow reset, reload all sections, including RO ones. > + * We're not supposed to end up here anyway, let's just assume > + * the overhead of reloading everything is acceptable. > + */ > + panthor_reload_fw_sections(ptdev, true); > + } else { > /* The FW detects 0 -> 1 transitions. Make sure we reset > * the HALT bit before the FW is rebooted. > * This is not needed on a slow reset because FW sections are > * re-initialized. > */ > struct panthor_fw_global_iface *glb_iface = panthor_fw_get_glb_iface(ptdev); > + > panthor_fw_update_reqs(glb_iface, req, 0, GLB_HALT); > - > - ret = panthor_fw_start(ptdev); > - if (!ret) > - goto out; > - > - /* Forcibly reset the MCU and force a slow reset, so we get a > - * fresh boot on the next panthor_fw_start() call. > - */ > - panthor_fw_stop(ptdev); > - ptdev->fw->fast_reset = false; > - drm_err(&ptdev->base, "FW fast reset failed, trying a slow reset"); > - > - ret = panthor_vm_flush_all(ptdev->fw->vm); > - if (ret) { > - drm_err(&ptdev->base, "FW slow reset failed (couldn't flush FW's AS l2cache)"); > - return ret; > - } > } > > - /* Reload all sections, including RO ones. We're not supposed > - * to end up here anyway, let's just assume the overhead of > - * reloading everything is acceptable. > - */ > - panthor_reload_fw_sections(ptdev, true); > - > ret = panthor_fw_start(ptdev); > if (ret) { > - drm_err(&ptdev->base, "FW slow reset failed (couldn't start the FW )"); > + drm_err(&ptdev->base, "FW %s reset failed", > + ptdev->reset.fast ? "fast" : "slow"); > return ret; > } > > -out: > /* We must re-initialize the global interface even on fast-reset. */ > panthor_fw_init_global_iface(ptdev); > return 0; > diff --git a/drivers/gpu/drm/panthor/panthor_gpu.c b/drivers/gpu/drm/panthor/panthor_gpu.c > index ee85a371bc38..671049020afa 100644 > --- a/drivers/gpu/drm/panthor/panthor_gpu.c > +++ b/drivers/gpu/drm/panthor/panthor_gpu.c > @@ -470,11 +470,12 @@ int panthor_gpu_soft_reset(struct panthor_device *ptdev) > */ > void panthor_gpu_suspend(struct panthor_device *ptdev) > { > - /* > - * It may be preferable to simply power down the L2, but for now just > - * soft-reset which will leave the L2 powered down. > - */ > - panthor_gpu_soft_reset(ptdev); > + /* On a fast reset, simply power down the L2. */ > + if (!ptdev->reset.fast) > + panthor_gpu_soft_reset(ptdev); > + else > + panthor_gpu_power_off(ptdev, L2, 1, 20000); > + > panthor_gpu_irq_suspend(&ptdev->gpu->irq); > } > > -- > 2.47.0 >
On Wed, 11 Dec 2024 09:57:07 +0000 Liviu Dudau <liviu.dudau@arm.com> wrote: > On Wed, Dec 11, 2024 at 08:54:19AM +0100, Boris Brezillon wrote: > > If we do a GPU soft-reset, that's no longer fast reset. This also means > > the slow reset fallback doesn't work because the MCU state is only reset > > after a GPU soft-reset. > > > > Let's move the retry logic to panthor_device_resume() to issue a > > soft-reset between the fast and slow attempts, and patch > > panthor_gpu_suspend() to only power-off the L2 when a fast reset is > > requested. > > > > v3: > > - No changes > > > > v2: > > - Add R-b > > > > Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com> > > Reviewed-by: Steven Price <steven.price@arm.com> > > For reasons that are not clear yet to me my v2 R-bs emails seem to have not > reached you or the dri-devel mailing lists. > > > > --- > > drivers/gpu/drm/panthor/panthor_device.c | 32 ++++++++++---- > > drivers/gpu/drm/panthor/panthor_device.h | 11 +++++ > > drivers/gpu/drm/panthor/panthor_fw.c | 54 ++++++------------------ > > drivers/gpu/drm/panthor/panthor_gpu.c | 11 ++--- > > 4 files changed, 53 insertions(+), 55 deletions(-) > > > > diff --git a/drivers/gpu/drm/panthor/panthor_device.c b/drivers/gpu/drm/panthor/panthor_device.c > > index 0362101ea896..2c817e65e6be 100644 > > --- a/drivers/gpu/drm/panthor/panthor_device.c > > +++ b/drivers/gpu/drm/panthor/panthor_device.c > > @@ -431,6 +431,22 @@ int panthor_device_mmap_io(struct panthor_device *ptdev, struct vm_area_struct * > > return 0; > > } > > > > +static int panthor_device_resume_hw_components(struct panthor_device *ptdev) > > +{ > > + int ret; > > + > > + panthor_gpu_resume(ptdev); > > + panthor_mmu_resume(ptdev); > > + > > + ret = panthor_fw_resume(ptdev); > > + if (!ret) > > + return 0; > > + > > + panthor_mmu_suspend(ptdev); > > + panthor_gpu_suspend(ptdev); > > + return ret; > > My only comment was a nit here where I prefer the construct: > > if (ret) { > panthor_mmu_suspend(ptdev); > panthor_gpu_suspend(ptdev); > } > > return ret; > > but feel free to ignore it. > > For the whole series: Reviewed-by: Liviu Dudau <liviu.dudau@arm.com> I applied the series before seeing your replies. Sorry about that :-/.
diff --git a/drivers/gpu/drm/panthor/panthor_device.c b/drivers/gpu/drm/panthor/panthor_device.c index 0362101ea896..2c817e65e6be 100644 --- a/drivers/gpu/drm/panthor/panthor_device.c +++ b/drivers/gpu/drm/panthor/panthor_device.c @@ -431,6 +431,22 @@ int panthor_device_mmap_io(struct panthor_device *ptdev, struct vm_area_struct * return 0; } +static int panthor_device_resume_hw_components(struct panthor_device *ptdev) +{ + int ret; + + panthor_gpu_resume(ptdev); + panthor_mmu_resume(ptdev); + + ret = panthor_fw_resume(ptdev); + if (!ret) + return 0; + + panthor_mmu_suspend(ptdev); + panthor_gpu_suspend(ptdev); + return ret; +} + int panthor_device_resume(struct device *dev) { struct panthor_device *ptdev = dev_get_drvdata(dev); @@ -457,16 +473,16 @@ int panthor_device_resume(struct device *dev) if (panthor_device_is_initialized(ptdev) && drm_dev_enter(&ptdev->base, &cookie)) { - panthor_gpu_resume(ptdev); - panthor_mmu_resume(ptdev); - ret = panthor_fw_resume(ptdev); - if (!drm_WARN_ON(&ptdev->base, ret)) { - panthor_sched_resume(ptdev); - } else { - panthor_mmu_suspend(ptdev); - panthor_gpu_suspend(ptdev); + ret = panthor_device_resume_hw_components(ptdev); + if (ret && ptdev->reset.fast) { + drm_err(&ptdev->base, "Fast reset failed, trying a slow reset"); + ptdev->reset.fast = false; + ret = panthor_device_resume_hw_components(ptdev); } + if (!ret) + panthor_sched_resume(ptdev); + drm_dev_exit(cookie); if (ret) diff --git a/drivers/gpu/drm/panthor/panthor_device.h b/drivers/gpu/drm/panthor/panthor_device.h index b6c4f25a5d6e..da6574021664 100644 --- a/drivers/gpu/drm/panthor/panthor_device.h +++ b/drivers/gpu/drm/panthor/panthor_device.h @@ -157,6 +157,17 @@ struct panthor_device { /** @pending: Set to true if a reset is pending. */ atomic_t pending; + + /** + * @fast: True if the post_reset logic can proceed with a fast reset. + * + * A fast reset is just a reset where the driver doesn't reload the FW sections. + * + * Any time the firmware is properly suspended, a fast reset can take place. + * On the other hand, if the halt operation failed, the driver will reload + * all FW sections to make sure we start from a fresh state. + */ + bool fast; } reset; /** @pm: Power management related data. */ diff --git a/drivers/gpu/drm/panthor/panthor_fw.c b/drivers/gpu/drm/panthor/panthor_fw.c index 02789558788d..5b68dc02b5ce 100644 --- a/drivers/gpu/drm/panthor/panthor_fw.c +++ b/drivers/gpu/drm/panthor/panthor_fw.c @@ -263,17 +263,6 @@ struct panthor_fw { /** @booted: True is the FW is booted */ bool booted; - /** - * @fast_reset: True if the post_reset logic can proceed with a fast reset. - * - * A fast reset is just a reset where the driver doesn't reload the FW sections. - * - * Any time the firmware is properly suspended, a fast reset can take place. - * On the other hand, if the halt operation failed, the driver will reload - * all sections to make sure we start from a fresh state. - */ - bool fast_reset; - /** @irq: Job irq data. */ struct panthor_irq irq; }; @@ -1090,7 +1079,7 @@ void panthor_fw_pre_reset(struct panthor_device *ptdev, bool on_hang) /* Make sure we won't be woken up by a ping. */ cancel_delayed_work_sync(&ptdev->fw->watchdog.ping_work); - ptdev->fw->fast_reset = false; + ptdev->reset.fast = false; if (!on_hang) { struct panthor_fw_global_iface *glb_iface = panthor_fw_get_glb_iface(ptdev); @@ -1100,7 +1089,7 @@ void panthor_fw_pre_reset(struct panthor_device *ptdev, bool on_hang) gpu_write(ptdev, CSF_DOORBELL(CSF_GLB_DOORBELL_ID), 1); if (!readl_poll_timeout(ptdev->iomem + MCU_STATUS, status, status == MCU_STATUS_HALT, 10, 100000)) { - ptdev->fw->fast_reset = true; + ptdev->reset.fast = true; } else { drm_warn(&ptdev->base, "Failed to cleanly suspend MCU"); } @@ -1125,49 +1114,30 @@ int panthor_fw_post_reset(struct panthor_device *ptdev) if (ret) return ret; - /* If this is a fast reset, try to start the MCU without reloading - * the FW sections. If it fails, go for a full reset. - */ - if (ptdev->fw->fast_reset) { + if (!ptdev->reset.fast) { + /* On a slow reset, reload all sections, including RO ones. + * We're not supposed to end up here anyway, let's just assume + * the overhead of reloading everything is acceptable. + */ + panthor_reload_fw_sections(ptdev, true); + } else { /* The FW detects 0 -> 1 transitions. Make sure we reset * the HALT bit before the FW is rebooted. * This is not needed on a slow reset because FW sections are * re-initialized. */ struct panthor_fw_global_iface *glb_iface = panthor_fw_get_glb_iface(ptdev); + panthor_fw_update_reqs(glb_iface, req, 0, GLB_HALT); - - ret = panthor_fw_start(ptdev); - if (!ret) - goto out; - - /* Forcibly reset the MCU and force a slow reset, so we get a - * fresh boot on the next panthor_fw_start() call. - */ - panthor_fw_stop(ptdev); - ptdev->fw->fast_reset = false; - drm_err(&ptdev->base, "FW fast reset failed, trying a slow reset"); - - ret = panthor_vm_flush_all(ptdev->fw->vm); - if (ret) { - drm_err(&ptdev->base, "FW slow reset failed (couldn't flush FW's AS l2cache)"); - return ret; - } } - /* Reload all sections, including RO ones. We're not supposed - * to end up here anyway, let's just assume the overhead of - * reloading everything is acceptable. - */ - panthor_reload_fw_sections(ptdev, true); - ret = panthor_fw_start(ptdev); if (ret) { - drm_err(&ptdev->base, "FW slow reset failed (couldn't start the FW )"); + drm_err(&ptdev->base, "FW %s reset failed", + ptdev->reset.fast ? "fast" : "slow"); return ret; } -out: /* We must re-initialize the global interface even on fast-reset. */ panthor_fw_init_global_iface(ptdev); return 0; diff --git a/drivers/gpu/drm/panthor/panthor_gpu.c b/drivers/gpu/drm/panthor/panthor_gpu.c index ee85a371bc38..671049020afa 100644 --- a/drivers/gpu/drm/panthor/panthor_gpu.c +++ b/drivers/gpu/drm/panthor/panthor_gpu.c @@ -470,11 +470,12 @@ int panthor_gpu_soft_reset(struct panthor_device *ptdev) */ void panthor_gpu_suspend(struct panthor_device *ptdev) { - /* - * It may be preferable to simply power down the L2, but for now just - * soft-reset which will leave the L2 powered down. - */ - panthor_gpu_soft_reset(ptdev); + /* On a fast reset, simply power down the L2. */ + if (!ptdev->reset.fast) + panthor_gpu_soft_reset(ptdev); + else + panthor_gpu_power_off(ptdev, L2, 1, 20000); + panthor_gpu_irq_suspend(&ptdev->gpu->irq); }