Message ID | 20160712164934.1390-1-peter@lekensteyn.nl (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On Tue, Jul 12, 2016 at 06:49:34PM +0200, Peter Wu wrote: > The FBIOPUT_CON2FBMAP ioctl takes a console_lock(). When this is called > while nouveau was runtime suspended, a deadlock would occur due to > nouveau_fbcon_set_suspend also trying to obtain console_lock(). > > Fix this by delaying the drm_fb_helper_set_suspend call. Based on the > i915 code (which was done for performance reasons though). > > Cc: Chris Wilson <chris@chris-wilson.co.uk> > Cc: Daniel Vetter <daniel.vetter@ffwll.ch> > Signed-off-by: Peter Wu <peter@lekensteyn.nl> > --- > Tested on top of v4.7-rc5, the deadlock is gone. If we bother with this, it should imo be moved into the drm_fb_helper.c function drm_fb_helper_set_suspend(). But this also smells like some kind of bad duct-tape. I think Lukas is working on some other rpm vs. fbdev deadlocks, maybe we could fix them all with one proper fix? I've made some comments on Lukas' last patch series. Besides this, when fixing a deadlock pls provide more details about the precise callchain and the locks involved in the deadlock. If you discovered this using lockdep, then just add the entire lockdep splat to the commit message. Otherwise there's lots of guesswork involved here. -Daniel > --- > drivers/gpu/drm/nouveau/nouveau_drm.c | 4 +-- > drivers/gpu/drm/nouveau/nouveau_drv.h | 1 + > drivers/gpu/drm/nouveau/nouveau_fbcon.c | 54 ++++++++++++++++++++++++++++----- > drivers/gpu/drm/nouveau/nouveau_fbcon.h | 2 +- > 4 files changed, 50 insertions(+), 11 deletions(-) > > diff --git a/drivers/gpu/drm/nouveau/nouveau_drm.c b/drivers/gpu/drm/nouveau/nouveau_drm.c > index 11f8dd9..f9a2c10 100644 > --- a/drivers/gpu/drm/nouveau/nouveau_drm.c > +++ b/drivers/gpu/drm/nouveau/nouveau_drm.c > @@ -552,7 +552,7 @@ nouveau_do_suspend(struct drm_device *dev, bool runtime) > > if (dev->mode_config.num_crtc) { > NV_INFO(drm, "suspending console...\n"); > - nouveau_fbcon_set_suspend(dev, 1); > + nouveau_fbcon_set_suspend(dev, FBINFO_STATE_SUSPENDED, true); > NV_INFO(drm, "suspending display...\n"); > ret = nouveau_display_suspend(dev, runtime); > if (ret) > @@ -635,7 +635,7 @@ nouveau_do_resume(struct drm_device *dev, bool runtime) > NV_INFO(drm, "resuming display...\n"); > nouveau_display_resume(dev, runtime); > NV_INFO(drm, "resuming console...\n"); > - nouveau_fbcon_set_suspend(dev, 0); > + nouveau_fbcon_set_suspend(dev, FBINFO_STATE_RUNNING, false); > } > > return 0; > diff --git a/drivers/gpu/drm/nouveau/nouveau_drv.h b/drivers/gpu/drm/nouveau/nouveau_drv.h > index 822a021..a743d19 100644 > --- a/drivers/gpu/drm/nouveau/nouveau_drv.h > +++ b/drivers/gpu/drm/nouveau/nouveau_drv.h > @@ -147,6 +147,7 @@ struct nouveau_drm { > struct nouveau_channel *channel; > struct nvkm_gpuobj *notify; > struct nouveau_fbdev *fbcon; > + struct work_struct fbdev_suspend_work; > struct nvif_object nvsw; > struct nvif_object ntfy; > struct nvif_notify flip; > diff --git a/drivers/gpu/drm/nouveau/nouveau_fbcon.c b/drivers/gpu/drm/nouveau/nouveau_fbcon.c > index d1f248f..089156a 100644 > --- a/drivers/gpu/drm/nouveau/nouveau_fbcon.c > +++ b/drivers/gpu/drm/nouveau/nouveau_fbcon.c > @@ -492,19 +492,53 @@ static const struct drm_fb_helper_funcs nouveau_fbcon_helper_funcs = { > .fb_probe = nouveau_fbcon_create, > }; > > +static void nouveau_fbcon_suspend_worker(struct work_struct *work) > +{ > + nouveau_fbcon_set_suspend(container_of(work, > + struct nouveau_drm, > + fbdev_suspend_work)->dev, > + FBINFO_STATE_RUNNING, > + true); > +} > + > void > -nouveau_fbcon_set_suspend(struct drm_device *dev, int state) > +nouveau_fbcon_set_suspend(struct drm_device *dev, int state, bool synchronous) > { > struct nouveau_drm *drm = nouveau_drm(dev); > - if (drm->fbcon) { > - console_lock(); > - if (state == FBINFO_STATE_RUNNING) > - nouveau_fbcon_accel_restore(dev); > - drm_fb_helper_set_suspend(&drm->fbcon->helper, state); > + if (!drm->fbcon) > + return; > + > + if (synchronous) { > + /* Flush any pending work to turn the console on, and then > + * wait to turn it off. It must be synchronous as we are > + * about to suspend or unload the driver. > + * > + * Note that from within the work-handler, we cannot flush > + * ourselves, so only flush outstanding work upon suspend! > + */ > if (state != FBINFO_STATE_RUNNING) > - nouveau_fbcon_accel_save_disable(dev); > - console_unlock(); > + flush_work(&drm->fbdev_suspend_work); > + console_lock(); > + } else { > + /* > + * The console lock can be pretty contented on resume due > + * to all the printk activity. Try to keep it out of the hot > + * path of resume if possible. This also prevents a deadlock > + * with FBIOPUT_CON2FBMAP. > + */ > + WARN_ON(state != FBINFO_STATE_RUNNING); > + if (!console_trylock()) { > + schedule_work(&drm->fbdev_suspend_work); > + return; > + } > } > + > + if (state == FBINFO_STATE_RUNNING) > + nouveau_fbcon_accel_restore(dev); > + drm_fb_helper_set_suspend(&drm->fbcon->helper, state); > + if (state != FBINFO_STATE_RUNNING) > + nouveau_fbcon_accel_save_disable(dev); > + console_unlock(); > } > > int > @@ -526,6 +560,8 @@ nouveau_fbcon_init(struct drm_device *dev) > fbcon->dev = dev; > drm->fbcon = fbcon; > > + INIT_WORK(&drm->fbdev_suspend_work, nouveau_fbcon_suspend_worker); > + > drm_fb_helper_prepare(dev, &fbcon->helper, &nouveau_fbcon_helper_funcs); > > ret = drm_fb_helper_init(dev, &fbcon->helper, > @@ -571,6 +607,8 @@ nouveau_fbcon_fini(struct drm_device *dev) > if (!drm->fbcon) > return; > > + flush_work(&drm->fbdev_suspend_work); > + > nouveau_fbcon_accel_fini(dev); > nouveau_fbcon_destroy(dev, drm->fbcon); > kfree(drm->fbcon); > diff --git a/drivers/gpu/drm/nouveau/nouveau_fbcon.h b/drivers/gpu/drm/nouveau/nouveau_fbcon.h > index ca77ad0..34b2504 100644 > --- a/drivers/gpu/drm/nouveau/nouveau_fbcon.h > +++ b/drivers/gpu/drm/nouveau/nouveau_fbcon.h > @@ -66,7 +66,7 @@ void nouveau_fbcon_gpu_lockup(struct fb_info *info); > > int nouveau_fbcon_init(struct drm_device *dev); > void nouveau_fbcon_fini(struct drm_device *dev); > -void nouveau_fbcon_set_suspend(struct drm_device *dev, int state); > +void nouveau_fbcon_set_suspend(struct drm_device *dev, int state, bool synchronous); > void nouveau_fbcon_accel_save_disable(struct drm_device *dev); > void nouveau_fbcon_accel_restore(struct drm_device *dev); > > -- > 2.8.3 >
On Wed, Jul 13, 2016 at 11:54:49AM +0200, Daniel Vetter wrote: > On Tue, Jul 12, 2016 at 06:49:34PM +0200, Peter Wu wrote: > > The FBIOPUT_CON2FBMAP ioctl takes a console_lock(). When this is called > > while nouveau was runtime suspended, a deadlock would occur due to > > nouveau_fbcon_set_suspend also trying to obtain console_lock(). > > > > Fix this by delaying the drm_fb_helper_set_suspend call. Based on the > > i915 code (which was done for performance reasons though). > > > > Cc: Chris Wilson <chris@chris-wilson.co.uk> > > Cc: Daniel Vetter <daniel.vetter@ffwll.ch> > > Signed-off-by: Peter Wu <peter@lekensteyn.nl> > > --- > > Tested on top of v4.7-rc5, the deadlock is gone. > > If we bother with this, it should imo be moved into the drm_fb_helper.c > function drm_fb_helper_set_suspend(). But this also smells like some kind > of bad duct-tape. I think Lukas is working on some other rpm vs. fbdev > deadlocks, maybe we could fix them all with one proper fix? I've made some > comments on Lukas' last patch series. This patch is only needed for drivers that use console_lock (for drm_fb_helper_set_suspend) in their runtime resume functions. Lukas posted fixes for runtime PM reference leaks, those are different from this deadlock (see https://lists.freedesktop.org/archives/dri-devel/2016-July/113005.html for a backtrace for this issue). The deadlock could also be avoided if the device backing the fbcon is somehow runtime-resumed outside the lock, but that feels like a larger hack that does not seem easy. The i915 patch was done to reduce resume time (due to console_lock contention), that feature seems useful to all other drivers too even if the deadlock is fixed in a different way. My current plan is to move stuff out of the lock and allow (just) resuming the console to be delayed. Some drivers (nouveau, radeon/amdgpu, i915) do unnecessary stuff under the console lock: - nouveau: I *think* that cleraing/setting FBINFO_HWACCEL_DISABLED (nouveau_fbcon_accel_restore) is safe outside the lock as the fb is already suspended before clearing/after setting the flag. - radeon: since the console is suspended, I don't think that that all of the code is radeon_resume_kms is really needed. - amdgpu: same as radeon. Btw, console_lock is leaked on an error path. - i915: I think that clearing the fb memory can be done outside the lock too as the console is suspended. Please correct me if my assumptions are flawed. > Besides this, when fixing a deadlock pls provide more details about the > precise callchain and the locks involved in the deadlock. If you > discovered this using lockdep, then just add the entire lockdep splat to > the commit message. Otherwise there's lots of guesswork involved here. > -Daniel There was no lockdep splat, it was triggered via the ioctl in the commit message. I'll include the verbose trace from the previous mail in the next proposed patch to reduce hunting though. Peter > > --- > > drivers/gpu/drm/nouveau/nouveau_drm.c | 4 +-- > > drivers/gpu/drm/nouveau/nouveau_drv.h | 1 + > > drivers/gpu/drm/nouveau/nouveau_fbcon.c | 54 ++++++++++++++++++++++++++++----- > > drivers/gpu/drm/nouveau/nouveau_fbcon.h | 2 +- > > 4 files changed, 50 insertions(+), 11 deletions(-) > > > > diff --git a/drivers/gpu/drm/nouveau/nouveau_drm.c b/drivers/gpu/drm/nouveau/nouveau_drm.c > > index 11f8dd9..f9a2c10 100644 > > --- a/drivers/gpu/drm/nouveau/nouveau_drm.c > > +++ b/drivers/gpu/drm/nouveau/nouveau_drm.c > > @@ -552,7 +552,7 @@ nouveau_do_suspend(struct drm_device *dev, bool runtime) > > > > if (dev->mode_config.num_crtc) { > > NV_INFO(drm, "suspending console...\n"); > > - nouveau_fbcon_set_suspend(dev, 1); > > + nouveau_fbcon_set_suspend(dev, FBINFO_STATE_SUSPENDED, true); > > NV_INFO(drm, "suspending display...\n"); > > ret = nouveau_display_suspend(dev, runtime); > > if (ret) > > @@ -635,7 +635,7 @@ nouveau_do_resume(struct drm_device *dev, bool runtime) > > NV_INFO(drm, "resuming display...\n"); > > nouveau_display_resume(dev, runtime); > > NV_INFO(drm, "resuming console...\n"); > > - nouveau_fbcon_set_suspend(dev, 0); > > + nouveau_fbcon_set_suspend(dev, FBINFO_STATE_RUNNING, false); > > } > > > > return 0; > > diff --git a/drivers/gpu/drm/nouveau/nouveau_drv.h b/drivers/gpu/drm/nouveau/nouveau_drv.h > > index 822a021..a743d19 100644 > > --- a/drivers/gpu/drm/nouveau/nouveau_drv.h > > +++ b/drivers/gpu/drm/nouveau/nouveau_drv.h > > @@ -147,6 +147,7 @@ struct nouveau_drm { > > struct nouveau_channel *channel; > > struct nvkm_gpuobj *notify; > > struct nouveau_fbdev *fbcon; > > + struct work_struct fbdev_suspend_work; > > struct nvif_object nvsw; > > struct nvif_object ntfy; > > struct nvif_notify flip; > > diff --git a/drivers/gpu/drm/nouveau/nouveau_fbcon.c b/drivers/gpu/drm/nouveau/nouveau_fbcon.c > > index d1f248f..089156a 100644 > > --- a/drivers/gpu/drm/nouveau/nouveau_fbcon.c > > +++ b/drivers/gpu/drm/nouveau/nouveau_fbcon.c > > @@ -492,19 +492,53 @@ static const struct drm_fb_helper_funcs nouveau_fbcon_helper_funcs = { > > .fb_probe = nouveau_fbcon_create, > > }; > > > > +static void nouveau_fbcon_suspend_worker(struct work_struct *work) > > +{ > > + nouveau_fbcon_set_suspend(container_of(work, > > + struct nouveau_drm, > > + fbdev_suspend_work)->dev, > > + FBINFO_STATE_RUNNING, > > + true); > > +} > > + > > void > > -nouveau_fbcon_set_suspend(struct drm_device *dev, int state) > > +nouveau_fbcon_set_suspend(struct drm_device *dev, int state, bool synchronous) > > { > > struct nouveau_drm *drm = nouveau_drm(dev); > > - if (drm->fbcon) { > > - console_lock(); > > - if (state == FBINFO_STATE_RUNNING) > > - nouveau_fbcon_accel_restore(dev); > > - drm_fb_helper_set_suspend(&drm->fbcon->helper, state); > > + if (!drm->fbcon) > > + return; > > + > > + if (synchronous) { > > + /* Flush any pending work to turn the console on, and then > > + * wait to turn it off. It must be synchronous as we are > > + * about to suspend or unload the driver. > > + * > > + * Note that from within the work-handler, we cannot flush > > + * ourselves, so only flush outstanding work upon suspend! > > + */ > > if (state != FBINFO_STATE_RUNNING) > > - nouveau_fbcon_accel_save_disable(dev); > > - console_unlock(); > > + flush_work(&drm->fbdev_suspend_work); > > + console_lock(); > > + } else { > > + /* > > + * The console lock can be pretty contented on resume due > > + * to all the printk activity. Try to keep it out of the hot > > + * path of resume if possible. This also prevents a deadlock > > + * with FBIOPUT_CON2FBMAP. > > + */ > > + WARN_ON(state != FBINFO_STATE_RUNNING); > > + if (!console_trylock()) { > > + schedule_work(&drm->fbdev_suspend_work); > > + return; > > + } > > } > > + > > + if (state == FBINFO_STATE_RUNNING) > > + nouveau_fbcon_accel_restore(dev); > > + drm_fb_helper_set_suspend(&drm->fbcon->helper, state); > > + if (state != FBINFO_STATE_RUNNING) > > + nouveau_fbcon_accel_save_disable(dev); > > + console_unlock(); > > } > > > > int > > @@ -526,6 +560,8 @@ nouveau_fbcon_init(struct drm_device *dev) > > fbcon->dev = dev; > > drm->fbcon = fbcon; > > > > + INIT_WORK(&drm->fbdev_suspend_work, nouveau_fbcon_suspend_worker); > > + > > drm_fb_helper_prepare(dev, &fbcon->helper, &nouveau_fbcon_helper_funcs); > > > > ret = drm_fb_helper_init(dev, &fbcon->helper, > > @@ -571,6 +607,8 @@ nouveau_fbcon_fini(struct drm_device *dev) > > if (!drm->fbcon) > > return; > > > > + flush_work(&drm->fbdev_suspend_work); > > + > > nouveau_fbcon_accel_fini(dev); > > nouveau_fbcon_destroy(dev, drm->fbcon); > > kfree(drm->fbcon); > > diff --git a/drivers/gpu/drm/nouveau/nouveau_fbcon.h b/drivers/gpu/drm/nouveau/nouveau_fbcon.h > > index ca77ad0..34b2504 100644 > > --- a/drivers/gpu/drm/nouveau/nouveau_fbcon.h > > +++ b/drivers/gpu/drm/nouveau/nouveau_fbcon.h > > @@ -66,7 +66,7 @@ void nouveau_fbcon_gpu_lockup(struct fb_info *info); > > > > int nouveau_fbcon_init(struct drm_device *dev); > > void nouveau_fbcon_fini(struct drm_device *dev); > > -void nouveau_fbcon_set_suspend(struct drm_device *dev, int state); > > +void nouveau_fbcon_set_suspend(struct drm_device *dev, int state, bool synchronous); > > void nouveau_fbcon_accel_save_disable(struct drm_device *dev); > > void nouveau_fbcon_accel_restore(struct drm_device *dev); > > > > -- > > 2.8.3 > > > > -- > Daniel Vetter > Software Engineer, Intel Corporation > http://blog.ffwll.ch
On Wed, Jul 13, 2016 at 02:40:50PM +0200, Peter Wu wrote: > On Wed, Jul 13, 2016 at 11:54:49AM +0200, Daniel Vetter wrote: > > On Tue, Jul 12, 2016 at 06:49:34PM +0200, Peter Wu wrote: > > > The FBIOPUT_CON2FBMAP ioctl takes a console_lock(). When this is called > > > while nouveau was runtime suspended, a deadlock would occur due to > > > nouveau_fbcon_set_suspend also trying to obtain console_lock(). > > > > > > Fix this by delaying the drm_fb_helper_set_suspend call. Based on the > > > i915 code (which was done for performance reasons though). > > > > > > Cc: Chris Wilson <chris@chris-wilson.co.uk> > > > Cc: Daniel Vetter <daniel.vetter@ffwll.ch> > > > Signed-off-by: Peter Wu <peter@lekensteyn.nl> > > > --- > > > Tested on top of v4.7-rc5, the deadlock is gone. > > > > If we bother with this, it should imo be moved into the drm_fb_helper.c > > function drm_fb_helper_set_suspend(). But this also smells like some kind > > of bad duct-tape. I think Lukas is working on some other rpm vs. fbdev > > deadlocks, maybe we could fix them all with one proper fix? I've made some > > comments on Lukas' last patch series. > > This patch is only needed for drivers that use console_lock (for > drm_fb_helper_set_suspend) in their runtime resume functions. > Lukas posted fixes for runtime PM reference leaks, those are different > from this deadlock (see > https://lists.freedesktop.org/archives/dri-devel/2016-July/113005.html > for a backtrace for this issue). > > The deadlock could also be avoided if the device backing the fbcon is > somehow runtime-resumed outside the lock, but that feels like a larger > hack that does not seem easy. > > The i915 patch was done to reduce resume time (due to console_lock > contention), that feature seems useful to all other drivers too even if > the deadlock is fixed in a different way. I might have imagined something, but I thought Lukas is already working on some rpm vs. vga_switcheroo inversions. But looking again this was on the audio side. I think the proper solution for fbcon would be for the fbdev emulation to also hold a runtime pm references when the console is in use. This should already happen correctly for vblank, the more tricky part is fbdev mmap and fbcon: - I have no idea, but there should be a way to intercept fbdev userspace mmaps and make sure that as long as an app has the fbdev mmapped we don't runtime suspend. No one really should be doing that (at least for normal setups), hence this shouldn't result in unsafe. - For fbcon, we could suspend it in the dpms off callbacks (maybe with a timer), and resume it only when enabling fbcon again - fbcon needs to redraw anyway on dpms on. Another solution for fbcon might be to untangle the suspend/resume stuff and protect it by something else than console_lock. But that means fixing up fbcon locking horror shows. > My current plan is to move stuff out of the lock and allow (just) > resuming the console to be delayed. Some drivers (nouveau, > radeon/amdgpu, i915) do unnecessary stuff under the console lock: > > - nouveau: I *think* that cleraing/setting FBINFO_HWACCEL_DISABLED > (nouveau_fbcon_accel_restore) is safe outside the lock as the fb is > already suspended before clearing/after setting the flag. > - radeon: since the console is suspended, I don't think that that all > of the code is radeon_resume_kms is really needed. > - amdgpu: same as radeon. Btw, console_lock is leaked on an error path. > - i915: I think that clearing the fb memory can be done outside the > lock too as the console is suspended. > > Please correct me if my assumptions are flawed. Yeah, fixing this independent issues should definitely help, irrespective of how we fix fb_set_suspend. > > Besides this, when fixing a deadlock pls provide more details about the > > precise callchain and the locks involved in the deadlock. If you > > discovered this using lockdep, then just add the entire lockdep splat to > > the commit message. Otherwise there's lots of guesswork involved here. > > -Daniel > > There was no lockdep splat, it was triggered via the ioctl in the commit > message. I'll include the verbose trace from the previous mail in the > next proposed patch to reduce hunting though. Sounds good too. -Daniel
On Tue, Jul 12, 2016 at 06:49:34PM +0200, Peter Wu wrote: > The FBIOPUT_CON2FBMAP ioctl takes a console_lock(). When this is called > while nouveau was runtime suspended, a deadlock would occur due to > nouveau_fbcon_set_suspend also trying to obtain console_lock(). > > Fix this by delaying the drm_fb_helper_set_suspend call. Based on the > i915 code (which was done for performance reasons though). > > Cc: Chris Wilson <chris@chris-wilson.co.uk> > Cc: Daniel Vetter <daniel.vetter@ffwll.ch> > Signed-off-by: Peter Wu <peter@lekensteyn.nl> > --- > Tested on top of v4.7-rc5, the deadlock is gone. > --- > drivers/gpu/drm/nouveau/nouveau_drm.c | 4 +-- > drivers/gpu/drm/nouveau/nouveau_drv.h | 1 + > drivers/gpu/drm/nouveau/nouveau_fbcon.c | 54 ++++++++++++++++++++++++++++----- > drivers/gpu/drm/nouveau/nouveau_fbcon.h | 2 +- > 4 files changed, 50 insertions(+), 11 deletions(-) > > diff --git a/drivers/gpu/drm/nouveau/nouveau_drm.c b/drivers/gpu/drm/nouveau/nouveau_drm.c > index 11f8dd9..f9a2c10 100644 > --- a/drivers/gpu/drm/nouveau/nouveau_drm.c > +++ b/drivers/gpu/drm/nouveau/nouveau_drm.c > @@ -552,7 +552,7 @@ nouveau_do_suspend(struct drm_device *dev, bool runtime) > > if (dev->mode_config.num_crtc) { > NV_INFO(drm, "suspending console...\n"); > - nouveau_fbcon_set_suspend(dev, 1); > + nouveau_fbcon_set_suspend(dev, FBINFO_STATE_SUSPENDED, true); > NV_INFO(drm, "suspending display...\n"); > ret = nouveau_display_suspend(dev, runtime); > if (ret) > @@ -635,7 +635,7 @@ nouveau_do_resume(struct drm_device *dev, bool runtime) > NV_INFO(drm, "resuming display...\n"); > nouveau_display_resume(dev, runtime); > NV_INFO(drm, "resuming console...\n"); > - nouveau_fbcon_set_suspend(dev, 0); > + nouveau_fbcon_set_suspend(dev, FBINFO_STATE_RUNNING, false); > } > > return 0; > diff --git a/drivers/gpu/drm/nouveau/nouveau_drv.h b/drivers/gpu/drm/nouveau/nouveau_drv.h > index 822a021..a743d19 100644 > --- a/drivers/gpu/drm/nouveau/nouveau_drv.h > +++ b/drivers/gpu/drm/nouveau/nouveau_drv.h > @@ -147,6 +147,7 @@ struct nouveau_drm { > struct nouveau_channel *channel; > struct nvkm_gpuobj *notify; > struct nouveau_fbdev *fbcon; > + struct work_struct fbdev_suspend_work; > struct nvif_object nvsw; > struct nvif_object ntfy; > struct nvif_notify flip; > diff --git a/drivers/gpu/drm/nouveau/nouveau_fbcon.c b/drivers/gpu/drm/nouveau/nouveau_fbcon.c > index d1f248f..089156a 100644 > --- a/drivers/gpu/drm/nouveau/nouveau_fbcon.c > +++ b/drivers/gpu/drm/nouveau/nouveau_fbcon.c > @@ -492,19 +492,53 @@ static const struct drm_fb_helper_funcs nouveau_fbcon_helper_funcs = { > .fb_probe = nouveau_fbcon_create, > }; > > +static void nouveau_fbcon_suspend_worker(struct work_struct *work) > +{ > + nouveau_fbcon_set_suspend(container_of(work, > + struct nouveau_drm, > + fbdev_suspend_work)->dev, > + FBINFO_STATE_RUNNING, > + true); > +} > + > void > -nouveau_fbcon_set_suspend(struct drm_device *dev, int state) > +nouveau_fbcon_set_suspend(struct drm_device *dev, int state, bool synchronous) > { > struct nouveau_drm *drm = nouveau_drm(dev); > - if (drm->fbcon) { > - console_lock(); > - if (state == FBINFO_STATE_RUNNING) > - nouveau_fbcon_accel_restore(dev); > - drm_fb_helper_set_suspend(&drm->fbcon->helper, state); > + if (!drm->fbcon) > + return; > + > + if (synchronous) { > + /* Flush any pending work to turn the console on, and then > + * wait to turn it off. It must be synchronous as we are > + * about to suspend or unload the driver. > + * > + * Note that from within the work-handler, we cannot flush > + * ourselves, so only flush outstanding work upon suspend! > + */ > if (state != FBINFO_STATE_RUNNING) > - nouveau_fbcon_accel_save_disable(dev); > - console_unlock(); > + flush_work(&drm->fbdev_suspend_work); > + console_lock(); > + } else { > + /* > + * The console lock can be pretty contented on resume due > + * to all the printk activity. Try to keep it out of the hot > + * path of resume if possible. This also prevents a deadlock > + * with FBIOPUT_CON2FBMAP. > + */ > + WARN_ON(state != FBINFO_STATE_RUNNING); > + if (!console_trylock()) { > + schedule_work(&drm->fbdev_suspend_work); > + return; > + } > } > + > + if (state == FBINFO_STATE_RUNNING) > + nouveau_fbcon_accel_restore(dev); > + drm_fb_helper_set_suspend(&drm->fbcon->helper, state); > + if (state != FBINFO_STATE_RUNNING) > + nouveau_fbcon_accel_save_disable(dev); > + console_unlock(); > } > > int > @@ -526,6 +560,8 @@ nouveau_fbcon_init(struct drm_device *dev) > fbcon->dev = dev; > drm->fbcon = fbcon; > > + INIT_WORK(&drm->fbdev_suspend_work, nouveau_fbcon_suspend_worker); > + > drm_fb_helper_prepare(dev, &fbcon->helper, &nouveau_fbcon_helper_funcs); > > ret = drm_fb_helper_init(dev, &fbcon->helper, > @@ -571,6 +607,8 @@ nouveau_fbcon_fini(struct drm_device *dev) > if (!drm->fbcon) > return; > > + flush_work(&drm->fbdev_suspend_work); Hmm, since suspend_work can theorectically rearm itself, this should be cancel_work_sync(). The copy'n'paste of the code looks fine, so (other than the bug copied across): Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk> Whether you can convince the maintainers on the basis of this being a deadlock fix is another matter... I did test this patch, since I see a livelock on resume, but not the same console deadlock. Just in case anyone is interested: Jul 13 17:05:59 acer kernel: [24873.945839] NMI watchdog: BUG: soft lockup - CPU#2 stuck for 22s! [kworker/2:1:8370] Jul 13 17:05:59 acer kernel: [24873.946563] Modules linked in: rfcomm drbg ansi_cprng ctr ccm arc4 bnep ath10k_pci ath10k_core snd_hda_codec_hdmi snd_hda_codec_realtek ath snd_hda_co dec_generic snd_hda_intel mac80211 snd_hda_codec binfmt_misc snd_hda_core nls_iso8859_1 snd_hwdep btusb btrtl snd_pcm btbcm rtsx_usb_ms btintel x86_pkg_temp_thermal uvcvideo acer_wmi intel_powerclamp memstick snd_seq_midi bluetooth sparse_keymap snd_seq_midi_event coretemp videobuf2_vmalloc videobuf2_memops snd_rawmidi kvm_intel videobuf2_v4l2 kvm cfg80211 video buf2_core snd_seq videodev irqbypass snd_seq_device snd_timer media crct10dif_pclmul snd crc32_pclmul hid_multitouch ghash_clmulni_intel aesni_intel joydev aes_x86_64 lrw gf128mul gl ue_helper ablk_helper cryptd soundcore ie31200_edac edac_core mei_me shpchp mei input_leds acpi_als serio_raw kfifo_buf lpc_ich industrialio soc_button_array mac_hid parport_pc ppdev lp parport autofs4 btrfs xor raid6_pq dm_mirror dm_region_hash dm_log nouveau rtsx_usb_sdmmc rtsx_usb hid_generic usbhid hid i915 broadcom bcm_phy_lib mxm_wmi ttm i2c_algo_bit drm_k ms_helper syscopyarea sysfillrect sysimgblt fb_sys_fops drm tg3 ahci ptp libahci pps_core video wmi fjes Jul 13 17:05:59 acer kernel: [24873.946598] CPU: 2 PID: 8370 Comm: kworker/2:1 Tainted: G L 4.7.0-rc6+ #4 Jul 13 17:05:59 acer kernel: [24873.946599] Hardware name: Acer Aspire VN7-791G/Aspire VN7-791G, BIOS V1.11 01/09/2015 Jul 13 17:05:59 acer kernel: [24873.946603] Workqueue: pm pm_runtime_work Jul 13 17:05:59 acer kernel: [24873.946604] task: ffff880023c59c80 ti: ffff880024058000 task.ti: ffff880024058000 Jul 13 17:05:59 acer kernel: [24873.946605] RIP: 0010:[<ffffffff8140a6c0>] [<ffffffff8140a6c0>] ioread32+0x30/0x40 Jul 13 17:05:59 acer kernel: [24873.946608] RSP: 0018:ffff88002405baf0 EFLAGS: 00000296 Jul 13 17:05:59 acer kernel: [24873.946609] RAX: 00000000ffffffff RBX: ffff88041734a000 RCX: 0000000000000018 Jul 13 17:05:59 acer kernel: [24873.946610] RDX: 0012230aadf99e58 RSI: ffffc9000410a014 RDI: ffffc90004009410 Jul 13 17:05:59 acer kernel: [24873.946610] RBP: ffff88002405bb10 R08: 0000000000000009 R09: ffff880416ce0000 Jul 13 17:05:59 acer kernel: [24873.946611] R10: 000000000000000a R11: 0000000000000001 R12: 00000000ffffffff Jul 13 17:05:59 acer kernel: [24873.946612] R13: 00000000ffffffff R14: ffff880415de6600 R15: ffffffffffffffff Jul 13 17:05:59 acer kernel: [24873.946613] FS: 0000000000000000(0000) GS:ffff88045f280000(0000) knlGS:0000000000000000 Jul 13 17:05:59 acer kernel: [24873.946614] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 Jul 13 17:05:59 acer kernel: [24873.946614] CR2: 00007f566a5ac010 CR3: 0000000002e06000 CR4: 00000000001406e0 Jul 13 17:05:59 acer kernel: [24873.946615] Stack: Jul 13 17:05:59 acer kernel: [24873.946616] ffffffffc03bee71 ffff88041734a000 0000000000000000 ffff880415de7908 Jul 13 17:05:59 acer kernel: [24873.946617] ffff88002405bb20 ffffffffc03be9bf ffff88002405bb58 ffffffffc03b7340 Jul 13 17:05:59 acer kernel: [24873.946618] ffff880415de7908 ffff88041734a000 00000414ef5b0e40 0000000000000011 Jul 13 17:05:59 acer kernel: [24873.946620] Call Trace: Jul 13 17:05:59 acer kernel: [24873.946645] [<ffffffffc03bee71>] ? nv04_timer_read+0x51/0x70 [nouveau] Jul 13 17:05:59 acer kernel: [24873.946661] [<ffffffffc03be9bf>] nvkm_timer_read+0xf/0x20 [nouveau] Jul 13 17:05:59 acer kernel: [24873.946676] [<ffffffffc03b7340>] nvkm_pmu_init+0x50/0x450 [nouveau] Jul 13 17:05:59 acer kernel: [24873.946685] [<ffffffffc0371ac1>] nvkm_subdev_init+0x91/0x200 [nouveau] Jul 13 17:05:59 acer kernel: [24873.946701] [<ffffffffc03c2f26>] nvkm_device_init+0x146/0x280 [nouveau] Jul 13 17:05:59 acer kernel: [24873.946715] [<ffffffffc03c6a18>] nvkm_udevice_init+0x48/0x60 [nouveau] Jul 13 17:05:59 acer kernel: [24873.946724] [<ffffffffc0370440>] nvkm_object_init+0x40/0x190 [nouveau] Jul 13 17:05:59 acer kernel: [24873.946733] [<ffffffffc03704b4>] nvkm_object_init+0xb4/0x190 [nouveau] Jul 13 17:05:59 acer kernel: [24873.946742] [<ffffffffc036d56e>] nvkm_client_init+0xe/0x10 [nouveau] Jul 13 17:05:59 acer kernel: [24873.946758] [<ffffffffc040ac0e>] nvkm_client_resume+0xe/0x10 [nouveau] Jul 13 17:05:59 acer kernel: [24873.946767] [<ffffffffc036c7c7>] nvif_client_resume+0x17/0x20 [nouveau] Jul 13 17:05:59 acer kernel: [24873.946782] [<ffffffffc04082fb>] nouveau_do_resume+0x4b/0x130 [nouveau] Jul 13 17:05:59 acer kernel: [24873.946797] [<ffffffffc0408709>] nouveau_pmops_runtime_resume+0x79/0x120 [nouveau] Jul 13 17:05:59 acer kernel: [24873.946800] [<ffffffff814448eb>] pci_pm_runtime_resume+0x7b/0xa0 Jul 13 17:05:59 acer kernel: [24873.946801] [<ffffffff815676d3>] __rpm_callback+0x33/0x70 Jul 13 17:05:59 acer kernel: [24873.946803] [<ffffffff81444870>] ? pci_restore_standard_config+0x40/0x40 Jul 13 17:05:59 acer kernel: [24873.946804] [<ffffffff81567734>] rpm_callback+0x24/0x80 Jul 13 17:05:59 acer kernel: [24873.946806] [<ffffffff81444870>] ? pci_restore_standard_config+0x40/0x40 Jul 13 17:05:59 acer kernel: [24873.946807] [<ffffffff81567ee1>] rpm_resume+0x491/0x690 Jul 13 17:05:59 acer kernel: [24873.946808] [<ffffffff81568f08>] pm_runtime_work+0x58/0xa0 Jul 13 17:05:59 acer kernel: [24873.946811] [<ffffffff8109adbb>] process_one_work+0x16b/0x480 Jul 13 17:05:59 acer kernel: [24873.946812] [<ffffffff8109b11b>] worker_thread+0x4b/0x500 Jul 13 17:05:59 acer kernel: [24873.946814] [<ffffffff8109b0d0>] ? process_one_work+0x480/0x480 Jul 13 17:05:59 acer kernel: [24873.946815] [<ffffffff8109b0d0>] ? process_one_work+0x480/0x480 Jul 13 17:05:59 acer kernel: [24873.946816] [<ffffffff810a1348>] kthread+0xd8/0xf0 Jul 13 17:05:59 acer kernel: [24873.946818] [<ffffffff81845fdf>] ret_from_fork+0x1f/0x40 Jul 13 17:05:59 acer kernel: [24873.946819] [<ffffffff810a1270>] ? kthread_create_on_node+0x1a0/0x1a0 Jul 13 17:05:59 acer kernel: [24873.946820] Code: 03 00 77 25 48 81 ff 00 00 01 00 76 05 0f b7 d7 ed c3 55 48 c7 c6 e4 36 cc 81 48 89 e5 e8 19 ff ff ff b8 ff ff ff ff 5d c3 8b 07 <c3 > 0f 1f 44 00 00 66 2e 0f 1f 84 00 00 00 00 00 48 81 fe ff ff -Chris
On Wed, Jul 13, 2016 at 04:57:19PM +0200, Daniel Vetter wrote: > On Wed, Jul 13, 2016 at 02:40:50PM +0200, Peter Wu wrote: > > On Wed, Jul 13, 2016 at 11:54:49AM +0200, Daniel Vetter wrote: > > > On Tue, Jul 12, 2016 at 06:49:34PM +0200, Peter Wu wrote: > > > > The FBIOPUT_CON2FBMAP ioctl takes a console_lock(). When this is called > > > > while nouveau was runtime suspended, a deadlock would occur due to > > > > nouveau_fbcon_set_suspend also trying to obtain console_lock(). > > > > > > > > Fix this by delaying the drm_fb_helper_set_suspend call. Based on the > > > > i915 code (which was done for performance reasons though). > > > > > > > > Cc: Chris Wilson <chris@chris-wilson.co.uk> > > > > Cc: Daniel Vetter <daniel.vetter@ffwll.ch> > > > > Signed-off-by: Peter Wu <peter@lekensteyn.nl> > > > > --- > > > > Tested on top of v4.7-rc5, the deadlock is gone. > > > > > > If we bother with this, it should imo be moved into the drm_fb_helper.c > > > function drm_fb_helper_set_suspend(). But this also smells like some kind > > > of bad duct-tape. I think Lukas is working on some other rpm vs. fbdev > > > deadlocks, maybe we could fix them all with one proper fix? I've made some > > > comments on Lukas' last patch series. > > > > This patch is only needed for drivers that use console_lock (for > > drm_fb_helper_set_suspend) in their runtime resume functions. > > Lukas posted fixes for runtime PM reference leaks, those are different > > from this deadlock (see > > https://lists.freedesktop.org/archives/dri-devel/2016-July/113005.html > > for a backtrace for this issue). > > > > The deadlock could also be avoided if the device backing the fbcon is > > somehow runtime-resumed outside the lock, but that feels like a larger > > hack that does not seem easy. > > > > The i915 patch was done to reduce resume time (due to console_lock > > contention), that feature seems useful to all other drivers too even if > > the deadlock is fixed in a different way. > > I might have imagined something, but I thought Lukas is already working on > some rpm vs. vga_switcheroo inversions. But looking again this was on the > audio side. > > I think the proper solution for fbcon would be for the fbdev emulation to > also hold a runtime pm references when the console is in use. nouveau does this by adding a fb_open and fb_release function that calls pm_runtime_get and pm_runtime_put respectively (and is the only driver doing this). So that is why it causes the console_lock issue for nouveau, but not for others. > This should already happen correctly for vblank, the more tricky part > is fbdev mmap and fbcon: > > - I have no idea, but there should be a way to intercept fbdev userspace > mmaps and make sure that as long as an app has the fbdev mmapped we > don't runtime suspend. No one really should be doing that (at least for > normal setups), hence this shouldn't result in unsafe. mmap normally needs a fd right? When the chardev /dev/fbX is opened, the fb_open function will be called. So nouveau should not have this issue with mmap/read/write to a fb while the device is suspended. (This RPM thing was added in f231976c2e89 ("drm/nouveau/fbcon: take runpm reference when userspace has an open fd"), maybe it is not a bad idea for other drivers with RPM support either.) > - For fbcon, we could suspend it in the dpms off callbacks (maybe with a > timer), and resume it only when enabling fbcon again - fbcon needs to > redraw anyway on dpms on. Would this guarantee that the fb is suspended before/during suspend (of the graphics device) and resumed somewhere during/after resume? Suspending the fb also has the effect that reads/writes to /dev/fbN fail, maybe that is a bit too restricted since the framebuffer is just accessible until the device is suspended. (Hmm, fb_read/fb_write does not seem to do any locking...) > Another solution for fbcon might be to untangle the suspend/resume stuff > and protect it by something else than console_lock. But that means > fixing up fbcon locking horror shows. console_lock seems needed for some code down the call stack, removing it risks some blow ups. Some archaeology: this locking problem was introduced with 054430e773c9 ("fbcon: fix locking harder"). In the past fb_set_suspend also took the fb_info lock but that was removed in 9e769ff3f585 ("fb: avoid possible deadlock caused by fb_set_suspend"). Peter > > My current plan is to move stuff out of the lock and allow (just) > > resuming the console to be delayed. Some drivers (nouveau, > > radeon/amdgpu, i915) do unnecessary stuff under the console lock: > > > > - nouveau: I *think* that cleraing/setting FBINFO_HWACCEL_DISABLED > > (nouveau_fbcon_accel_restore) is safe outside the lock as the fb is > > already suspended before clearing/after setting the flag. > > - radeon: since the console is suspended, I don't think that that all > > of the code is radeon_resume_kms is really needed. > > - amdgpu: same as radeon. Btw, console_lock is leaked on an error path. > > - i915: I think that clearing the fb memory can be done outside the > > lock too as the console is suspended. > > > > Please correct me if my assumptions are flawed. > > Yeah, fixing this independent issues should definitely help, irrespective > of how we fix fb_set_suspend. > > > > Besides this, when fixing a deadlock pls provide more details about the > > > precise callchain and the locks involved in the deadlock. If you > > > discovered this using lockdep, then just add the entire lockdep splat to > > > the commit message. Otherwise there's lots of guesswork involved here. > > > -Daniel > > > > There was no lockdep splat, it was triggered via the ioctl in the commit > > message. I'll include the verbose trace from the previous mail in the > > next proposed patch to reduce hunting though. > > Sounds good too. > -Daniel > -- > Daniel Vetter > Software Engineer, Intel Corporation > http://blog.ffwll.ch
On Wed, Jul 13, 2016 at 06:17:47PM +0100, Chris Wilson wrote: > On Tue, Jul 12, 2016 at 06:49:34PM +0200, Peter Wu wrote: > > The FBIOPUT_CON2FBMAP ioctl takes a console_lock(). When this is called > > while nouveau was runtime suspended, a deadlock would occur due to > > nouveau_fbcon_set_suspend also trying to obtain console_lock(). > > > > Fix this by delaying the drm_fb_helper_set_suspend call. Based on the > > i915 code (which was done for performance reasons though). > > > > Cc: Chris Wilson <chris@chris-wilson.co.uk> > > Cc: Daniel Vetter <daniel.vetter@ffwll.ch> > > Signed-off-by: Peter Wu <peter@lekensteyn.nl> > > --- > > Tested on top of v4.7-rc5, the deadlock is gone. > > --- > > drivers/gpu/drm/nouveau/nouveau_drm.c | 4 +-- > > drivers/gpu/drm/nouveau/nouveau_drv.h | 1 + > > drivers/gpu/drm/nouveau/nouveau_fbcon.c | 54 ++++++++++++++++++++++++++++----- > > drivers/gpu/drm/nouveau/nouveau_fbcon.h | 2 +- > > 4 files changed, 50 insertions(+), 11 deletions(-) > > > > diff --git a/drivers/gpu/drm/nouveau/nouveau_drm.c b/drivers/gpu/drm/nouveau/nouveau_drm.c > > index 11f8dd9..f9a2c10 100644 > > --- a/drivers/gpu/drm/nouveau/nouveau_drm.c > > +++ b/drivers/gpu/drm/nouveau/nouveau_drm.c > > @@ -552,7 +552,7 @@ nouveau_do_suspend(struct drm_device *dev, bool runtime) > > > > if (dev->mode_config.num_crtc) { > > NV_INFO(drm, "suspending console...\n"); > > - nouveau_fbcon_set_suspend(dev, 1); > > + nouveau_fbcon_set_suspend(dev, FBINFO_STATE_SUSPENDED, true); > > NV_INFO(drm, "suspending display...\n"); > > ret = nouveau_display_suspend(dev, runtime); > > if (ret) > > @@ -635,7 +635,7 @@ nouveau_do_resume(struct drm_device *dev, bool runtime) > > NV_INFO(drm, "resuming display...\n"); > > nouveau_display_resume(dev, runtime); > > NV_INFO(drm, "resuming console...\n"); > > - nouveau_fbcon_set_suspend(dev, 0); > > + nouveau_fbcon_set_suspend(dev, FBINFO_STATE_RUNNING, false); > > } > > > > return 0; > > diff --git a/drivers/gpu/drm/nouveau/nouveau_drv.h b/drivers/gpu/drm/nouveau/nouveau_drv.h > > index 822a021..a743d19 100644 > > --- a/drivers/gpu/drm/nouveau/nouveau_drv.h > > +++ b/drivers/gpu/drm/nouveau/nouveau_drv.h > > @@ -147,6 +147,7 @@ struct nouveau_drm { > > struct nouveau_channel *channel; > > struct nvkm_gpuobj *notify; > > struct nouveau_fbdev *fbcon; > > + struct work_struct fbdev_suspend_work; > > struct nvif_object nvsw; > > struct nvif_object ntfy; > > struct nvif_notify flip; > > diff --git a/drivers/gpu/drm/nouveau/nouveau_fbcon.c b/drivers/gpu/drm/nouveau/nouveau_fbcon.c > > index d1f248f..089156a 100644 > > --- a/drivers/gpu/drm/nouveau/nouveau_fbcon.c > > +++ b/drivers/gpu/drm/nouveau/nouveau_fbcon.c > > @@ -492,19 +492,53 @@ static const struct drm_fb_helper_funcs nouveau_fbcon_helper_funcs = { > > .fb_probe = nouveau_fbcon_create, > > }; > > > > +static void nouveau_fbcon_suspend_worker(struct work_struct *work) > > +{ > > + nouveau_fbcon_set_suspend(container_of(work, > > + struct nouveau_drm, > > + fbdev_suspend_work)->dev, > > + FBINFO_STATE_RUNNING, > > + true); > > +} > > + > > void > > -nouveau_fbcon_set_suspend(struct drm_device *dev, int state) > > +nouveau_fbcon_set_suspend(struct drm_device *dev, int state, bool synchronous) > > { > > struct nouveau_drm *drm = nouveau_drm(dev); > > - if (drm->fbcon) { > > - console_lock(); > > - if (state == FBINFO_STATE_RUNNING) > > - nouveau_fbcon_accel_restore(dev); > > - drm_fb_helper_set_suspend(&drm->fbcon->helper, state); > > + if (!drm->fbcon) > > + return; > > + > > + if (synchronous) { > > + /* Flush any pending work to turn the console on, and then > > + * wait to turn it off. It must be synchronous as we are > > + * about to suspend or unload the driver. > > + * > > + * Note that from within the work-handler, we cannot flush > > + * ourselves, so only flush outstanding work upon suspend! > > + */ > > if (state != FBINFO_STATE_RUNNING) > > - nouveau_fbcon_accel_save_disable(dev); > > - console_unlock(); > > + flush_work(&drm->fbdev_suspend_work); > > + console_lock(); > > + } else { > > + /* > > + * The console lock can be pretty contented on resume due > > + * to all the printk activity. Try to keep it out of the hot > > + * path of resume if possible. This also prevents a deadlock > > + * with FBIOPUT_CON2FBMAP. > > + */ > > + WARN_ON(state != FBINFO_STATE_RUNNING); > > + if (!console_trylock()) { > > + schedule_work(&drm->fbdev_suspend_work); > > + return; > > + } > > } > > + > > + if (state == FBINFO_STATE_RUNNING) > > + nouveau_fbcon_accel_restore(dev); > > + drm_fb_helper_set_suspend(&drm->fbcon->helper, state); > > + if (state != FBINFO_STATE_RUNNING) > > + nouveau_fbcon_accel_save_disable(dev); > > + console_unlock(); > > } > > > > int > > @@ -526,6 +560,8 @@ nouveau_fbcon_init(struct drm_device *dev) > > fbcon->dev = dev; > > drm->fbcon = fbcon; > > > > + INIT_WORK(&drm->fbdev_suspend_work, nouveau_fbcon_suspend_worker); > > + > > drm_fb_helper_prepare(dev, &fbcon->helper, &nouveau_fbcon_helper_funcs); > > > > ret = drm_fb_helper_init(dev, &fbcon->helper, > > @@ -571,6 +607,8 @@ nouveau_fbcon_fini(struct drm_device *dev) > > if (!drm->fbcon) > > return; > > > > + flush_work(&drm->fbdev_suspend_work); > > Hmm, since suspend_work can theorectically rearm itself, this should be > cancel_work_sync(). How so? The worker calls with state = FBINFO_STATE_RUNNING and synchronous = true, so schedule_work() can never be called. > The copy'n'paste of the code looks fine, so (other than the bug copied > across): > > Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk> > > Whether you can convince the maintainers on the basis of this being a > deadlock fix is another matter... > > I did test this patch, since I see a livelock on resume, but not the > same console deadlock. Just in case anyone is interested: This sounds like a device that is somehow still sleeping, resulting in failure to read the register. Can you always reproduce this somehow? Is this the mainline kernel with just this patch? I found an acpidump for your laptop on https://bugzilla.kernel.org/show_bug.cgi?id=99381 and it looks like you have a newer laptop designed for Win8 or newer. Were there any other ACPI messages (like an infinite loop) preceding this dmesg? Peter > Jul 13 17:05:59 acer kernel: [24873.945839] NMI watchdog: BUG: soft lockup - CPU#2 stuck for 22s! [kworker/2:1:8370] > Jul 13 17:05:59 acer kernel: [24873.946563] Modules linked in: rfcomm drbg ansi_cprng ctr ccm arc4 bnep ath10k_pci ath10k_core snd_hda_codec_hdmi snd_hda_codec_realtek ath snd_hda_co > dec_generic snd_hda_intel mac80211 snd_hda_codec binfmt_misc snd_hda_core nls_iso8859_1 snd_hwdep btusb btrtl snd_pcm btbcm rtsx_usb_ms btintel x86_pkg_temp_thermal uvcvideo acer_wmi > intel_powerclamp memstick snd_seq_midi bluetooth sparse_keymap snd_seq_midi_event coretemp videobuf2_vmalloc videobuf2_memops snd_rawmidi kvm_intel videobuf2_v4l2 kvm cfg80211 video > buf2_core snd_seq videodev irqbypass snd_seq_device snd_timer media crct10dif_pclmul snd crc32_pclmul hid_multitouch ghash_clmulni_intel aesni_intel joydev aes_x86_64 lrw gf128mul gl > ue_helper ablk_helper cryptd soundcore ie31200_edac edac_core mei_me shpchp mei input_leds acpi_als serio_raw kfifo_buf lpc_ich industrialio soc_button_array mac_hid parport_pc ppdev > lp parport autofs4 btrfs xor raid6_pq dm_mirror dm_region_hash dm_log nouveau rtsx_usb_sdmmc rtsx_usb hid_generic usbhid hid i915 broadcom bcm_phy_lib mxm_wmi ttm i2c_algo_bit drm_k > ms_helper syscopyarea sysfillrect sysimgblt fb_sys_fops drm tg3 ahci ptp libahci pps_core video wmi fjes > Jul 13 17:05:59 acer kernel: [24873.946598] CPU: 2 PID: 8370 Comm: kworker/2:1 Tainted: G L 4.7.0-rc6+ #4 > Jul 13 17:05:59 acer kernel: [24873.946599] Hardware name: Acer Aspire VN7-791G/Aspire VN7-791G, BIOS V1.11 01/09/2015 > Jul 13 17:05:59 acer kernel: [24873.946603] Workqueue: pm pm_runtime_work > Jul 13 17:05:59 acer kernel: [24873.946604] task: ffff880023c59c80 ti: ffff880024058000 task.ti: ffff880024058000 > Jul 13 17:05:59 acer kernel: [24873.946605] RIP: 0010:[<ffffffff8140a6c0>] [<ffffffff8140a6c0>] ioread32+0x30/0x40 > Jul 13 17:05:59 acer kernel: [24873.946608] RSP: 0018:ffff88002405baf0 EFLAGS: 00000296 > Jul 13 17:05:59 acer kernel: [24873.946609] RAX: 00000000ffffffff RBX: ffff88041734a000 RCX: 0000000000000018 > Jul 13 17:05:59 acer kernel: [24873.946610] RDX: 0012230aadf99e58 RSI: ffffc9000410a014 RDI: ffffc90004009410 > Jul 13 17:05:59 acer kernel: [24873.946610] RBP: ffff88002405bb10 R08: 0000000000000009 R09: ffff880416ce0000 > Jul 13 17:05:59 acer kernel: [24873.946611] R10: 000000000000000a R11: 0000000000000001 R12: 00000000ffffffff > Jul 13 17:05:59 acer kernel: [24873.946612] R13: 00000000ffffffff R14: ffff880415de6600 R15: ffffffffffffffff > Jul 13 17:05:59 acer kernel: [24873.946613] FS: 0000000000000000(0000) GS:ffff88045f280000(0000) knlGS:0000000000000000 > Jul 13 17:05:59 acer kernel: [24873.946614] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 > Jul 13 17:05:59 acer kernel: [24873.946614] CR2: 00007f566a5ac010 CR3: 0000000002e06000 CR4: 00000000001406e0 > Jul 13 17:05:59 acer kernel: [24873.946615] Stack: > Jul 13 17:05:59 acer kernel: [24873.946616] ffffffffc03bee71 ffff88041734a000 0000000000000000 ffff880415de7908 > Jul 13 17:05:59 acer kernel: [24873.946617] ffff88002405bb20 ffffffffc03be9bf ffff88002405bb58 ffffffffc03b7340 > Jul 13 17:05:59 acer kernel: [24873.946618] ffff880415de7908 ffff88041734a000 00000414ef5b0e40 0000000000000011 > Jul 13 17:05:59 acer kernel: [24873.946620] Call Trace: > Jul 13 17:05:59 acer kernel: [24873.946645] [<ffffffffc03bee71>] ? nv04_timer_read+0x51/0x70 [nouveau] > Jul 13 17:05:59 acer kernel: [24873.946661] [<ffffffffc03be9bf>] nvkm_timer_read+0xf/0x20 [nouveau] > Jul 13 17:05:59 acer kernel: [24873.946676] [<ffffffffc03b7340>] nvkm_pmu_init+0x50/0x450 [nouveau] > Jul 13 17:05:59 acer kernel: [24873.946685] [<ffffffffc0371ac1>] nvkm_subdev_init+0x91/0x200 [nouveau] > Jul 13 17:05:59 acer kernel: [24873.946701] [<ffffffffc03c2f26>] nvkm_device_init+0x146/0x280 [nouveau] > Jul 13 17:05:59 acer kernel: [24873.946715] [<ffffffffc03c6a18>] nvkm_udevice_init+0x48/0x60 [nouveau] > Jul 13 17:05:59 acer kernel: [24873.946724] [<ffffffffc0370440>] nvkm_object_init+0x40/0x190 [nouveau] > Jul 13 17:05:59 acer kernel: [24873.946733] [<ffffffffc03704b4>] nvkm_object_init+0xb4/0x190 [nouveau] > Jul 13 17:05:59 acer kernel: [24873.946742] [<ffffffffc036d56e>] nvkm_client_init+0xe/0x10 [nouveau] > Jul 13 17:05:59 acer kernel: [24873.946758] [<ffffffffc040ac0e>] nvkm_client_resume+0xe/0x10 [nouveau] > Jul 13 17:05:59 acer kernel: [24873.946767] [<ffffffffc036c7c7>] nvif_client_resume+0x17/0x20 [nouveau] > Jul 13 17:05:59 acer kernel: [24873.946782] [<ffffffffc04082fb>] nouveau_do_resume+0x4b/0x130 [nouveau] > Jul 13 17:05:59 acer kernel: [24873.946797] [<ffffffffc0408709>] nouveau_pmops_runtime_resume+0x79/0x120 [nouveau] > Jul 13 17:05:59 acer kernel: [24873.946800] [<ffffffff814448eb>] pci_pm_runtime_resume+0x7b/0xa0 > Jul 13 17:05:59 acer kernel: [24873.946801] [<ffffffff815676d3>] __rpm_callback+0x33/0x70 > Jul 13 17:05:59 acer kernel: [24873.946803] [<ffffffff81444870>] ? pci_restore_standard_config+0x40/0x40 > Jul 13 17:05:59 acer kernel: [24873.946804] [<ffffffff81567734>] rpm_callback+0x24/0x80 > Jul 13 17:05:59 acer kernel: [24873.946806] [<ffffffff81444870>] ? pci_restore_standard_config+0x40/0x40 > Jul 13 17:05:59 acer kernel: [24873.946807] [<ffffffff81567ee1>] rpm_resume+0x491/0x690 > Jul 13 17:05:59 acer kernel: [24873.946808] [<ffffffff81568f08>] pm_runtime_work+0x58/0xa0 > Jul 13 17:05:59 acer kernel: [24873.946811] [<ffffffff8109adbb>] process_one_work+0x16b/0x480 > Jul 13 17:05:59 acer kernel: [24873.946812] [<ffffffff8109b11b>] worker_thread+0x4b/0x500 > Jul 13 17:05:59 acer kernel: [24873.946814] [<ffffffff8109b0d0>] ? process_one_work+0x480/0x480 > Jul 13 17:05:59 acer kernel: [24873.946815] [<ffffffff8109b0d0>] ? process_one_work+0x480/0x480 > Jul 13 17:05:59 acer kernel: [24873.946816] [<ffffffff810a1348>] kthread+0xd8/0xf0 > Jul 13 17:05:59 acer kernel: [24873.946818] [<ffffffff81845fdf>] ret_from_fork+0x1f/0x40 > Jul 13 17:05:59 acer kernel: [24873.946819] [<ffffffff810a1270>] ? kthread_create_on_node+0x1a0/0x1a0 > Jul 13 17:05:59 acer kernel: [24873.946820] Code: 03 00 77 25 48 81 ff 00 00 01 00 76 05 0f b7 d7 ed c3 55 48 c7 c6 e4 36 cc 81 48 89 e5 e8 19 ff ff ff b8 ff ff ff ff 5d c3 8b 07 <c3 > > 0f 1f 44 00 00 66 2e 0f 1f 84 00 00 00 00 00 48 81 fe ff ff > -Chris > > -- > Chris Wilson, Intel Open Source Technology Centre
On Fri, Jul 15, 2016 at 01:26:22PM +0200, Peter Wu wrote: > On Wed, Jul 13, 2016 at 06:17:47PM +0100, Chris Wilson wrote: > > Hmm, since suspend_work can theorectically rearm itself, this should be > > cancel_work_sync(). > > How so? The worker calls with state = FBINFO_STATE_RUNNING and > synchronous = true, so schedule_work() can never be called. No wories then, I feel victim to having to read the code again. -Chris
On Fri, Jul 15, 2016 at 01:10:51PM +0200, Peter Wu wrote: > On Wed, Jul 13, 2016 at 04:57:19PM +0200, Daniel Vetter wrote: > > On Wed, Jul 13, 2016 at 02:40:50PM +0200, Peter Wu wrote: > > > On Wed, Jul 13, 2016 at 11:54:49AM +0200, Daniel Vetter wrote: > > > > On Tue, Jul 12, 2016 at 06:49:34PM +0200, Peter Wu wrote: > > > > > The FBIOPUT_CON2FBMAP ioctl takes a console_lock(). When this is called > > > > > while nouveau was runtime suspended, a deadlock would occur due to > > > > > nouveau_fbcon_set_suspend also trying to obtain console_lock(). > > > > > > > > > > Fix this by delaying the drm_fb_helper_set_suspend call. Based on the > > > > > i915 code (which was done for performance reasons though). > > > > > > > > > > Cc: Chris Wilson <chris@chris-wilson.co.uk> > > > > > Cc: Daniel Vetter <daniel.vetter@ffwll.ch> > > > > > Signed-off-by: Peter Wu <peter@lekensteyn.nl> > > > > > --- > > > > > Tested on top of v4.7-rc5, the deadlock is gone. > > > > > > > > If we bother with this, it should imo be moved into the drm_fb_helper.c > > > > function drm_fb_helper_set_suspend(). But this also smells like some kind > > > > of bad duct-tape. I think Lukas is working on some other rpm vs. fbdev > > > > deadlocks, maybe we could fix them all with one proper fix? I've made some > > > > comments on Lukas' last patch series. > > > > > > This patch is only needed for drivers that use console_lock (for > > > drm_fb_helper_set_suspend) in their runtime resume functions. > > > Lukas posted fixes for runtime PM reference leaks, those are different > > > from this deadlock (see > > > https://lists.freedesktop.org/archives/dri-devel/2016-July/113005.html > > > for a backtrace for this issue). > > > > > > The deadlock could also be avoided if the device backing the fbcon is > > > somehow runtime-resumed outside the lock, but that feels like a larger > > > hack that does not seem easy. > > > > > > The i915 patch was done to reduce resume time (due to console_lock > > > contention), that feature seems useful to all other drivers too even if > > > the deadlock is fixed in a different way. > > > > I might have imagined something, but I thought Lukas is already working on > > some rpm vs. vga_switcheroo inversions. But looking again this was on the > > audio side. > > > > I think the proper solution for fbcon would be for the fbdev emulation to > > also hold a runtime pm references when the console is in use. > > nouveau does this by adding a fb_open and fb_release function that calls > pm_runtime_get and pm_runtime_put respectively (and is the only driver > doing this). So that is why it causes the console_lock issue for > nouveau, but not for others. > > > This should already happen correctly for vblank, the more tricky part > > is fbdev mmap and fbcon: > > > > - I have no idea, but there should be a way to intercept fbdev userspace > > mmaps and make sure that as long as an app has the fbdev mmapped we > > don't runtime suspend. No one really should be doing that (at least for > > normal setups), hence this shouldn't result in unsafe. > > mmap normally needs a fd right? When the chardev /dev/fbX is opened, the > fb_open function will be called. So nouveau should not have this issue > with mmap/read/write to a fb while the device is suspended. > > (This RPM thing was added in f231976c2e89 ("drm/nouveau/fbcon: take > runpm reference when userspace has an open fd"), maybe it is not a bad > idea for other drivers with RPM support either.) Yes, looks like nouveau implements this correctly already. I guess we could ponder whether we should lift this into the shared fbdev emulation, using some fbdev_rpm_get/put functions in dev->mode_config.helper_private. We can't just do the rpm stuff directly because: - gross midlayer violation - with the "pile of devices" approach arm chips employ it's unclear what exactly can't be runtime suspended when fbdev is still in use. But as long as no one else cares I'd go meh. > > - For fbcon, we could suspend it in the dpms off callbacks (maybe with a > > timer), and resume it only when enabling fbcon again - fbcon needs to > > redraw anyway on dpms on. > > Would this guarantee that the fb is suspended before/during suspend (of > the graphics device) and resumed somewhere during/after resume? > > Suspending the fb also has the effect that reads/writes to /dev/fbN > fail, maybe that is a bit too restricted since the framebuffer is just > accessible until the device is suspended. > > (Hmm, fb_read/fb_write does not seem to do any locking...) Hm, annoying. That pretty much means runtime pm breaks fbcon. I guess in practice no one will notice, since it generally should keep working for as long as the output is on (if we ignore stuff like manual refresh panels for a bit here ...). > > Another solution for fbcon might be to untangle the suspend/resume stuff > > and protect it by something else than console_lock. But that means > > fixing up fbcon locking horror shows. > > console_lock seems needed for some code down the call stack, removing it > risks some blow ups. > > Some archaeology: this locking problem was introduced with 054430e773c9 > ("fbcon: fix locking harder"). In the past fb_set_suspend also took the > fb_info lock but that was removed in 9e769ff3f585 ("fb: avoid possible > deadlock caused by fb_set_suspend"). Yup, this is the horror show I mean and which I think we'd need to fix here as the underlying issue. Let me elaborate: fbdev has a notifier chain to essentially implement dynamic function lookup. It's used for a bunch of things, and those functions are indexed with a set of #defines: - fbcon setup/signalling. This is essentially fbdev->fbcon calls. There's various different locking contexts/rules for the different callbacks. - iirc it also goes the other way, where fbcon calls into the notifier to update stuff - a few aux functions like backlight control and other random bits. The reason for this seems to be that it allows you to load fbcon and fbdev drivers in any order, and you'll still end up with an fbcon on every fbdev driver. The problem is that the notifier itself has it's own mutex, which means through that mutex it'll interfer every calling context with every other calling context, resulting in all these deadlocks. Thus far the approach was to shuffle random bits around, but I think that really doesn't work well any more. There's 2 real fixes: - Nuke the fbdev notifier, at least for the fbcon interaction. This means we'll make fbcon a compile-time selection, and you can't decide any more at runtime (by loading or not loading the module) whether you want fbcon. Since distros all built-in fbcon anyway I don't think anyway cares about that, since embedded folks disable everything they don't need anyway. This would be the cleanest solution I think. - Split up the fb notifier into the different calling contexts. This would definitely help for e.g. the backlight stuff (another area), but I'm not 100% sure it would work in your case here. Once we have that we've essentially undone the damage in 054430e773c9 and we could try to implement some proper locking (i.e. restore the fb_info lock) again. I won't have time for this myself, but I can definitely promise to review patches and help push them through - this has been annoying me (and lots of other people) since a long time. -Daniel > > Peter > > > > My current plan is to move stuff out of the lock and allow (just) > > > resuming the console to be delayed. Some drivers (nouveau, > > > radeon/amdgpu, i915) do unnecessary stuff under the console lock: > > > > > > - nouveau: I *think* that cleraing/setting FBINFO_HWACCEL_DISABLED > > > (nouveau_fbcon_accel_restore) is safe outside the lock as the fb is > > > already suspended before clearing/after setting the flag. > > > - radeon: since the console is suspended, I don't think that that all > > > of the code is radeon_resume_kms is really needed. > > > - amdgpu: same as radeon. Btw, console_lock is leaked on an error path. > > > - i915: I think that clearing the fb memory can be done outside the > > > lock too as the console is suspended. > > > > > > Please correct me if my assumptions are flawed. > > > > Yeah, fixing this independent issues should definitely help, irrespective > > of how we fix fb_set_suspend. > > > > > > Besides this, when fixing a deadlock pls provide more details about the > > > > precise callchain and the locks involved in the deadlock. If you > > > > discovered this using lockdep, then just add the entire lockdep splat to > > > > the commit message. Otherwise there's lots of guesswork involved here. > > > > -Daniel > > > > > > There was no lockdep splat, it was triggered via the ioctl in the commit > > > message. I'll include the verbose trace from the previous mail in the > > > next proposed patch to reduce hunting though. > > > > Sounds good too. > > -Daniel > > -- > > Daniel Vetter > > Software Engineer, Intel Corporation > > http://blog.ffwll.ch
diff --git a/drivers/gpu/drm/nouveau/nouveau_drm.c b/drivers/gpu/drm/nouveau/nouveau_drm.c index 11f8dd9..f9a2c10 100644 --- a/drivers/gpu/drm/nouveau/nouveau_drm.c +++ b/drivers/gpu/drm/nouveau/nouveau_drm.c @@ -552,7 +552,7 @@ nouveau_do_suspend(struct drm_device *dev, bool runtime) if (dev->mode_config.num_crtc) { NV_INFO(drm, "suspending console...\n"); - nouveau_fbcon_set_suspend(dev, 1); + nouveau_fbcon_set_suspend(dev, FBINFO_STATE_SUSPENDED, true); NV_INFO(drm, "suspending display...\n"); ret = nouveau_display_suspend(dev, runtime); if (ret) @@ -635,7 +635,7 @@ nouveau_do_resume(struct drm_device *dev, bool runtime) NV_INFO(drm, "resuming display...\n"); nouveau_display_resume(dev, runtime); NV_INFO(drm, "resuming console...\n"); - nouveau_fbcon_set_suspend(dev, 0); + nouveau_fbcon_set_suspend(dev, FBINFO_STATE_RUNNING, false); } return 0; diff --git a/drivers/gpu/drm/nouveau/nouveau_drv.h b/drivers/gpu/drm/nouveau/nouveau_drv.h index 822a021..a743d19 100644 --- a/drivers/gpu/drm/nouveau/nouveau_drv.h +++ b/drivers/gpu/drm/nouveau/nouveau_drv.h @@ -147,6 +147,7 @@ struct nouveau_drm { struct nouveau_channel *channel; struct nvkm_gpuobj *notify; struct nouveau_fbdev *fbcon; + struct work_struct fbdev_suspend_work; struct nvif_object nvsw; struct nvif_object ntfy; struct nvif_notify flip; diff --git a/drivers/gpu/drm/nouveau/nouveau_fbcon.c b/drivers/gpu/drm/nouveau/nouveau_fbcon.c index d1f248f..089156a 100644 --- a/drivers/gpu/drm/nouveau/nouveau_fbcon.c +++ b/drivers/gpu/drm/nouveau/nouveau_fbcon.c @@ -492,19 +492,53 @@ static const struct drm_fb_helper_funcs nouveau_fbcon_helper_funcs = { .fb_probe = nouveau_fbcon_create, }; +static void nouveau_fbcon_suspend_worker(struct work_struct *work) +{ + nouveau_fbcon_set_suspend(container_of(work, + struct nouveau_drm, + fbdev_suspend_work)->dev, + FBINFO_STATE_RUNNING, + true); +} + void -nouveau_fbcon_set_suspend(struct drm_device *dev, int state) +nouveau_fbcon_set_suspend(struct drm_device *dev, int state, bool synchronous) { struct nouveau_drm *drm = nouveau_drm(dev); - if (drm->fbcon) { - console_lock(); - if (state == FBINFO_STATE_RUNNING) - nouveau_fbcon_accel_restore(dev); - drm_fb_helper_set_suspend(&drm->fbcon->helper, state); + if (!drm->fbcon) + return; + + if (synchronous) { + /* Flush any pending work to turn the console on, and then + * wait to turn it off. It must be synchronous as we are + * about to suspend or unload the driver. + * + * Note that from within the work-handler, we cannot flush + * ourselves, so only flush outstanding work upon suspend! + */ if (state != FBINFO_STATE_RUNNING) - nouveau_fbcon_accel_save_disable(dev); - console_unlock(); + flush_work(&drm->fbdev_suspend_work); + console_lock(); + } else { + /* + * The console lock can be pretty contented on resume due + * to all the printk activity. Try to keep it out of the hot + * path of resume if possible. This also prevents a deadlock + * with FBIOPUT_CON2FBMAP. + */ + WARN_ON(state != FBINFO_STATE_RUNNING); + if (!console_trylock()) { + schedule_work(&drm->fbdev_suspend_work); + return; + } } + + if (state == FBINFO_STATE_RUNNING) + nouveau_fbcon_accel_restore(dev); + drm_fb_helper_set_suspend(&drm->fbcon->helper, state); + if (state != FBINFO_STATE_RUNNING) + nouveau_fbcon_accel_save_disable(dev); + console_unlock(); } int @@ -526,6 +560,8 @@ nouveau_fbcon_init(struct drm_device *dev) fbcon->dev = dev; drm->fbcon = fbcon; + INIT_WORK(&drm->fbdev_suspend_work, nouveau_fbcon_suspend_worker); + drm_fb_helper_prepare(dev, &fbcon->helper, &nouveau_fbcon_helper_funcs); ret = drm_fb_helper_init(dev, &fbcon->helper, @@ -571,6 +607,8 @@ nouveau_fbcon_fini(struct drm_device *dev) if (!drm->fbcon) return; + flush_work(&drm->fbdev_suspend_work); + nouveau_fbcon_accel_fini(dev); nouveau_fbcon_destroy(dev, drm->fbcon); kfree(drm->fbcon); diff --git a/drivers/gpu/drm/nouveau/nouveau_fbcon.h b/drivers/gpu/drm/nouveau/nouveau_fbcon.h index ca77ad0..34b2504 100644 --- a/drivers/gpu/drm/nouveau/nouveau_fbcon.h +++ b/drivers/gpu/drm/nouveau/nouveau_fbcon.h @@ -66,7 +66,7 @@ void nouveau_fbcon_gpu_lockup(struct fb_info *info); int nouveau_fbcon_init(struct drm_device *dev); void nouveau_fbcon_fini(struct drm_device *dev); -void nouveau_fbcon_set_suspend(struct drm_device *dev, int state); +void nouveau_fbcon_set_suspend(struct drm_device *dev, int state, bool synchronous); void nouveau_fbcon_accel_save_disable(struct drm_device *dev); void nouveau_fbcon_accel_restore(struct drm_device *dev);
The FBIOPUT_CON2FBMAP ioctl takes a console_lock(). When this is called while nouveau was runtime suspended, a deadlock would occur due to nouveau_fbcon_set_suspend also trying to obtain console_lock(). Fix this by delaying the drm_fb_helper_set_suspend call. Based on the i915 code (which was done for performance reasons though). Cc: Chris Wilson <chris@chris-wilson.co.uk> Cc: Daniel Vetter <daniel.vetter@ffwll.ch> Signed-off-by: Peter Wu <peter@lekensteyn.nl> --- Tested on top of v4.7-rc5, the deadlock is gone. --- drivers/gpu/drm/nouveau/nouveau_drm.c | 4 +-- drivers/gpu/drm/nouveau/nouveau_drv.h | 1 + drivers/gpu/drm/nouveau/nouveau_fbcon.c | 54 ++++++++++++++++++++++++++++----- drivers/gpu/drm/nouveau/nouveau_fbcon.h | 2 +- 4 files changed, 50 insertions(+), 11 deletions(-)