Message ID | 20210921174332.30784-1-matthew.brost@intel.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | drm/i915: fix blank screen booting crashes | expand |
On Tue, Sep 21, 2021 at 10:43:32AM -0700, Matthew Brost wrote: >From: Hugh Dickins <hughd@google.com> > >5.15-rc1 crashes with blank screen when booting up on two ThinkPads >using i915. Bisections converge convincingly, but arrive at different >and surprising "culprits", none of them the actual culprit. > >netconsole (with init_netconsole() hacked to call i915_init() when >logging has started, instead of by module_init()) tells the story: > >kernel BUG at drivers/gpu/drm/i915/i915_sw_fence.c:245! >with RSI: ffffffff814d408b pointing to sw_fence_dummy_notify(). >I've been building with CONFIG_CC_OPTIMIZE_FOR_SIZE=y, and that >function needs to be 4-byte aligned. > >v2: > (Jani Nikula) > - Change BUG_ON to WARN_ON >v3: > (Jani / Tvrtko) > - Short circuit __i915_sw_fence_init on WARN_ON > >Fixes: 62eaf0ae217d ("drm/i915/guc: Support request cancellation") >Signed-off-by: Hugh Dickins <hughd@google.com> >Signed-off-by: Matthew Brost <matthew.brost@intel.com> >Reviewed-by: Matthew Brost <matthew.brost@intel.com> >--- > drivers/gpu/drm/i915/gt/intel_context.c | 4 ++-- > drivers/gpu/drm/i915/i915_sw_fence.c | 17 ++++++++++------- > 2 files changed, 12 insertions(+), 9 deletions(-) > >diff --git a/drivers/gpu/drm/i915/gt/intel_context.c b/drivers/gpu/drm/i915/gt/intel_context.c >index ff637147b1a9..e7f78bc7ebfc 100644 >--- a/drivers/gpu/drm/i915/gt/intel_context.c >+++ b/drivers/gpu/drm/i915/gt/intel_context.c >@@ -362,8 +362,8 @@ static int __intel_context_active(struct i915_active *active) > return 0; > } > >-static int sw_fence_dummy_notify(struct i915_sw_fence *sf, >- enum i915_sw_fence_notify state) >+static int __i915_sw_fence_call >+sw_fence_dummy_notify(struct i915_sw_fence *sf, enum i915_sw_fence_notify state) > { > return NOTIFY_DONE; > } >diff --git a/drivers/gpu/drm/i915/i915_sw_fence.c b/drivers/gpu/drm/i915/i915_sw_fence.c >index c589a681da77..08cea73264e7 100644 >--- a/drivers/gpu/drm/i915/i915_sw_fence.c >+++ b/drivers/gpu/drm/i915/i915_sw_fence.c >@@ -13,9 +13,9 @@ > #include "i915_selftest.h" > > #if IS_ENABLED(CONFIG_DRM_I915_DEBUG) >-#define I915_SW_FENCE_BUG_ON(expr) BUG_ON(expr) >+#define I915_SW_FENCE_WARN_ON(expr) WARN_ON(expr) > #else >-#define I915_SW_FENCE_BUG_ON(expr) BUILD_BUG_ON_INVALID(expr) >+#define I915_SW_FENCE_WARN_ON(expr) BUILD_BUG_ON_INVALID(expr) > #endif > > static DEFINE_SPINLOCK(i915_sw_fence_lock); >@@ -129,7 +129,10 @@ static int __i915_sw_fence_notify(struct i915_sw_fence *fence, > i915_sw_fence_notify_t fn; > > fn = (i915_sw_fence_notify_t)(fence->flags & I915_SW_FENCE_MASK); >- return fn(fence, state); >+ if (likely(fn)) >+ return fn(fence, state); >+ else >+ return 0; since the knowledge for these being NULL (or with the wrong alignment) are in the init/reinit functions, wouldn't it be better to just add a fence_nop() and assign it there instead this likely() here? > } > > #ifdef CONFIG_DRM_I915_SW_FENCE_DEBUG_OBJECTS >@@ -242,9 +245,9 @@ void __i915_sw_fence_init(struct i915_sw_fence *fence, > const char *name, > struct lock_class_key *key) > { >- BUG_ON(!fn || (unsigned long)fn & ~I915_SW_FENCE_MASK); >- > __init_waitqueue_head(&fence->wait, name, key); >+ if (WARN_ON(!fn || (unsigned long)fn & ~I915_SW_FENCE_MASK)) >+ return; like: if (WARN_ON(!fn || (unsigned long)fn & ~I915_SW_FENCE_MASK)) fence->flags = (unsigned long)sw_fence_dummy_notify; else fence->flags = (unsigned long)fn; f you return here instead of calling i915_sw_fence_reinit(), aren't you just going to use uninitialized memory later? At least in the selftests, which allocate it with kmalloc()... I didn't check others. For the bug fix we could just add the __aligned(4) and leave the rest to a separate patch. Lucas De Marchi > fence->flags = (unsigned long)fn; > > i915_sw_fence_reinit(fence); >@@ -257,8 +260,8 @@ void i915_sw_fence_reinit(struct i915_sw_fence *fence) > atomic_set(&fence->pending, 1); > fence->error = 0; > >- I915_SW_FENCE_BUG_ON(!fence->flags); >- I915_SW_FENCE_BUG_ON(!list_empty(&fence->wait.head)); >+ I915_SW_FENCE_WARN_ON(!fence->flags); >+ I915_SW_FENCE_WARN_ON(!list_empty(&fence->wait.head)); > } > > void i915_sw_fence_commit(struct i915_sw_fence *fence) >-- >2.32.0 >
On Tue, Sep 21, 2021 at 11:46:37AM -0700, Lucas De Marchi wrote: > On Tue, Sep 21, 2021 at 10:43:32AM -0700, Matthew Brost wrote: > > From: Hugh Dickins <hughd@google.com> > > > > 5.15-rc1 crashes with blank screen when booting up on two ThinkPads > > using i915. Bisections converge convincingly, but arrive at different > > and surprising "culprits", none of them the actual culprit. > > > > netconsole (with init_netconsole() hacked to call i915_init() when > > logging has started, instead of by module_init()) tells the story: > > > > kernel BUG at drivers/gpu/drm/i915/i915_sw_fence.c:245! > > with RSI: ffffffff814d408b pointing to sw_fence_dummy_notify(). > > I've been building with CONFIG_CC_OPTIMIZE_FOR_SIZE=y, and that > > function needs to be 4-byte aligned. > > > > v2: > > (Jani Nikula) > > - Change BUG_ON to WARN_ON > > v3: > > (Jani / Tvrtko) > > - Short circuit __i915_sw_fence_init on WARN_ON > > > > Fixes: 62eaf0ae217d ("drm/i915/guc: Support request cancellation") > > Signed-off-by: Hugh Dickins <hughd@google.com> > > Signed-off-by: Matthew Brost <matthew.brost@intel.com> > > Reviewed-by: Matthew Brost <matthew.brost@intel.com> > > --- > > drivers/gpu/drm/i915/gt/intel_context.c | 4 ++-- > > drivers/gpu/drm/i915/i915_sw_fence.c | 17 ++++++++++------- > > 2 files changed, 12 insertions(+), 9 deletions(-) > > > > diff --git a/drivers/gpu/drm/i915/gt/intel_context.c b/drivers/gpu/drm/i915/gt/intel_context.c > > index ff637147b1a9..e7f78bc7ebfc 100644 > > --- a/drivers/gpu/drm/i915/gt/intel_context.c > > +++ b/drivers/gpu/drm/i915/gt/intel_context.c > > @@ -362,8 +362,8 @@ static int __intel_context_active(struct i915_active *active) > > return 0; > > } > > > > > -static int sw_fence_dummy_notify(struct i915_sw_fence *sf, > > - enum i915_sw_fence_notify state) > > +static int __i915_sw_fence_call > > +sw_fence_dummy_notify(struct i915_sw_fence *sf, enum i915_sw_fence_notify state) > > { > > return NOTIFY_DONE; > > } > > diff --git a/drivers/gpu/drm/i915/i915_sw_fence.c b/drivers/gpu/drm/i915/i915_sw_fence.c > > index c589a681da77..08cea73264e7 100644 > > --- a/drivers/gpu/drm/i915/i915_sw_fence.c > > +++ b/drivers/gpu/drm/i915/i915_sw_fence.c > > @@ -13,9 +13,9 @@ > > #include "i915_selftest.h" > > > > #if IS_ENABLED(CONFIG_DRM_I915_DEBUG) > > -#define I915_SW_FENCE_BUG_ON(expr) BUG_ON(expr) > > +#define I915_SW_FENCE_WARN_ON(expr) WARN_ON(expr) > > #else > > -#define I915_SW_FENCE_BUG_ON(expr) BUILD_BUG_ON_INVALID(expr) > > +#define I915_SW_FENCE_WARN_ON(expr) BUILD_BUG_ON_INVALID(expr) > > #endif > > > > static DEFINE_SPINLOCK(i915_sw_fence_lock); > > @@ -129,7 +129,10 @@ static int __i915_sw_fence_notify(struct i915_sw_fence *fence, > > i915_sw_fence_notify_t fn; > > > > fn = (i915_sw_fence_notify_t)(fence->flags & I915_SW_FENCE_MASK); > > - return fn(fence, state); > > + if (likely(fn)) > > + return fn(fence, state); > > + else > > + return 0; > > since the knowledge for these being NULL (or with the wrong alignment) > are in the init/reinit functions, wouldn't it be better to just add a > fence_nop() and assign it there instead this likely() here? > Maybe? I prefer the way it is. > > } > > > > #ifdef CONFIG_DRM_I915_SW_FENCE_DEBUG_OBJECTS > > @@ -242,9 +245,9 @@ void __i915_sw_fence_init(struct i915_sw_fence *fence, > > const char *name, > > struct lock_class_key *key) > > { > > - BUG_ON(!fn || (unsigned long)fn & ~I915_SW_FENCE_MASK); > > - > > __init_waitqueue_head(&fence->wait, name, key); > > + if (WARN_ON(!fn || (unsigned long)fn & ~I915_SW_FENCE_MASK)) > > + return; > > like: > if (WARN_ON(!fn || (unsigned long)fn & ~I915_SW_FENCE_MASK)) > fence->flags = (unsigned long)sw_fence_dummy_notify; > else > fence->flags = (unsigned long)fn; > > > f you return here instead of calling i915_sw_fence_reinit(), aren't you > just going to use uninitialized memory later? At least in the selftests, > which allocate it with kmalloc()... I didn't check others. > I don't think so, maybe the fence won't work but it won't blow up either. > > For the bug fix we could just add the __aligned(4) and leave the rest to a > separate patch. > The bug was sw_fence_dummy_notify in gt/intel_context.c was not 4 byte align which triggered a BUG_ON during boot which blank screened a laptop. Jani / Tvrtko suggested that we make the BUG_ON to WARN_ONs so if someone makes this mistake in the future kernel should boot albiet with a WARNING. The long term fix is just pull out the I915_SW_FENCE_MASK (stealing bits from a poitner) and we don't have to worry any of this. Matt > > Lucas De Marchi > > > fence->flags = (unsigned long)fn; > > > > i915_sw_fence_reinit(fence); > > @@ -257,8 +260,8 @@ void i915_sw_fence_reinit(struct i915_sw_fence *fence) > > atomic_set(&fence->pending, 1); > > fence->error = 0; > > > > - I915_SW_FENCE_BUG_ON(!fence->flags); > > - I915_SW_FENCE_BUG_ON(!list_empty(&fence->wait.head)); > > + I915_SW_FENCE_WARN_ON(!fence->flags); > > + I915_SW_FENCE_WARN_ON(!list_empty(&fence->wait.head)); > > } > > > > void i915_sw_fence_commit(struct i915_sw_fence *fence) > > -- > > 2.32.0 > >
On Tue, Sep 21, 2021 at 03:55:15PM -0700, Matthew Brost wrote: >On Tue, Sep 21, 2021 at 11:46:37AM -0700, Lucas De Marchi wrote: >> On Tue, Sep 21, 2021 at 10:43:32AM -0700, Matthew Brost wrote: >> > From: Hugh Dickins <hughd@google.com> >> > >> > 5.15-rc1 crashes with blank screen when booting up on two ThinkPads >> > using i915. Bisections converge convincingly, but arrive at different >> > and surprising "culprits", none of them the actual culprit. >> > >> > netconsole (with init_netconsole() hacked to call i915_init() when >> > logging has started, instead of by module_init()) tells the story: >> > >> > kernel BUG at drivers/gpu/drm/i915/i915_sw_fence.c:245! >> > with RSI: ffffffff814d408b pointing to sw_fence_dummy_notify(). >> > I've been building with CONFIG_CC_OPTIMIZE_FOR_SIZE=y, and that >> > function needs to be 4-byte aligned. >> > >> > v2: >> > (Jani Nikula) >> > - Change BUG_ON to WARN_ON >> > v3: >> > (Jani / Tvrtko) >> > - Short circuit __i915_sw_fence_init on WARN_ON >> > >> > Fixes: 62eaf0ae217d ("drm/i915/guc: Support request cancellation") >> > Signed-off-by: Hugh Dickins <hughd@google.com> >> > Signed-off-by: Matthew Brost <matthew.brost@intel.com> >> > Reviewed-by: Matthew Brost <matthew.brost@intel.com> >> > --- >> > drivers/gpu/drm/i915/gt/intel_context.c | 4 ++-- >> > drivers/gpu/drm/i915/i915_sw_fence.c | 17 ++++++++++------- >> > 2 files changed, 12 insertions(+), 9 deletions(-) >> > >> > diff --git a/drivers/gpu/drm/i915/gt/intel_context.c b/drivers/gpu/drm/i915/gt/intel_context.c >> > index ff637147b1a9..e7f78bc7ebfc 100644 >> > --- a/drivers/gpu/drm/i915/gt/intel_context.c >> > +++ b/drivers/gpu/drm/i915/gt/intel_context.c >> > @@ -362,8 +362,8 @@ static int __intel_context_active(struct i915_active *active) >> > return 0; >> > } >> > >> >> > -static int sw_fence_dummy_notify(struct i915_sw_fence *sf, >> > - enum i915_sw_fence_notify state) >> > +static int __i915_sw_fence_call >> > +sw_fence_dummy_notify(struct i915_sw_fence *sf, enum i915_sw_fence_notify state) >> > { >> > return NOTIFY_DONE; >> > } >> > diff --git a/drivers/gpu/drm/i915/i915_sw_fence.c b/drivers/gpu/drm/i915/i915_sw_fence.c >> > index c589a681da77..08cea73264e7 100644 >> > --- a/drivers/gpu/drm/i915/i915_sw_fence.c >> > +++ b/drivers/gpu/drm/i915/i915_sw_fence.c >> > @@ -13,9 +13,9 @@ >> > #include "i915_selftest.h" >> > >> > #if IS_ENABLED(CONFIG_DRM_I915_DEBUG) >> > -#define I915_SW_FENCE_BUG_ON(expr) BUG_ON(expr) >> > +#define I915_SW_FENCE_WARN_ON(expr) WARN_ON(expr) >> > #else >> > -#define I915_SW_FENCE_BUG_ON(expr) BUILD_BUG_ON_INVALID(expr) >> > +#define I915_SW_FENCE_WARN_ON(expr) BUILD_BUG_ON_INVALID(expr) >> > #endif >> > >> > static DEFINE_SPINLOCK(i915_sw_fence_lock); >> > @@ -129,7 +129,10 @@ static int __i915_sw_fence_notify(struct i915_sw_fence *fence, >> > i915_sw_fence_notify_t fn; >> > >> > fn = (i915_sw_fence_notify_t)(fence->flags & I915_SW_FENCE_MASK); >> > - return fn(fence, state); >> > + if (likely(fn)) >> > + return fn(fence, state); >> > + else >> > + return 0; >> >> since the knowledge for these being NULL (or with the wrong alignment) >> are in the init/reinit functions, wouldn't it be better to just add a >> fence_nop() and assign it there instead this likely() here? >> > >Maybe? I prefer the way it is. > >> > } >> > >> > #ifdef CONFIG_DRM_I915_SW_FENCE_DEBUG_OBJECTS >> > @@ -242,9 +245,9 @@ void __i915_sw_fence_init(struct i915_sw_fence *fence, >> > const char *name, >> > struct lock_class_key *key) >> > { >> > - BUG_ON(!fn || (unsigned long)fn & ~I915_SW_FENCE_MASK); >> > - >> > __init_waitqueue_head(&fence->wait, name, key); >> > + if (WARN_ON(!fn || (unsigned long)fn & ~I915_SW_FENCE_MASK)) >> > + return; >> >> like: >> if (WARN_ON(!fn || (unsigned long)fn & ~I915_SW_FENCE_MASK)) >> fence->flags = (unsigned long)sw_fence_dummy_notify; >> else >> fence->flags = (unsigned long)fn; >> >> >> f you return here instead of calling i915_sw_fence_reinit(), aren't you >> just going to use uninitialized memory later? At least in the selftests, >> which allocate it with kmalloc()... I didn't check others. >> > >I don't think so, maybe the fence won't work but it won't blow up >either. > >> >> For the bug fix we could just add the __aligned(4) and leave the rest to a >> separate patch. >> > >The bug was sw_fence_dummy_notify in gt/intel_context.c was not 4 byte >align which triggered a BUG_ON during boot which blank screened a >laptop. Jani / Tvrtko suggested that we make the BUG_ON to WARN_ONs so >if someone makes this mistake in the future kernel should boot albiet >with a WARNING. yes, I understood. But afaics with WARN_ON you are allowing it to continue and may be using uninitialized memory later, just causing other problems down the line, which may be equally difficult to debug. what I suggested is that there is the easy fix to apply to the current rcX kernel, adding __aligned(4) to sw_fence_dummy_notify() (patch 1). And there is the additional protection being added here (patch 2) which is subject to the debate. > >The long term fix is just pull out the I915_SW_FENCE_MASK (stealing bits >from a poitner) and we don't have to worry any of this. Patch 2 may not even be needed if you're going that route. But we are not only protecting against unaligned, but also from code calling i915_sw_fence_init() with a NULL fn. Lucas De Marchi > >Matt > >> >> Lucas De Marchi >> >> > fence->flags = (unsigned long)fn; >> > >> > i915_sw_fence_reinit(fence); >> > @@ -257,8 +260,8 @@ void i915_sw_fence_reinit(struct i915_sw_fence *fence) >> > atomic_set(&fence->pending, 1); >> > fence->error = 0; >> > >> > - I915_SW_FENCE_BUG_ON(!fence->flags); >> > - I915_SW_FENCE_BUG_ON(!list_empty(&fence->wait.head)); >> > + I915_SW_FENCE_WARN_ON(!fence->flags); >> > + I915_SW_FENCE_WARN_ON(!list_empty(&fence->wait.head)); >> > } >> > >> > void i915_sw_fence_commit(struct i915_sw_fence *fence) >> > -- >> > 2.32.0 >> >
On Tue, Sep 21, 2021 at 04:29:31PM -0700, Lucas De Marchi wrote: > On Tue, Sep 21, 2021 at 03:55:15PM -0700, Matthew Brost wrote: > > On Tue, Sep 21, 2021 at 11:46:37AM -0700, Lucas De Marchi wrote: > > > On Tue, Sep 21, 2021 at 10:43:32AM -0700, Matthew Brost wrote: > > > > From: Hugh Dickins <hughd@google.com> > > > > > > > > 5.15-rc1 crashes with blank screen when booting up on two ThinkPads > > > > using i915. Bisections converge convincingly, but arrive at different > > > > and surprising "culprits", none of them the actual culprit. > > > > > > > > netconsole (with init_netconsole() hacked to call i915_init() when > > > > logging has started, instead of by module_init()) tells the story: > > > > > > > > kernel BUG at drivers/gpu/drm/i915/i915_sw_fence.c:245! > > > > with RSI: ffffffff814d408b pointing to sw_fence_dummy_notify(). > > > > I've been building with CONFIG_CC_OPTIMIZE_FOR_SIZE=y, and that > > > > function needs to be 4-byte aligned. > > > > > > > > v2: > > > > (Jani Nikula) > > > > - Change BUG_ON to WARN_ON > > > > v3: > > > > (Jani / Tvrtko) > > > > - Short circuit __i915_sw_fence_init on WARN_ON > > > > > > > > Fixes: 62eaf0ae217d ("drm/i915/guc: Support request cancellation") > > > > Signed-off-by: Hugh Dickins <hughd@google.com> > > > > Signed-off-by: Matthew Brost <matthew.brost@intel.com> > > > > Reviewed-by: Matthew Brost <matthew.brost@intel.com> > > > > --- > > > > drivers/gpu/drm/i915/gt/intel_context.c | 4 ++-- > > > > drivers/gpu/drm/i915/i915_sw_fence.c | 17 ++++++++++------- > > > > 2 files changed, 12 insertions(+), 9 deletions(-) > > > > > > > > diff --git a/drivers/gpu/drm/i915/gt/intel_context.c b/drivers/gpu/drm/i915/gt/intel_context.c > > > > index ff637147b1a9..e7f78bc7ebfc 100644 > > > > --- a/drivers/gpu/drm/i915/gt/intel_context.c > > > > +++ b/drivers/gpu/drm/i915/gt/intel_context.c > > > > @@ -362,8 +362,8 @@ static int __intel_context_active(struct i915_active *active) > > > > return 0; > > > > } > > > > > > > > > > > -static int sw_fence_dummy_notify(struct i915_sw_fence *sf, > > > > - enum i915_sw_fence_notify state) > > > > +static int __i915_sw_fence_call > > > > +sw_fence_dummy_notify(struct i915_sw_fence *sf, enum i915_sw_fence_notify state) > > > > { > > > > return NOTIFY_DONE; > > > > } > > > > diff --git a/drivers/gpu/drm/i915/i915_sw_fence.c b/drivers/gpu/drm/i915/i915_sw_fence.c > > > > index c589a681da77..08cea73264e7 100644 > > > > --- a/drivers/gpu/drm/i915/i915_sw_fence.c > > > > +++ b/drivers/gpu/drm/i915/i915_sw_fence.c > > > > @@ -13,9 +13,9 @@ > > > > #include "i915_selftest.h" > > > > > > > > #if IS_ENABLED(CONFIG_DRM_I915_DEBUG) > > > > -#define I915_SW_FENCE_BUG_ON(expr) BUG_ON(expr) > > > > +#define I915_SW_FENCE_WARN_ON(expr) WARN_ON(expr) > > > > #else > > > > -#define I915_SW_FENCE_BUG_ON(expr) BUILD_BUG_ON_INVALID(expr) > > > > +#define I915_SW_FENCE_WARN_ON(expr) BUILD_BUG_ON_INVALID(expr) > > > > #endif > > > > > > > > static DEFINE_SPINLOCK(i915_sw_fence_lock); > > > > @@ -129,7 +129,10 @@ static int __i915_sw_fence_notify(struct i915_sw_fence *fence, > > > > i915_sw_fence_notify_t fn; > > > > > > > > fn = (i915_sw_fence_notify_t)(fence->flags & I915_SW_FENCE_MASK); > > > > - return fn(fence, state); > > > > + if (likely(fn)) > > > > + return fn(fence, state); > > > > + else > > > > + return 0; > > > > > > since the knowledge for these being NULL (or with the wrong alignment) > > > are in the init/reinit functions, wouldn't it be better to just add a > > > fence_nop() and assign it there instead this likely() here? > > > > > > > Maybe? I prefer the way it is. > > > > > > } > > > > > > > > #ifdef CONFIG_DRM_I915_SW_FENCE_DEBUG_OBJECTS > > > > @@ -242,9 +245,9 @@ void __i915_sw_fence_init(struct i915_sw_fence *fence, > > > > const char *name, > > > > struct lock_class_key *key) > > > > { > > > > - BUG_ON(!fn || (unsigned long)fn & ~I915_SW_FENCE_MASK); > > > > - > > > > __init_waitqueue_head(&fence->wait, name, key); > > > > + if (WARN_ON(!fn || (unsigned long)fn & ~I915_SW_FENCE_MASK)) > > > > + return; > > > > > > like: > > > if (WARN_ON(!fn || (unsigned long)fn & ~I915_SW_FENCE_MASK)) > > > fence->flags = (unsigned long)sw_fence_dummy_notify; > > > else > > > fence->flags = (unsigned long)fn; > > > > > > > > > f you return here instead of calling i915_sw_fence_reinit(), aren't you > > > just going to use uninitialized memory later? At least in the selftests, > > > which allocate it with kmalloc()... I didn't check others. > > > > > > > I don't think so, maybe the fence won't work but it won't blow up > > either. > > > > > > > > For the bug fix we could just add the __aligned(4) and leave the rest to a > > > separate patch. > > > > > > > The bug was sw_fence_dummy_notify in gt/intel_context.c was not 4 byte > > align which triggered a BUG_ON during boot which blank screened a > > laptop. Jani / Tvrtko suggested that we make the BUG_ON to WARN_ONs so > > if someone makes this mistake in the future kernel should boot albiet > > with a WARNING. > > yes, I understood. But afaics with WARN_ON you are allowing it to > continue and may be using uninitialized memory later, just causing other > problems down the line, which may be equally difficult to debug. > > what I suggested is that there is the easy fix to apply to the current > rcX kernel, adding __aligned(4) to sw_fence_dummy_notify() (patch 1). > And there is the additional protection being added here (patch 2) which > is subject to the debate. > Got it. Will post as 2 different patches. > > > > The long term fix is just pull out the I915_SW_FENCE_MASK (stealing bits > > from a poitner) and we don't have to worry any of this. > > Patch 2 may not even be needed if you're going that route. But we are > not only protecting against unaligned, but also from code calling > i915_sw_fence_init() with a NULL fn. > Maybe, I'll just do the proper fix in patch #2 right away. Matt > Lucas De Marchi > > > > > Matt > > > > > > > > Lucas De Marchi > > > > > > > fence->flags = (unsigned long)fn; > > > > > > > > i915_sw_fence_reinit(fence); > > > > @@ -257,8 +260,8 @@ void i915_sw_fence_reinit(struct i915_sw_fence *fence) > > > > atomic_set(&fence->pending, 1); > > > > fence->error = 0; > > > > > > > > - I915_SW_FENCE_BUG_ON(!fence->flags); > > > > - I915_SW_FENCE_BUG_ON(!list_empty(&fence->wait.head)); > > > > + I915_SW_FENCE_WARN_ON(!fence->flags); > > > > + I915_SW_FENCE_WARN_ON(!list_empty(&fence->wait.head)); > > > > } > > > > > > > > void i915_sw_fence_commit(struct i915_sw_fence *fence) > > > > -- > > > > 2.32.0 > > > >
On Tue, Sep 21, 2021 at 06:40:41PM -0700, Matthew Brost wrote: >On Tue, Sep 21, 2021 at 04:29:31PM -0700, Lucas De Marchi wrote: >> On Tue, Sep 21, 2021 at 03:55:15PM -0700, Matthew Brost wrote: >> > On Tue, Sep 21, 2021 at 11:46:37AM -0700, Lucas De Marchi wrote: >> > > On Tue, Sep 21, 2021 at 10:43:32AM -0700, Matthew Brost wrote: >> > > > From: Hugh Dickins <hughd@google.com> >> > > > >> > > > 5.15-rc1 crashes with blank screen when booting up on two ThinkPads >> > > > using i915. Bisections converge convincingly, but arrive at different >> > > > and surprising "culprits", none of them the actual culprit. >> > > > >> > > > netconsole (with init_netconsole() hacked to call i915_init() when >> > > > logging has started, instead of by module_init()) tells the story: >> > > > >> > > > kernel BUG at drivers/gpu/drm/i915/i915_sw_fence.c:245! >> > > > with RSI: ffffffff814d408b pointing to sw_fence_dummy_notify(). >> > > > I've been building with CONFIG_CC_OPTIMIZE_FOR_SIZE=y, and that >> > > > function needs to be 4-byte aligned. >> > > > >> > > > v2: >> > > > (Jani Nikula) >> > > > - Change BUG_ON to WARN_ON >> > > > v3: >> > > > (Jani / Tvrtko) >> > > > - Short circuit __i915_sw_fence_init on WARN_ON >> > > > >> > > > Fixes: 62eaf0ae217d ("drm/i915/guc: Support request cancellation") >> > > > Signed-off-by: Hugh Dickins <hughd@google.com> >> > > > Signed-off-by: Matthew Brost <matthew.brost@intel.com> >> > > > Reviewed-by: Matthew Brost <matthew.brost@intel.com> >> > > > --- >> > > > drivers/gpu/drm/i915/gt/intel_context.c | 4 ++-- >> > > > drivers/gpu/drm/i915/i915_sw_fence.c | 17 ++++++++++------- >> > > > 2 files changed, 12 insertions(+), 9 deletions(-) >> > > > >> > > > diff --git a/drivers/gpu/drm/i915/gt/intel_context.c b/drivers/gpu/drm/i915/gt/intel_context.c >> > > > index ff637147b1a9..e7f78bc7ebfc 100644 >> > > > --- a/drivers/gpu/drm/i915/gt/intel_context.c >> > > > +++ b/drivers/gpu/drm/i915/gt/intel_context.c >> > > > @@ -362,8 +362,8 @@ static int __intel_context_active(struct i915_active *active) >> > > > return 0; >> > > > } >> > > > >> > > >> > > > -static int sw_fence_dummy_notify(struct i915_sw_fence *sf, >> > > > - enum i915_sw_fence_notify state) >> > > > +static int __i915_sw_fence_call >> > > > +sw_fence_dummy_notify(struct i915_sw_fence *sf, enum i915_sw_fence_notify state) >> > > > { >> > > > return NOTIFY_DONE; >> > > > } >> > > > diff --git a/drivers/gpu/drm/i915/i915_sw_fence.c b/drivers/gpu/drm/i915/i915_sw_fence.c >> > > > index c589a681da77..08cea73264e7 100644 >> > > > --- a/drivers/gpu/drm/i915/i915_sw_fence.c >> > > > +++ b/drivers/gpu/drm/i915/i915_sw_fence.c >> > > > @@ -13,9 +13,9 @@ >> > > > #include "i915_selftest.h" >> > > > >> > > > #if IS_ENABLED(CONFIG_DRM_I915_DEBUG) >> > > > -#define I915_SW_FENCE_BUG_ON(expr) BUG_ON(expr) >> > > > +#define I915_SW_FENCE_WARN_ON(expr) WARN_ON(expr) >> > > > #else >> > > > -#define I915_SW_FENCE_BUG_ON(expr) BUILD_BUG_ON_INVALID(expr) >> > > > +#define I915_SW_FENCE_WARN_ON(expr) BUILD_BUG_ON_INVALID(expr) >> > > > #endif >> > > > >> > > > static DEFINE_SPINLOCK(i915_sw_fence_lock); >> > > > @@ -129,7 +129,10 @@ static int __i915_sw_fence_notify(struct i915_sw_fence *fence, >> > > > i915_sw_fence_notify_t fn; >> > > > >> > > > fn = (i915_sw_fence_notify_t)(fence->flags & I915_SW_FENCE_MASK); >> > > > - return fn(fence, state); >> > > > + if (likely(fn)) >> > > > + return fn(fence, state); >> > > > + else >> > > > + return 0; >> > > >> > > since the knowledge for these being NULL (or with the wrong alignment) >> > > are in the init/reinit functions, wouldn't it be better to just add a >> > > fence_nop() and assign it there instead this likely() here? >> > > >> > >> > Maybe? I prefer the way it is. >> > >> > > > } >> > > > >> > > > #ifdef CONFIG_DRM_I915_SW_FENCE_DEBUG_OBJECTS >> > > > @@ -242,9 +245,9 @@ void __i915_sw_fence_init(struct i915_sw_fence *fence, >> > > > const char *name, >> > > > struct lock_class_key *key) >> > > > { >> > > > - BUG_ON(!fn || (unsigned long)fn & ~I915_SW_FENCE_MASK); >> > > > - >> > > > __init_waitqueue_head(&fence->wait, name, key); >> > > > + if (WARN_ON(!fn || (unsigned long)fn & ~I915_SW_FENCE_MASK)) >> > > > + return; >> > > >> > > like: >> > > if (WARN_ON(!fn || (unsigned long)fn & ~I915_SW_FENCE_MASK)) >> > > fence->flags = (unsigned long)sw_fence_dummy_notify; >> > > else >> > > fence->flags = (unsigned long)fn; >> > > >> > > >> > > f you return here instead of calling i915_sw_fence_reinit(), aren't you >> > > just going to use uninitialized memory later? At least in the selftests, >> > > which allocate it with kmalloc()... I didn't check others. >> > > >> > >> > I don't think so, maybe the fence won't work but it won't blow up >> > either. >> > >> > > >> > > For the bug fix we could just add the __aligned(4) and leave the rest to a >> > > separate patch. >> > > >> > >> > The bug was sw_fence_dummy_notify in gt/intel_context.c was not 4 byte >> > align which triggered a BUG_ON during boot which blank screened a >> > laptop. Jani / Tvrtko suggested that we make the BUG_ON to WARN_ONs so >> > if someone makes this mistake in the future kernel should boot albiet >> > with a WARNING. >> >> yes, I understood. But afaics with WARN_ON you are allowing it to >> continue and may be using uninitialized memory later, just causing other >> problems down the line, which may be equally difficult to debug. >> >> what I suggested is that there is the easy fix to apply to the current >> rcX kernel, adding __aligned(4) to sw_fence_dummy_notify() (patch 1). >> And there is the additional protection being added here (patch 2) which >> is subject to the debate. >> > >Got it. Will post as 2 different patches. > >> > >> > The long term fix is just pull out the I915_SW_FENCE_MASK (stealing bits >> > from a poitner) and we don't have to worry any of this. >> >> Patch 2 may not even be needed if you're going that route. But we are >> not only protecting against unaligned, but also from code calling >> i915_sw_fence_init() with a NULL fn. >> > >Maybe, I'll just do the proper fix in patch #2 right away. makes sense. Thanks Lucas De Marchi
diff --git a/drivers/gpu/drm/i915/gt/intel_context.c b/drivers/gpu/drm/i915/gt/intel_context.c index ff637147b1a9..e7f78bc7ebfc 100644 --- a/drivers/gpu/drm/i915/gt/intel_context.c +++ b/drivers/gpu/drm/i915/gt/intel_context.c @@ -362,8 +362,8 @@ static int __intel_context_active(struct i915_active *active) return 0; } -static int sw_fence_dummy_notify(struct i915_sw_fence *sf, - enum i915_sw_fence_notify state) +static int __i915_sw_fence_call +sw_fence_dummy_notify(struct i915_sw_fence *sf, enum i915_sw_fence_notify state) { return NOTIFY_DONE; } diff --git a/drivers/gpu/drm/i915/i915_sw_fence.c b/drivers/gpu/drm/i915/i915_sw_fence.c index c589a681da77..08cea73264e7 100644 --- a/drivers/gpu/drm/i915/i915_sw_fence.c +++ b/drivers/gpu/drm/i915/i915_sw_fence.c @@ -13,9 +13,9 @@ #include "i915_selftest.h" #if IS_ENABLED(CONFIG_DRM_I915_DEBUG) -#define I915_SW_FENCE_BUG_ON(expr) BUG_ON(expr) +#define I915_SW_FENCE_WARN_ON(expr) WARN_ON(expr) #else -#define I915_SW_FENCE_BUG_ON(expr) BUILD_BUG_ON_INVALID(expr) +#define I915_SW_FENCE_WARN_ON(expr) BUILD_BUG_ON_INVALID(expr) #endif static DEFINE_SPINLOCK(i915_sw_fence_lock); @@ -129,7 +129,10 @@ static int __i915_sw_fence_notify(struct i915_sw_fence *fence, i915_sw_fence_notify_t fn; fn = (i915_sw_fence_notify_t)(fence->flags & I915_SW_FENCE_MASK); - return fn(fence, state); + if (likely(fn)) + return fn(fence, state); + else + return 0; } #ifdef CONFIG_DRM_I915_SW_FENCE_DEBUG_OBJECTS @@ -242,9 +245,9 @@ void __i915_sw_fence_init(struct i915_sw_fence *fence, const char *name, struct lock_class_key *key) { - BUG_ON(!fn || (unsigned long)fn & ~I915_SW_FENCE_MASK); - __init_waitqueue_head(&fence->wait, name, key); + if (WARN_ON(!fn || (unsigned long)fn & ~I915_SW_FENCE_MASK)) + return; fence->flags = (unsigned long)fn; i915_sw_fence_reinit(fence); @@ -257,8 +260,8 @@ void i915_sw_fence_reinit(struct i915_sw_fence *fence) atomic_set(&fence->pending, 1); fence->error = 0; - I915_SW_FENCE_BUG_ON(!fence->flags); - I915_SW_FENCE_BUG_ON(!list_empty(&fence->wait.head)); + I915_SW_FENCE_WARN_ON(!fence->flags); + I915_SW_FENCE_WARN_ON(!list_empty(&fence->wait.head)); } void i915_sw_fence_commit(struct i915_sw_fence *fence)