diff mbox series

drm/i915: Fix NPD in PMU during driver teardown

Message ID d88ae0c8d7dda0d17ad52e48c4c2352df71fe85d.1656528043.git.stuart.summers@intel.com (mailing list archive)
State New, archived
Headers show
Series drm/i915: Fix NPD in PMU during driver teardown | expand

Commit Message

Summers, Stuart June 29, 2022, 6:46 p.m. UTC
In the driver teardown, we are unregistering the gt prior
to unregistering the PMU. This means there is a small window
of time in which the application can request metrics from the
PMU, some of which are calling into the uapi engines list,
while the engines are not available. In this case we can
see null pointer dereferences.

Prevent this by simply checking if the engines are present
when those PMU events come through. Print a debug message
indicating when they aren't available.

Signed-off-by: Stuart Summers <stuart.summers@intel.com>
---
 drivers/gpu/drm/i915/i915_pmu.c | 72 +++++++++++++++++++--------------
 1 file changed, 42 insertions(+), 30 deletions(-)

Comments

Tvrtko Ursulin June 30, 2022, 10:29 a.m. UTC | #1
On 29/06/2022 19:46, Stuart Summers wrote:
> In the driver teardown, we are unregistering the gt prior
> to unregistering the PMU. This means there is a small window
> of time in which the application can request metrics from the
> PMU, some of which are calling into the uapi engines list,
> while the engines are not available. In this case we can
> see null pointer dereferences.
> 
> Prevent this by simply checking if the engines are present
> when those PMU events come through. Print a debug message
> indicating when they aren't available.

Obvious question - can we just move PMU unregister PMU to before 
unregister GT?

Regards,

Tvrtko

> Signed-off-by: Stuart Summers <stuart.summers@intel.com>
> ---
>   drivers/gpu/drm/i915/i915_pmu.c | 72 +++++++++++++++++++--------------
>   1 file changed, 42 insertions(+), 30 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_pmu.c b/drivers/gpu/drm/i915/i915_pmu.c
> index 958b37123bf1..796a1d8e36f2 100644
> --- a/drivers/gpu/drm/i915/i915_pmu.c
> +++ b/drivers/gpu/drm/i915/i915_pmu.c
> @@ -670,21 +670,28 @@ static void i915_pmu_enable(struct perf_event *event)
>   	if (is_engine_event(event)) {
>   		u8 sample = engine_event_sample(event);
>   		struct intel_engine_cs *engine;
> -
> -		engine = intel_engine_lookup_user(i915,
> -						  engine_event_class(event),
> -						  engine_event_instance(event));
> -
> -		BUILD_BUG_ON(ARRAY_SIZE(engine->pmu.enable_count) !=
> -			     I915_ENGINE_SAMPLE_COUNT);
> -		BUILD_BUG_ON(ARRAY_SIZE(engine->pmu.sample) !=
> -			     I915_ENGINE_SAMPLE_COUNT);
> -		GEM_BUG_ON(sample >= ARRAY_SIZE(engine->pmu.enable_count));
> -		GEM_BUG_ON(sample >= ARRAY_SIZE(engine->pmu.sample));
> -		GEM_BUG_ON(engine->pmu.enable_count[sample] == ~0);
> -
> -		engine->pmu.enable |= BIT(sample);
> -		engine->pmu.enable_count[sample]++;
> +		u8 class = engine_event_class(event);
> +		u8 instance = engine_event_instance(event);
> +
> +		engine = intel_engine_lookup_user(i915, class, instance);
> +		if (engine) {
> +			BUILD_BUG_ON(ARRAY_SIZE(engine->pmu.enable_count) !=
> +				     I915_ENGINE_SAMPLE_COUNT);
> +			BUILD_BUG_ON(ARRAY_SIZE(engine->pmu.sample) !=
> +				     I915_ENGINE_SAMPLE_COUNT);
> +			GEM_BUG_ON(sample >=
> +				   ARRAY_SIZE(engine->pmu.enable_count));
> +			GEM_BUG_ON(sample >=
> +				   ARRAY_SIZE(engine->pmu.sample));
> +			GEM_BUG_ON(engine->pmu.enable_count[sample] == ~0);
> +
> +			engine->pmu.enable |= BIT(sample);
> +			engine->pmu.enable_count[sample]++;
> +		} else {
> +			drm_dbg(&i915->drm,
> +				"Invalid engine event: { class:%d, inst:%d }\n",
> +				class, instance);
> +		}
>   	}
>   
>   	spin_unlock_irqrestore(&pmu->lock, flags);
> @@ -714,21 +721,26 @@ static void i915_pmu_disable(struct perf_event *event)
>   	if (is_engine_event(event)) {
>   		u8 sample = engine_event_sample(event);
>   		struct intel_engine_cs *engine;
> -
> -		engine = intel_engine_lookup_user(i915,
> -						  engine_event_class(event),
> -						  engine_event_instance(event));
> -
> -		GEM_BUG_ON(sample >= ARRAY_SIZE(engine->pmu.enable_count));
> -		GEM_BUG_ON(sample >= ARRAY_SIZE(engine->pmu.sample));
> -		GEM_BUG_ON(engine->pmu.enable_count[sample] == 0);
> -
> -		/*
> -		 * Decrement the reference count and clear the enabled
> -		 * bitmask when the last listener on an event goes away.
> -		 */
> -		if (--engine->pmu.enable_count[sample] == 0)
> -			engine->pmu.enable &= ~BIT(sample);
> +		u8 class = engine_event_class(event);
> +		u8 instance = engine_event_instance(event);
> +
> +		engine = intel_engine_lookup_user(i915, class, instance);
> +		if (engine) {
> +			GEM_BUG_ON(sample >= ARRAY_SIZE(engine->pmu.enable_count));
> +			GEM_BUG_ON(sample >= ARRAY_SIZE(engine->pmu.sample));
> +			GEM_BUG_ON(engine->pmu.enable_count[sample] == 0);
> +
> +			/*
> +			 * Decrement the reference count and clear the enabled
> +			 * bitmask when the last listener on an event goes away.
> +			 */
> +			if (--engine->pmu.enable_count[sample] == 0)
> +				engine->pmu.enable &= ~BIT(sample);
> +		} else {
> +			drm_dbg(&i915->drm,
> +				"Invalid engine event: { class:%d, inst:%d }\n",
> +				class, instance);
> +		}
>   	}
>   
>   	GEM_BUG_ON(bit >= ARRAY_SIZE(pmu->enable_count));
Summers, Stuart June 30, 2022, 7:04 p.m. UTC | #2
On Thu, 2022-06-30 at 11:29 +0100, Tvrtko Ursulin wrote:
> On 29/06/2022 19:46, Stuart Summers wrote:
> > In the driver teardown, we are unregistering the gt prior
> > to unregistering the PMU. This means there is a small window
> > of time in which the application can request metrics from the
> > PMU, some of which are calling into the uapi engines list,
> > while the engines are not available. In this case we can
> > see null pointer dereferences.
> > 
> > Prevent this by simply checking if the engines are present
> > when those PMU events come through. Print a debug message
> > indicating when they aren't available.
> 
> Obvious question - can we just move PMU unregister PMU to before 
> unregister GT?

Well I wanted to push the workaround asap to get feedback. I submitted
to trybot in parallel and it looks valid. I agree that's a more valid
solution, but IMO we should have these checks anyway.

At any rate, I'll resubmit with the move of the registrations.

Thanks,
Stuart

> 
> Regards,
> 
> Tvrtko
> 
> > Signed-off-by: Stuart Summers <stuart.summers@intel.com>
> > ---
> >   drivers/gpu/drm/i915/i915_pmu.c | 72 +++++++++++++++++++---------
> > -----
> >   1 file changed, 42 insertions(+), 30 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/i915/i915_pmu.c
> > b/drivers/gpu/drm/i915/i915_pmu.c
> > index 958b37123bf1..796a1d8e36f2 100644
> > --- a/drivers/gpu/drm/i915/i915_pmu.c
> > +++ b/drivers/gpu/drm/i915/i915_pmu.c
> > @@ -670,21 +670,28 @@ static void i915_pmu_enable(struct perf_event
> > *event)
> >   	if (is_engine_event(event)) {
> >   		u8 sample = engine_event_sample(event);
> >   		struct intel_engine_cs *engine;
> > -
> > -		engine = intel_engine_lookup_user(i915,
> > -						  engine_event_class(ev
> > ent),
> > -						  engine_event_instance
> > (event));
> > -
> > -		BUILD_BUG_ON(ARRAY_SIZE(engine->pmu.enable_count) !=
> > -			     I915_ENGINE_SAMPLE_COUNT);
> > -		BUILD_BUG_ON(ARRAY_SIZE(engine->pmu.sample) !=
> > -			     I915_ENGINE_SAMPLE_COUNT);
> > -		GEM_BUG_ON(sample >= ARRAY_SIZE(engine-
> > >pmu.enable_count));
> > -		GEM_BUG_ON(sample >= ARRAY_SIZE(engine->pmu.sample));
> > -		GEM_BUG_ON(engine->pmu.enable_count[sample] == ~0);
> > -
> > -		engine->pmu.enable |= BIT(sample);
> > -		engine->pmu.enable_count[sample]++;
> > +		u8 class = engine_event_class(event);
> > +		u8 instance = engine_event_instance(event);
> > +
> > +		engine = intel_engine_lookup_user(i915, class,
> > instance);
> > +		if (engine) {
> > +			BUILD_BUG_ON(ARRAY_SIZE(engine-
> > >pmu.enable_count) !=
> > +				     I915_ENGINE_SAMPLE_COUNT);
> > +			BUILD_BUG_ON(ARRAY_SIZE(engine->pmu.sample) !=
> > +				     I915_ENGINE_SAMPLE_COUNT);
> > +			GEM_BUG_ON(sample >=
> > +				   ARRAY_SIZE(engine-
> > >pmu.enable_count));
> > +			GEM_BUG_ON(sample >=
> > +				   ARRAY_SIZE(engine->pmu.sample));
> > +			GEM_BUG_ON(engine->pmu.enable_count[sample] ==
> > ~0);
> > +
> > +			engine->pmu.enable |= BIT(sample);
> > +			engine->pmu.enable_count[sample]++;
> > +		} else {
> > +			drm_dbg(&i915->drm,
> > +				"Invalid engine event: { class:%d,
> > inst:%d }\n",
> > +				class, instance);
> > +		}
> >   	}
> >   
> >   	spin_unlock_irqrestore(&pmu->lock, flags);
> > @@ -714,21 +721,26 @@ static void i915_pmu_disable(struct
> > perf_event *event)
> >   	if (is_engine_event(event)) {
> >   		u8 sample = engine_event_sample(event);
> >   		struct intel_engine_cs *engine;
> > -
> > -		engine = intel_engine_lookup_user(i915,
> > -						  engine_event_class(ev
> > ent),
> > -						  engine_event_instance
> > (event));
> > -
> > -		GEM_BUG_ON(sample >= ARRAY_SIZE(engine-
> > >pmu.enable_count));
> > -		GEM_BUG_ON(sample >= ARRAY_SIZE(engine->pmu.sample));
> > -		GEM_BUG_ON(engine->pmu.enable_count[sample] == 0);
> > -
> > -		/*
> > -		 * Decrement the reference count and clear the enabled
> > -		 * bitmask when the last listener on an event goes
> > away.
> > -		 */
> > -		if (--engine->pmu.enable_count[sample] == 0)
> > -			engine->pmu.enable &= ~BIT(sample);
> > +		u8 class = engine_event_class(event);
> > +		u8 instance = engine_event_instance(event);
> > +
> > +		engine = intel_engine_lookup_user(i915, class,
> > instance);
> > +		if (engine) {
> > +			GEM_BUG_ON(sample >= ARRAY_SIZE(engine-
> > >pmu.enable_count));
> > +			GEM_BUG_ON(sample >= ARRAY_SIZE(engine-
> > >pmu.sample));
> > +			GEM_BUG_ON(engine->pmu.enable_count[sample] ==
> > 0);
> > +
> > +			/*
> > +			 * Decrement the reference count and clear the
> > enabled
> > +			 * bitmask when the last listener on an event
> > goes away.
> > +			 */
> > +			if (--engine->pmu.enable_count[sample] == 0)
> > +				engine->pmu.enable &= ~BIT(sample);
> > +		} else {
> > +			drm_dbg(&i915->drm,
> > +				"Invalid engine event: { class:%d,
> > inst:%d }\n",
> > +				class, instance);
> > +		}
> >   	}
> >   
> >   	GEM_BUG_ON(bit >= ARRAY_SIZE(pmu->enable_count));
diff mbox series

Patch

diff --git a/drivers/gpu/drm/i915/i915_pmu.c b/drivers/gpu/drm/i915/i915_pmu.c
index 958b37123bf1..796a1d8e36f2 100644
--- a/drivers/gpu/drm/i915/i915_pmu.c
+++ b/drivers/gpu/drm/i915/i915_pmu.c
@@ -670,21 +670,28 @@  static void i915_pmu_enable(struct perf_event *event)
 	if (is_engine_event(event)) {
 		u8 sample = engine_event_sample(event);
 		struct intel_engine_cs *engine;
-
-		engine = intel_engine_lookup_user(i915,
-						  engine_event_class(event),
-						  engine_event_instance(event));
-
-		BUILD_BUG_ON(ARRAY_SIZE(engine->pmu.enable_count) !=
-			     I915_ENGINE_SAMPLE_COUNT);
-		BUILD_BUG_ON(ARRAY_SIZE(engine->pmu.sample) !=
-			     I915_ENGINE_SAMPLE_COUNT);
-		GEM_BUG_ON(sample >= ARRAY_SIZE(engine->pmu.enable_count));
-		GEM_BUG_ON(sample >= ARRAY_SIZE(engine->pmu.sample));
-		GEM_BUG_ON(engine->pmu.enable_count[sample] == ~0);
-
-		engine->pmu.enable |= BIT(sample);
-		engine->pmu.enable_count[sample]++;
+		u8 class = engine_event_class(event);
+		u8 instance = engine_event_instance(event);
+
+		engine = intel_engine_lookup_user(i915, class, instance);
+		if (engine) {
+			BUILD_BUG_ON(ARRAY_SIZE(engine->pmu.enable_count) !=
+				     I915_ENGINE_SAMPLE_COUNT);
+			BUILD_BUG_ON(ARRAY_SIZE(engine->pmu.sample) !=
+				     I915_ENGINE_SAMPLE_COUNT);
+			GEM_BUG_ON(sample >=
+				   ARRAY_SIZE(engine->pmu.enable_count));
+			GEM_BUG_ON(sample >=
+				   ARRAY_SIZE(engine->pmu.sample));
+			GEM_BUG_ON(engine->pmu.enable_count[sample] == ~0);
+
+			engine->pmu.enable |= BIT(sample);
+			engine->pmu.enable_count[sample]++;
+		} else {
+			drm_dbg(&i915->drm,
+				"Invalid engine event: { class:%d, inst:%d }\n",
+				class, instance);
+		}
 	}
 
 	spin_unlock_irqrestore(&pmu->lock, flags);
@@ -714,21 +721,26 @@  static void i915_pmu_disable(struct perf_event *event)
 	if (is_engine_event(event)) {
 		u8 sample = engine_event_sample(event);
 		struct intel_engine_cs *engine;
-
-		engine = intel_engine_lookup_user(i915,
-						  engine_event_class(event),
-						  engine_event_instance(event));
-
-		GEM_BUG_ON(sample >= ARRAY_SIZE(engine->pmu.enable_count));
-		GEM_BUG_ON(sample >= ARRAY_SIZE(engine->pmu.sample));
-		GEM_BUG_ON(engine->pmu.enable_count[sample] == 0);
-
-		/*
-		 * Decrement the reference count and clear the enabled
-		 * bitmask when the last listener on an event goes away.
-		 */
-		if (--engine->pmu.enable_count[sample] == 0)
-			engine->pmu.enable &= ~BIT(sample);
+		u8 class = engine_event_class(event);
+		u8 instance = engine_event_instance(event);
+
+		engine = intel_engine_lookup_user(i915, class, instance);
+		if (engine) {
+			GEM_BUG_ON(sample >= ARRAY_SIZE(engine->pmu.enable_count));
+			GEM_BUG_ON(sample >= ARRAY_SIZE(engine->pmu.sample));
+			GEM_BUG_ON(engine->pmu.enable_count[sample] == 0);
+
+			/*
+			 * Decrement the reference count and clear the enabled
+			 * bitmask when the last listener on an event goes away.
+			 */
+			if (--engine->pmu.enable_count[sample] == 0)
+				engine->pmu.enable &= ~BIT(sample);
+		} else {
+			drm_dbg(&i915->drm,
+				"Invalid engine event: { class:%d, inst:%d }\n",
+				class, instance);
+		}
 	}
 
 	GEM_BUG_ON(bit >= ARRAY_SIZE(pmu->enable_count));