diff mbox series

[v5,09/10] drm/i915/perf: execute OA configuration from command stream

Message ID 20190627080045.8814-10-lionel.g.landwerlin@intel.com (mailing list archive)
State New, archived
Headers show
Series drm/i915: Vulkan performance query support | expand

Commit Message

Lionel Landwerlin June 27, 2019, 8 a.m. UTC
We can't run into issues with doing writing the global OA/NOA
registers configuration from CPU so far, but HW engineers actually
recommend doing this from the command streamer.

Since we have a command buffer prepared for the execbuffer side of
things, we can reuse that approach here too.

Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
---
 drivers/gpu/drm/i915/i915_perf.c | 203 +++++++++++++++----------------
 1 file changed, 100 insertions(+), 103 deletions(-)

Comments

Chris Wilson June 27, 2019, 10:02 a.m. UTC | #1
Quoting Lionel Landwerlin (2019-06-27 09:00:44)
> We can't run into issues with doing writing the global OA/NOA
> registers configuration from CPU so far, but HW engineers actually
> recommend doing this from the command streamer.
> 
> Since we have a command buffer prepared for the execbuffer side of
> things, we can reuse that approach here too.
> 
> Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
> ---
>  drivers/gpu/drm/i915/i915_perf.c | 203 +++++++++++++++----------------
>  1 file changed, 100 insertions(+), 103 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
> index bf4f5fee6764..7e636463e1f5 100644
> --- a/drivers/gpu/drm/i915/i915_perf.c
> +++ b/drivers/gpu/drm/i915/i915_perf.c
> @@ -389,6 +389,19 @@ void i915_oa_config_release(struct kref *ref)
>         kfree(oa_config);
>  }
>  
> +static void i915_oa_config_dispose_buffers(struct drm_i915_private *i915)
> +{
> +       struct i915_oa_config *oa_config, *next;
> +
> +       mutex_lock(&i915->perf.metrics_lock);
> +       list_for_each_entry_safe(oa_config, next, &i915->perf.metrics_buffers, vma_link) {
> +               list_del(&oa_config->vma_link);
> +               i915_gem_object_put(oa_config->obj);
> +               oa_config->obj = NULL;
> +       }
> +       mutex_unlock(&i915->perf.metrics_lock);
> +}
> +
>  static u32 *write_cs_mi_lri(u32 *cs, const struct i915_oa_reg *reg_data, u32 n_regs)
>  {
>         u32 i;
> @@ -1813,67 +1826,86 @@ static int alloc_noa_wait(struct drm_i915_private *i915)
>         return ret;
>  }
>  
> -static void config_oa_regs(struct drm_i915_private *dev_priv,
> -                          const struct i915_oa_reg *regs,
> -                          u32 n_regs)
> +static int config_oa_regs(struct drm_i915_private *i915,
> +                         struct i915_oa_config *oa_config)
>  {
> -       u32 i;
> +       struct i915_request *rq;
> +       struct i915_vma *vma;
> +       long timeout;
> +       u32 *cs;
> +       int err;
>  
> -       for (i = 0; i < n_regs; i++) {
> -               const struct i915_oa_reg *reg = regs + i;
> +       rq = i915_request_create(i915->engine[RCS0]->kernel_context);
> +       if (IS_ERR(rq))
> +               return PTR_ERR(rq);
>  
> -               I915_WRITE(reg->addr, reg->value);
> +       err = i915_active_request_set(&i915->engine[RCS0]->last_oa_config,
> +                                     rq);

This will need an explicit mutex as it is not serialised by any single
timeline.

i915->perf.metrics_lock viable? Needs to be taken here and in execbuf.

> +       if (err) {
> +               i915_request_add(rq);
> +               return err;
> +       }
> +
> +       vma = i915_vma_instance(oa_config->obj, &i915->ggtt.vm, NULL);
> +       if (unlikely(IS_ERR(vma))) {
> +               i915_request_add(rq);
> +               return PTR_ERR(vma);
> +       }
> +
> +       err = i915_vma_pin(vma, 0, 0, PIN_GLOBAL);
> +       if (err) {
> +               i915_request_add(rq);
> +               return err;
>         }

This time there is no excuse for not pinning outside of the
timeline->mutex.

> +
> +       err = i915_vma_move_to_active(vma, rq, 0);
> +       if (err) {
> +               i915_vma_unpin(vma);
> +               i915_request_add(rq);
> +               return err;
> +       }

> +
> +       cs = intel_ring_begin(rq, INTEL_GEN(i915) >= 8 ? 4 : 2);
> +       if (IS_ERR(cs)) {
> +               i915_vma_unpin(vma);
> +               i915_request_add(rq);
> +               return PTR_ERR(cs);
> +       }
> +
> +       if (INTEL_GEN(i915) > 8) {
> +               *cs++ = MI_BATCH_BUFFER_START_GEN8;
> +               *cs++ = lower_32_bits(vma->node.start);
> +               *cs++ = upper_32_bits(vma->node.start);
> +               *cs++ = MI_NOOP;
> +       } else {
> +               *cs++ = MI_BATCH_BUFFER_START;
> +               *cs++ = vma->node.start;
> +       }
> +
> +       intel_ring_advance(rq, cs);
> +
> +       i915_vma_unpin(vma);
> +
> +       i915_request_add(rq);
> +
> +       i915_request_get(rq);

Strictly, before the add, as the add is the xfer of the ref.

> +       timeout = i915_request_wait(rq, I915_WAIT_INTERRUPTIBLE | I915_WAIT_LOCKED,
> +                                   MAX_SCHEDULE_TIMEOUT);
> +       i915_request_put(rq);
> +
> +       return timeout < 0 ? err : 0;

Report err, but timeline carries the errno?
-Chris
Lionel Landwerlin June 27, 2019, 2:50 p.m. UTC | #2
On 27/06/2019 13:02, Chris Wilson wrote:
> Quoting Lionel Landwerlin (2019-06-27 09:00:44)
>> We can't run into issues with doing writing the global OA/NOA
>> registers configuration from CPU so far, but HW engineers actually
>> recommend doing this from the command streamer.
>>
>> Since we have a command buffer prepared for the execbuffer side of
>> things, we can reuse that approach here too.
>>
>> Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
>> ---
>>   drivers/gpu/drm/i915/i915_perf.c | 203 +++++++++++++++----------------
>>   1 file changed, 100 insertions(+), 103 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
>> index bf4f5fee6764..7e636463e1f5 100644
>> --- a/drivers/gpu/drm/i915/i915_perf.c
>> +++ b/drivers/gpu/drm/i915/i915_perf.c
>> @@ -389,6 +389,19 @@ void i915_oa_config_release(struct kref *ref)
>>          kfree(oa_config);
>>   }
>>   
>> +static void i915_oa_config_dispose_buffers(struct drm_i915_private *i915)
>> +{
>> +       struct i915_oa_config *oa_config, *next;
>> +
>> +       mutex_lock(&i915->perf.metrics_lock);
>> +       list_for_each_entry_safe(oa_config, next, &i915->perf.metrics_buffers, vma_link) {
>> +               list_del(&oa_config->vma_link);
>> +               i915_gem_object_put(oa_config->obj);
>> +               oa_config->obj = NULL;
>> +       }
>> +       mutex_unlock(&i915->perf.metrics_lock);
>> +}
>> +
>>   static u32 *write_cs_mi_lri(u32 *cs, const struct i915_oa_reg *reg_data, u32 n_regs)
>>   {
>>          u32 i;
>> @@ -1813,67 +1826,86 @@ static int alloc_noa_wait(struct drm_i915_private *i915)
>>          return ret;
>>   }
>>   
>> -static void config_oa_regs(struct drm_i915_private *dev_priv,
>> -                          const struct i915_oa_reg *regs,
>> -                          u32 n_regs)
>> +static int config_oa_regs(struct drm_i915_private *i915,
>> +                         struct i915_oa_config *oa_config)
>>   {
>> -       u32 i;
>> +       struct i915_request *rq;
>> +       struct i915_vma *vma;
>> +       long timeout;
>> +       u32 *cs;
>> +       int err;
>>   
>> -       for (i = 0; i < n_regs; i++) {
>> -               const struct i915_oa_reg *reg = regs + i;
>> +       rq = i915_request_create(i915->engine[RCS0]->kernel_context);
>> +       if (IS_ERR(rq))
>> +               return PTR_ERR(rq);
>>   
>> -               I915_WRITE(reg->addr, reg->value);
>> +       err = i915_active_request_set(&i915->engine[RCS0]->last_oa_config,
>> +                                     rq);
> This will need an explicit mutex as it is not serialised by any single
> timeline.
>
> i915->perf.metrics_lock viable? Needs to be taken here and in execbuf.


Not sure that I understand. This function is called under struct_mutex lock.


>
>> +       if (err) {
>> +               i915_request_add(rq);
>> +               return err;
>> +       }
>> +
>> +       vma = i915_vma_instance(oa_config->obj, &i915->ggtt.vm, NULL);
>> +       if (unlikely(IS_ERR(vma))) {
>> +               i915_request_add(rq);
>> +               return PTR_ERR(vma);
>> +       }
>> +
>> +       err = i915_vma_pin(vma, 0, 0, PIN_GLOBAL);
>> +       if (err) {
>> +               i915_request_add(rq);
>> +               return err;
>>          }
> This time there is no excuse for not pinning outside of the
> timeline->mutex.


Right, okay, but there is so much going on when turning on i915-perf, 
draining + idling + context image edition.

Do we really need to optimize this in this patch?


After all this removes the cpu wait that was done under struct_mutex lock.


>
>> +
>> +       err = i915_vma_move_to_active(vma, rq, 0);
>> +       if (err) {
>> +               i915_vma_unpin(vma);
>> +               i915_request_add(rq);
>> +               return err;
>> +       }
>> +
>> +       cs = intel_ring_begin(rq, INTEL_GEN(i915) >= 8 ? 4 : 2);
>> +       if (IS_ERR(cs)) {
>> +               i915_vma_unpin(vma);
>> +               i915_request_add(rq);
>> +               return PTR_ERR(cs);
>> +       }
>> +
>> +       if (INTEL_GEN(i915) > 8) {
>> +               *cs++ = MI_BATCH_BUFFER_START_GEN8;
>> +               *cs++ = lower_32_bits(vma->node.start);
>> +               *cs++ = upper_32_bits(vma->node.start);
>> +               *cs++ = MI_NOOP;
>> +       } else {
>> +               *cs++ = MI_BATCH_BUFFER_START;
>> +               *cs++ = vma->node.start;
>> +       }
>> +
>> +       intel_ring_advance(rq, cs);
>> +
>> +       i915_vma_unpin(vma);
>> +
>> +       i915_request_add(rq);
>> +
>> +       i915_request_get(rq);
> Strictly, before the add, as the add is the xfer of the ref.


Oops.


>
>> +       timeout = i915_request_wait(rq, I915_WAIT_INTERRUPTIBLE | I915_WAIT_LOCKED,
>> +                                   MAX_SCHEDULE_TIMEOUT);
>> +       i915_request_put(rq);
>> +
>> +       return timeout < 0 ? err : 0;
> Report err, but timeline carries the errno?


I meant to return timeout.


> -Chris
>
Chris Wilson June 27, 2019, 3:04 p.m. UTC | #3
Quoting Lionel Landwerlin (2019-06-27 15:50:38)
> On 27/06/2019 13:02, Chris Wilson wrote:
> > Quoting Lionel Landwerlin (2019-06-27 09:00:44)
> >> @@ -1813,67 +1826,86 @@ static int alloc_noa_wait(struct drm_i915_private *i915)
> >>          return ret;
> >>   }
> >>   
> >> -static void config_oa_regs(struct drm_i915_private *dev_priv,
> >> -                          const struct i915_oa_reg *regs,
> >> -                          u32 n_regs)
> >> +static int config_oa_regs(struct drm_i915_private *i915,
> >> +                         struct i915_oa_config *oa_config)
> >>   {
> >> -       u32 i;
> >> +       struct i915_request *rq;
> >> +       struct i915_vma *vma;
> >> +       long timeout;
> >> +       u32 *cs;
> >> +       int err;
> >>   
> >> -       for (i = 0; i < n_regs; i++) {
> >> -               const struct i915_oa_reg *reg = regs + i;
> >> +       rq = i915_request_create(i915->engine[RCS0]->kernel_context);
> >> +       if (IS_ERR(rq))
> >> +               return PTR_ERR(rq);
> >>   
> >> -               I915_WRITE(reg->addr, reg->value);
> >> +       err = i915_active_request_set(&i915->engine[RCS0]->last_oa_config,
> >> +                                     rq);
> > This will need an explicit mutex as it is not serialised by any single
> > timeline.
> >
> > i915->perf.metrics_lock viable? Needs to be taken here and in execbuf.
> 
> 
> Not sure that I understand. This function is called under struct_mutex lock.

The active_request is serialised by the timeline lock, which for about
the next 10 minutes struct_mutex remains in lieu of. I want you to
nominate an explicit lock for serialisation of the last_oa_config
barrier

> >
> >> +       if (err) {
> >> +               i915_request_add(rq);
> >> +               return err;
> >> +       }
> >> +
> >> +       vma = i915_vma_instance(oa_config->obj, &i915->ggtt.vm, NULL);
> >> +       if (unlikely(IS_ERR(vma))) {
> >> +               i915_request_add(rq);
> >> +               return PTR_ERR(vma);
> >> +       }
> >> +
> >> +       err = i915_vma_pin(vma, 0, 0, PIN_GLOBAL);
> >> +       if (err) {
> >> +               i915_request_add(rq);
> >> +               return err;
> >>          }
> > This time there is no excuse for not pinning outside of the
> > timeline->mutex.
> 
> 
> Right, okay, but there is so much going on when turning on i915-perf, 
> draining + idling + context image edition.
> 
> Do we really need to optimize this in this patch?

It's not optimizing, it's about the lock nesting. To pin a vma, we may
ultimately need to emit requests, at which point trying to pin from
inside a request critical section is going to make lockdep rightfully
complain.

> After all this removes the cpu wait that was done under struct_mutex lock.

Puzzling, as all of this is under struct_mutex and may require waits :(
Fear not, removing struct_mutex here is easier than execbuf, so only 50
patches!

So long as you haven't used struct_mutex for serialising perf :)

> >> +       timeout = i915_request_wait(rq, I915_WAIT_INTERRUPTIBLE | I915_WAIT_LOCKED,
> >> +                                   MAX_SCHEDULE_TIMEOUT);

One other thing, I915_WAIT_LOCKED is no longer used.
-Chris
diff mbox series

Patch

diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
index bf4f5fee6764..7e636463e1f5 100644
--- a/drivers/gpu/drm/i915/i915_perf.c
+++ b/drivers/gpu/drm/i915/i915_perf.c
@@ -389,6 +389,19 @@  void i915_oa_config_release(struct kref *ref)
 	kfree(oa_config);
 }
 
+static void i915_oa_config_dispose_buffers(struct drm_i915_private *i915)
+{
+	struct i915_oa_config *oa_config, *next;
+
+	mutex_lock(&i915->perf.metrics_lock);
+	list_for_each_entry_safe(oa_config, next, &i915->perf.metrics_buffers, vma_link) {
+		list_del(&oa_config->vma_link);
+		i915_gem_object_put(oa_config->obj);
+		oa_config->obj = NULL;
+	}
+	mutex_unlock(&i915->perf.metrics_lock);
+}
+
 static u32 *write_cs_mi_lri(u32 *cs, const struct i915_oa_reg *reg_data, u32 n_regs)
 {
 	u32 i;
@@ -1813,67 +1826,86 @@  static int alloc_noa_wait(struct drm_i915_private *i915)
 	return ret;
 }
 
-static void config_oa_regs(struct drm_i915_private *dev_priv,
-			   const struct i915_oa_reg *regs,
-			   u32 n_regs)
+static int config_oa_regs(struct drm_i915_private *i915,
+			  struct i915_oa_config *oa_config)
 {
-	u32 i;
+	struct i915_request *rq;
+	struct i915_vma *vma;
+	long timeout;
+	u32 *cs;
+	int err;
 
-	for (i = 0; i < n_regs; i++) {
-		const struct i915_oa_reg *reg = regs + i;
+	rq = i915_request_create(i915->engine[RCS0]->kernel_context);
+	if (IS_ERR(rq))
+		return PTR_ERR(rq);
 
-		I915_WRITE(reg->addr, reg->value);
+	err = i915_active_request_set(&i915->engine[RCS0]->last_oa_config,
+				      rq);
+	if (err) {
+		i915_request_add(rq);
+		return err;
+	}
+
+	vma = i915_vma_instance(oa_config->obj, &i915->ggtt.vm, NULL);
+	if (unlikely(IS_ERR(vma))) {
+		i915_request_add(rq);
+		return PTR_ERR(vma);
+	}
+
+	err = i915_vma_pin(vma, 0, 0, PIN_GLOBAL);
+	if (err) {
+		i915_request_add(rq);
+		return err;
 	}
+
+	err = i915_vma_move_to_active(vma, rq, 0);
+	if (err) {
+		i915_vma_unpin(vma);
+		i915_request_add(rq);
+		return err;
+	}
+
+	cs = intel_ring_begin(rq, INTEL_GEN(i915) >= 8 ? 4 : 2);
+	if (IS_ERR(cs)) {
+		i915_vma_unpin(vma);
+		i915_request_add(rq);
+		return PTR_ERR(cs);
+	}
+
+	if (INTEL_GEN(i915) > 8) {
+		*cs++ = MI_BATCH_BUFFER_START_GEN8;
+		*cs++ = lower_32_bits(vma->node.start);
+		*cs++ = upper_32_bits(vma->node.start);
+		*cs++ = MI_NOOP;
+	} else {
+		*cs++ = MI_BATCH_BUFFER_START;
+		*cs++ = vma->node.start;
+	}
+
+	intel_ring_advance(rq, cs);
+
+	i915_vma_unpin(vma);
+
+	i915_request_add(rq);
+
+	i915_request_get(rq);
+	timeout = i915_request_wait(rq, I915_WAIT_INTERRUPTIBLE | I915_WAIT_LOCKED,
+				    MAX_SCHEDULE_TIMEOUT);
+	i915_request_put(rq);
+
+	return timeout < 0 ? err : 0;
 }
 
 static int hsw_enable_metric_set(struct i915_perf_stream *stream)
 {
 	struct drm_i915_private *dev_priv = stream->dev_priv;
-	const struct i915_oa_config *oa_config = stream->oa_config;
 
-	/* PRM:
-	 *
-	 * OA unit is using “crclk” for its functionality. When trunk
-	 * level clock gating takes place, OA clock would be gated,
-	 * unable to count the events from non-render clock domain.
-	 * Render clock gating must be disabled when OA is enabled to
-	 * count the events from non-render domain. Unit level clock
-	 * gating for RCS should also be disabled.
-	 */
 	I915_WRITE(GEN7_MISCCPCTL, (I915_READ(GEN7_MISCCPCTL) &
 				    ~GEN7_DOP_CLOCK_GATE_ENABLE));
 	I915_WRITE(GEN6_UCGCTL1, (I915_READ(GEN6_UCGCTL1) |
 				  GEN6_CSUNIT_CLOCK_GATE_DISABLE));
 
-	config_oa_regs(dev_priv, oa_config->mux_regs, oa_config->mux_regs_len);
-
-	/* It apparently takes a fairly long time for a new MUX
-	 * configuration to be be applied after these register writes.
-	 * This delay duration was derived empirically based on the
-	 * render_basic config but hopefully it covers the maximum
-	 * configuration latency.
-	 *
-	 * As a fallback, the checks in _append_oa_reports() to skip
-	 * invalid OA reports do also seem to work to discard reports
-	 * generated before this config has completed - albeit not
-	 * silently.
-	 *
-	 * Unfortunately this is essentially a magic number, since we
-	 * don't currently know of a reliable mechanism for predicting
-	 * how long the MUX config will take to apply and besides
-	 * seeing invalid reports we don't know of a reliable way to
-	 * explicitly check that the MUX config has landed.
-	 *
-	 * It's even possible we've miss characterized the underlying
-	 * problem - it just seems like the simplest explanation why
-	 * a delay at this location would mitigate any invalid reports.
-	 */
-	usleep_range(15000, 20000);
-
-	config_oa_regs(dev_priv, oa_config->b_counter_regs,
-		       oa_config->b_counter_regs_len);
-
-	return 0;
+	return config_oa_regs(dev_priv, stream->oa_config);
 }
 
 static void hsw_disable_metric_set(struct drm_i915_private *dev_priv)
@@ -1978,7 +2010,6 @@  static int gen8_configure_all_contexts(struct drm_i915_private *dev_priv,
 {
 	unsigned int map_type = i915_coherent_map_type(dev_priv);
 	struct i915_gem_context *ctx;
-	struct i915_request *rq;
 	int ret;
 
 	lockdep_assert_held(&dev_priv->drm.struct_mutex);
@@ -2037,14 +2068,9 @@  static int gen8_configure_all_contexts(struct drm_i915_private *dev_priv,
 	}
 
 	/*
-	 * Apply the configuration by doing one context restore of the edited
-	 * context image.
+	 * The above configuration will be applied when called
+	 * config_oa_regs().
 	 */
-	rq = i915_request_create(dev_priv->engine[RCS0]->kernel_context);
-	if (IS_ERR(rq))
-		return PTR_ERR(rq);
-
-	i915_request_add(rq);
 
 	return 0;
 }
@@ -2093,35 +2119,7 @@  static int gen8_enable_metric_set(struct i915_perf_stream *stream)
 	if (ret)
 		return ret;
 
-	config_oa_regs(dev_priv, oa_config->mux_regs, oa_config->mux_regs_len);
-
-	/* It apparently takes a fairly long time for a new MUX
-	 * configuration to be be applied after these register writes.
-	 * This delay duration was derived empirically based on the
-	 * render_basic config but hopefully it covers the maximum
-	 * configuration latency.
-	 *
-	 * As a fallback, the checks in _append_oa_reports() to skip
-	 * invalid OA reports do also seem to work to discard reports
-	 * generated before this config has completed - albeit not
-	 * silently.
-	 *
-	 * Unfortunately this is essentially a magic number, since we
-	 * don't currently know of a reliable mechanism for predicting
-	 * how long the MUX config will take to apply and besides
-	 * seeing invalid reports we don't know of a reliable way to
-	 * explicitly check that the MUX config has landed.
-	 *
-	 * It's even possible we've miss characterized the underlying
-	 * problem - it just seems like the simplest explanation why
-	 * a delay at this location would mitigate any invalid reports.
-	 */
-	usleep_range(15000, 20000);
-
-	config_oa_regs(dev_priv, oa_config->b_counter_regs,
-		       oa_config->b_counter_regs_len);
-
-	return 0;
+	return config_oa_regs(dev_priv, stream->oa_config);
 }
 
 static void gen8_disable_metric_set(struct drm_i915_private *dev_priv)
@@ -2292,6 +2290,7 @@  static int i915_oa_stream_init(struct i915_perf_stream *stream,
 			       struct perf_open_properties *props)
 {
 	struct drm_i915_private *dev_priv = stream->dev_priv;
+	struct drm_i915_gem_object *obj;
 	int format_size;
 	int ret;
 
@@ -2376,13 +2375,6 @@  static int i915_oa_stream_init(struct i915_perf_stream *stream,
 		}
 	}
 
-	ret = i915_perf_get_oa_config(dev_priv, props->metrics_set,
-				      &stream->oa_config, NULL);
-	if (ret) {
-		DRM_DEBUG("Invalid OA config id=%i\n", props->metrics_set);
-		goto err_config;
-	}
-
 	ret = alloc_noa_wait(dev_priv);
 	if (ret) {
 		DRM_DEBUG("Unable to allocate NOA wait batch buffer\n");
@@ -2412,6 +2404,19 @@  static int i915_oa_stream_init(struct i915_perf_stream *stream,
 	if (ret)
 		goto err_lock;
 
+	ret = i915_perf_get_oa_config(dev_priv, props->metrics_set,
+				      &stream->oa_config, &obj);
+	if (ret) {
+		DRM_DEBUG("Invalid OA config id=%i\n", props->metrics_set);
+		goto err_config;
+	}
+
+	/*
+	 * We just need the buffer to be created, but not our own reference on
+	 * it as the oa_config already has one.
+	 */
+	i915_gem_object_put(obj);
+
 	stream->ops = &i915_oa_stream_ops;
 	dev_priv->perf.oa.exclusive_stream = stream;
 
@@ -2430,14 +2435,16 @@  static int i915_oa_stream_init(struct i915_perf_stream *stream,
 err_enable:
 	dev_priv->perf.oa.exclusive_stream = NULL;
 	dev_priv->perf.oa.ops.disable_metric_set(dev_priv);
+
+err_config:
+	i915_oa_config_put(stream->oa_config);
+	i915_oa_config_dispose_buffers(dev_priv);
 	mutex_unlock(&dev_priv->drm.struct_mutex);
 
 err_lock:
 	free_oa_buffer(dev_priv);
 
 err_oa_buf_alloc:
-	i915_oa_config_put(stream->oa_config);
-
 	intel_uncore_forcewake_put(&dev_priv->uncore, FORCEWAKE_ALL);
 	intel_runtime_pm_put(&dev_priv->runtime_pm, stream->wakeref);
 
@@ -2446,9 +2453,6 @@  static int i915_oa_stream_init(struct i915_perf_stream *stream,
 	mutex_unlock(&dev_priv->drm.struct_mutex);
 
 err_noa_wait_alloc:
-	i915_oa_config_put(stream->oa_config);
-
-err_config:
 	if (stream->ctx)
 		oa_put_render_ctx_id(stream);
 
@@ -2810,20 +2814,13 @@  static int i915_perf_release(struct inode *inode, struct file *file)
 {
 	struct i915_perf_stream *stream = file->private_data;
 	struct drm_i915_private *dev_priv = stream->dev_priv;
-	struct i915_oa_config *oa_config, *next;
 
 	mutex_lock(&dev_priv->perf.lock);
 
 	i915_perf_destroy_locked(stream);
 
 	/* Dispose of all oa config batch buffers. */
-	mutex_lock(&dev_priv->perf.metrics_lock);
-	list_for_each_entry_safe(oa_config, next, &dev_priv->perf.metrics_buffers, vma_link) {
-		list_del(&oa_config->vma_link);
-		i915_gem_object_put(oa_config->obj);
-		oa_config->obj = NULL;
-	}
-	mutex_unlock(&dev_priv->perf.metrics_lock);
+	i915_oa_config_dispose_buffers(dev_priv);
 
 	mutex_unlock(&dev_priv->perf.lock);