diff mbox series

[v7,11/12] drm/i915/perf: execute OA configuration from command stream

Message ID 20190709093208.20470-12-lionel.g.landwerlin@intel.com (mailing list archive)
State New, archived
Headers show
Series drm/i915: Vulkan performance query support | expand

Commit Message

Lionel Landwerlin July 9, 2019, 9:32 a.m. UTC
We haven't run into issues with programming the global OA/NOA
registers configuration from CPU so far, but HW engineers actually
recommend doing this from the command streamer.

Since we have a command buffer prepared for the execbuffer side of
things, we can reuse that approach here too.

This also allows us to significantly reduce the amount of time we hold
the main lock.

v2: Drop the global lock as much as possible

Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
---
 drivers/gpu/drm/i915/i915_drv.h  |   7 +
 drivers/gpu/drm/i915/i915_perf.c | 261 ++++++++++++++++++-------------
 2 files changed, 161 insertions(+), 107 deletions(-)

Comments

Chris Wilson July 9, 2019, 10:02 a.m. UTC | #1
Quoting Lionel Landwerlin (2019-07-09 10:32:07)
> +static int emit_oa_config(struct drm_i915_private *i915,
> +                         struct i915_perf_stream *stream)
>  {
> -       u32 i;
> +       struct i915_oa_config *oa_config = stream->oa_config;
> +       struct i915_request *rq = stream->initial_config_rq;
> +       struct i915_vma *vma;
> +       u32 *cs;
> +       int err;
>  
> -       for (i = 0; i < n_regs; i++) {
> -               const struct i915_oa_reg *reg = regs + i;
> +       vma = i915_vma_instance(oa_config->obj, &i915->ggtt.vm, NULL);
> +       if (unlikely(IS_ERR(vma)))
> +               return PTR_ERR(vma);
> +
> +       err = i915_vma_pin(vma, 0, 0, PIN_GLOBAL);
> +       if (err)
> +               return err;

Hmm, still a lock inversion here as we will not be allowed to pin from
underneath rq->timeline->mutex.
-Chris
Lionel Landwerlin July 9, 2019, 11:04 a.m. UTC | #2
On 09/07/2019 13:02, Chris Wilson wrote:
> Quoting Lionel Landwerlin (2019-07-09 10:32:07)
>> +static int emit_oa_config(struct drm_i915_private *i915,
>> +                         struct i915_perf_stream *stream)
>>   {
>> -       u32 i;
>> +       struct i915_oa_config *oa_config = stream->oa_config;
>> +       struct i915_request *rq = stream->initial_config_rq;
>> +       struct i915_vma *vma;
>> +       u32 *cs;
>> +       int err;
>>   
>> -       for (i = 0; i < n_regs; i++) {
>> -               const struct i915_oa_reg *reg = regs + i;
>> +       vma = i915_vma_instance(oa_config->obj, &i915->ggtt.vm, NULL);
>> +       if (unlikely(IS_ERR(vma)))
>> +               return PTR_ERR(vma);
>> +
>> +       err = i915_vma_pin(vma, 0, 0, PIN_GLOBAL);
>> +       if (err)
>> +               return err;
> Hmm, still a lock inversion here as we will not be allowed to pin from
> underneath rq->timeline->mutex.
> -Chris
>
I can put this function back under the i915->drm.struct_mutex.

We still drop the lock to do the wait.


Sounds ok?


-Lionel
Lionel Landwerlin July 9, 2019, 11:16 a.m. UTC | #3
On 09/07/2019 14:04, Lionel Landwerlin wrote:
> On 09/07/2019 13:02, Chris Wilson wrote:
>> Quoting Lionel Landwerlin (2019-07-09 10:32:07)
>>> +static int emit_oa_config(struct drm_i915_private *i915,
>>> +                         struct i915_perf_stream *stream)
>>>   {
>>> -       u32 i;
>>> +       struct i915_oa_config *oa_config = stream->oa_config;
>>> +       struct i915_request *rq = stream->initial_config_rq;
>>> +       struct i915_vma *vma;
>>> +       u32 *cs;
>>> +       int err;
>>>   -       for (i = 0; i < n_regs; i++) {
>>> -               const struct i915_oa_reg *reg = regs + i;
>>> +       vma = i915_vma_instance(oa_config->obj, &i915->ggtt.vm, NULL);
>>> +       if (unlikely(IS_ERR(vma)))
>>> +               return PTR_ERR(vma);
>>> +
>>> +       err = i915_vma_pin(vma, 0, 0, PIN_GLOBAL);
>>> +       if (err)
>>> +               return err;
>> Hmm, still a lock inversion here as we will not be allowed to pin from
>> underneath rq->timeline->mutex.
>> -Chris
>>
> I can put this function back under the i915->drm.struct_mutex.
>
> We still drop the lock to do the wait.
>
>
> Sounds ok?
>
>
> -Lionel


When with locking around the global pin.


-Lionel
diff mbox series

Patch

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index dc0e4982c672..c72e7c746b57 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -1205,6 +1205,13 @@  struct i915_perf_stream {
 	 */
 	intel_wakeref_t wakeref;
 
+	/**
+	 * @initial_config_rq: First request run at the opening of the i915
+	 * perf stream to configure the HW. Should be NULL after the perf
+	 * stream has been opened successfully.
+	 */
+	struct i915_request *initial_config_rq;
+
 	/**
 	 * @sample_flags: Flags representing the `DRM_I915_PERF_PROP_SAMPLE_*`
 	 * properties given when opening a stream, representing the contents
diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
index 1e5011d4cdae..048a9abad26a 100644
--- a/drivers/gpu/drm/i915/i915_perf.c
+++ b/drivers/gpu/drm/i915/i915_perf.c
@@ -392,6 +392,19 @@  void i915_oa_config_release(struct kref *ref)
 	kfree(oa_config);
 }
 
+static void i915_oa_config_dispose_buffers(struct drm_i915_private *i915)
+{
+	struct i915_oa_config *oa_config, *next;
+
+	mutex_lock(&i915->perf.metrics_lock);
+	list_for_each_entry_safe(oa_config, next, &i915->perf.metrics_buffers, vma_link) {
+		list_del(&oa_config->vma_link);
+		i915_gem_object_put(oa_config->obj);
+		oa_config->obj = NULL;
+	}
+	mutex_unlock(&i915->perf.metrics_lock);
+}
+
 static u32 *write_cs_mi_lri(u32 *cs, const struct i915_oa_reg *reg_data, u32 n_regs)
 {
 	u32 i;
@@ -1449,6 +1462,14 @@  static void oa_put_render_ctx_id(struct i915_perf_stream *stream)
 	}
 }
 
+static void free_noa_wait(struct drm_i915_private *i915)
+{
+	mutex_lock(&i915->drm.struct_mutex);
+	i915_vma_unpin_and_release(&i915->perf.oa.noa_wait,
+				   I915_VMA_RELEASE_MAP);
+	mutex_unlock(&i915->drm.struct_mutex);
+}
+
 static void
 free_oa_buffer(struct drm_i915_private *i915)
 {
@@ -1468,16 +1489,17 @@  static void i915_oa_stream_destroy(struct i915_perf_stream *stream)
 
 	BUG_ON(stream != dev_priv->perf.oa.exclusive_stream);
 
+	dev_priv->perf.oa.ops.disable_metric_set(dev_priv);
+
 	/*
 	 * Unset exclusive_stream first, it will be checked while disabling
 	 * the metric set on gen8+.
 	 */
 	mutex_lock(&dev_priv->drm.struct_mutex);
 	dev_priv->perf.oa.exclusive_stream = NULL;
-	dev_priv->perf.oa.ops.disable_metric_set(dev_priv);
-	i915_vma_unpin_and_release(&dev_priv->perf.oa.noa_wait, 0);
 	mutex_unlock(&dev_priv->drm.struct_mutex);
 
+	free_noa_wait(dev_priv);
 	free_oa_buffer(dev_priv);
 
 	intel_uncore_forcewake_put(&dev_priv->uncore, FORCEWAKE_ALL);
@@ -1710,6 +1732,10 @@  static int alloc_noa_wait(struct drm_i915_private *i915)
 		return PTR_ERR(bo);
 	}
 
+	ret = i915_mutex_lock_interruptible(&i915->drm);
+	if (ret)
+		goto err_unref;
+
 	/*
 	 * We pin in GGTT because we jump into this buffer now because
 	 * multiple OA config BOs will have a jump to this address and it
@@ -1717,10 +1743,13 @@  static int alloc_noa_wait(struct drm_i915_private *i915)
 	 */
 	vma = i915_gem_object_ggtt_pin(bo, NULL, 0, 4096, 0);
 	if (IS_ERR(vma)) {
+		mutex_unlock(&i915->drm.struct_mutex);
 		ret = PTR_ERR(vma);
 		goto err_unref;
 	}
 
+	mutex_unlock(&i915->drm.struct_mutex);
+
 	batch = cs = i915_gem_object_pin_map(bo, I915_MAP_WB);
 	if (IS_ERR(batch)) {
 		ret = PTR_ERR(batch);
@@ -1854,7 +1883,11 @@  static int alloc_noa_wait(struct drm_i915_private *i915)
 	return 0;
 
 err_unpin:
-	__i915_vma_unpin(vma);
+	mutex_lock(&i915->drm.struct_mutex);
+	i915_vma_unpin_and_release(&i915->perf.oa.noa_wait, 0);
+	mutex_unlock(&i915->drm.struct_mutex);
+
+	return ret;
 
 err_unref:
 	i915_gem_object_put(bo);
@@ -1862,23 +1895,55 @@  static int alloc_noa_wait(struct drm_i915_private *i915)
 	return ret;
 }
 
-static void config_oa_regs(struct drm_i915_private *dev_priv,
-			   const struct i915_oa_reg *regs,
-			   u32 n_regs)
+static int emit_oa_config(struct drm_i915_private *i915,
+			  struct i915_perf_stream *stream)
 {
-	u32 i;
+	struct i915_oa_config *oa_config = stream->oa_config;
+	struct i915_request *rq = stream->initial_config_rq;
+	struct i915_vma *vma;
+	u32 *cs;
+	int err;
 
-	for (i = 0; i < n_regs; i++) {
-		const struct i915_oa_reg *reg = regs + i;
+	vma = i915_vma_instance(oa_config->obj, &i915->ggtt.vm, NULL);
+	if (unlikely(IS_ERR(vma)))
+		return PTR_ERR(vma);
+
+	err = i915_vma_pin(vma, 0, 0, PIN_GLOBAL);
+	if (err)
+		return err;
 
-		I915_WRITE(reg->addr, reg->value);
+	err = i915_vma_move_to_active(vma, rq, 0);
+	if (err) {
+		i915_vma_unpin(vma);
+		return err;
 	}
+
+	cs = intel_ring_begin(rq, INTEL_GEN(i915) >= 8 ? 4 : 2);
+	if (IS_ERR(cs)) {
+		i915_vma_unpin(vma);
+		return PTR_ERR(cs);
+	}
+
+	if (INTEL_GEN(i915) > 8) {
+		*cs++ = MI_BATCH_BUFFER_START_GEN8;
+		*cs++ = lower_32_bits(vma->node.start);
+		*cs++ = upper_32_bits(vma->node.start);
+		*cs++ = MI_NOOP;
+	} else {
+		*cs++ = MI_BATCH_BUFFER_START;
+		*cs++ = vma->node.start;
+	}
+
+	intel_ring_advance(rq, cs);
+
+	i915_vma_unpin(vma);
+
+	return 0;
 }
 
 static int hsw_enable_metric_set(struct i915_perf_stream *stream)
 {
 	struct drm_i915_private *dev_priv = stream->dev_priv;
-	const struct i915_oa_config *oa_config = stream->oa_config;
 
 	/* PRM:
 	 *
@@ -1894,35 +1959,7 @@  static int hsw_enable_metric_set(struct i915_perf_stream *stream)
 	I915_WRITE(GEN6_UCGCTL1, (I915_READ(GEN6_UCGCTL1) |
 				  GEN6_CSUNIT_CLOCK_GATE_DISABLE));
 
-	config_oa_regs(dev_priv, oa_config->mux_regs, oa_config->mux_regs_len);
-
-	/* It apparently takes a fairly long time for a new MUX
-	 * configuration to be be applied after these register writes.
-	 * This delay duration was derived empirically based on the
-	 * render_basic config but hopefully it covers the maximum
-	 * configuration latency.
-	 *
-	 * As a fallback, the checks in _append_oa_reports() to skip
-	 * invalid OA reports do also seem to work to discard reports
-	 * generated before this config has completed - albeit not
-	 * silently.
-	 *
-	 * Unfortunately this is essentially a magic number, since we
-	 * don't currently know of a reliable mechanism for predicting
-	 * how long the MUX config will take to apply and besides
-	 * seeing invalid reports we don't know of a reliable way to
-	 * explicitly check that the MUX config has landed.
-	 *
-	 * It's even possible we've miss characterized the underlying
-	 * problem - it just seems like the simplest explanation why
-	 * a delay at this location would mitigate any invalid reports.
-	 */
-	usleep_range(15000, 20000);
-
-	config_oa_regs(dev_priv, oa_config->b_counter_regs,
-		       oa_config->b_counter_regs_len);
-
-	return 0;
+	return emit_oa_config(dev_priv, stream);
 }
 
 static void hsw_disable_metric_set(struct drm_i915_private *dev_priv)
@@ -2027,10 +2064,18 @@  static int gen8_configure_all_contexts(struct drm_i915_private *dev_priv,
 {
 	unsigned int map_type = i915_coherent_map_type(dev_priv);
 	struct i915_gem_context *ctx;
-	struct i915_request *rq;
 	int ret;
 
-	lockdep_assert_held(&dev_priv->drm.struct_mutex);
+	/* When calling without a configuration, we're tearing down the i915
+	 * perf stream. Don't be interruptible in that case.
+	 */
+	if (oa_config) {
+		ret = i915_mutex_lock_interruptible(&dev_priv->drm);
+		if (ret)
+			return ret;
+	} else {
+		mutex_lock(&dev_priv->drm.struct_mutex);
+	}
 
 	/*
 	 * The OA register config is setup through the context image. This image
@@ -2049,7 +2094,7 @@  static int gen8_configure_all_contexts(struct drm_i915_private *dev_priv,
 				     I915_WAIT_LOCKED,
 				     MAX_SCHEDULE_TIMEOUT);
 	if (ret)
-		return ret;
+		goto unlock;
 
 	/* Update all contexts now that we've stalled the submission. */
 	list_for_each_entry(ctx, &dev_priv->contexts.list, link) {
@@ -2072,7 +2117,8 @@  static int gen8_configure_all_contexts(struct drm_i915_private *dev_priv,
 						       map_type);
 			if (IS_ERR(regs)) {
 				i915_gem_context_unlock_engines(ctx);
-				return PTR_ERR(regs);
+				ret = PTR_ERR(regs);
+				goto unlock;
 			}
 
 			ce->state->obj->mm.dirty = true;
@@ -2086,16 +2132,14 @@  static int gen8_configure_all_contexts(struct drm_i915_private *dev_priv,
 	}
 
 	/*
-	 * Apply the configuration by doing one context restore of the edited
-	 * context image.
+	 * The above configuration will be applied when called
+	 * config_oa_regs().
 	 */
-	rq = i915_request_create(dev_priv->engine[RCS0]->kernel_context);
-	if (IS_ERR(rq))
-		return PTR_ERR(rq);
 
-	i915_request_add(rq);
+unlock:
+	mutex_unlock(&dev_priv->drm.struct_mutex);
 
-	return 0;
+	return ret;
 }
 
 static int gen8_enable_metric_set(struct i915_perf_stream *stream)
@@ -2142,35 +2186,7 @@  static int gen8_enable_metric_set(struct i915_perf_stream *stream)
 	if (ret)
 		return ret;
 
-	config_oa_regs(dev_priv, oa_config->mux_regs, oa_config->mux_regs_len);
-
-	/* It apparently takes a fairly long time for a new MUX
-	 * configuration to be be applied after these register writes.
-	 * This delay duration was derived empirically based on the
-	 * render_basic config but hopefully it covers the maximum
-	 * configuration latency.
-	 *
-	 * As a fallback, the checks in _append_oa_reports() to skip
-	 * invalid OA reports do also seem to work to discard reports
-	 * generated before this config has completed - albeit not
-	 * silently.
-	 *
-	 * Unfortunately this is essentially a magic number, since we
-	 * don't currently know of a reliable mechanism for predicting
-	 * how long the MUX config will take to apply and besides
-	 * seeing invalid reports we don't know of a reliable way to
-	 * explicitly check that the MUX config has landed.
-	 *
-	 * It's even possible we've miss characterized the underlying
-	 * problem - it just seems like the simplest explanation why
-	 * a delay at this location would mitigate any invalid reports.
-	 */
-	usleep_range(15000, 20000);
-
-	config_oa_regs(dev_priv, oa_config->b_counter_regs,
-		       oa_config->b_counter_regs_len);
-
-	return 0;
+	return emit_oa_config(dev_priv, stream);
 }
 
 static void gen8_disable_metric_set(struct drm_i915_private *dev_priv)
@@ -2341,7 +2357,9 @@  static int i915_oa_stream_init(struct i915_perf_stream *stream,
 			       struct perf_open_properties *props)
 {
 	struct drm_i915_private *dev_priv = stream->dev_priv;
+	struct drm_i915_gem_object *obj;
 	int format_size;
+	long timeout;
 	int ret;
 
 	/* If the sysfs metrics/ directory wasn't registered for some
@@ -2425,13 +2443,6 @@  static int i915_oa_stream_init(struct i915_perf_stream *stream,
 		}
 	}
 
-	ret = i915_perf_get_oa_config(dev_priv, props->metrics_set,
-				      &stream->oa_config, NULL);
-	if (ret) {
-		DRM_DEBUG("Invalid OA config id=%i\n", props->metrics_set);
-		goto err_config;
-	}
-
 	ret = alloc_noa_wait(dev_priv);
 	if (ret) {
 		DRM_DEBUG("Unable to allocate NOA wait batch buffer\n");
@@ -2457,47 +2468,90 @@  static int i915_oa_stream_init(struct i915_perf_stream *stream,
 	if (ret)
 		goto err_oa_buf_alloc;
 
+	ret = i915_perf_get_oa_config(dev_priv, props->metrics_set,
+				      &stream->oa_config, &obj);
+	if (ret) {
+		DRM_DEBUG("Invalid OA config id=%i\n", props->metrics_set);
+		goto err_config;
+	}
+
+	/*
+	 * We just need the buffer to be created, but not our own reference on
+	 * it as the oa_config already has one.
+	 */
+	i915_gem_object_put(obj);
+
+	stream->initial_config_rq =
+		i915_request_create(dev_priv->engine[RCS0]->kernel_context);
+	if (IS_ERR(stream->initial_config_rq)) {
+		ret = PTR_ERR(stream->initial_config_rq);
+		goto err_initial_config;
+	}
+
+	stream->ops = &i915_oa_stream_ops;
+
 	ret = i915_mutex_lock_interruptible(&dev_priv->drm);
 	if (ret)
 		goto err_lock;
 
-	stream->ops = &i915_oa_stream_ops;
+	ret = i915_active_request_set(&dev_priv->engine[RCS0]->last_oa_config,
+				      stream->initial_config_rq);
+	if (ret) {
+		mutex_unlock(&dev_priv->drm.struct_mutex);
+		goto err_lock;
+	}
+
 	dev_priv->perf.oa.exclusive_stream = stream;
 
+	mutex_unlock(&dev_priv->drm.struct_mutex);
+
 	ret = dev_priv->perf.oa.ops.enable_metric_set(stream);
 	if (ret) {
 		DRM_DEBUG("Unable to enable metric set\n");
 		goto err_enable;
 	}
 
-	DRM_DEBUG("opening stream oa config uuid=%s\n", stream->oa_config->uuid);
+	i915_request_get(stream->initial_config_rq);
 
-	mutex_unlock(&dev_priv->drm.struct_mutex);
+	i915_request_add(stream->initial_config_rq);
+
+	timeout = i915_request_wait(stream->initial_config_rq,
+				    I915_WAIT_INTERRUPTIBLE,
+				    MAX_SCHEDULE_TIMEOUT);
+	i915_request_put(stream->initial_config_rq);
+	stream->initial_config_rq = NULL;
+
+	ret = timeout < 0 ? timeout : 0;
+	if (ret)
+		goto err_enable;
+
+	DRM_DEBUG("opening stream oa config uuid=%s\n", stream->oa_config->uuid);
 
 	return 0;
 
 err_enable:
+	mutex_lock(&dev_priv->drm.struct_mutex);
 	dev_priv->perf.oa.exclusive_stream = NULL;
-	dev_priv->perf.oa.ops.disable_metric_set(dev_priv);
 	mutex_unlock(&dev_priv->drm.struct_mutex);
+	dev_priv->perf.oa.ops.disable_metric_set(dev_priv);
 
 err_lock:
-	free_oa_buffer(dev_priv);
+	i915_request_add(stream->initial_config_rq);
 
-err_oa_buf_alloc:
+err_initial_config:
 	i915_oa_config_put(stream->oa_config);
+	i915_oa_config_dispose_buffers(dev_priv);
+
+err_config:
+	free_oa_buffer(dev_priv);
 
+err_oa_buf_alloc:
 	intel_uncore_forcewake_put(&dev_priv->uncore, FORCEWAKE_ALL);
 	intel_runtime_pm_put(&dev_priv->runtime_pm, stream->wakeref);
 
-	mutex_lock(&dev_priv->drm.struct_mutex);
-	i915_vma_unpin_and_release(&dev_priv->perf.oa.noa_wait, 0);
-	mutex_unlock(&dev_priv->drm.struct_mutex);
+	free_noa_wait(dev_priv);
 
 err_noa_wait_alloc:
-	i915_oa_config_put(stream->oa_config);
-
-err_config:
 	if (stream->ctx)
 		oa_put_render_ctx_id(stream);
 
@@ -2859,20 +2913,13 @@  static int i915_perf_release(struct inode *inode, struct file *file)
 {
 	struct i915_perf_stream *stream = file->private_data;
 	struct drm_i915_private *dev_priv = stream->dev_priv;
-	struct i915_oa_config *oa_config, *next;
 
 	mutex_lock(&dev_priv->perf.lock);
 
 	i915_perf_destroy_locked(stream);
 
 	/* Dispose of all oa config batch buffers. */
-	mutex_lock(&dev_priv->perf.metrics_lock);
-	list_for_each_entry_safe(oa_config, next, &dev_priv->perf.metrics_buffers, vma_link) {
-		list_del(&oa_config->vma_link);
-		i915_gem_object_put(oa_config->obj);
-		oa_config->obj = NULL;
-	}
-	mutex_unlock(&dev_priv->perf.metrics_lock);
+	i915_oa_config_dispose_buffers(dev_priv);
 
 	mutex_unlock(&dev_priv->perf.lock);