diff mbox series

[v3,5/7] drm/i915: add a new perf configuration execbuf parameter

Message ID 20190604131140.12647-6-lionel.g.landwerlin@intel.com (mailing list archive)
State New, archived
Headers show
Series drm/i915: Vulkan performance query support | expand

Commit Message

Lionel Landwerlin June 4, 2019, 1:11 p.m. UTC
We want the ability to dispatch a set of command buffer to the
hardware, each with a different OA configuration. To achieve this, we
reuse a couple of fields from the execbuf2 struct (I CAN HAZ
execbuf3?) to notify what OA configuration should be used for a batch
buffer. This requires the process making the execbuf with this flag to
also own the perf fd at the time of execbuf.

v2: Add a emit_oa_config() vfunc in the intel_engine_cs (Chris)
    Move oa_config vma to active (Chris)

v3: Don't drop the lock for engine lookup (Chris)
    Move OA config vma to active before writing the ringbuffer (Chris)

Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
---
 .../gpu/drm/i915/gem/i915_gem_execbuffer.c    | 110 +++++++++++++++++-
 drivers/gpu/drm/i915/gt/intel_engine_types.h  |   7 ++
 drivers/gpu/drm/i915/gt/intel_lrc.c           |   1 +
 drivers/gpu/drm/i915/gt/intel_ringbuffer.c    |   4 +-
 drivers/gpu/drm/i915/i915_drv.c               |   4 +
 drivers/gpu/drm/i915/i915_drv.h               |   1 +
 drivers/gpu/drm/i915/i915_perf.c              |  14 +--
 include/uapi/drm/i915_drm.h                   |  37 ++++++
 8 files changed, 169 insertions(+), 9 deletions(-)

Comments

Chris Wilson June 4, 2019, 1:40 p.m. UTC | #1
Quoting Lionel Landwerlin (2019-06-04 14:11:38)
>         list_add_tail(&rq->client_link, &rq->file_priv->mm.request_list);
>  }
>  
> +static int eb_oa_config(struct i915_execbuffer *eb)
> +{
> +       struct i915_vma *oa_vma;
> +       int err;
> +
> +       if (!eb->oa_config)
> +               return 0;
> +
> +       /*
> +        * If the config hasn't changed, skip reconfiguring the HW (this is
> +        * subject to a delay we want to avoid has much as possible).
> +        */
> +       if (eb->oa_config == eb->i915->perf.oa.exclusive_stream->oa_config)
> +               return 0;

But you don't order the execution so it may not be the right oa_config.
Just add the barrier. It is virtually no cost for the exclusive oa
userspace.

How does this interact with the global oa_config being changed via the
ioctl? What significance is there for this per-execbuf oa_config being
applied to other users?
-Chris
Lionel Landwerlin June 4, 2019, 1:51 p.m. UTC | #2
On 04/06/2019 16:40, Chris Wilson wrote:
> Quoting Lionel Landwerlin (2019-06-04 14:11:38)
>>          list_add_tail(&rq->client_link, &rq->file_priv->mm.request_list);
>>   }
>>   
>> +static int eb_oa_config(struct i915_execbuffer *eb)
>> +{
>> +       struct i915_vma *oa_vma;
>> +       int err;
>> +
>> +       if (!eb->oa_config)
>> +               return 0;
>> +
>> +       /*
>> +        * If the config hasn't changed, skip reconfiguring the HW (this is
>> +        * subject to a delay we want to avoid has much as possible).
>> +        */
>> +       if (eb->oa_config == eb->i915->perf.oa.exclusive_stream->oa_config)
>> +               return 0;
> But you don't order the execution so it may not be the right oa_config.
> Just add the barrier. It is virtually no cost for the exclusive oa
> userspace.


Ah right sorry :(


>
> How does this interact with the global oa_config being changed via the
> ioctl?


eb_oa_config() should be called under the global lock right?
Or are you referring to something else?


>   What significance is there for this per-execbuf oa_config being
> applied to other users?


The expectation is that a single application is using this and data 
other users get from MI_REPORT_PERF_COUNT is undefined (much like when 
OA is turned off).

Now I see there is a problem with an application with multiple contexts 
because we have the Flex EU counter configurations per context.

I can break the config in 2 parts and execute the flex counter 
programming everything regardless.


We really want to minimize the NOA reprogramming because it takes an 
undefined amount of time to apply (been told below 1ms).


Thanks for spotting those issue.


-Lionel


> -Chris
>
diff mbox series

Patch

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
index 02dc5480e8fe..4a785999a9c5 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
@@ -280,7 +280,11 @@  struct i915_execbuffer {
 	struct {
 		u64 flags; /** Available extensions parameters */
 		struct drm_i915_gem_exec_timeline_fences timeline_fences;
+		struct drm_i915_gem_execbuffer_perf_ext perf_config;
 	} extensions;
+
+	struct i915_oa_config *oa_config; /** HW configuration for OA, NULL is not needed. */
+	struct drm_i915_gem_object *oa_bo;
 };
 
 #define exec_entry(EB, VMA) (&(EB)->exec[(VMA)->exec_flags - (EB)->flags])
@@ -1198,6 +1202,34 @@  static int reloc_move_to_gpu(struct i915_request *rq, struct i915_vma *vma)
 	return err;
 }
 
+
+static int
+get_execbuf_oa_config(struct drm_i915_private *dev_priv,
+		      s32 perf_fd, u64 oa_config_id,
+		      struct i915_oa_config **out_oa_config,
+		      struct drm_i915_gem_object **out_oa_obj)
+{
+	struct file *perf_file;
+	int ret;
+
+	if (!dev_priv->perf.oa.exclusive_stream)
+		return -EINVAL;
+
+	perf_file = fget(perf_fd);
+	if (!perf_file)
+		return -EINVAL;
+
+	if (perf_file->private_data != dev_priv->perf.oa.exclusive_stream)
+		return -EINVAL;
+
+	fput(perf_file);
+
+	ret = i915_perf_get_oa_config(dev_priv, oa_config_id,
+				      out_oa_config, out_oa_obj);
+
+	return ret;
+}
+
 static int __reloc_gpu_alloc(struct i915_execbuffer *eb,
 			     struct i915_vma *vma,
 			     unsigned int len)
@@ -2060,6 +2092,51 @@  add_to_client(struct i915_request *rq, struct drm_file *file)
 	list_add_tail(&rq->client_link, &rq->file_priv->mm.request_list);
 }
 
+static int eb_oa_config(struct i915_execbuffer *eb)
+{
+	struct i915_vma *oa_vma;
+	int err;
+
+	if (!eb->oa_config)
+		return 0;
+
+	/*
+	 * If the config hasn't changed, skip reconfiguring the HW (this is
+	 * subject to a delay we want to avoid has much as possible).
+	 */
+	if (eb->oa_config == eb->i915->perf.oa.exclusive_stream->oa_config)
+		return 0;
+
+	oa_vma = i915_vma_instance(eb->oa_bo,
+				   &eb->engine->i915->ggtt.vm, NULL);
+	if (unlikely(IS_ERR(oa_vma)))
+		return PTR_ERR(oa_vma);
+
+	err = i915_vma_pin(oa_vma, 0, 0, PIN_GLOBAL);
+	if (err)
+		return err;
+
+	err = i915_vma_move_to_active(oa_vma, eb->request, 0);
+	if (err) {
+		i915_vma_unpin(oa_vma);
+		return err;
+	}
+
+	err = eb->engine->emit_bb_start(eb->request,
+					oa_vma->node.start,
+					0, I915_DISPATCH_SECURE);
+	if (err) {
+		i915_vma_unpin(oa_vma);
+		return err;
+	}
+
+	i915_vma_unpin(oa_vma);
+
+	swap(eb->oa_config, eb->i915->perf.oa.exclusive_stream->oa_config);
+
+	return 0;
+}
+
 static int eb_submit(struct i915_execbuffer *eb)
 {
 	int err;
@@ -2086,6 +2163,10 @@  static int eb_submit(struct i915_execbuffer *eb)
 			return err;
 	}
 
+	err = eb_oa_config(eb);
+	if (err)
+		return err;
+
 	err = eb->engine->emit_bb_start(eb->request,
 					eb->batch->node.start +
 					eb->batch_start_offset,
@@ -2519,6 +2600,13 @@  parse_execbuf2_extensions(struct drm_i915_gem_execbuffer2 *args,
 				return -EFAULT;
 			break;
 
+		case DRM_I915_GEM_BASE_EXECBUFFER_TYPE_PERF:
+			if (copy_from_user(&eb->extensions.perf_config,
+					   u64_to_user_ptr(iter_ptr),
+					   sizeof(eb->extensions.perf_config)))
+				return -EFAULT;
+			break;
+
 		default:
 			return -EINVAL;
 		}
@@ -2567,6 +2655,7 @@  i915_gem_do_execbuffer(struct drm_device *dev,
 	eb.buffer_count = args->buffer_count;
 	eb.batch_start_offset = args->batch_start_offset;
 	eb.batch_len = args->batch_len;
+	eb.oa_config = NULL;
 
 	eb.batch_flags = 0;
 	if (args->flags & I915_EXEC_SECURE) {
@@ -2651,9 +2740,23 @@  i915_gem_do_execbuffer(struct drm_device *dev,
 	if (unlikely(err))
 		goto err_unlock;
 
+	if (eb.extensions.flags & BIT(DRM_I915_GEM_BASE_EXECBUFFER_TYPE_PERF)) {
+		if (!intel_engine_has_oa(eb.engine)) {
+			err = -ENODEV;
+			goto err_engine;
+		}
+
+		err = get_execbuf_oa_config(eb.i915,
+					    eb.extensions.perf_config.perf_fd,
+					    eb.extensions.perf_config.oa_config,
+					    &eb.oa_config, &eb.oa_bo);
+		if (err)
+			goto err_engine;
+	}
+
 	err = eb_wait_for_ring(&eb); /* may temporarily drop struct_mutex */
 	if (unlikely(err))
-		goto err_engine;
+		goto err_oa;
 
 	err = eb_relocate(&eb);
 	if (err) {
@@ -2806,6 +2909,11 @@  i915_gem_do_execbuffer(struct drm_device *dev,
 err_vma:
 	if (eb.exec)
 		eb_release_vmas(&eb);
+err_oa:
+	if (eb.oa_config) {
+		i915_gem_object_put(eb.oa_bo);
+		i915_oa_config_put(eb.oa_config);
+	}
 err_engine:
 	eb_unpin_context(&eb);
 err_unlock:
diff --git a/drivers/gpu/drm/i915/gt/intel_engine_types.h b/drivers/gpu/drm/i915/gt/intel_engine_types.h
index 01223864237a..97badb185eb2 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_types.h
+++ b/drivers/gpu/drm/i915/gt/intel_engine_types.h
@@ -457,6 +457,7 @@  struct intel_engine_cs {
 #define I915_ENGINE_HAS_SEMAPHORES   BIT(3)
 #define I915_ENGINE_NEEDS_BREADCRUMB_TASKLET BIT(4)
 #define I915_ENGINE_IS_VIRTUAL       BIT(5)
+#define I915_ENGINE_HAS_OA           BIT(6)
 	unsigned int flags;
 
 	/*
@@ -552,6 +553,12 @@  intel_engine_is_virtual(const struct intel_engine_cs *engine)
 	return engine->flags & I915_ENGINE_IS_VIRTUAL;
 }
 
+static inline bool
+intel_engine_has_oa(const struct intel_engine_cs *engine)
+{
+	return engine->flags & I915_ENGINE_HAS_OA;
+}
+
 #define instdone_slice_mask(dev_priv__) \
 	(IS_GEN(dev_priv__, 7) ? \
 	 1 : RUNTIME_INFO(dev_priv__)->sseu.slice_mask)
diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c
index fed704802c57..ed19f4e53d31 100644
--- a/drivers/gpu/drm/i915/gt/intel_lrc.c
+++ b/drivers/gpu/drm/i915/gt/intel_lrc.c
@@ -2732,6 +2732,7 @@  int intel_execlists_submission_setup(struct intel_engine_cs *engine)
 		engine->init_context = gen8_init_rcs_context;
 		engine->emit_flush = gen8_emit_flush_render;
 		engine->emit_fini_breadcrumb = gen8_emit_fini_breadcrumb_rcs;
+		engine->flags |= I915_ENGINE_HAS_OA;
 	}
 
 	return 0;
diff --git a/drivers/gpu/drm/i915/gt/intel_ringbuffer.c b/drivers/gpu/drm/i915/gt/intel_ringbuffer.c
index ff58d658e3e2..972193cfbf41 100644
--- a/drivers/gpu/drm/i915/gt/intel_ringbuffer.c
+++ b/drivers/gpu/drm/i915/gt/intel_ringbuffer.c
@@ -2212,8 +2212,10 @@  static void setup_rcs(struct intel_engine_cs *engine)
 		engine->irq_enable_mask = I915_USER_INTERRUPT;
 	}
 
-	if (IS_HASWELL(i915))
+	if (IS_HASWELL(i915)) {
 		engine->emit_bb_start = hsw_emit_bb_start;
+		engine->flags |= I915_ENGINE_HAS_OA;
+	}
 
 	engine->resume = rcs_resume;
 }
diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index a872791f98bd..75029d1a3802 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -478,6 +478,10 @@  static int i915_getparam_ioctl(struct drm_device *dev, void *data,
 	case I915_PARAM_PERF_REVISION:
 		value = 1;
 		break;
+	case I915_PARAM_HAS_EXEC_PERF_CONFIG:
+		/* Obviously requires perf support. */
+		value = dev_priv->perf.initialized;
+		break;
 	default:
 		DRM_DEBUG("Unknown parameter %d\n", param->param);
 		return -EINVAL;
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index f1e51307253a..aefdae856e77 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -2833,6 +2833,7 @@  int i915_perf_get_oa_config(struct drm_i915_private *i915,
 			    int metrics_set,
 			    struct i915_oa_config **out_config,
 			    struct drm_i915_gem_object **out_obj);
+void i915_oa_config_put(struct i915_oa_config *oa_config);
 
 /* i915_gem_evict.c */
 int __must_check i915_gem_evict_something(struct i915_address_space *vm,
diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
index e0071e44de3d..82a282f668c0 100644
--- a/drivers/gpu/drm/i915/i915_perf.c
+++ b/drivers/gpu/drm/i915/i915_perf.c
@@ -366,7 +366,7 @@  struct perf_open_properties {
 	int oa_period_exponent;
 };
 
-static void put_oa_config(struct i915_oa_config *oa_config)
+void i915_oa_config_put(struct i915_oa_config *oa_config)
 {
 	if (!atomic_dec_and_test(&oa_config->ref_count))
 		return;
@@ -500,7 +500,7 @@  int i915_perf_get_oa_config(struct drm_i915_private *i915,
 
 err_buf_alloc:
 	if (out_config) {
-		put_oa_config(oa_config);
+		i915_oa_config_put(oa_config);
 		*out_config = NULL;
 	}
 unlock:
@@ -1475,7 +1475,7 @@  static void i915_oa_stream_destroy(struct i915_perf_stream *stream)
 	if (stream->ctx)
 		oa_put_render_ctx_id(stream);
 
-	put_oa_config(stream->oa_config);
+	i915_oa_config_put(stream->oa_config);
 
 	if (dev_priv->perf.oa.spurious_report_rs.missed) {
 		DRM_NOTE("%d spurious OA report notices suppressed due to ratelimiting\n",
@@ -2243,7 +2243,7 @@  static int i915_oa_stream_init(struct i915_perf_stream *stream,
 	free_oa_buffer(dev_priv);
 
 err_oa_buf_alloc:
-	put_oa_config(stream->oa_config);
+	i915_oa_config_put(stream->oa_config);
 
 	intel_uncore_forcewake_put(&dev_priv->uncore, FORCEWAKE_ALL);
 	intel_runtime_pm_put(dev_priv, stream->wakeref);
@@ -3406,7 +3406,7 @@  int i915_perf_add_config_ioctl(struct drm_device *dev, void *data,
 sysfs_err:
 	mutex_unlock(&dev_priv->perf.metrics_lock);
 reg_err:
-	put_oa_config(oa_config);
+	i915_oa_config_put(oa_config);
 	DRM_DEBUG("Failed to add new OA config\n");
 	return err;
 }
@@ -3460,7 +3460,7 @@  int i915_perf_remove_config_ioctl(struct drm_device *dev, void *data,
 
 	DRM_DEBUG("Removed config %s id=%i\n", oa_config->uuid, oa_config->id);
 
-	put_oa_config(oa_config);
+	i915_oa_config_put(oa_config);
 
 config_err:
 	mutex_unlock(&dev_priv->perf.metrics_lock);
@@ -3622,7 +3622,7 @@  static int destroy_config(int id, void *p, void *data)
 {
 	struct i915_oa_config *oa_config = p;
 
-	put_oa_config(oa_config);
+	i915_oa_config_put(oa_config);
 
 	return 0;
 }
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index ba1b02859346..7f770183ee31 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -623,6 +623,16 @@  typedef struct drm_i915_irq_wait {
  */
 #define I915_PARAM_HAS_EXEC_TIMELINE_FENCES 55
 
+/*
+ * Request an OA performance configuration change before running the commands
+ * given in an execbuf.
+ *
+ * Performance configuration ID is given in the DR4 field of
+ * drm_i915_gem_execbuffer2 and the file descriptor of the i915 perf stream is
+ * given in DR1. Execbuffer will fail if any of these parameter is invalid.
+ */
+#define I915_PARAM_HAS_EXEC_PERF_CONFIG 56
+
 /* Must be kept compact -- no holes and well documented */
 
 typedef struct drm_i915_getparam {
@@ -1026,6 +1036,12 @@  enum drm_i915_gem_base_execbuffer_type {
 	 */
 	DRM_I915_GEM_BASE_EXECBUFFER_TYPE_TIMELINE_FENCES,
 
+	/**
+	 * This identifier is associated with
+	 * drm_i915_gem_execbuffer_perf_ext.
+	 */
+	DRM_I915_GEM_BASE_EXECBUFFER_TYPE_PERF,
+
 	DRM_I915_GEM_BASE_EXECBUFFER_TYPE_MAX /* non-ABI */
 };
 
@@ -1073,6 +1089,27 @@  struct drm_i915_gem_exec_timeline_fences {
 	__u64 values_ptr;
 };
 
+struct drm_i915_gem_execbuffer_perf_ext {
+	struct drm_i915_gem_base_execbuffer_ext base;
+
+	/**
+	 * Performance file descriptor returned by DRM_IOCTL_I915_PERF_OPEN.
+	 * This is used to identify that the application
+	 */
+	__s32 perf_fd;
+
+	/**
+	 * Unused for now. Must be cleared to zero.
+	 */
+	__u32 pad;
+
+	/**
+	 * OA configuration ID to switch to before executing the commands
+	 * associated to the execbuf.
+	 */
+	__u64 oa_config;
+};
+
 struct drm_i915_gem_execbuffer2 {
 	/**
 	 * List of gem_exec_object2 structs