diff mbox

[RFC,6/8] drm/i915: Insert commands for capture of OA counters in the ring

Message ID 1436950023-13940-7-git-send-email-sourab.gupta@intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

sourab.gupta@intel.com July 15, 2015, 8:47 a.m. UTC
From: Sourab Gupta <sourab.gupta@intel.com>

This patch adds the routines which insert commands for capturing OA
snapshots into the ringbuffer of RCS engine.

The command MI_REPORT_PERF_COUNT can be used to capture snapshots of OA
counters, which is inserted at BB boundaries.
While inserting the commands, we keep a reference of associated request.
This will be released when we are forwarding the samples to userspace
(or when the event is being destroyed).
Also, an active reference of the destination buffer is taken here, so that
we can be assured that the buffer is freed up only after GPU is done with
it, even if the local reference of the buffer is released.

Signed-off-by: Sourab Gupta <sourab.gupta@intel.com>
---
 drivers/gpu/drm/i915/i915_drv.h            | 11 +++++
 drivers/gpu/drm/i915/i915_gem_execbuffer.c |  6 +++
 drivers/gpu/drm/i915/i915_oa_perf.c        | 77 ++++++++++++++++++++++++++++++
 3 files changed, 94 insertions(+)

Comments

Chris Wilson July 15, 2015, 10:26 a.m. UTC | #1
On Wed, Jul 15, 2015 at 02:17:01PM +0530, sourab.gupta@intel.com wrote:
> +void i915_oa_insert_cmd(struct intel_ringbuffer *ringbuf, u32 ctx_id)
You need to pass in the request here instead.

A better name would be i915_oa_emit_perf_report(). insert_cmd() is a
little too generic (i.e. which cmd do you mean?).

> +{
> +	struct intel_engine_cs *ring = ringbuf->ring;
> +	struct drm_i915_private *dev_priv = ring->dev->dev_private;
> +	struct drm_i915_gem_object *obj = dev_priv->oa_pmu.oa_rcs_buffer.obj;
> +	struct i915_oa_rcs_node *entry;
> +	unsigned long lock_flags;
> +	u32 addr = 0;
> +	int ret;
> +
> +	/* OA counters are only supported on the render ring */
> +	if (ring->id != RCS)
> +		return;
> +
> +	entry = kzalloc(sizeof(*entry), GFP_KERNEL);
> +	if (entry == NULL) {
> +		DRM_ERROR("alloc failed\n");
> +		return;
> +	}
> +	entry->ctx_id = ctx_id;
> +	i915_gem_request_assign(&entry->req, ring->outstanding_lazy_request);
> +
> +	spin_lock_irqsave(&dev_priv->oa_pmu.lock, lock_flags);
> +	if (list_empty(&dev_priv->oa_pmu.node_list))
> +		entry->offset = 0;
> +	else {
> +		struct i915_oa_rcs_node *last_entry;
> +		int max_offset = dev_priv->oa_pmu.oa_rcs_buffer.node_count *
> +				dev_priv->oa_pmu.oa_rcs_buffer.node_size;
> +
> +		last_entry = list_last_entry(&dev_priv->oa_pmu.node_list,
> +					struct i915_oa_rcs_node, head);
> +		entry->offset = last_entry->offset +
> +				dev_priv->oa_pmu.oa_rcs_buffer.node_size;
> +
> +		if (entry->offset > max_offset)
> +			entry->offset = 0;
> +	}
> +	list_add_tail(&entry->head, &dev_priv->oa_pmu.node_list);
> +	spin_unlock_irqrestore(&dev_priv->oa_pmu.lock, lock_flags);
> +
> +	addr = i915_gem_obj_ggtt_offset(obj) + entry->offset;

Don't do more than one i915_gem_obj_to_ggtt() please (preferably none
and just keep hold of your pinned vma from the start).

> +	/* addr should be 64 byte aligned */
> +	BUG_ON(addr & 0x3f);
> +
> +	ret = intel_ring_begin(ring, 4);
> +	if (ret)
> +		return;

You've commited the request to the sample list, but have just erred out.

> +
> +	intel_ring_emit(ring, MI_REPORT_PERF_COUNT | (1<<0));
> +	intel_ring_emit(ring, addr | MI_REPORT_PERF_COUNT_GGTT);
> +	intel_ring_emit(ring, ring->outstanding_lazy_request->seqno);
> +	intel_ring_emit(ring, MI_NOOP);
> +	intel_ring_advance(ring);
> +
> +	obj->base.write_domain = I915_GEM_DOMAIN_RENDER;
> +	i915_vma_move_to_active(i915_gem_obj_to_ggtt(obj), ring);

That's the magic I have been looking for.
-Chris
diff mbox

Patch

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index b3d5f7e..fb296ae 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -1660,6 +1660,11 @@  enum i915_oa_event_state {
 	I915_OA_EVENT_STOPPED,
 };
 
+enum i915_profile_mode {
+	I915_PROFILE_OA = 0,
+	I915_PROFILE_MAX,
+};
+
 struct i915_oa_rcs_node {
 	struct list_head head;
 	struct drm_i915_gem_request *req;
@@ -1974,6 +1979,9 @@  struct drm_i915_private {
 		struct work_struct work_timer;
 		struct work_struct work_event_destroy;
 	} oa_pmu;
+
+	void (*insert_profile_cmd[I915_PROFILE_MAX])
+		(struct intel_ringbuffer *ringbuf, u32 ctx_id);
 #endif
 
 	/* Abstract the submission mechanism (legacy ringbuffer or execlists) away */
@@ -3154,6 +3162,7 @@  void i915_oa_context_pin_notify(struct drm_i915_private *dev_priv,
 				struct intel_context *context);
 void i915_oa_context_unpin_notify(struct drm_i915_private *dev_priv,
 				  struct intel_context *context);
+void i915_insert_profiling_cmd(struct intel_ringbuffer *ringbuf, u32 ctx_id);
 #else
 static inline void
 i915_oa_context_pin_notify(struct drm_i915_private *dev_priv,
@@ -3161,6 +3170,8 @@  i915_oa_context_pin_notify(struct drm_i915_private *dev_priv,
 static inline void
 i915_oa_context_unpin_notify(struct drm_i915_private *dev_priv,
 			     struct intel_context *context) {}
+void i915_insert_profiling_cmd(struct intel_ringbuffer *ringbuf,
+				u32 ctx_id) {};
 #endif
 
 /* i915_gem_evict.c */
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index 3336e1c..2f8971b 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -1317,6 +1317,9 @@  i915_gem_ringbuffer_submission(struct drm_device *dev, struct drm_file *file,
 			goto error;
 	}
 
+	i915_insert_profiling_cmd(ring->buffer,
+		i915_execbuffer2_get_context_id(*args));
+
 	exec_len = args->batch_len;
 	if (cliprects) {
 		for (i = 0; i < args->num_cliprects; i++) {
@@ -1339,6 +1342,9 @@  i915_gem_ringbuffer_submission(struct drm_device *dev, struct drm_file *file,
 			return ret;
 	}
 
+	i915_insert_profiling_cmd(ring->buffer,
+		i915_execbuffer2_get_context_id(*args));
+
 	trace_i915_gem_ring_dispatch(intel_ring_get_request(ring), dispatch_flags);
 
 	i915_gem_execbuffer_move_to_active(vmas, ring);
diff --git a/drivers/gpu/drm/i915/i915_oa_perf.c b/drivers/gpu/drm/i915/i915_oa_perf.c
index c1e3bea..9966e54 100644
--- a/drivers/gpu/drm/i915/i915_oa_perf.c
+++ b/drivers/gpu/drm/i915/i915_oa_perf.c
@@ -25,6 +25,78 @@  static int hsw_perf_format_sizes[] = {
 	64   /* C4_B8_HSW */
 };
 
+void i915_insert_profiling_cmd(struct intel_ringbuffer *ringbuf, u32 ctx_id)
+{
+	struct intel_engine_cs *ring = ringbuf->ring;
+	struct drm_i915_private *dev_priv = ring->dev->dev_private;
+	int i;
+
+	for (i = I915_PROFILE_OA; i < I915_PROFILE_MAX; i++) {
+		if (dev_priv->insert_profile_cmd[i])
+			dev_priv->insert_profile_cmd[i](ringbuf, ctx_id);
+	}
+}
+
+void i915_oa_insert_cmd(struct intel_ringbuffer *ringbuf, u32 ctx_id)
+{
+	struct intel_engine_cs *ring = ringbuf->ring;
+	struct drm_i915_private *dev_priv = ring->dev->dev_private;
+	struct drm_i915_gem_object *obj = dev_priv->oa_pmu.oa_rcs_buffer.obj;
+	struct i915_oa_rcs_node *entry;
+	unsigned long lock_flags;
+	u32 addr = 0;
+	int ret;
+
+	/* OA counters are only supported on the render ring */
+	if (ring->id != RCS)
+		return;
+
+	entry = kzalloc(sizeof(*entry), GFP_KERNEL);
+	if (entry == NULL) {
+		DRM_ERROR("alloc failed\n");
+		return;
+	}
+	entry->ctx_id = ctx_id;
+	i915_gem_request_assign(&entry->req, ring->outstanding_lazy_request);
+
+	spin_lock_irqsave(&dev_priv->oa_pmu.lock, lock_flags);
+	if (list_empty(&dev_priv->oa_pmu.node_list))
+		entry->offset = 0;
+	else {
+		struct i915_oa_rcs_node *last_entry;
+		int max_offset = dev_priv->oa_pmu.oa_rcs_buffer.node_count *
+				dev_priv->oa_pmu.oa_rcs_buffer.node_size;
+
+		last_entry = list_last_entry(&dev_priv->oa_pmu.node_list,
+					struct i915_oa_rcs_node, head);
+		entry->offset = last_entry->offset +
+				dev_priv->oa_pmu.oa_rcs_buffer.node_size;
+
+		if (entry->offset > max_offset)
+			entry->offset = 0;
+	}
+	list_add_tail(&entry->head, &dev_priv->oa_pmu.node_list);
+	spin_unlock_irqrestore(&dev_priv->oa_pmu.lock, lock_flags);
+
+	addr = i915_gem_obj_ggtt_offset(obj) + entry->offset;
+
+	/* addr should be 64 byte aligned */
+	BUG_ON(addr & 0x3f);
+
+	ret = intel_ring_begin(ring, 4);
+	if (ret)
+		return;
+
+	intel_ring_emit(ring, MI_REPORT_PERF_COUNT | (1<<0));
+	intel_ring_emit(ring, addr | MI_REPORT_PERF_COUNT_GGTT);
+	intel_ring_emit(ring, ring->outstanding_lazy_request->seqno);
+	intel_ring_emit(ring, MI_NOOP);
+	intel_ring_advance(ring);
+
+	obj->base.write_domain = I915_GEM_DOMAIN_RENDER;
+	i915_vma_move_to_active(i915_gem_obj_to_ggtt(obj), ring);
+}
+
 static void forward_one_oa_snapshot_to_event(struct drm_i915_private *dev_priv,
 					     u8 *snapshot,
 					     struct perf_event *event)
@@ -1025,6 +1097,10 @@  static void i915_oa_event_start(struct perf_event *event, int flags)
 	dev_priv->oa_pmu.event_state = I915_OA_EVENT_STARTED;
 	update_oacontrol(dev_priv);
 
+	if (dev_priv->oa_pmu.multiple_ctx_mode)
+		dev_priv->insert_profile_cmd[I915_PROFILE_OA] =
+				i915_oa_insert_cmd;
+
 	/* Reset the head ptr to ensure we don't forward reports relating
 	 * to a previous perf event */
 	oastatus1 = I915_READ(GEN7_OASTATUS1);
@@ -1061,6 +1137,7 @@  static void i915_oa_event_stop(struct perf_event *event, int flags)
 
 		spin_lock_irqsave(&dev_priv->oa_pmu.lock, lock_flags);
 
+		dev_priv->insert_profile_cmd[I915_PROFILE_OA] = NULL;
 		dev_priv->oa_pmu.event_state = I915_OA_EVENT_STOP_IN_PROGRESS;
 		list_for_each_entry(entry, &dev_priv->oa_pmu.node_list, head)
 			entry->discard = true;