diff mbox

[RFC,3/8] drm/i915: Add mechanism for forwarding CS based OA counter snapshots through perf

Message ID 1436950023-13940-4-git-send-email-sourab.gupta@intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

sourab.gupta@intel.com July 15, 2015, 8:46 a.m. UTC
From: Sourab Gupta <sourab.gupta@intel.com>

This patch adds the mechanism for forwarding the CS based OA snapshots
through the perf event interface.

The OA snapshots will be captured in a gem buffer object. The metadata
information (ctx_id right now) pertaining to snapshot is maintained in a
list, which has offsets into the gem buffer object for each snapshot
captured.

Each snapshot collected is forwarded as a separate perf sample. The perf
sample will have raw OA report followed by metadata information pertaining
to that sample. The size of the OA report is the one specified during
event init.

In order to track whether the gpu has completed processing the node, a
field pertaining to corresponding gem request is added. The request is
expected to be referenced whenever the gpu command is submitted.

While forwarding the samples, we check whether the gem request is completed
and dereference the corresponding request. The need to dereference the
request necessitates a worker here, which will be scheduled when the
hrtimer triggers.

While flushing the samples, we have to wait for the requests already
scheduled, before forwarding the samples. This wait is in a lockless
fashion.

Signed-off-by: Sourab Gupta <sourab.gupta@intel.com>
---
 drivers/gpu/drm/i915/i915_drv.h     |  11 ++++
 drivers/gpu/drm/i915/i915_oa_perf.c | 128 +++++++++++++++++++++++++++++++++++-
 include/uapi/drm/i915_drm.h         |   5 ++
 3 files changed, 143 insertions(+), 1 deletion(-)
diff mbox

Patch

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 740148d..eb72f95 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -1653,6 +1653,13 @@  struct i915_oa_reg {
 	u32 value;
 };
 
+struct i915_oa_rcs_node {
+	struct list_head head;
+	struct drm_i915_gem_request *req;
+	u32 offset;
+	u32 ctx_id;
+};
+
 extern const struct i915_oa_reg i915_oa_3d_mux_config_hsw[];
 extern const int i915_oa_3d_mux_config_hsw_len;
 extern const struct i915_oa_reg i915_oa_3d_b_counter_config_hsw[];
@@ -1952,7 +1959,11 @@  struct drm_i915_private {
 			u8 *addr;
 			int format;
 			int format_size;
+			u32 node_size;
+			u32 node_count;
 		} oa_rcs_buffer;
+		struct list_head node_list;
+		struct work_struct work_timer;
 	} oa_pmu;
 #endif
 
diff --git a/drivers/gpu/drm/i915/i915_oa_perf.c b/drivers/gpu/drm/i915/i915_oa_perf.c
index b79582b..a4fdca3 100644
--- a/drivers/gpu/drm/i915/i915_oa_perf.c
+++ b/drivers/gpu/drm/i915/i915_oa_perf.c
@@ -58,6 +58,14 @@  static u32 forward_oa_snapshots(struct drm_i915_private *dev_priv,
 	u8 *snapshot;
 	u32 taken;
 
+	/*
+	 * Schedule a worker to forward the RCS based OA reports collected.
+	 * A worker is needed since it requires device mutex to be taken
+	 * which can't be done here because of atomic context
+	 */
+	if (dev_priv->oa_pmu.multiple_ctx_mode)
+		schedule_work(&dev_priv->oa_pmu.work_timer);
+
 	head -= dev_priv->oa_pmu.oa_buffer.gtt_offset;
 	tail -= dev_priv->oa_pmu.oa_buffer.gtt_offset;
 
@@ -165,6 +173,103 @@  static void flush_oa_snapshots(struct drm_i915_private *dev_priv,
 	spin_unlock_irqrestore(&dev_priv->oa_pmu.oa_buffer.flush_lock, flags);
 }
 
+int i915_oa_rcs_wait_gpu(struct drm_i915_private *dev_priv)
+{
+	struct i915_oa_rcs_node *last_entry;
+	unsigned long lock_flags;
+	int ret;
+
+	/*
+	 * Wait for the last scheduled request to complete. This would
+	 * implicitly wait for the prior submitted requests. The refcount
+	 * of the requests is not decremented here.
+	 */
+	spin_lock_irqsave(&dev_priv->oa_pmu.lock, lock_flags);
+
+	if (list_empty(&dev_priv->oa_pmu.node_list)) {
+		spin_unlock_irqrestore(&dev_priv->oa_pmu.lock, lock_flags);
+		return 0;
+	}
+	last_entry = list_last_entry(&dev_priv->oa_pmu.node_list,
+			struct i915_oa_rcs_node, head);
+	spin_unlock_irqrestore(&dev_priv->oa_pmu.lock, lock_flags);
+
+	if (last_entry && last_entry->req) {
+		ret = __i915_wait_request(last_entry->req, atomic_read(
+				&dev_priv->gpu_error.reset_counter),
+				dev_priv->mm.interruptible, NULL, NULL);
+		if (ret) {
+			DRM_ERROR("failed to wait\n");
+			return ret;
+		}
+	}
+	return 0;
+}
+
+static void forward_one_oa_rcs_sample(struct drm_i915_private *dev_priv,
+				struct i915_oa_rcs_node *node)
+{
+	struct perf_sample_data data;
+	struct perf_event *event = dev_priv->oa_pmu.exclusive_event;
+	int format_size, snapshot_size;
+	u8 *snapshot;
+	struct drm_i915_oa_node_ctx_id *ctx_info;
+	struct perf_raw_record raw;
+
+	format_size = dev_priv->oa_pmu.oa_rcs_buffer.format_size;
+	snapshot_size = format_size + sizeof(*ctx_info);
+	snapshot = dev_priv->oa_pmu.oa_rcs_buffer.addr + node->offset;
+
+	ctx_info = (struct drm_i915_oa_node_ctx_id *)(snapshot + format_size);
+	ctx_info->ctx_id = node->ctx_id;
+
+	perf_sample_data_init(&data, 0, event->hw.last_period);
+
+	/* Note: the combined u32 raw->size member + raw data itself must be 8
+	 * byte aligned. (See note in init_oa_buffer for more details) */
+	raw.size = snapshot_size + 4;
+	raw.data = snapshot;
+
+	data.raw = &raw;
+
+	perf_event_overflow(event, &data, &dev_priv->oa_pmu.dummy_regs);
+}
+
+void forward_oa_rcs_snapshots_work(struct work_struct *__work)
+{
+	struct drm_i915_private *dev_priv =
+		container_of(__work, typeof(*dev_priv), oa_pmu.work_timer);
+	struct i915_oa_rcs_node *entry, *next;
+	struct drm_i915_gem_request *req;
+	unsigned long lock_flags;
+	int ret;
+
+	list_for_each_entry_safe
+		(entry, next, &dev_priv->oa_pmu.node_list, head) {
+		req = entry->req;
+		if (req && i915_gem_request_completed(req, true)) {
+			forward_one_oa_rcs_sample(dev_priv, entry);
+			ret = i915_mutex_lock_interruptible(dev_priv->dev);
+			if (ret)
+				break;
+			i915_gem_request_assign(&entry->req, NULL);
+			mutex_unlock(&dev_priv->dev->struct_mutex);
+		} else
+			break;
+
+		/*
+		 * Do we instead need to protect whole loop? If so, we would
+		 * need to *list_move_tail* to a deferred list, from where
+		 * i915 device mutex could be taken to deference the requests,
+		 * and free the node.
+		 */
+		spin_lock_irqsave(&dev_priv->oa_pmu.lock, lock_flags);
+		list_del(&entry->head);
+		spin_unlock_irqrestore(&dev_priv->oa_pmu.lock, lock_flags);
+		kfree(entry);
+	}
+}
+
 static void
 oa_rcs_buffer_destroy(struct drm_i915_private *i915)
 {
@@ -364,7 +469,7 @@  static int init_oa_rcs_buffer(struct perf_event *event)
 	struct drm_i915_private *dev_priv =
 		container_of(event->pmu, typeof(*dev_priv), oa_pmu.pmu);
 	struct drm_i915_gem_object *bo;
-	int ret;
+	int ret, node_size;
 
 	BUG_ON(dev_priv->oa_pmu.oa_rcs_buffer.obj);
 
@@ -375,6 +480,16 @@  static int init_oa_rcs_buffer(struct perf_event *event)
 	dev_priv->oa_pmu.oa_rcs_buffer.obj = bo;
 
 	dev_priv->oa_pmu.oa_rcs_buffer.addr = vmap_oa_buffer(bo);
+	INIT_LIST_HEAD(&dev_priv->oa_pmu.node_list);
+
+	node_size = dev_priv->oa_pmu.oa_rcs_buffer.format_size +
+			sizeof(struct drm_i915_oa_node_ctx_id);
+
+	/* node size has to be aligned to 64 bytes, since only 64 byte aligned
+	 * addresses can be given to OA unit for dumping OA reports */
+	node_size = ALIGN(node_size, 64);
+	dev_priv->oa_pmu.oa_rcs_buffer.node_size = node_size;
+	dev_priv->oa_pmu.oa_rcs_buffer.node_count = bo->base.size / node_size;
 
 	DRM_DEBUG_DRIVER("OA RCS Buffer initialized, vaddr = %p",
 			 dev_priv->oa_pmu.oa_rcs_buffer.addr);
@@ -849,7 +964,13 @@  static int i915_oa_event_flush(struct perf_event *event)
 	if (event->attr.sample_period) {
 		struct drm_i915_private *i915 =
 			container_of(event->pmu, typeof(*i915), oa_pmu.pmu);
+		int ret;
 
+		if (i915->oa_pmu.multiple_ctx_mode) {
+			ret = i915_oa_rcs_wait_gpu(i915);
+			if (ret)
+				return ret;
+		}
 		flush_oa_snapshots(i915, true);
 	}
 
@@ -945,6 +1066,8 @@  void i915_oa_pmu_register(struct drm_device *dev)
 	hrtimer_init(&i915->oa_pmu.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
 	i915->oa_pmu.timer.function = hrtimer_sample;
 
+	INIT_WORK(&i915->oa_pmu.work_timer, forward_oa_rcs_snapshots_work);
+
 	spin_lock_init(&i915->oa_pmu.lock);
 
 	i915->oa_pmu.pmu.capabilities  = PERF_PMU_CAP_IS_DEVICE;
@@ -974,6 +1097,9 @@  void i915_oa_pmu_unregister(struct drm_device *dev)
 	if (i915->oa_pmu.pmu.event_init == NULL)
 		return;
 
+	if (i915->oa_pmu.multiple_ctx_mode)
+		cancel_work_sync(&i915->oa_pmu.work_timer);
+
 	unregister_sysctl_table(i915->oa_pmu.sysctl_header);
 
 	perf_pmu_unregister(&i915->oa_pmu.pmu);
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index dcf7c87..e97b2fd 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -123,6 +123,11 @@  enum drm_i915_oa_event_type {
 	I915_OA_RECORD_MAX,			/* non-ABI */
 };
 
+struct drm_i915_oa_node_ctx_id {
+	__u32 ctx_id;
+	__u32 pad;
+};
+
 /* Each region is a minimum of 16k, and there are at most 255 of them.
  */
 #define I915_NR_TEX_REGIONS 255	/* table size 2k - maximum due to use