diff mbox

[RFC,2/8] drm/i915: Add mechanism for forwarding the timestamp data through perf

Message ID 1438754144-20435-3-git-send-email-sourab.gupta@intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

sourab.gupta@intel.com Aug. 5, 2015, 5:55 a.m. UTC
From: Sourab Gupta <sourab.gupta@intel.com>

This patch adds the mechanism for forwarding the timestamp data to
userspace using the Gen PMU perf event interface.

The timestamps will be captured in a gem buffer object. The metadata
information (ctx global id right now) pertaining to snapshot is maintained
in a list, whose each node has offsets into the gem buffer object for each
snapshot captured.
In order to track whether the gpu has completed processing the node,
a field pertaining to corresponding gem request is added. The request is
expected to be referenced whenever the gpu command is submitted.

Each snapshot collected is forwarded as a separate perf sample. The perf
sample will have raw timestamp data followed by metadata information
pertaining to that sample.
While forwarding the samples, we check whether the gem request is completed
and dereference the corresponding request. The need to dereference the
request necessitates a worker here, which will be scheduled when the
hrtimer triggers.
While flushing the samples, we have to wait for the requests already
scheduled, before forwarding the samples. This wait is done in a lockless
fashion.

v2: Changes here pertaining to (as suggested by Chris):
    - Forwarding functionality implemented in a separate fn. The work item
      (scheduled from hrtimer/event stop) would be calling that function.
      The event flush would directly call this forwarding fn. This meets
      the flush semantics.
    - use spin_lock instead of spin_lock_irqsave
    - Code restructuring & better nomenclature

Signed-off-by: Sourab Gupta <sourab.gupta@intel.com>
---
 drivers/gpu/drm/i915/i915_drv.h     |  11 +++
 drivers/gpu/drm/i915/i915_oa_perf.c | 131 ++++++++++++++++++++++++++++++++++--
 include/uapi/drm/i915_drm.h         |  10 +++
 3 files changed, 147 insertions(+), 5 deletions(-)

Comments

Chris Wilson Aug. 5, 2015, 9:55 a.m. UTC | #1
On Wed, Aug 05, 2015 at 11:25:38AM +0530, sourab.gupta@intel.com wrote:
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index 66f9ee9..08235582 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -1676,6 +1676,13 @@ struct i915_oa_rcs_node {
>  	u32 tag;
>  };
>  
> +struct i915_gen_pmu_node {
> +	struct list_head head;

This not the head of the list, but a node.

> +	struct drm_i915_gem_request *req;

Use request since this is a less often used member name and brevity
isn't saving thousands of keystrokes.

> +	u32 offset;
> +	u32 ctx_id;
> +};

> +static int i915_gen_pmu_wait_gpu(struct drm_i915_private *dev_priv)
> +{
> +	struct i915_gen_pmu_node *last_entry = NULL;
> +	int ret;
> +
> +	/*
> +	 * Wait for the last scheduled request to complete. This would
> +	 * implicitly wait for the prior submitted requests. The refcount
> +	 * of the requests is not decremented here.
> +	 */
> +	spin_lock(&dev_priv->gen_pmu.lock);
> +
> +	if (!list_empty(&dev_priv->gen_pmu.node_list)) {
> +		last_entry = list_last_entry(&dev_priv->gen_pmu.node_list,
> +			struct i915_gen_pmu_node, head);
> +	}
> +	spin_unlock(&dev_priv->gen_pmu.lock);

Because you issue requests on all rings that are not actually
serialised, the order of writes and retirements are also not serialised,
i.e. this does not do a complete wait for all activity on the object.

>  static void forward_gen_pmu_snapshots(struct drm_i915_private *dev_priv)
>  {
> -	WARN_ON(!dev_priv->gen_pmu.buffer.addr);
> +	struct i915_gen_pmu_node *entry, *next;
> +	LIST_HEAD(deferred_list_free);
> +	int ret;
>  
> -	/* TODO: routine for forwarding snapshots to userspace */
> +	list_for_each_entry_safe
> +		(entry, next, &dev_priv->gen_pmu.node_list, head) {
> +		if (!i915_gem_request_completed(entry->req, true))
> +			break;

Again the list is not actually in retirement order since you combine
multiple rings into one list.

These problems magically disappear with a list per-ring and a page
per-ring. You also need to be more careful with overwriting unflushed
entries. A dynamic approach to page allocation overcomes that.
-Chris
diff mbox

Patch

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 66f9ee9..08235582 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -1676,6 +1676,13 @@  struct i915_oa_rcs_node {
 	u32 tag;
 };
 
+struct i915_gen_pmu_node {
+	struct list_head head;
+	struct drm_i915_gem_request *req;
+	u32 offset;
+	u32 ctx_id;
+};
+
 extern const struct i915_oa_reg i915_oa_3d_mux_config_hsw[];
 extern const int i915_oa_3d_mux_config_hsw_len;
 extern const struct i915_oa_reg i915_oa_3d_b_counter_config_hsw[];
@@ -2000,7 +2007,11 @@  struct drm_i915_private {
 			struct drm_i915_gem_object *obj;
 			u32 gtt_offset;
 			u8 *addr;
+			u32 node_size;
+			u32 node_count;
 		} buffer;
+		struct list_head node_list;
+		struct work_struct forward_work;
 	} gen_pmu;
 
 	void (*emit_profiling_data[I915_PROFILE_MAX])
diff --git a/drivers/gpu/drm/i915/i915_oa_perf.c b/drivers/gpu/drm/i915/i915_oa_perf.c
index 37ff0a9..3add862 100644
--- a/drivers/gpu/drm/i915/i915_oa_perf.c
+++ b/drivers/gpu/drm/i915/i915_oa_perf.c
@@ -11,6 +11,9 @@ 
 #define FREQUENCY 200
 #define PERIOD max_t(u64, 10000, NSEC_PER_SEC / FREQUENCY)
 
+#define TS_DATA_SIZE sizeof(struct drm_i915_ts_data)
+#define CTX_INFO_SIZE sizeof(struct drm_i915_ts_node_ctx_id)
+
 static u32 i915_oa_event_paranoid = true;
 
 static int hsw_perf_format_sizes[] = {
@@ -414,11 +417,113 @@  static void forward_oa_rcs_work_fn(struct work_struct *__work)
 	forward_oa_rcs_snapshots(dev_priv);
 }
 
+static int i915_gen_pmu_wait_gpu(struct drm_i915_private *dev_priv)
+{
+	struct i915_gen_pmu_node *last_entry = NULL;
+	int ret;
+
+	/*
+	 * Wait for the last scheduled request to complete. This would
+	 * implicitly wait for the prior submitted requests. The refcount
+	 * of the requests is not decremented here.
+	 */
+	spin_lock(&dev_priv->gen_pmu.lock);
+
+	if (!list_empty(&dev_priv->gen_pmu.node_list)) {
+		last_entry = list_last_entry(&dev_priv->gen_pmu.node_list,
+			struct i915_gen_pmu_node, head);
+	}
+	spin_unlock(&dev_priv->gen_pmu.lock);
+
+	if (!last_entry)
+		return 0;
+
+	ret = __i915_wait_request(last_entry->req, atomic_read(
+			&dev_priv->gpu_error.reset_counter),
+			true, NULL, NULL);
+	if (ret) {
+		DRM_ERROR("failed to wait\n");
+		return ret;
+	}
+	return 0;
+}
+
+static void forward_one_gen_pmu_sample(struct drm_i915_private *dev_priv,
+				struct i915_gen_pmu_node *node)
+{
+	struct perf_sample_data data;
+	struct perf_event *event = dev_priv->gen_pmu.exclusive_event;
+	int snapshot_size;
+	u8 *snapshot;
+	struct drm_i915_ts_node_ctx_id *ctx_info;
+	struct perf_raw_record raw;
+
+	BUILD_BUG_ON(TS_DATA_SIZE != 8);
+	BUILD_BUG_ON(CTX_INFO_SIZE != 8);
+
+	snapshot = dev_priv->gen_pmu.buffer.addr + node->offset;
+	snapshot_size = TS_DATA_SIZE + CTX_INFO_SIZE;
+
+	ctx_info = (struct drm_i915_ts_node_ctx_id *)(snapshot + TS_DATA_SIZE);
+	ctx_info->ctx_id = node->ctx_id;
+
+	/* Note: the raw sample consists of a u32 size member and raw data. The
+	 * combined size of these two fields is required to be 8 byte aligned.
+	 * The size of raw data field is assumed to be 8 byte aligned already.
+	 * Therefore, adding 4 bytes to the raw sample size here.
+	 */
+	BUILD_BUG_ON(((snapshot_size + 4 + sizeof(raw.size)) % 8) != 0);
+
+	perf_sample_data_init(&data, 0, event->hw.last_period);
+	raw.size = snapshot_size + 4;
+	raw.data = snapshot;
+
+	data.raw = &raw;
+	perf_event_overflow(event, &data, &dev_priv->gen_pmu.dummy_regs);
+}
+
+/*
+ * Routine to forward the samples to perf. This may be called from the event
+ * flush and worker thread. This function may sleep, hence can't be called from
+ * atomic contexts directly.
+ */
 static void forward_gen_pmu_snapshots(struct drm_i915_private *dev_priv)
 {
-	WARN_ON(!dev_priv->gen_pmu.buffer.addr);
+	struct i915_gen_pmu_node *entry, *next;
+	LIST_HEAD(deferred_list_free);
+	int ret;
 
-	/* TODO: routine for forwarding snapshots to userspace */
+	list_for_each_entry_safe
+		(entry, next, &dev_priv->gen_pmu.node_list, head) {
+		if (!i915_gem_request_completed(entry->req, true))
+			break;
+
+		forward_one_gen_pmu_sample(dev_priv, entry);
+
+		spin_lock(&dev_priv->gen_pmu.lock);
+		list_move_tail(&entry->head, &deferred_list_free);
+		spin_unlock(&dev_priv->gen_pmu.lock);
+	}
+
+	ret = i915_mutex_lock_interruptible(dev_priv->dev);
+	if (ret)
+		return;
+	while (!list_empty(&deferred_list_free)) {
+		entry = list_first_entry(&deferred_list_free,
+					struct i915_gen_pmu_node, head);
+		i915_gem_request_unreference(entry->req);
+		list_del(&entry->head);
+		kfree(entry);
+	}
+	mutex_unlock(&dev_priv->dev->struct_mutex);
+}
+
+static void forward_gen_pmu_work_fn(struct work_struct *__work)
+{
+	struct drm_i915_private *dev_priv =
+		container_of(__work, typeof(*dev_priv), gen_pmu.forward_work);
+
+	forward_gen_pmu_snapshots(dev_priv);
 }
 
 static void
@@ -752,7 +857,7 @@  static int init_gen_pmu_buffer(struct perf_event *event)
 	struct drm_i915_private *dev_priv =
 		container_of(event->pmu, typeof(*dev_priv), gen_pmu.pmu);
 	struct drm_i915_gem_object *bo;
-	int ret;
+	int ret, node_size;
 
 	BUG_ON(dev_priv->gen_pmu.buffer.obj);
 
@@ -764,6 +869,14 @@  static int init_gen_pmu_buffer(struct perf_event *event)
 	dev_priv->gen_pmu.buffer.gtt_offset =
 				i915_gem_obj_ggtt_offset(bo);
 	dev_priv->gen_pmu.buffer.addr = vmap_oa_buffer(bo);
+	INIT_LIST_HEAD(&dev_priv->gen_pmu.node_list);
+
+	node_size = TS_DATA_SIZE + CTX_INFO_SIZE;
+
+	/* size has to be aligned to 8 bytes */
+	node_size = ALIGN(node_size, 8);
+	dev_priv->gen_pmu.buffer.node_size = node_size;
+	dev_priv->gen_pmu.buffer.node_count = bo->base.size / node_size;
 
 	DRM_DEBUG_DRIVER("Gen PMU Buffer initialized, vaddr = %p",
 			 dev_priv->gen_pmu.buffer.addr);
@@ -776,7 +889,7 @@  static enum hrtimer_restart hrtimer_sample_gen(struct hrtimer *hrtimer)
 	struct drm_i915_private *i915 =
 		container_of(hrtimer, typeof(*i915), gen_pmu.timer);
 
-	forward_gen_pmu_snapshots(i915);
+	schedule_work(&i915->gen_pmu.forward_work);
 
 	hrtimer_forward_now(hrtimer, ns_to_ktime(PERIOD));
 	return HRTIMER_RESTART;
@@ -1353,7 +1466,7 @@  static void i915_gen_event_stop(struct perf_event *event, int flags)
 	spin_unlock(&dev_priv->gen_pmu.lock);
 
 	hrtimer_cancel(&dev_priv->gen_pmu.timer);
-	forward_gen_pmu_snapshots(dev_priv);
+	schedule_work(&dev_priv->gen_pmu.forward_work);
 
 	event->hw.state = PERF_HES_STOPPED;
 }
@@ -1384,6 +1497,11 @@  static int i915_gen_event_flush(struct perf_event *event)
 {
 	struct drm_i915_private *i915 =
 		container_of(event->pmu, typeof(*i915), gen_pmu.pmu);
+	int ret;
+
+	ret = i915_gen_pmu_wait_gpu(i915);
+	if (ret)
+		return ret;
 
 	forward_gen_pmu_snapshots(i915);
 	return 0;
@@ -1535,6 +1653,7 @@  void i915_gen_pmu_register(struct drm_device *dev)
 	hrtimer_init(&i915->gen_pmu.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
 	i915->gen_pmu.timer.function = hrtimer_sample_gen;
 
+	INIT_WORK(&i915->gen_pmu.forward_work, forward_gen_pmu_work_fn);
 	spin_lock_init(&i915->gen_pmu.lock);
 
 	i915->gen_pmu.pmu.capabilities  = PERF_PMU_CAP_IS_DEVICE;
@@ -1564,6 +1683,8 @@  void i915_gen_pmu_unregister(struct drm_device *dev)
 	if (i915->gen_pmu.pmu.event_init == NULL)
 		return;
 
+	cancel_work_sync(&i915->gen_pmu.forward_work);
+
 	perf_pmu_unregister(&i915->gen_pmu.pmu);
 	i915->gen_pmu.pmu.event_init = NULL;
 }
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index abe5826..4a19c9b 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -140,6 +140,16 @@  struct drm_i915_oa_node_tag {
 	__u32 pad;
 };
 
+struct drm_i915_ts_data {
+	__u32 ts_low;
+	__u32 ts_high;
+};
+
+struct drm_i915_ts_node_ctx_id {
+	__u32 ctx_id;
+	__u32 pad;
+};
+
 /* Each region is a minimum of 16k, and there are at most 255 of them.
  */
 #define I915_NR_TEX_REGIONS 255	/* table size 2k - maximum due to use