[RFC,3/8] drm/i915: Add mechanism for forwarding CS based OA counter snapshots through perf

Message ID	1436950023-13940-4-git-send-email-sourab.gupta@intel.com (mailing list archive)
State	New, archived
Headers	show Return-Path: <intel-gfx-bounces@lists.freedesktop.org> From: sourab.gupta@intel.com To: intel-gfx@lists.freedesktop.org Date: Wed, 15 Jul 2015 14:16:58 +0530 Message-Id: <1436950023-13940-4-git-send-email-sourab.gupta@intel.com> In-Reply-To: <1436950023-13940-1-git-send-email-sourab.gupta@intel.com> References: <1436950023-13940-1-git-send-email-sourab.gupta@intel.com> Cc: Insoo Woo <insoo.woo@intel.com>, Peter Zijlstra <a.p.zijlstra@chello.nl>, Jabin Wu <jabin.wu@intel.com>, Sourab Gupta <sourab.gupta@intel.com> Subject: [Intel-gfx] [RFC 3/8] drm/i915: Add mechanism for forwarding CS based OA counter snapshots through perf Precedence: list MIME-Version: 1.0 Content-Type: text/plain; charset="utf-8" Content-Transfer-Encoding: base64 Errors-To: intel-gfx-bounces@lists.freedesktop.org Sender: "Intel-gfx" <intel-gfx-bounces@lists.freedesktop.org>

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index 740148d..eb72f95 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -1653,6 +1653,13 @@ struct i915_oa_reg { u32 value; }; +struct i915_oa_rcs_node { + struct list_head head; + struct drm_i915_gem_request *req; + u32 offset; + u32 ctx_id; +}; + extern const struct i915_oa_reg i915_oa_3d_mux_config_hsw[]; extern const int i915_oa_3d_mux_config_hsw_len; extern const struct i915_oa_reg i915_oa_3d_b_counter_config_hsw[]; @@ -1952,7 +1959,11 @@ struct drm_i915_private { u8 *addr; int format; int format_size; + u32 node_size; + u32 node_count; } oa_rcs_buffer; + struct list_head node_list; + struct work_struct work_timer; } oa_pmu; #endif diff --git a/drivers/gpu/drm/i915/i915_oa_perf.c b/drivers/gpu/drm/i915/i915_oa_perf.c index b79582b..a4fdca3 100644 --- a/drivers/gpu/drm/i915/i915_oa_perf.c +++ b/drivers/gpu/drm/i915/i915_oa_perf.c @@ -58,6 +58,14 @@ static u32 forward_oa_snapshots(struct drm_i915_private *dev_priv, u8 *snapshot; u32 taken; + /* + * Schedule a worker to forward the RCS based OA reports collected. + * A worker is needed since it requires device mutex to be taken + * which can't be done here because of atomic context + */ + if (dev_priv->oa_pmu.multiple_ctx_mode) + schedule_work(&dev_priv->oa_pmu.work_timer); + head -= dev_priv->oa_pmu.oa_buffer.gtt_offset; tail -= dev_priv->oa_pmu.oa_buffer.gtt_offset; @@ -165,6 +173,103 @@ static void flush_oa_snapshots(struct drm_i915_private *dev_priv, spin_unlock_irqrestore(&dev_priv->oa_pmu.oa_buffer.flush_lock, flags); } +int i915_oa_rcs_wait_gpu(struct drm_i915_private *dev_priv) +{ + struct i915_oa_rcs_node *last_entry; + unsigned long lock_flags; + int ret; + + /* + * Wait for the last scheduled request to complete. This would + * implicitly wait for the prior submitted requests. The refcount + * of the requests is not decremented here. + */ + spin_lock_irqsave(&dev_priv->oa_pmu.lock, lock_flags); + + if (list_empty(&dev_priv->oa_pmu.node_list)) { + spin_unlock_irqrestore(&dev_priv->oa_pmu.lock, lock_flags); + return 0; + } + last_entry = list_last_entry(&dev_priv->oa_pmu.node_list, + struct i915_oa_rcs_node, head); + spin_unlock_irqrestore(&dev_priv->oa_pmu.lock, lock_flags); + + if (last_entry && last_entry->req) { + ret = __i915_wait_request(last_entry->req, atomic_read( + &dev_priv->gpu_error.reset_counter), + dev_priv->mm.interruptible, NULL, NULL); + if (ret) { + DRM_ERROR("failed to wait\n"); + return ret; + } + } + return 0; +} + +static void forward_one_oa_rcs_sample(struct drm_i915_private *dev_priv, + struct i915_oa_rcs_node *node) +{ + struct perf_sample_data data; + struct perf_event *event = dev_priv->oa_pmu.exclusive_event; + int format_size, snapshot_size; + u8 *snapshot; + struct drm_i915_oa_node_ctx_id *ctx_info; + struct perf_raw_record raw; + + format_size = dev_priv->oa_pmu.oa_rcs_buffer.format_size; + snapshot_size = format_size + sizeof(*ctx_info); + snapshot = dev_priv->oa_pmu.oa_rcs_buffer.addr + node->offset; + + ctx_info = (struct drm_i915_oa_node_ctx_id *)(snapshot + format_size); + ctx_info->ctx_id = node->ctx_id; + + perf_sample_data_init(&data, 0, event->hw.last_period); + + /* Note: the combined u32 raw->size member + raw data itself must be 8 + * byte aligned. (See note in init_oa_buffer for more details) */ + raw.size = snapshot_size + 4; + raw.data = snapshot; + + data.raw = &raw; + + perf_event_overflow(event, &data, &dev_priv->oa_pmu.dummy_regs); +} + +void forward_oa_rcs_snapshots_work(struct work_struct *__work) +{ + struct drm_i915_private *dev_priv = + container_of(__work, typeof(*dev_priv), oa_pmu.work_timer); + struct i915_oa_rcs_node *entry, *next; + struct drm_i915_gem_request *req; + unsigned long lock_flags; + int ret; + + list_for_each_entry_safe + (entry, next, &dev_priv->oa_pmu.node_list, head) { + req = entry->req; + if (req && i915_gem_request_completed(req, true)) { + forward_one_oa_rcs_sample(dev_priv, entry); + ret = i915_mutex_lock_interruptible(dev_priv->dev); + if (ret) + break; + i915_gem_request_assign(&entry->req, NULL); + mutex_unlock(&dev_priv->dev->struct_mutex); + } else + break; + + /* + * Do we instead need to protect whole loop? If so, we would + * need to *list_move_tail* to a deferred list, from where + * i915 device mutex could be taken to deference the requests, + * and free the node. + */ + spin_lock_irqsave(&dev_priv->oa_pmu.lock, lock_flags); + list_del(&entry->head); + spin_unlock_irqrestore(&dev_priv->oa_pmu.lock, lock_flags); + kfree(entry); + } +} + static void oa_rcs_buffer_destroy(struct drm_i915_private *i915) { @@ -364,7 +469,7 @@ static int init_oa_rcs_buffer(struct perf_event *event) struct drm_i915_private *dev_priv = container_of(event->pmu, typeof(*dev_priv), oa_pmu.pmu); struct drm_i915_gem_object *bo; - int ret; + int ret, node_size; BUG_ON(dev_priv->oa_pmu.oa_rcs_buffer.obj); @@ -375,6 +480,16 @@ static int init_oa_rcs_buffer(struct perf_event *event) dev_priv->oa_pmu.oa_rcs_buffer.obj = bo; dev_priv->oa_pmu.oa_rcs_buffer.addr = vmap_oa_buffer(bo); + INIT_LIST_HEAD(&dev_priv->oa_pmu.node_list); + + node_size = dev_priv->oa_pmu.oa_rcs_buffer.format_size + + sizeof(struct drm_i915_oa_node_ctx_id); + + /* node size has to be aligned to 64 bytes, since only 64 byte aligned + * addresses can be given to OA unit for dumping OA reports */ + node_size = ALIGN(node_size, 64); + dev_priv->oa_pmu.oa_rcs_buffer.node_size = node_size; + dev_priv->oa_pmu.oa_rcs_buffer.node_count = bo->base.size / node_size; DRM_DEBUG_DRIVER("OA RCS Buffer initialized, vaddr = %p", dev_priv->oa_pmu.oa_rcs_buffer.addr); @@ -849,7 +964,13 @@ static int i915_oa_event_flush(struct perf_event *event) if (event->attr.sample_period) { struct drm_i915_private *i915 = container_of(event->pmu, typeof(*i915), oa_pmu.pmu); + int ret; + if (i915->oa_pmu.multiple_ctx_mode) { + ret = i915_oa_rcs_wait_gpu(i915); + if (ret) + return ret; + } flush_oa_snapshots(i915, true); } @@ -945,6 +1066,8 @@ void i915_oa_pmu_register(struct drm_device *dev) hrtimer_init(&i915->oa_pmu.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); i915->oa_pmu.timer.function = hrtimer_sample; + INIT_WORK(&i915->oa_pmu.work_timer, forward_oa_rcs_snapshots_work); + spin_lock_init(&i915->oa_pmu.lock); i915->oa_pmu.pmu.capabilities = PERF_PMU_CAP_IS_DEVICE; @@ -974,6 +1097,9 @@ void i915_oa_pmu_unregister(struct drm_device *dev) if (i915->oa_pmu.pmu.event_init == NULL) return; + if (i915->oa_pmu.multiple_ctx_mode) + cancel_work_sync(&i915->oa_pmu.work_timer); + unregister_sysctl_table(i915->oa_pmu.sysctl_header); perf_pmu_unregister(&i915->oa_pmu.pmu); diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h index dcf7c87..e97b2fd 100644 --- a/include/uapi/drm/i915_drm.h +++ b/include/uapi/drm/i915_drm.h @@ -123,6 +123,11 @@ enum drm_i915_oa_event_type { I915_OA_RECORD_MAX, /* non-ABI */ }; +struct drm_i915_oa_node_ctx_id { + __u32 ctx_id; + __u32 pad; +}; + /* Each region is a minimum of 16k, and there are at most 255 of them. */ #define I915_NR_TEX_REGIONS 255 /* table size 2k - maximum due to use

[RFC,3/8] drm/i915: Add mechanism for forwarding CS based OA counter snapshots through perf

Commit Message

Patch