[1/6] drm/i915: Extend i915 perf framework for collecting timestamps on all gpu engines

Message ID	1489645211-25729-2-git-send-email-sourab.gupta@intel.com (mailing list archive)
State	New, archived
Headers	show Return-Path: <intel-gfx-bounces@lists.freedesktop.org> From: sourab.gupta@intel.com To: intel-gfx@lists.freedesktop.org Date: Thu, 16 Mar 2017 11:50:06 +0530 Message-Id: <1489645211-25729-2-git-send-email-sourab.gupta@intel.com> In-Reply-To: <1489645211-25729-1-git-send-email-sourab.gupta@intel.com> References: <1489645211-25729-1-git-send-email-sourab.gupta@intel.com> Cc: Daniel Vetter <daniel.vetter@ffwll.ch>, Sourab Gupta <sourab.gupta@intel.com>, Matthew Auld <matthew.auld@intel.com> Subject: [Intel-gfx] [PATCH 1/6] drm/i915: Extend i915 perf framework for collecting timestamps on all gpu engines Precedence: list MIME-Version: 1.0 Content-Type: text/plain; charset="utf-8" Content-Transfer-Encoding: base64 Errors-To: intel-gfx-bounces@lists.freedesktop.org Sender: "Intel-gfx" <intel-gfx-bounces@lists.freedesktop.org>

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index 414afa5..f730477 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -2045,6 +2045,11 @@ struct i915_perf_stream { bool cs_mode; /** + * @using_oa: Whether OA unit is in use for this particular stream + */ + bool using_oa; + + /** * @last_request: Contains an RCU guarded pointer to the last request * associated with the stream, when command stream mode is enabled for * the stream. @@ -2156,11 +2161,33 @@ struct i915_perf_cs_sample { struct drm_i915_gem_request *request; /** - * @offset: Offset into ``&drm_i915_private->perf.command_stream_buf`` - * where the perf metrics will be collected, when the commands inserted - * into the command stream are executed by GPU. + * @start_offset: Offset into ``&drm_i915_private-> + * perf.command_stream_buf`` where the metrics for this perf sample + * will be collected, when the commands inserted into the command stream + * are executed by GPU. + */ + u32 start_offset; + + /** + * @oa_offset: Offset into ``&drm_i915_private-> + * perf.command_stream_buf`` where the OA report for this perf sample + * will be collected (if the stream is configured for collection of OA + * samples). */ - u32 offset; + u32 oa_offset; + + /** + * @ts_offset: Offset into ``&drm_i915_private-> + * perf.command_stream_buf`` where the timestamps for this perf sample + * will be collected (if the stream is configured for collection of + * timestamp data) + */ + u32 ts_offset; + + /** + * @size: buffer size corresponding to this perf sample + */ + u32 size; /** * @ctx_id: Context ID associated with this perf sample @@ -2519,15 +2546,14 @@ struct drm_i915_private { spinlock_t hook_lock; - struct { - struct i915_perf_stream *exclusive_stream; + struct hrtimer poll_check_timer; + struct i915_perf_stream *exclusive_stream; + wait_queue_head_t poll_wq[I915_NUM_ENGINES]; + bool pollin[I915_NUM_ENGINES]; + struct { u32 specific_ctx_id; - struct hrtimer poll_check_timer; - wait_queue_head_t poll_wq; - bool pollin; - bool periodic; int period_exponent; @@ -2635,13 +2661,13 @@ struct drm_i915_private { u8 *vaddr; #define I915_PERF_CMD_STREAM_BUF_STATUS_OVERFLOW (1<<0) u32 status; - } command_stream_buf; + } command_stream_buf[I915_NUM_ENGINES]; u32 last_cmd_stream_ctx_id; u32 last_pid; u32 last_tag; - struct list_head cs_samples; - spinlock_t sample_lock; + struct list_head cs_samples[I915_NUM_ENGINES]; + spinlock_t sample_lock[I915_NUM_ENGINES]; } perf; /* Abstract the submission mechanism (legacy ringbuffer or execlists) away */ diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c index 759865e..995f7a1 100644 --- a/drivers/gpu/drm/i915/i915_perf.c +++ b/drivers/gpu/drm/i915/i915_perf.c @@ -285,12 +285,17 @@ #define OAREPORT_REASON_CTX_SWITCH (1<<3) #define OAREPORT_REASON_CLK_RATIO (1<<5) -/* Data common to periodic and RCS based OA samples */ -struct oa_sample_data { +#define OA_ADDR_ALIGN 64 +#define TS_ADDR_ALIGN 8 +#define I915_PERF_TS_SAMPLE_SIZE 8 + +/*Data common to perf samples (periodic OA / CS based OA / Timestamps)*/ +struct sample_data { u32 source; u32 ctx_id; u32 pid; u32 tag; + u64 ts; const u8 *report; }; @@ -346,6 +351,7 @@ struct oa_sample_data { #define SAMPLE_CTX_ID (1<<2) #define SAMPLE_PID (1<<3) #define SAMPLE_TAG (1<<4) +#define SAMPLE_TS (1<<5) /** * struct perf_open_properties - for validated properties given to open a stream @@ -402,7 +408,7 @@ void i915_perf_command_stream_hook(struct drm_i915_gem_request *request, mutex_lock(&dev_priv->perf.streams_lock); list_for_each_entry(stream, &dev_priv->perf.streams, link) { if ((stream->state == I915_PERF_STREAM_ENABLED) && - stream->cs_mode) + stream->cs_mode && (stream->engine == engine->id)) stream->ops->command_stream_hook(stream, request, tag); } mutex_unlock(&dev_priv->perf.streams_lock); @@ -411,7 +417,7 @@ void i915_perf_command_stream_hook(struct drm_i915_gem_request *request, /** * release_perf_samples - Release old perf samples to make space for new * sample data. - * @dev_priv: i915 device private + * @stream: An i915-perf stream associated with the samples * @target_size: Space required to be freed up. * * We also dereference the associated request before deleting the sample. @@ -420,17 +426,17 @@ void i915_perf_command_stream_hook(struct drm_i915_gem_request *request, * to be replaced by a new sample, and gpu will eventually overwrite the buffer * contents, when the request associated with new sample completes. */ -static void release_perf_samples(struct drm_i915_private *dev_priv, +static void release_perf_samples(struct i915_perf_stream *stream, u32 target_size) { + struct drm_i915_private *dev_priv = stream->dev_priv; struct i915_perf_cs_sample *sample, *next; - u32 sample_size = dev_priv->perf.oa.oa_buffer.format_size; u32 size = 0; - list_for_each_entry_safe - (sample, next, &dev_priv->perf.cs_samples, link) { + list_for_each_entry_safe (sample, next, + &dev_priv->perf.cs_samples[stream->engine], link) { - size += sample_size; + size += sample->size; i915_gem_request_put(sample->request); list_del(&sample->link); kfree(sample); @@ -449,109 +455,125 @@ static void release_perf_samples(struct drm_i915_private *dev_priv, * If the space is exhausted in the buffer, it will remove the older * entries in order to make space. */ -static void insert_perf_sample(struct drm_i915_private *dev_priv, +static void insert_perf_sample(struct i915_perf_stream *stream, struct i915_perf_cs_sample *sample) { + struct drm_i915_private *dev_priv = stream->dev_priv; struct i915_perf_cs_sample *first, *last; - int max_offset = dev_priv->perf.command_stream_buf.vma->obj->base.size; - u32 sample_size = dev_priv->perf.oa.oa_buffer.format_size; - - spin_lock(&dev_priv->perf.sample_lock); - if (list_empty(&dev_priv->perf.cs_samples)) { - sample->offset = 0; - list_add_tail(&sample->link, &dev_priv->perf.cs_samples); - spin_unlock(&dev_priv->perf.sample_lock); - return; + u32 sample_flags = stream->sample_flags; + enum intel_engine_id id = stream->engine; + int max_offset = + dev_priv->perf.command_stream_buf[id].vma->obj->base.size; + u32 offset, sample_size = 0; + bool sample_ts = false; + + if (stream->sample_flags & SAMPLE_OA_REPORT) + sample_size += dev_priv->perf.oa.oa_buffer.format_size; + else if (sample_flags & SAMPLE_TS) { + /* + * XXX: Since TS data can anyways be derived from OA report, so + * no need to capture it for RCS engine, if capture oa data is + * called already. + */ + sample_size += I915_PERF_TS_SAMPLE_SIZE; + sample_ts = true; + } + + spin_lock(&dev_priv->perf.sample_lock[id]); + if (list_empty(&dev_priv->perf.cs_samples[id])) { + offset = 0; + goto out; } - first = list_first_entry(&dev_priv->perf.cs_samples,typeof(*first), + first = list_first_entry(&dev_priv->perf.cs_samples[id],typeof(*first), link); - last = list_last_entry(&dev_priv->perf.cs_samples, typeof(*last), + last = list_last_entry(&dev_priv->perf.cs_samples[id], typeof(*last), link); - if (last->offset >= first->offset) { + if (last->start_offset >= first->start_offset) { /* Sufficient space available at the end of buffer? */ - if (last->offset + 2*sample_size < max_offset) - sample->offset = last->offset + sample_size; + if (last->start_offset + last->size + sample_size < max_offset) + offset = last->start_offset + last->size; /* * Wraparound condition. Is sufficient space available at * beginning of buffer? */ - else if (sample_size < first->offset) - sample->offset = 0; + else if (sample_size < first->start_offset) + offset = 0; /* Insufficient space. Overwrite existing old entries */ else { - u32 target_size = sample_size - first->offset; + u32 target_size = sample_size - first->start_offset; - dev_priv->perf.command_stream_buf.status |= + dev_priv->perf.command_stream_buf[id].status |= I915_PERF_CMD_STREAM_BUF_STATUS_OVERFLOW; - release_perf_samples(dev_priv, target_size); - sample->offset = 0; + release_perf_samples(stream, target_size); + offset = 0; } } else { /* Sufficient space available? */ - if (last->offset + 2*sample_size < first->offset) - sample->offset = last->offset + sample_size; + if (last->start_offset + last->size + sample_size + < first->start_offset) + offset = last->start_offset + last->size; /* Insufficient space. Overwrite existing old entries */ else { u32 target_size = sample_size - - (first->offset - last->offset - - sample_size); + (first->start_offset - last->start_offset - + last->size); - dev_priv->perf.command_stream_buf.status |= + dev_priv->perf.command_stream_buf[id].status |= I915_PERF_CMD_STREAM_BUF_STATUS_OVERFLOW; - release_perf_samples(dev_priv, target_size); - sample->offset = last->offset + sample_size; + release_perf_samples(stream, target_size); + offset = last->start_offset + last->size; } } - list_add_tail(&sample->link, &dev_priv->perf.cs_samples); - spin_unlock(&dev_priv->perf.sample_lock); + +out: + sample->start_offset = offset; + sample->size = sample_size; + if (stream->sample_flags & SAMPLE_OA_REPORT) { + sample->oa_offset = offset; + /* Ensure 64 byte alignment of oa_offset */ + sample->oa_offset = ALIGN(sample->oa_offset, OA_ADDR_ALIGN); + offset = sample->oa_offset + + dev_priv->perf.oa.oa_buffer.format_size; + } + if (sample_ts) { + sample->ts_offset = offset; + /* Ensure 8 byte alignment of ts_offset */ + sample->ts_offset = ALIGN(sample->ts_offset, TS_ADDR_ALIGN); + offset = sample->ts_offset + I915_PERF_TS_SAMPLE_SIZE; + } + + list_add_tail(&sample->link, &dev_priv->perf.cs_samples[id]); + + spin_unlock(&dev_priv->perf.sample_lock[id]); } /** - * i915_perf_command_stream_hook_oa - Insert the commands to capture OA reports - * metrics into the render command stream - * @stream: An i915-perf stream opened for OA metrics - * @request: request in whose context the metrics are being collected. - * @tag: userspace provided tag to be associated with the perf sample + * i915_engine_stream_capture_oa - Insert the commands to capture OA reports + * into the render command stream + * @request: request in whose context the OA metrics are being collected. + * @offset: command stream buffer offset where the OA metrics need to be + * collected */ -static void i915_perf_command_stream_hook_oa(struct i915_perf_stream *stream, - struct drm_i915_gem_request *request, - u32 tag) +static int i915_engine_stream_capture_oa(struct drm_i915_gem_request *request, + u32 offset) { struct drm_i915_private *dev_priv = request->i915; - struct i915_gem_context *ctx = request->ctx; - struct i915_perf_cs_sample *sample; u32 addr = 0; u32 cmd, *cs; - sample = kzalloc(sizeof(*sample), GFP_KERNEL); - if (sample == NULL) { - DRM_ERROR("Perf sample alloc failed\n"); - return; - } - - cs = intel_ring_begin(request, 4); - if (IS_ERR(cs)) { - kfree(sample); - return; - } - - sample->ctx_id = ctx->hw_id; - sample->pid = current->pid; - sample->tag = tag; - i915_gem_request_assign(&sample->request, request); - - insert_perf_sample(dev_priv, sample); - - addr = dev_priv->perf.command_stream_buf.vma->node.start + - sample->offset; + addr = dev_priv->perf.command_stream_buf[RCS].vma->node.start + offset; if (WARN_ON(addr & 0x3f)) { DRM_ERROR("OA buffer address not aligned to 64 byte"); - return; + return -EINVAL; } + cs = intel_ring_begin(request, 4); + if (IS_ERR(cs)) + return PTR_ERR(cs); + cmd = MI_REPORT_PERF_COUNT | (1<<0); if (INTEL_GEN(dev_priv) >= 8) cmd |= (2<<0); @@ -567,29 +589,151 @@ static void i915_perf_command_stream_hook_oa(struct i915_perf_stream *stream, intel_ring_advance(request, cs); + return 0; +} + +/** + * i915_engine_stream_capture_ts - Insert the commands to capture timestamp + * data into the GPU command stream + * @request: request in whose context the timestamps are being collected. + * @offset: command stream buffer offset where the timestamp data needs to be + * collected + */ +static int i915_engine_stream_capture_ts(struct drm_i915_gem_request *request, + u32 offset) +{ + struct drm_i915_private *dev_priv = request->i915; + enum intel_engine_id id = request->engine->id; + u32 addr = 0; + u32 cmd, *cs; + + cs = intel_ring_begin(request, 6); + if (IS_ERR(cs)) + return PTR_ERR(cs); + + addr = dev_priv->perf.command_stream_buf[id].vma->node.start + offset; + + if (id == RCS) { + if (INTEL_GEN(dev_priv) >= 8) + cmd = GFX_OP_PIPE_CONTROL(6); + else + cmd = GFX_OP_PIPE_CONTROL(5); + + *cs++ = cmd; + *cs++ = PIPE_CONTROL_GLOBAL_GTT_IVB | + PIPE_CONTROL_TIMESTAMP_WRITE; + *cs++ = addr | PIPE_CONTROL_GLOBAL_GTT; + *cs++ = 0; + *cs++ = 0; + + if (INTEL_GEN(dev_priv) >= 8) + *cs++ = 0; + else + *cs++ = MI_NOOP; + } else { + uint32_t cmd; + + cmd = MI_FLUSH_DW + 1; + if (INTEL_GEN(dev_priv) >= 8) + cmd += 1; + + cmd |= MI_FLUSH_DW_OP_STAMP; + + *cs++ = cmd; + *cs++ = addr | MI_FLUSH_DW_USE_GTT; + *cs++ = 0; + *cs++ = 0; + + if (INTEL_GEN(dev_priv) >= 8) + *cs++ = 0; + else + *cs++ = MI_NOOP; + *cs++ = MI_NOOP; + } + intel_ring_advance(request, cs); + + return 0; +} + +/** + * i915_engine_stream_capture_cs_data - Insert the commands to capture perf + * metrics into the GPU command stream + * @stream: An i915-perf stream opened for GPU metrics + * @request: request in whose context the metrics are being collected. + * @tag: userspace provided tag to be associated with the perf sample + */ +static void i915_engine_stream_capture_cs_data(struct i915_perf_stream *stream, + struct drm_i915_gem_request *request, + u32 tag) +{ + struct drm_i915_private *dev_priv = request->i915; + struct i915_gem_context *ctx = request->ctx; + struct i915_perf_cs_sample *sample; + enum intel_engine_id id = stream->engine; + u32 sample_flags = stream->sample_flags; + int ret; + + sample = kzalloc(sizeof(*sample), GFP_KERNEL); + if (sample == NULL) { + DRM_ERROR("Perf sample alloc failed\n"); + return; + } + + sample->ctx_id = ctx->hw_id; + sample->pid = current->pid; + sample->tag = tag; + i915_gem_request_assign(&sample->request, request); + + insert_perf_sample(stream, sample); + + if (sample_flags & SAMPLE_OA_REPORT) { + ret = i915_engine_stream_capture_oa(request, sample->oa_offset); + if (ret) + goto err_unref; + } else if (sample_flags & SAMPLE_TS) { + /* + * XXX: Since TS data can anyways be derived from OA report, so + * no need to capture it for RCS engine, if capture oa data is + * called already. + */ + ret = i915_engine_stream_capture_ts(request, sample->ts_offset); + if (ret) + goto err_unref; + } + i915_gem_active_set(&stream->last_request, request); - i915_vma_move_to_active(dev_priv->perf.command_stream_buf.vma, request, - EXEC_OBJECT_WRITE); + i915_vma_move_to_active(dev_priv->perf.command_stream_buf[id].vma, + request, EXEC_OBJECT_WRITE); + return; + +err_unref: + i915_gem_request_put(sample->request); + spin_lock(&dev_priv->perf.sample_lock[id]); + list_del(&sample->link); + spin_unlock(&dev_priv->perf.sample_lock[id]); + kfree(sample); } /** - * i915_oa_rcs_release_samples - Release the perf command stream samples - * @dev_priv: i915 device private + * i915_engine_stream_release_samples - Release the perf command stream samples + * @stream: An i915-perf stream opened for GPU metrics * * Note: The associated requests should be completed before releasing the * references here. */ -static void i915_oa_rcs_release_samples(struct drm_i915_private *dev_priv) +static void i915_engine_stream_release_samples(struct i915_perf_stream *stream) { + struct drm_i915_private *dev_priv = stream->dev_priv; + enum intel_engine_id id = stream->engine; struct i915_perf_cs_sample *entry, *next; list_for_each_entry_safe - (entry, next, &dev_priv->perf.cs_samples, link) { + (entry, next, &dev_priv->perf.cs_samples[id], link) { i915_gem_request_put(entry->request); - spin_lock(&dev_priv->perf.sample_lock); + spin_lock(&dev_priv->perf.sample_lock[id]); list_del(&entry->link); - spin_unlock(&dev_priv->perf.sample_lock); + spin_unlock(&dev_priv->perf.sample_lock[id]); kfree(entry); } } @@ -933,13 +1077,13 @@ static int append_oa_status(struct i915_perf_stream *stream, } /** - * append_oa_sample - Copies single OA report into userspace read() buffer. - * @stream: An i915-perf stream opened for OA metrics + * append_sample - Copies single perf sample into userspace read() buffer. + * @stream: An i915-perf stream opened for perf samples * @buf: destination buffer given by userspace * @count: the number of bytes userspace wants to read * @offset: (inout): the current position for writing into @buf - * @data: OA sample data, which contains (optionally) OA report with other - * sample metadata. + * @data: perf sample data which contains (optionally) metrics configured + * earlier when opening a stream * * The contents of a sample are configured through `DRM_I915_PERF_PROP_SAMPLE_*` * properties when opening a stream, tracked as `stream->sample_flags`. This @@ -950,11 +1094,11 @@ static int append_oa_status(struct i915_perf_stream *stream, * * Returns: 0 on success, negative error code on failure. */ -static int append_oa_sample(struct i915_perf_stream *stream, +static int append_sample(struct i915_perf_stream *stream, char __user *buf, size_t count, size_t *offset, - const struct oa_sample_data *data) + const struct sample_data *data) { struct drm_i915_private *dev_priv = stream->dev_priv; int report_size = dev_priv->perf.oa.oa_buffer.format_size; @@ -997,6 +1141,12 @@ static int append_oa_sample(struct i915_perf_stream *stream, buf += 4; } + if (sample_flags & SAMPLE_TS) { + if (copy_to_user(buf, &data->ts, I915_PERF_TS_SAMPLE_SIZE)) + return -EFAULT; + buf += I915_PERF_TS_SAMPLE_SIZE; + } + if (sample_flags & SAMPLE_OA_REPORT) { if (copy_to_user(buf, data->report, report_size)) return -EFAULT; @@ -1025,7 +1175,7 @@ static int append_oa_buffer_sample(struct i915_perf_stream *stream, { struct drm_i915_private *dev_priv = stream->dev_priv; u32 sample_flags = stream->sample_flags; - struct oa_sample_data data = { 0 }; + struct sample_data data = { 0 }; if (sample_flags & SAMPLE_OA_SOURCE_INFO) data.source = I915_PERF_OA_EVENT_SOURCE_PERIODIC; @@ -1039,10 +1189,16 @@ static int append_oa_buffer_sample(struct i915_perf_stream *stream, if (sample_flags & SAMPLE_TAG) data.tag = dev_priv->perf.last_tag; + /* TODO: Derive timestamp from OA report, + * after scaling with the ts base + */ + if (sample_flags & SAMPLE_TS) + data.ts = 0; + if (sample_flags & SAMPLE_OA_REPORT) data.report = report; - return append_oa_sample(stream, buf, count, offset, &data); + return append_sample(stream, buf, count, offset, &data); } @@ -1209,7 +1365,7 @@ static int gen8_append_oa_reports(struct i915_perf_stream *stream, * switches since it's not-uncommon for periodic samples to * identify a switch before any 'context switch' report. */ - if (!dev_priv->perf.oa.exclusive_stream->ctx || + if (!dev_priv->perf.exclusive_stream->ctx || dev_priv->perf.oa.specific_ctx_id == ctx_id || (dev_priv->perf.oa.oa_buffer.last_ctx_id == dev_priv->perf.oa.specific_ctx_id) || @@ -1218,7 +1374,7 @@ static int gen8_append_oa_reports(struct i915_perf_stream *stream, /* While filtering for a single context we avoid * leaking the IDs of other contexts. */ - if (dev_priv->perf.oa.exclusive_stream->ctx && + if (dev_priv->perf.exclusive_stream->ctx && dev_priv->perf.oa.specific_ctx_id != ctx_id) { report32[2] = INVALID_CTX_ID; } @@ -1576,36 +1732,43 @@ static int gen7_oa_read(struct i915_perf_stream *stream, } /** - * append_oa_rcs_sample - Copies single RCS based OA report into userspace - * read() buffer. - * @stream: An i915-perf stream opened for OA metrics + * append_one_cs_sample - Copies single perf sample data associated with + * GPU command stream, into userspace read() buffer. + * @stream: An i915-perf stream opened for perf CS metrics * @buf: destination buffer given by userspace * @count: the number of bytes userspace wants to read * @offset: (inout): the current position for writing into @buf - * @node: Sample data associated with RCS based OA report + * @node: Sample data associated with perf metrics * * Returns: 0 on success, negative error code on failure. */ -static int append_oa_rcs_sample(struct i915_perf_stream *stream, +static int append_one_cs_sample(struct i915_perf_stream *stream, char __user *buf, size_t count, size_t *offset, struct i915_perf_cs_sample *node) { struct drm_i915_private *dev_priv = stream->dev_priv; - struct oa_sample_data data = { 0 }; - const u8 *report = dev_priv->perf.command_stream_buf.vaddr + - node->offset; + struct sample_data data = { 0 }; + enum intel_engine_id id = stream->engine; u32 sample_flags = stream->sample_flags; - u32 report_ts; - int ret; + int ret = 0; - /* First, append the periodic OA samples having lower timestamps */ - report_ts = *(u32 *)(report + 4); - ret = dev_priv->perf.oa.ops.read(stream, buf, count, offset, - report_ts, U32_MAX); - if (ret) - return ret; + if (sample_flags & SAMPLE_OA_REPORT) { + const u8 *report = dev_priv->perf.command_stream_buf[id].vaddr + + node->oa_offset; + u32 sample_ts = *(u32 *)(report + 4); + + data.report = report; + + /* First, append the periodic OA samples having lower + * timestamp values + */ + ret = dev_priv->perf.oa.ops.read(stream, buf, count, offset, + sample_ts, U32_MAX); + if (ret) + return ret; + } if (sample_flags & SAMPLE_OA_SOURCE_INFO) data.source = I915_PERF_OA_EVENT_SOURCE_RCS; @@ -1625,16 +1788,28 @@ static int append_oa_rcs_sample(struct i915_perf_stream *stream, dev_priv->perf.last_tag = node->tag; } - if (sample_flags & SAMPLE_OA_REPORT) - data.report = report; + if (sample_flags & SAMPLE_TS) { + /* For RCS, if OA samples are also being collected, derive the + * timestamp from OA report, after scaling with the TS base. + * Else, forward the timestamp collected via command stream. + */ + + /* TODO: derive the timestamp from OA report */ + if (sample_flags & SAMPLE_OA_REPORT) + data.ts = 0; + else + data.ts = *(u64 *) + (dev_priv->perf.command_stream_buf[id].vaddr + + node->ts_offset); + } - return append_oa_sample(stream, buf, count, offset, &data); + return append_sample(stream, buf, count, offset, &data); } /** - * oa_rcs_append_reports: Copies all comand stream based OA reports into - * userspace read() buffer. - * @stream: An i915-perf stream opened for OA metrics + * append_command_stream_samples: Copies all comand stream based perf samples + * into userspace read() buffer. + * @stream: An i915-perf stream opened for perf CS metrics * @buf: destination buffer given by userspace * @count: the number of bytes userspace wants to read * @offset: (inout): the current position for writing into @buf @@ -1647,16 +1822,17 @@ static int append_oa_rcs_sample(struct i915_perf_stream *stream, * * Returns: 0 on success, negative error code on failure. */ -static int oa_rcs_append_reports(struct i915_perf_stream *stream, +static int append_command_stream_samples(struct i915_perf_stream *stream, char __user *buf, size_t count, size_t *offset) { struct drm_i915_private *dev_priv = stream->dev_priv; struct i915_perf_cs_sample *entry, *next; + enum intel_engine_id id = stream->engine; LIST_HEAD(free_list); int ret = 0; - u32 status = dev_priv->perf.command_stream_buf.status; + u32 status = dev_priv->perf.command_stream_buf[id].status; if (unlikely(status & I915_PERF_CMD_STREAM_BUF_STATUS_OVERFLOW)) { ret = append_oa_status(stream, buf, count, offset, @@ -1664,28 +1840,28 @@ static int oa_rcs_append_reports(struct i915_perf_stream *stream, if (ret) return ret; - dev_priv->perf.command_stream_buf.status &= + dev_priv->perf.command_stream_buf[id].status &= ~I915_PERF_CMD_STREAM_BUF_STATUS_OVERFLOW; } - spin_lock(&dev_priv->perf.sample_lock); - if (list_empty(&dev_priv->perf.cs_samples)) { - spin_unlock(&dev_priv->perf.sample_lock); + spin_lock(&dev_priv->perf.sample_lock[id]); + if (list_empty(&dev_priv->perf.cs_samples[id])) { + spin_unlock(&dev_priv->perf.sample_lock[id]); goto pending_periodic; } list_for_each_entry_safe(entry, next, - &dev_priv->perf.cs_samples, link) { + &dev_priv->perf.cs_samples[id], link) { if (!i915_gem_request_completed(entry->request)) break; list_move_tail(&entry->link, &free_list); } - spin_unlock(&dev_priv->perf.sample_lock); + spin_unlock(&dev_priv->perf.sample_lock[id]); if (list_empty(&free_list)) goto pending_periodic; list_for_each_entry_safe(entry, next, &free_list, link) { - ret = append_oa_rcs_sample(stream, buf, count, offset, entry); + ret = append_one_cs_sample(stream, buf, count, offset, entry); if (ret) break; @@ -1695,14 +1871,15 @@ static int oa_rcs_append_reports(struct i915_perf_stream *stream, } /* Don't discard remaining entries, keep them for next read */ - spin_lock(&dev_priv->perf.sample_lock); - list_splice(&free_list, &dev_priv->perf.cs_samples); - spin_unlock(&dev_priv->perf.sample_lock); + spin_lock(&dev_priv->perf.sample_lock[id]); + list_splice(&free_list, &dev_priv->perf.cs_samples[id]); + spin_unlock(&dev_priv->perf.sample_lock[id]); return ret; pending_periodic: - if (!dev_priv->perf.oa.n_pending_periodic_samples) + if (!((stream->sample_flags & SAMPLE_OA_REPORT) && + dev_priv->perf.oa.n_pending_periodic_samples)) return 0; ret = dev_priv->perf.oa.ops.read(stream, buf, count, offset, @@ -1733,15 +1910,16 @@ static enum cs_buf_state command_stream_buf_state( struct i915_perf_stream *stream) { struct drm_i915_private *dev_priv = stream->dev_priv; + enum intel_engine_id id = stream->engine; struct i915_perf_cs_sample *entry = NULL; struct drm_i915_gem_request *request = NULL; - spin_lock(&dev_priv->perf.sample_lock); - entry = list_first_entry_or_null(&dev_priv->perf.cs_samples, + spin_lock(&dev_priv->perf.sample_lock[id]); + entry = list_first_entry_or_null(&dev_priv->perf.cs_samples[id], struct i915_perf_cs_sample, link); if (entry) request = entry->request; - spin_unlock(&dev_priv->perf.sample_lock); + spin_unlock(&dev_priv->perf.sample_lock[id]); if (!entry) return CS_BUF_EMPTY; @@ -1766,23 +1944,26 @@ static enum cs_buf_state command_stream_buf_state( static bool stream_have_data_unlocked(struct i915_perf_stream *stream) { struct drm_i915_private *dev_priv = stream->dev_priv; - enum cs_buf_state state; - u32 num_samples, last_ts = 0; - - dev_priv->perf.oa.n_pending_periodic_samples = 0; - dev_priv->perf.oa.pending_periodic_ts = 0; - num_samples = dev_priv->perf.oa.ops.oa_buffer_num_reports(dev_priv, - &last_ts); - if (stream->cs_mode) + enum cs_buf_state state = CS_BUF_EMPTY; + u32 num_samples = 0, last_ts = 0; + + if (stream->sample_flags & SAMPLE_OA_REPORT) { + dev_priv->perf.oa.n_pending_periodic_samples = 0; + dev_priv->perf.oa.pending_periodic_ts = 0; + num_samples = dev_priv->perf.oa.ops.oa_buffer_num_reports( + dev_priv, &last_ts); + } else if (stream->cs_mode) state = command_stream_buf_state(stream); - else - state = CS_BUF_EMPTY; switch (state) { case CS_BUF_EMPTY: - dev_priv->perf.oa.n_pending_periodic_samples = num_samples; - dev_priv->perf.oa.pending_periodic_ts = last_ts; - return (num_samples != 0); + if (stream->sample_flags & SAMPLE_OA_REPORT) { + dev_priv->perf.oa.n_pending_periodic_samples = + num_samples; + dev_priv->perf.oa.pending_periodic_ts = last_ts; + return (num_samples != 0); + } else + return false; case CS_BUF_HAVE_DATA: return true; @@ -1795,12 +1976,12 @@ static bool stream_have_data_unlocked(struct i915_perf_stream *stream) } /** - * i915_oa_wait_unlocked - handles blocking IO until OA data available - * @stream: An i915-perf stream opened for OA metrics + * i915_engine_stream_wait_unlocked - handles blocking IO until data available + * @stream: An i915-perf stream opened for GPU metrics * * Called when userspace tries to read() from a blocking stream FD opened - * for OA metrics. It waits until the hrtimer callback finds a non-empty - * OA buffer and wakes us. + * for perf metrics. It waits until the hrtimer callback finds a non-empty + * command stream buffer / OA buffer and wakes us. * * Note: it's acceptable to have this return with some false positives * since any subsequent read handling will return -EAGAIN if there isn't @@ -1808,9 +1989,10 @@ static bool stream_have_data_unlocked(struct i915_perf_stream *stream) * * Returns: zero on success or a negative error code */ -static int i915_oa_wait_unlocked(struct i915_perf_stream *stream) +static int i915_engine_stream_wait_unlocked(struct i915_perf_stream *stream) { struct drm_i915_private *dev_priv = stream->dev_priv; + enum intel_engine_id id = stream->engine; /* We would wait indefinitely if periodic sampling is not enabled */ if (!dev_priv->perf.oa.periodic) @@ -1830,32 +2012,34 @@ static int i915_oa_wait_unlocked(struct i915_perf_stream *stream) } } - return wait_event_interruptible(dev_priv->perf.oa.poll_wq, + return wait_event_interruptible(dev_priv->perf.poll_wq[id], stream_have_data_unlocked(stream)); } /** - * i915_oa_poll_wait - call poll_wait() for an OA stream poll() - * @stream: An i915-perf stream opened for OA metrics + * i915_engine_stream_poll_wait - call poll_wait() for an stream poll() + * @stream: An i915-perf stream opened for GPU metrics * @file: An i915 perf stream file * @wait: poll() state table * - * For handling userspace polling on an i915 perf stream opened for OA metrics, + * For handling userspace polling on an i915 perf stream opened for metrics, * this starts a poll_wait with the wait queue that our hrtimer callback wakes - * when it sees data ready to read in the circular OA buffer. + * when it sees data ready to read either in command stream buffer or in the + * circular OA buffer. */ -static void i915_oa_poll_wait(struct i915_perf_stream *stream, +static void i915_engine_stream_poll_wait(struct i915_perf_stream *stream, struct file *file, poll_table *wait) { struct drm_i915_private *dev_priv = stream->dev_priv; - poll_wait(file, &dev_priv->perf.oa.poll_wq, wait); + poll_wait(file, &dev_priv->perf.poll_wq[stream->engine], wait); } /** - * i915_oa_read - just calls through to &i915_oa_ops->read - * @stream: An i915-perf stream opened for OA metrics + * i915_engine_stream_read - Reads perf metrics available into userspace read + * buffer + * @stream: An i915-perf stream opened for GPU metrics * @buf: destination buffer given by userspace * @count: the number of bytes userspace wants to read * @offset: (inout): the current position for writing into @buf @@ -1865,7 +2049,7 @@ static void i915_oa_poll_wait(struct i915_perf_stream *stream, * * Returns: zero on success or a negative error code */ -static int i915_oa_read(struct i915_perf_stream *stream, +static int i915_engine_stream_read(struct i915_perf_stream *stream, char __user *buf, size_t count, size_t *offset) @@ -1874,10 +2058,13 @@ static int i915_oa_read(struct i915_perf_stream *stream, if (stream->cs_mode) - return oa_rcs_append_reports(stream, buf, count, offset); - else + return append_command_stream_samples(stream, buf, count, + offset); + else if (stream->sample_flags & SAMPLE_OA_REPORT) return dev_priv->perf.oa.ops.read(stream, buf, count, offset, U32_MAX, U32_MAX); + else + return -EINVAL; } /** @@ -1951,16 +2138,18 @@ static void oa_put_render_ctx_id(struct i915_perf_stream *stream) } static void -free_command_stream_buf(struct drm_i915_private *dev_priv) +free_command_stream_buf(struct drm_i915_private *dev_priv, + enum intel_engine_id id) { mutex_lock(&dev_priv->drm.struct_mutex); - i915_gem_object_unpin_map(dev_priv->perf.command_stream_buf.vma->obj); - i915_vma_unpin(dev_priv->perf.command_stream_buf.vma); - i915_gem_object_put(dev_priv->perf.command_stream_buf.vma->obj); + i915_gem_object_unpin_map( + dev_priv->perf.command_stream_buf[id].vma->obj); + i915_vma_unpin(dev_priv->perf.command_stream_buf[id].vma); + i915_gem_object_put(dev_priv->perf.command_stream_buf[id].vma->obj); - dev_priv->perf.command_stream_buf.vma = NULL; - dev_priv->perf.command_stream_buf.vaddr = NULL; + dev_priv->perf.command_stream_buf[id].vma = NULL; + dev_priv->perf.command_stream_buf[id].vaddr = NULL; mutex_unlock(&dev_priv->drm.struct_mutex); } @@ -1980,27 +2169,29 @@ static void oa_put_render_ctx_id(struct i915_perf_stream *stream) mutex_unlock(&i915->drm.struct_mutex); } -static void i915_oa_stream_destroy(struct i915_perf_stream *stream) +static void i915_engine_stream_destroy(struct i915_perf_stream *stream) { struct drm_i915_private *dev_priv = stream->dev_priv; - if (WARN_ON(stream != dev_priv->perf.oa.exclusive_stream)) + if (WARN_ON(stream != dev_priv->perf.exclusive_stream)) return; - dev_priv->perf.oa.ops.disable_metric_set(dev_priv); + if (stream->using_oa) { + dev_priv->perf.oa.ops.disable_metric_set(dev_priv); - free_oa_buffer(dev_priv); + free_oa_buffer(dev_priv); - intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL); - intel_runtime_pm_put(dev_priv); + intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL); + intel_runtime_pm_put(dev_priv); - if (stream->ctx) - oa_put_render_ctx_id(stream); + if (stream->ctx) + oa_put_render_ctx_id(stream); + } if (stream->cs_mode) - free_command_stream_buf(dev_priv); + free_command_stream_buf(dev_priv, stream->engine); - dev_priv->perf.oa.exclusive_stream = NULL; + dev_priv->perf.exclusive_stream = NULL; } static void gen7_init_oa_buffer(struct drm_i915_private *dev_priv) @@ -2048,7 +2239,7 @@ static void gen7_init_oa_buffer(struct drm_i915_private *dev_priv) /* Maybe make ->pollin per-stream state if we support multiple * concurrent streams in the future. */ - dev_priv->perf.oa.pollin = false; + dev_priv->perf.pollin[RCS] = false; } static void gen8_init_oa_buffer(struct drm_i915_private *dev_priv) @@ -2098,7 +2289,7 @@ static void gen8_init_oa_buffer(struct drm_i915_private *dev_priv) /* Maybe make ->pollin per-stream state if we support multiple * concurrent streams in the future. */ - dev_priv->perf.oa.pollin = false; + dev_priv->perf.pollin[RCS] = false; } static int alloc_obj(struct drm_i915_private *dev_priv, @@ -2179,28 +2370,32 @@ static int alloc_oa_buffer(struct drm_i915_private *dev_priv) return 0; } -static int alloc_command_stream_buf(struct drm_i915_private *dev_priv) +static int alloc_command_stream_buf(struct i915_perf_stream *stream) { + struct drm_i915_private *dev_priv = stream->dev_priv; + enum intel_engine_id id = stream->engine; struct i915_vma *vma; u8 *vaddr; int ret; - if (WARN_ON(dev_priv->perf.command_stream_buf.vma)) + if (WARN_ON(dev_priv->perf.command_stream_buf[id].vma)) return -ENODEV; ret = alloc_obj(dev_priv, &vma, &vaddr); if (ret) return ret; - dev_priv->perf.command_stream_buf.vma = vma; - dev_priv->perf.command_stream_buf.vaddr = vaddr; - if (WARN_ON(!list_empty(&dev_priv->perf.cs_samples))) - INIT_LIST_HEAD(&dev_priv->perf.cs_samples); + dev_priv->perf.command_stream_buf[id].vma = vma; + dev_priv->perf.command_stream_buf[id].vaddr = vaddr; + if (WARN_ON(!list_empty(&dev_priv->perf.cs_samples[id]))) + INIT_LIST_HEAD(&dev_priv->perf.cs_samples[id]); + + dev_priv->perf.pollin[id] = false; DRM_DEBUG_DRIVER( "command stream buf initialized, gtt offset = 0x%x, vaddr = %p", - i915_ggtt_offset(dev_priv->perf.command_stream_buf.vma), - dev_priv->perf.command_stream_buf.vaddr); + i915_ggtt_offset(dev_priv->perf.command_stream_buf[id].vma), + dev_priv->perf.command_stream_buf[id].vaddr); return 0; } @@ -2543,10 +2738,10 @@ static void gen7_update_oacontrol_locked(struct drm_i915_private *dev_priv) { assert_spin_locked(&dev_priv->perf.hook_lock); - if (dev_priv->perf.oa.exclusive_stream->state != + if (dev_priv->perf.exclusive_stream->state != I915_PERF_STREAM_DISABLED) { struct i915_gem_context *ctx = - dev_priv->perf.oa.exclusive_stream->ctx; + dev_priv->perf.exclusive_stream->ctx; u32 ctx_id = dev_priv->perf.oa.specific_ctx_id; bool periodic = dev_priv->perf.oa.periodic; @@ -2608,22 +2803,23 @@ static void gen8_oa_enable(struct drm_i915_private *dev_priv) } /** - * i915_oa_stream_enable - handle `I915_PERF_IOCTL_ENABLE` for OA stream - * @stream: An i915 perf stream opened for OA metrics + * i915_engine_stream_enable - handle `I915_PERF_IOCTL_ENABLE` for perf stream + * @stream: An i915 perf stream opened for GPU metrics * * [Re]enables hardware periodic sampling according to the period configured * when opening the stream. This also starts a hrtimer that will periodically * check for data in the circular OA buffer for notifying userspace (e.g. * during a read() or poll()). */ -static void i915_oa_stream_enable(struct i915_perf_stream *stream) +static void i915_engine_stream_enable(struct i915_perf_stream *stream) { struct drm_i915_private *dev_priv = stream->dev_priv; - dev_priv->perf.oa.ops.oa_enable(dev_priv); + if (stream->sample_flags & SAMPLE_OA_REPORT) + dev_priv->perf.oa.ops.oa_enable(dev_priv); - if (dev_priv->perf.oa.periodic) - hrtimer_start(&dev_priv->perf.oa.poll_check_timer, + if (stream->cs_mode || dev_priv->perf.oa.periodic) + hrtimer_start(&dev_priv->perf.poll_check_timer, ns_to_ktime(POLL_PERIOD), HRTIMER_MODE_REL_PINNED); } @@ -2639,19 +2835,19 @@ static void gen8_oa_disable(struct drm_i915_private *dev_priv) } /** - * i915_oa_stream_disable - handle `I915_PERF_IOCTL_DISABLE` for OA stream - * @stream: An i915 perf stream opened for OA metrics + * i915_engine_stream_disable - handle `I915_PERF_IOCTL_DISABLE` for perf stream + * @stream: An i915 perf stream opened for GPU metrics * * Stops the OA unit from periodically writing counter reports into the * circular OA buffer. This also stops the hrtimer that periodically checks for * data in the circular OA buffer, for notifying userspace. */ -static void i915_oa_stream_disable(struct i915_perf_stream *stream) +static void i915_engine_stream_disable(struct i915_perf_stream *stream) { struct drm_i915_private *dev_priv = stream->dev_priv; - if (dev_priv->perf.oa.periodic) - hrtimer_cancel(&dev_priv->perf.oa.poll_check_timer); + if (stream->cs_mode || dev_priv->perf.oa.periodic) + hrtimer_cancel(&dev_priv->perf.poll_check_timer); if (stream->cs_mode) { /* @@ -2660,24 +2856,25 @@ static void i915_oa_stream_disable(struct i915_perf_stream *stream) */ i915_gem_active_wait(&stream->last_request, I915_WAIT_INTERRUPTIBLE); - i915_oa_rcs_release_samples(dev_priv); + i915_engine_stream_release_samples(stream); } - dev_priv->perf.oa.ops.oa_disable(dev_priv); + if (stream->sample_flags & SAMPLE_OA_REPORT) + dev_priv->perf.oa.ops.oa_disable(dev_priv); } -static const struct i915_perf_stream_ops i915_oa_stream_ops = { - .destroy = i915_oa_stream_destroy, - .enable = i915_oa_stream_enable, - .disable = i915_oa_stream_disable, - .wait_unlocked = i915_oa_wait_unlocked, - .poll_wait = i915_oa_poll_wait, - .read = i915_oa_read, - .command_stream_hook = i915_perf_command_stream_hook_oa, +static const struct i915_perf_stream_ops i915_engine_stream_ops = { + .destroy = i915_engine_stream_destroy, + .enable = i915_engine_stream_enable, + .disable = i915_engine_stream_disable, + .wait_unlocked = i915_engine_stream_wait_unlocked, + .poll_wait = i915_engine_stream_poll_wait, + .read = i915_engine_stream_read, + .command_stream_hook = i915_engine_stream_capture_cs_data, }; /** - * i915_oa_stream_init - validate combined props for OA stream and init + * i915_engine_stream_init - validate combined props for stream and init * @stream: An i915 perf stream * @param: The open parameters passed to `DRM_I915_PERF_OPEN` * @props: The property state that configures stream (individually validated) @@ -2686,15 +2883,15 @@ static void i915_oa_stream_disable(struct i915_perf_stream *stream) * doesn't ensure that the combination necessarily makes sense. * * At this point it has been determined that userspace wants a stream of - * OA metrics, but still we need to further validate the combined + * perf metrics, but still we need to further validate the combined * properties are OK. * * If the configuration makes sense then we can allocate memory for - * a circular OA buffer and apply the requested metric set configuration. + * a circular perf buffer and apply the requested metric set configuration. * * Returns: zero on success or a negative error code. */ -static int i915_oa_stream_init(struct i915_perf_stream *stream, +static int i915_engine_stream_init(struct i915_perf_stream *stream, struct drm_i915_perf_open_param *param, struct perf_open_properties *props) { @@ -2703,15 +2900,16 @@ static int i915_oa_stream_init(struct i915_perf_stream *stream, SAMPLE_OA_SOURCE_INFO); bool require_cs_mode = props->sample_flags & (SAMPLE_PID | SAMPLE_TAG); - bool cs_sample_data = props->sample_flags & SAMPLE_OA_REPORT; + bool cs_sample_data = props->sample_flags & (SAMPLE_OA_REPORT | + SAMPLE_TS); int ret; /* To avoid the complexity of having to accurately filter * counter reports and marshal to the appropriate client * we currently only allow exclusive access */ - if (dev_priv->perf.oa.exclusive_stream) { - DRM_DEBUG("OA unit already in use\n"); + if (dev_priv->perf.exclusive_stream) { + DRM_DEBUG("I915 perf stream already in use\n"); return -EBUSY; } @@ -2762,6 +2960,7 @@ static int i915_oa_stream_init(struct i915_perf_stream *stream, return -EINVAL; } stream->engine= RCS; + stream->using_oa = true; format_size = dev_priv->perf.oa.oa_formats[props->oa_format].size; @@ -2838,8 +3037,21 @@ static int i915_oa_stream_init(struct i915_perf_stream *stream, require_cs_mode = true; } + if (props->sample_flags & SAMPLE_TS) { + stream->sample_flags |= SAMPLE_TS; + stream->sample_size += I915_PERF_TS_SAMPLE_SIZE; + + /* + * NB: it's meaningful to request SAMPLE_TS with just CS + * mode or periodic OA mode sampling but we don't allow + * SAMPLE_TS without either mode + */ + if (!require_oa_unit) + require_cs_mode = true; + } + if (require_cs_mode && !props->cs_mode) { - DRM_ERROR("PID/TAG sampling requires a ring to be specified"); + DRM_ERROR("PID/TAG/TS sampling requires engine to be specified"); ret = -EINVAL; goto err_enable; } @@ -2854,17 +3066,18 @@ static int i915_oa_stream_init(struct i915_perf_stream *stream, /* * The only time we should allow enabling CS mode if it's not - * strictly required, is if SAMPLE_CTX_ID has been requested - * as it's usable with periodic OA or CS sampling. + * strictly required, is if SAMPLE_CTX_ID/SAMPLE_TS has been + * requested as they're usable with periodic OA or CS sampling. */ if (!require_cs_mode && - !(props->sample_flags & SAMPLE_CTX_ID)) { + !(props->sample_flags & (SAMPLE_CTX_ID | SAMPLE_TS))) { DRM_ERROR( "Stream engine given without requesting any CS specific property"); ret = -EINVAL; goto err_enable; } stream->cs_mode = true; + stream->engine = props->engine; if (props->sample_flags & SAMPLE_PID) { stream->sample_flags |= SAMPLE_PID; @@ -2876,15 +3089,15 @@ static int i915_oa_stream_init(struct i915_perf_stream *stream, stream->sample_size += 4; } - ret = alloc_command_stream_buf(dev_priv); + ret = alloc_command_stream_buf(stream); if (ret) goto err_enable; init_request_active(&stream->last_request, NULL); } - stream->ops = &i915_oa_stream_ops; - dev_priv->perf.oa.exclusive_stream = stream; + stream->ops = &i915_engine_stream_ops; + dev_priv->perf.exclusive_stream = stream; return 0; @@ -3101,18 +3314,18 @@ static ssize_t i915_perf_read(struct file *file, /* Maybe make ->pollin per-stream state if we support multiple * concurrent streams in the future. */ - dev_priv->perf.oa.pollin = false; + dev_priv->perf.pollin[stream->engine] = false; } return ret; } -static enum hrtimer_restart oa_poll_check_timer_cb(struct hrtimer *hrtimer) +static enum hrtimer_restart poll_check_timer_cb(struct hrtimer *hrtimer) { struct i915_perf_stream *stream; struct drm_i915_private *dev_priv = container_of(hrtimer, typeof(*dev_priv), - perf.oa.poll_check_timer); + perf.poll_check_timer); /* No need to protect the streams list here, since the hrtimer is * disabled before the stream is removed from list, and currently a @@ -3121,8 +3334,8 @@ static enum hrtimer_restart oa_poll_check_timer_cb(struct hrtimer *hrtimer) */ list_for_each_entry(stream, &dev_priv->perf.streams, link) { if (stream_have_data_unlocked(stream)) { - dev_priv->perf.oa.pollin = true; - wake_up(&dev_priv->perf.oa.poll_wq); + dev_priv->perf.pollin[stream->engine] = true; + wake_up(&dev_priv->perf.poll_wq[stream->engine]); } } @@ -3162,7 +3375,7 @@ static unsigned int i915_perf_poll_locked(struct drm_i915_private *dev_priv, * the hrtimer/oa_poll_check_timer_cb to notify us when there are * samples to read. */ - if (dev_priv->perf.oa.pollin) + if (dev_priv->perf.pollin[stream->engine]) events |= POLLIN; return events; @@ -3471,7 +3684,7 @@ static int i915_perf_release(struct inode *inode, struct file *file) stream->dev_priv = dev_priv; stream->ctx = specific_ctx; - ret = i915_oa_stream_init(stream, param, props); + ret = i915_engine_stream_init(stream, param, props); if (ret) goto err_alloc; @@ -3650,21 +3863,12 @@ static int read_properties_unlocked(struct drm_i915_private *dev_priv, case DRM_I915_PERF_PROP_ENGINE: { unsigned int user_ring_id = value & I915_EXEC_RING_MASK; - enum intel_engine_id engine; if (user_ring_id > I915_USER_RINGS) return -EINVAL; - /* XXX: Currently only RCS is supported. - * Remove this check when support for other - * engines is added - */ - engine = user_ring_map[user_ring_id]; - if (engine != RCS) - return -EINVAL; - props->cs_mode = true; - props->engine = engine; + props->engine = user_ring_map[user_ring_id]; } break; case DRM_I915_PERF_PROP_SAMPLE_CTX_ID: @@ -3676,6 +3880,9 @@ static int read_properties_unlocked(struct drm_i915_private *dev_priv, case DRM_I915_PERF_PROP_SAMPLE_TAG: props->sample_flags |= SAMPLE_TAG; break; + case DRM_I915_PERF_PROP_SAMPLE_TS: + props->sample_flags |= SAMPLE_TS; + break; default: MISSING_CASE(id); DRM_DEBUG("Unknown i915 perf property ID\n"); @@ -3980,17 +4187,22 @@ void i915_perf_init(struct drm_i915_private *dev_priv) } if (dev_priv->perf.oa.n_builtin_sets) { - hrtimer_init(&dev_priv->perf.oa.poll_check_timer, + int count; + + hrtimer_init(&dev_priv->perf.poll_check_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - dev_priv->perf.oa.poll_check_timer.function = oa_poll_check_timer_cb; - init_waitqueue_head(&dev_priv->perf.oa.poll_wq); + dev_priv->perf.poll_check_timer.function = poll_check_timer_cb; + + for (count = 0; count < I915_NUM_ENGINES; count++) { + INIT_LIST_HEAD(&dev_priv->perf.cs_samples[count]); + spin_lock_init(&dev_priv->perf.sample_lock[count]); + init_waitqueue_head(&dev_priv->perf.poll_wq[count]); + } INIT_LIST_HEAD(&dev_priv->perf.streams); - INIT_LIST_HEAD(&dev_priv->perf.cs_samples); mutex_init(&dev_priv->perf.lock); mutex_init(&dev_priv->perf.streams_lock); spin_lock_init(&dev_priv->perf.hook_lock); - spin_lock_init(&dev_priv->perf.sample_lock); spin_lock_init(&dev_priv->perf.oa.oa_buffer.ptr_lock); dev_priv->perf.sysctl_header = register_sysctl_table(dev_root); diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h index 1af2e93..ea312fc 100644 --- a/drivers/gpu/drm/i915/i915_reg.h +++ b/drivers/gpu/drm/i915/i915_reg.h @@ -417,6 +417,7 @@ static inline bool i915_mmio_reg_valid(i915_reg_t reg) #define MI_FLUSH_DW_STORE_INDEX (1<<21) #define MI_INVALIDATE_TLB (1<<18) #define MI_FLUSH_DW_OP_STOREDW (1<<14) +#define MI_FLUSH_DW_OP_STAMP (3<<14) #define MI_FLUSH_DW_OP_MASK (3<<14) #define MI_FLUSH_DW_NOTIFY (1<<8) #define MI_INVALIDATE_BSD (1<<7) @@ -500,6 +501,7 @@ static inline bool i915_mmio_reg_valid(i915_reg_t reg) #define PIPE_CONTROL_TLB_INVALIDATE (1<<18) #define PIPE_CONTROL_MEDIA_STATE_CLEAR (1<<16) #define PIPE_CONTROL_QW_WRITE (1<<14) +#define PIPE_CONTROL_TIMESTAMP_WRITE (3<<14) #define PIPE_CONTROL_POST_SYNC_OP_MASK (3<<14) #define PIPE_CONTROL_DEPTH_STALL (1<<13) #define PIPE_CONTROL_WRITE_FLUSH (1<<12) diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h index ab5fdc0..011d989 100644 --- a/include/uapi/drm/i915_drm.h +++ b/include/uapi/drm/i915_drm.h @@ -1390,6 +1390,12 @@ enum drm_i915_perf_property_id { */ DRM_I915_PERF_PROP_SAMPLE_TAG, + /** + * The value of this property set to 1 requests inclusion of timestamp + * in the perf sample data. + */ + DRM_I915_PERF_PROP_SAMPLE_TS, + DRM_I915_PERF_PROP_MAX /* non-ABI */ }; @@ -1459,6 +1465,7 @@ enum drm_i915_perf_record_type { * { u32 ctx_id; } && DRM_I915_PERF_PROP_SAMPLE_CTX_ID * { u32 pid; } && DRM_I915_PERF_PROP_SAMPLE_PID * { u32 tag; } && DRM_I915_PERF_PROP_SAMPLE_TAG + * { u64 timestamp; } && DRM_I915_PERF_PROP_SAMPLE_TS * { u32 oa_report[]; } && DRM_I915_PERF_PROP_SAMPLE_OA * }; */

[1/6] drm/i915: Extend i915 perf framework for collecting timestamps on all gpu engines

Commit Message

Comments

Patch