[09/11] drm/i915: Extend i915 perf framework for collecting timestamps on all gpu engines

Message ID	1455600439-18480-10-git-send-email-sourab.gupta@intel.com (mailing list archive)
State	New, archived
Headers	show Return-Path: <intel-gfx-bounces@lists.freedesktop.org> From: sourab.gupta@intel.com To: intel-gfx@lists.freedesktop.org Date: Tue, 16 Feb 2016 10:57:17 +0530 Message-Id: <1455600439-18480-10-git-send-email-sourab.gupta@intel.com> In-Reply-To: <1455600439-18480-1-git-send-email-sourab.gupta@intel.com> References: <1455600439-18480-1-git-send-email-sourab.gupta@intel.com> Cc: Jabin Wu <jabin.wu@intel.com>, Sourab Gupta <sourab.gupta@intel.com> Subject: [Intel-gfx] [PATCH 09/11] drm/i915: Extend i915 perf framework for collecting timestamps on all gpu engines Precedence: list MIME-Version: 1.0 Content-Type: text/plain; charset="utf-8" Content-Transfer-Encoding: base64 Errors-To: intel-gfx-bounces@lists.freedesktop.org Sender: "Intel-gfx" <intel-gfx-bounces@lists.freedesktop.org>

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index cf86228..b1c952c 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -1725,6 +1725,7 @@ struct i915_perf_stream { struct list_head link; + enum intel_ring_id ring_id; u32 sample_flags; int sample_size; @@ -1734,6 +1735,9 @@ struct i915_perf_stream { /* Whether command stream based data collection is enabled */ bool cs_mode; + /* Whether the OA unit is in use */ + bool using_oa; + /* Enables the collection of HW samples, either in response to * I915_PERF_IOCTL_ENABLE or implicitly called when stream is * opened without I915_PERF_FLAG_DISABLED */ @@ -1782,7 +1786,8 @@ struct i915_perf_stream { * Routine to emit the commands in the command streamer associated * with the corresponding gpu engine. */ - void (*command_stream_hook)(struct drm_i915_gem_request *req, u32 tag); + void (*command_stream_hook)(struct i915_perf_stream *stream, + struct drm_i915_gem_request *req, u32 tag); }; struct i915_oa_ops { @@ -1807,7 +1812,16 @@ struct i915_oa_ops { struct i915_perf_cs_data_node { struct list_head link; struct drm_i915_gem_request *request; - u32 offset; + + /* Offsets into the GEM obj holding the data */ + u32 start_offset; + u32 oa_offset; + u32 ts_offset; + + /* buffer size corresponding to this entry */ + u32 size; + + /* Other metadata */ u32 ctx_id; u32 pid; u32 tag; @@ -2071,14 +2085,13 @@ struct drm_i915_private { spinlock_t hook_lock; - struct { - struct i915_perf_stream *exclusive_stream; + struct hrtimer poll_check_timer; + struct i915_perf_stream *exclusive_stream; + wait_queue_head_t poll_wq[I915_NUM_RINGS]; + struct { u32 specific_ctx_id; - struct hrtimer poll_check_timer; - wait_queue_head_t poll_wq; - bool periodic; u32 period_exponent; @@ -2115,10 +2128,10 @@ struct drm_i915_private { struct drm_i915_gem_object *obj; struct i915_vma *vma; u8 *addr; - } command_stream_buf; + } command_stream_buf[I915_NUM_RINGS]; - struct list_head node_list; - spinlock_t node_list_lock; + struct list_head node_list[I915_NUM_RINGS]; + spinlock_t node_list_lock[I915_NUM_RINGS]; } perf; /* Abstract the submission mechanism (legacy ringbuffer or execlists) away */ diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c index 141f721..1d2712d 100644 --- a/drivers/gpu/drm/i915/i915_perf.c +++ b/drivers/gpu/drm/i915/i915_perf.c @@ -51,12 +51,17 @@ static u32 i915_perf_stream_paranoid = true; #define GEN8_OAREPORT_REASON_GO_TRANSITION (1<<23) #define GEN9_OAREPORT_REASON_CLK_RATIO (1<<24) -/* Data common to periodic and RCS based samples */ -struct oa_sample_data { +#define OA_ADDR_ALIGN 64 +#define TS_ADDR_ALIGN 8 +#define I915_PERF_TS_SAMPLE_SIZE 8 + +/* Data common to all samples (periodic OA / CS based OA / Timestamps) */ +struct sample_data { u32 source; u32 ctx_id; u32 pid; u32 tag; + u64 ts; const u8 *report; }; @@ -100,6 +105,7 @@ static struct i915_oa_format gen8_plus_oa_formats[I915_OA_FORMAT_MAX] = { #define SAMPLE_CTX_ID (1<<2) #define SAMPLE_PID (1<<3) #define SAMPLE_TAG (1<<4) +#define SAMPLE_TS (1<<5) struct perf_open_properties { @@ -136,8 +142,9 @@ void i915_perf_command_stream_hook(struct drm_i915_gem_request *req, u32 tag) mutex_lock(&dev_priv->perf.streams_lock); list_for_each_entry(stream, &dev_priv->perf.streams, link) { - if (stream->enabled && stream->command_stream_hook) - stream->command_stream_hook(req, tag); + if (stream->enabled && (stream->ring_id == ring->id) && + stream->command_stream_hook) + stream->command_stream_hook(stream, req, tag); } mutex_unlock(&dev_priv->perf.streams_lock); } @@ -150,16 +157,15 @@ void i915_perf_command_stream_hook(struct drm_i915_gem_request *req, u32 tag) * eventually, when the request associated with new entry completes. */ static void release_some_perf_entries(struct drm_i915_private *dev_priv, - u32 target_size) + enum intel_ring_id id, u32 target_size) { struct i915_perf_cs_data_node *entry, *next; - u32 entry_size = dev_priv->perf.oa.oa_buffer.format_size; u32 size = 0; list_for_each_entry_safe - (entry, next, &dev_priv->perf.node_list, link) { + (entry, next, &dev_priv->perf.node_list[id], link) { - size += entry_size; + size += entry->size; i915_gem_request_unreference(entry->request); list_del(&entry->link); kfree(entry); @@ -175,99 +181,117 @@ static void release_some_perf_entries(struct drm_i915_private *dev_priv, * the buffer, it will remove the oldest entries in order to make space. */ static void insert_perf_entry(struct drm_i915_private *dev_priv, + struct i915_perf_stream *stream, struct i915_perf_cs_data_node *entry) { struct i915_perf_cs_data_node *first_entry, *last_entry; - int max_offset = dev_priv->perf.command_stream_buf.obj->base.size; - u32 entry_size = dev_priv->perf.oa.oa_buffer.format_size; - - spin_lock(&dev_priv->perf.node_list_lock); - if (list_empty(&dev_priv->perf.node_list)) { - entry->offset = 0; - list_add_tail(&entry->link, &dev_priv->perf.node_list); - spin_unlock(&dev_priv->perf.node_list_lock); - return; + u32 sample_flags = stream->sample_flags; + enum intel_ring_id id = stream->ring_id; + int max_offset = dev_priv->perf.command_stream_buf[id].obj->base.size; + u32 offset, entry_size = 0; + bool sample_ts = false; + + if (stream->sample_flags & SAMPLE_OA_REPORT) + entry_size += dev_priv->perf.oa.oa_buffer.format_size; + else if (sample_flags & SAMPLE_TS) { + /* + * XXX: Since TS data can anyways be derived from OA report, so + * no need to capture it for RCS ring, if capture oa data is + * called already. + */ + entry_size += I915_PERF_TS_SAMPLE_SIZE; + sample_ts = true; } - first_entry = list_first_entry(&dev_priv->perf.node_list, + spin_lock(&dev_priv->perf.node_list_lock[id]); + if (list_empty(&dev_priv->perf.node_list[id])) { + offset = 0; + goto out; + } + + first_entry = list_first_entry(&dev_priv->perf.node_list[id], typeof(*first_entry), link); - last_entry = list_last_entry(&dev_priv->perf.node_list, - typeof(*first_entry), link); + last_entry = list_last_entry(&dev_priv->perf.node_list[id], + typeof(*last_entry), link); - if (last_entry->offset >= first_entry->offset) { + if (last_entry->start_offset >= first_entry->start_offset) { /* Sufficient space available at the end of buffer? */ - if (last_entry->offset + 2*entry_size < max_offset) - entry->offset = last_entry->offset + entry_size; + if (last_entry->start_offset + last_entry->size + entry_size + < max_offset) + offset = last_entry->start_offset + last_entry->size; /* * Wraparound condition. Is sufficient space available at * beginning of buffer? */ - else if (entry_size < first_entry->offset) - entry->offset = 0; + else if (entry_size < first_entry->start_offset) + offset = 0; /* Insufficient space. Overwrite existing old entries */ else { - u32 target_size = entry_size - first_entry->offset; + u32 target_size = entry_size - + first_entry->start_offset; - release_some_perf_entries(dev_priv, target_size); - entry->offset = 0; + release_some_perf_entries(dev_priv, id, target_size); + offset = 0; } } else { /* Sufficient space available? */ - if (last_entry->offset + 2*entry_size < first_entry->offset) - entry->offset = last_entry->offset + entry_size; + if (last_entry->start_offset + last_entry->size + entry_size + < first_entry->start_offset) + offset = last_entry->start_offset + last_entry->size; /* Insufficient space. Overwrite existing old entries */ else { u32 target_size = entry_size - - (first_entry->offset - last_entry->offset - - entry_size); + (first_entry->start_offset - + last_entry->start_offset - + last_entry->size); - release_some_perf_entries(dev_priv, target_size); - entry->offset = last_entry->offset + entry_size; + release_some_perf_entries(dev_priv, id, target_size); + offset = last_entry->start_offset + last_entry->size; } } - list_add_tail(&entry->link, &dev_priv->perf.node_list); - spin_unlock(&dev_priv->perf.node_list_lock); + +out: + entry->start_offset = offset; + entry->size = entry_size; + if (stream->sample_flags & SAMPLE_OA_REPORT) { + entry->oa_offset = offset; + /* Ensure 64 byte alignment of oa_offset */ + entry->oa_offset = ALIGN(entry->oa_offset, OA_ADDR_ALIGN); + offset = entry->oa_offset + + dev_priv->perf.oa.oa_buffer.format_size; + } + if (sample_ts) { + entry->ts_offset = offset; + /* Ensure 8 byte alignment of ts_offset */ + entry->ts_offset = ALIGN(entry->ts_offset, TS_ADDR_ALIGN); + offset = entry->ts_offset + I915_PERF_TS_SAMPLE_SIZE; + } + + list_add_tail(&entry->link, &dev_priv->perf.node_list[id]); + spin_unlock(&dev_priv->perf.node_list_lock[id]); } -static void i915_perf_command_stream_hook_oa(struct drm_i915_gem_request *req, - u32 tag) +static int i915_perf_stream_capture_oa_report(struct drm_i915_gem_request *req, + u32 offset) { struct intel_engine_cs *ring = req->ring; struct intel_ringbuffer *ringbuf = req->ringbuf; - struct intel_context *ctx = req->ctx; struct drm_i915_private *dev_priv = ring->dev->dev_private; - struct i915_perf_cs_data_node *entry; u32 addr = 0; int ret; /* OA counters are only supported on the render ring */ BUG_ON(ring->id != RCS); - entry = kzalloc(sizeof(*entry), GFP_KERNEL); - if (entry == NULL) { - DRM_ERROR("alloc failed\n"); - return; - } - if (i915.enable_execlists) ret = intel_logical_ring_begin(req, 4); else ret = intel_ring_begin(req, 4); - if (ret) { - kfree(entry); - return; - } - - entry->ctx_id = ctx->global_id; - entry->pid = current->pid; - entry->tag = tag; - i915_gem_request_assign(&entry->request, req); - - insert_perf_entry(dev_priv, entry); + if (ret) + return ret; - addr = dev_priv->perf.command_stream_buf.vma->node.start + - entry->offset; + addr = dev_priv->perf.command_stream_buf[RCS].vma->node.start + offset; /* addr should be 64 byte aligned */ BUG_ON(addr & 0x3f); @@ -295,10 +319,154 @@ static void i915_perf_command_stream_hook_oa(struct drm_i915_gem_request *req, } intel_ring_advance(ring); } - i915_vma_move_to_active(dev_priv->perf.command_stream_buf.vma, req); + return 0; +} + +static int i915_perf_stream_capture_ts_data(struct drm_i915_gem_request *req, + u32 offset) +{ + struct intel_engine_cs *ring = req->ring; + struct intel_ringbuffer *ringbuf = req->ringbuf; + struct drm_i915_private *dev_priv = ring->dev->dev_private; + u32 addr = 0; + int ret; + + if (i915.enable_execlists) + ret = intel_logical_ring_begin(req, 6); + else + ret = intel_ring_begin(req, 6); + + if (ret) + return ret; + + addr = dev_priv->perf.command_stream_buf[ring->id].vma->node.start + + offset; + + if (i915.enable_execlists) { + if (ring->id == RCS) { + intel_logical_ring_emit(ringbuf, + GFX_OP_PIPE_CONTROL(6)); + intel_logical_ring_emit(ringbuf, + PIPE_CONTROL_GLOBAL_GTT_IVB | + PIPE_CONTROL_TIMESTAMP_WRITE); + intel_logical_ring_emit(ringbuf, addr | + PIPE_CONTROL_GLOBAL_GTT); + intel_logical_ring_emit(ringbuf, 0); + intel_logical_ring_emit(ringbuf, 0); + intel_logical_ring_emit(ringbuf, 0); + } else { + uint32_t cmd; + + cmd = MI_FLUSH_DW + 2; /* Gen8+ */ + + cmd |= MI_FLUSH_DW_OP_STAMP; + + intel_logical_ring_emit(ringbuf, cmd); + intel_logical_ring_emit(ringbuf, addr | + MI_FLUSH_DW_USE_GTT); + intel_logical_ring_emit(ringbuf, 0); + intel_logical_ring_emit(ringbuf, 0); + intel_logical_ring_emit(ringbuf, 0); + intel_logical_ring_emit(ringbuf, MI_NOOP); + } + intel_logical_ring_advance(ringbuf); + } else { + if (ring->id == RCS) { + if (INTEL_INFO(ring->dev)->gen >= 8) + intel_ring_emit(ring, GFX_OP_PIPE_CONTROL(6)); + else + intel_ring_emit(ring, GFX_OP_PIPE_CONTROL(5)); + intel_ring_emit(ring, + PIPE_CONTROL_GLOBAL_GTT_IVB | + PIPE_CONTROL_TIMESTAMP_WRITE); + intel_ring_emit(ring, addr | PIPE_CONTROL_GLOBAL_GTT); + intel_ring_emit(ring, 0); + if (INTEL_INFO(ring->dev)->gen >= 8) { + intel_ring_emit(ring, 0); + intel_ring_emit(ring, 0); + } else { + intel_ring_emit(ring, 0); + intel_ring_emit(ring, MI_NOOP); + } + } else { + uint32_t cmd; + + cmd = MI_FLUSH_DW + 1; + if (INTEL_INFO(ring->dev)->gen >= 8) + cmd += 1; + + cmd |= MI_FLUSH_DW_OP_STAMP; + + intel_ring_emit(ring, cmd); + intel_ring_emit(ring, addr | MI_FLUSH_DW_USE_GTT); + if (INTEL_INFO(ring->dev)->gen >= 8) { + intel_ring_emit(ring, 0); + intel_ring_emit(ring, 0); + intel_ring_emit(ring, 0); + } else { + intel_ring_emit(ring, 0); + intel_ring_emit(ring, 0); + intel_ring_emit(ring, MI_NOOP); + } + intel_ring_emit(ring, MI_NOOP); + } + intel_ring_advance(ring); + } + return 0; +} + +static void i915_perf_stream_cs_hook(struct i915_perf_stream *stream, + struct drm_i915_gem_request *req, u32 tag) +{ + struct intel_engine_cs *ring = req->ring; + struct intel_context *ctx = req->ctx; + struct drm_i915_private *dev_priv = ring->dev->dev_private; + enum intel_ring_id id = stream->ring_id; + u32 sample_flags = stream->sample_flags; + struct i915_perf_cs_data_node *entry; + int ret = 0; + + entry = kzalloc(sizeof(*entry), GFP_KERNEL); + if (entry == NULL) { + DRM_ERROR("alloc failed\n"); + return; + } + + entry->ctx_id = ctx->global_id; + entry->pid = current->pid; + entry->tag = tag; + i915_gem_request_assign(&entry->request, req); + + insert_perf_entry(dev_priv, stream, entry); + + if (sample_flags & SAMPLE_OA_REPORT) { + ret = i915_perf_stream_capture_oa_report(req, entry->oa_offset); + if (ret) + goto err; + } else if (sample_flags & SAMPLE_TS) { + /* + * XXX: Since TS data can anyways be derived from OA report, so + * no need to capture it for RCS ring, if capture oa data is + * called already. + */ + ret = i915_perf_stream_capture_ts_data(req, entry->ts_offset); + if (ret) + goto err; + } + + i915_vma_move_to_active(dev_priv->perf.command_stream_buf[id].vma, req); + return; + +err: + i915_gem_request_unreference(entry->request); + spin_lock(&dev_priv->perf.node_list_lock[id]); + list_del(&entry->link); + kfree(entry); + spin_unlock(&dev_priv->perf.node_list_lock[id]); } -static int i915_oa_rcs_wait_gpu(struct drm_i915_private *dev_priv) +static int i915_perf_wait_gpu(struct drm_i915_private *dev_priv, + enum intel_ring_id id) { struct i915_perf_cs_data_node *last_entry = NULL; struct drm_i915_gem_request *req = NULL; @@ -309,14 +477,14 @@ static int i915_oa_rcs_wait_gpu(struct drm_i915_private *dev_priv) * implicitly wait for the prior submitted requests. The refcount * of the requests is not decremented here. */ - spin_lock(&dev_priv->perf.node_list_lock); + spin_lock(&dev_priv->perf.node_list_lock[id]); - if (!list_empty(&dev_priv->perf.node_list)) { - last_entry = list_last_entry(&dev_priv->perf.node_list, + if (!list_empty(&dev_priv->perf.node_list[id])) { + last_entry = list_last_entry(&dev_priv->perf.node_list[id], struct i915_perf_cs_data_node, link); req = last_entry->request; } - spin_unlock(&dev_priv->perf.node_list_lock); + spin_unlock(&dev_priv->perf.node_list_lock[id]); if (!req) return 0; @@ -331,17 +499,18 @@ static int i915_oa_rcs_wait_gpu(struct drm_i915_private *dev_priv) return 0; } -static void i915_oa_rcs_free_requests(struct drm_i915_private *dev_priv) +static void i915_perf_free_requests(struct drm_i915_private *dev_priv, + enum intel_ring_id id) { struct i915_perf_cs_data_node *entry, *next; list_for_each_entry_safe - (entry, next, &dev_priv->perf.node_list, link) { + (entry, next, &dev_priv->perf.node_list[id], link) { i915_gem_request_unreference__unlocked(entry->request); - spin_lock(&dev_priv->perf.node_list_lock); + spin_lock(&dev_priv->perf.node_list_lock[id]); list_del(&entry->link); - spin_unlock(&dev_priv->perf.node_list_lock); + spin_unlock(&dev_priv->perf.node_list_lock[id]); kfree(entry); } } @@ -381,9 +550,9 @@ static bool append_oa_status(struct i915_perf_stream *stream, return true; } -static bool append_oa_sample(struct i915_perf_stream *stream, +static bool append_sample(struct i915_perf_stream *stream, struct i915_perf_read_state *read_state, - struct oa_sample_data *data) + struct sample_data *data) { struct drm_i915_private *dev_priv = stream->dev_priv; int report_size = dev_priv->perf.oa.oa_buffer.format_size; @@ -424,6 +593,13 @@ static bool append_oa_sample(struct i915_perf_stream *stream, read_state->buf += 4; } + if (sample_flags & SAMPLE_TS) { + if (copy_to_user(read_state->buf, &data->ts, + I915_PERF_TS_SAMPLE_SIZE)) + return false; + read_state->buf += I915_PERF_TS_SAMPLE_SIZE; + } + if (sample_flags & SAMPLE_OA_REPORT) { if (copy_to_user(read_state->buf, data->report, report_size)) return false; @@ -441,7 +617,7 @@ static bool append_oa_buffer_sample(struct i915_perf_stream *stream, { struct drm_i915_private *dev_priv = stream->dev_priv; u32 sample_flags = stream->sample_flags; - struct oa_sample_data data = { 0 }; + struct sample_data data = { 0 }; if (sample_flags & SAMPLE_OA_SOURCE_INFO) { enum drm_i915_perf_oa_event_source source; @@ -473,10 +649,15 @@ static bool append_oa_buffer_sample(struct i915_perf_stream *stream, if (sample_flags & SAMPLE_TAG) data.tag = 0; + /* Derive timestamp from OA report, after scaling with the ts base */ +#warning "FIXME: append_oa_buffer_sample: derive the timestamp from OA report" + if (sample_flags & SAMPLE_TS) + data.ts = 0; + if (sample_flags & SAMPLE_OA_REPORT) data.report = report; - append_oa_sample(stream, read_state, &data); + append_sample(stream, read_state, &data); return true; } @@ -528,7 +709,7 @@ static u32 gen8_append_oa_reports(struct i915_perf_stream *stream, ctx_id &= 0xfffff; } - if (dev_priv->perf.oa.exclusive_stream->enabled) { + if (stream->enabled) { /* NB: For Gen 8 we handle per-context report filtering * ourselves instead of programming the OA unit with a @@ -539,7 +720,7 @@ static u32 gen8_append_oa_reports(struct i915_perf_stream *stream, * first report belonging to any subsequently * switched-too context. */ - if (!dev_priv->perf.oa.exclusive_stream->ctx || + if (!stream->ctx || (dev_priv->perf.oa.specific_ctx_id == ctx_id || (dev_priv->perf.oa.specific_ctx_id != dev_priv->perf.oa.oa_buffer.last_ctx_id))) { @@ -630,7 +811,7 @@ static u32 gen7_append_oa_reports(struct i915_perf_stream *stream, if (report_ts > ts) break; - if (dev_priv->perf.oa.exclusive_stream->enabled) { + if (stream->enabled) { if (!append_oa_buffer_sample(stream, read_state, report)) break; @@ -687,24 +868,32 @@ static void gen7_oa_read(struct i915_perf_stream *stream, OA_MEM_SELECT_GGTT); } -static bool append_oa_rcs_sample(struct i915_perf_stream *stream, +static bool append_one_cs_sample(struct i915_perf_stream *stream, struct i915_perf_read_state *read_state, struct i915_perf_cs_data_node *node) { struct drm_i915_private *dev_priv = stream->dev_priv; - struct oa_sample_data data = { 0 }; - const u8 *report = dev_priv->perf.command_stream_buf.addr + - node->offset; + enum intel_ring_id id = stream->ring_id; + struct sample_data data = { 0 }; u32 sample_flags = stream->sample_flags; - u32 report_ts; - /* - * Forward the periodic OA samples which have the timestamp lower - * than timestamp of this sample, before forwarding this sample. - * This ensures samples read by user are order acc. to their timestamps - */ - report_ts = *(u32 *)(report + 4); - dev_priv->perf.oa.ops.read(stream, read_state, report_ts); + if (sample_flags & SAMPLE_OA_REPORT) { + const u8 *report = dev_priv->perf.command_stream_buf[id].addr + + node->oa_offset; + u32 sample_ts = *(u32 *)(report + 4); + + BUG_ON(id != RCS); + + data.report = report; + + /* + * Forward the periodic OA samples which have the timestamp + * lower than timestamp of this sample, before forwarding this + * sample. This ensures samples read by user are order acc. to + * their timestamps + */ + dev_priv->perf.oa.ops.read(stream, read_state, sample_ts); + } if (sample_flags & SAMPLE_OA_SOURCE_INFO) data.source = I915_PERF_OA_EVENT_SOURCE_RCS; @@ -718,38 +907,51 @@ static bool append_oa_rcs_sample(struct i915_perf_stream *stream, if (sample_flags & SAMPLE_TAG) data.tag = node->tag; - if (sample_flags & SAMPLE_OA_REPORT) - data.report = report; + if (sample_flags & SAMPLE_TS) { + /* For RCS, derive timestamp from OA report, after + * scaling with the timestamp base. For other rings, forward the + * timestamp collected via command stream. + */ +#warning "FIXME: append_one_cs_sample: derive the timestamp from OA report" + if (sample_flags & SAMPLE_OA_REPORT) + data.ts = 0; + else + data.ts = *(u64 *) + (dev_priv->perf.command_stream_buf[id].addr + + node->ts_offset); + } - append_oa_sample(stream, read_state, &data); + append_sample(stream, read_state, &data); return true; } -static void oa_rcs_append_reports(struct i915_perf_stream *stream, +static void append_command_stream_samples(struct i915_perf_stream *stream, struct i915_perf_read_state *read_state) { struct drm_i915_private *dev_priv = stream->dev_priv; + enum intel_ring_id id = stream->ring_id; struct i915_perf_cs_data_node *entry, *next; list_for_each_entry_safe(entry, next, - &dev_priv->perf.node_list, link) { + &dev_priv->perf.node_list[id], link) { if (!i915_gem_request_completed(entry->request, true)) break; - if (!append_oa_rcs_sample(stream, read_state, entry)) + if (!append_one_cs_sample(stream, read_state, entry)) break; - spin_lock(&dev_priv->perf.node_list_lock); + spin_lock(&dev_priv->perf.node_list_lock[id]); list_del(&entry->link); - spin_unlock(&dev_priv->perf.node_list_lock); + spin_unlock(&dev_priv->perf.node_list_lock[id]); i915_gem_request_unreference__unlocked(entry->request); kfree(entry); } - /* Flush any remaining periodic reports */ - dev_priv->perf.oa.ops.read(stream, read_state, U32_MAX); + /* Flush any remaining periodic OA reports in case of RCS*/ + if (stream->sample_flags & SAMPLE_OA_REPORT) + dev_priv->perf.oa.ops.read(stream, read_state, U32_MAX); } static bool command_stream_buf_is_empty(struct i915_perf_stream *stream) @@ -757,7 +959,7 @@ static bool command_stream_buf_is_empty(struct i915_perf_stream *stream) struct drm_i915_private *dev_priv = stream->dev_priv; if (stream->cs_mode) - return list_empty(&dev_priv->perf.node_list); + return list_empty(&dev_priv->perf.node_list[stream->ring_id]); else return true; } @@ -772,63 +974,69 @@ static bool stream_have_data__unlocked(struct i915_perf_stream *stream) * can't be destroyed until completion (such as a read()) that ensures * the device + OA buffer can't disappear */ - return !(dev_priv->perf.oa.ops.oa_buffer_is_empty(dev_priv) && - command_stream_buf_is_empty(stream)); + if (stream->sample_flags & SAMPLE_OA_REPORT) + return !(dev_priv->perf.oa.ops.oa_buffer_is_empty(dev_priv) && + command_stream_buf_is_empty(stream)); + else + return !command_stream_buf_is_empty(stream); } -static bool i915_oa_can_read(struct i915_perf_stream *stream) +static bool i915_perf_stream_can_read(struct i915_perf_stream *stream) { return stream_have_data__unlocked(stream); } -static int i915_oa_wait_unlocked(struct i915_perf_stream *stream) +static int i915_perf_stream_wait_unlocked(struct i915_perf_stream *stream) { struct drm_i915_private *dev_priv = stream->dev_priv; + enum intel_ring_id id = stream->ring_id; int ret; if (stream->cs_mode) { - ret = i915_oa_rcs_wait_gpu(dev_priv); + ret = i915_perf_wait_gpu(dev_priv, id); if (ret) return ret; } - return wait_event_interruptible(dev_priv->perf.oa.poll_wq, + return wait_event_interruptible(dev_priv->perf.poll_wq[id], stream_have_data__unlocked(stream)); } -static void i915_oa_poll_wait(struct i915_perf_stream *stream, +static void i915_perf_stream_poll_wait(struct i915_perf_stream *stream, struct file *file, poll_table *wait) { struct drm_i915_private *dev_priv = stream->dev_priv; - poll_wait(file, &dev_priv->perf.oa.poll_wq, wait); + poll_wait(file, &dev_priv->perf.poll_wq[stream->ring_id], wait); } -static void i915_oa_read(struct i915_perf_stream *stream, +static void i915_perf_stream_read(struct i915_perf_stream *stream, struct i915_perf_read_state *read_state) { struct drm_i915_private *dev_priv = stream->dev_priv; if (stream->cs_mode) - oa_rcs_append_reports(stream, read_state); - else + append_command_stream_samples(stream, read_state); + else if (stream->ring_id == RCS) dev_priv->perf.oa.ops.read(stream, read_state, U32_MAX); } static void -free_command_stream_buf(struct drm_i915_private *i915) +free_command_stream_buf(struct drm_i915_private *i915, + enum intel_ring_id id) { mutex_lock(&i915->dev->struct_mutex); - vunmap(i915->perf.command_stream_buf.addr); - i915_gem_object_ggtt_unpin(i915->perf.command_stream_buf.obj); - drm_gem_object_unreference(&i915->perf.command_stream_buf.obj->base); + vunmap(i915->perf.command_stream_buf[id].addr); + i915_gem_object_ggtt_unpin(i915->perf.command_stream_buf[id].obj); + drm_gem_object_unreference( + &i915->perf.command_stream_buf[id].obj->base); - i915->perf.command_stream_buf.obj = NULL; - i915->perf.command_stream_buf.vma = NULL; - i915->perf.command_stream_buf.addr = NULL; + i915->perf.command_stream_buf[id].obj = NULL; + i915->perf.command_stream_buf[id].vma = NULL; + i915->perf.command_stream_buf[id].addr = NULL; mutex_unlock(&i915->dev->struct_mutex); } @@ -849,16 +1057,13 @@ free_oa_buffer(struct drm_i915_private *i915) mutex_unlock(&i915->dev->struct_mutex); } -static void i915_oa_stream_destroy(struct i915_perf_stream *stream) +static void i915_perf_stream_destroy(struct i915_perf_stream *stream) { struct drm_i915_private *dev_priv = stream->dev_priv; - BUG_ON(stream != dev_priv->perf.oa.exclusive_stream); + BUG_ON(stream != dev_priv->perf.exclusive_stream); - if (stream->cs_mode) - free_command_stream_buf(dev_priv); - - if (dev_priv->perf.oa.oa_buffer.obj) { + if (stream->using_oa) { dev_priv->perf.oa.ops.disable_metric_set(dev_priv); free_oa_buffer(dev_priv); @@ -867,7 +1072,10 @@ static void i915_oa_stream_destroy(struct i915_perf_stream *stream) intel_runtime_pm_put(dev_priv); } - dev_priv->perf.oa.exclusive_stream = NULL; + if (stream->cs_mode) + free_command_stream_buf(dev_priv, stream->ring_id); + + dev_priv->perf.exclusive_stream = NULL; } static void *vmap_oa_buffer(struct drm_i915_gem_object *obj) @@ -993,27 +1201,28 @@ static int alloc_oa_buffer(struct drm_i915_private *dev_priv) return 0; } -static int alloc_command_stream_buf(struct drm_i915_private *dev_priv) +static int alloc_command_stream_buf(struct drm_i915_private *dev_priv, + enum intel_ring_id id) { struct drm_i915_gem_object *bo; int ret; - BUG_ON(dev_priv->perf.command_stream_buf.obj); + BUG_ON(dev_priv->perf.command_stream_buf[id].obj); ret = alloc_obj(dev_priv, &bo); if (ret) return ret; - dev_priv->perf.command_stream_buf.obj = bo; - dev_priv->perf.command_stream_buf.vma = i915_gem_obj_to_ggtt(bo); - dev_priv->perf.command_stream_buf.addr = vmap_oa_buffer(bo); - INIT_LIST_HEAD(&dev_priv->perf.node_list); + dev_priv->perf.command_stream_buf[id].obj = bo; + dev_priv->perf.command_stream_buf[id].vma = i915_gem_obj_to_ggtt(bo); + dev_priv->perf.command_stream_buf[id].addr = vmap_oa_buffer(bo); + INIT_LIST_HEAD(&dev_priv->perf.node_list[id]); DRM_DEBUG_DRIVER( "command stream buf initialized, gtt offset = 0x%x, vaddr = %p", (unsigned int) - dev_priv->perf.command_stream_buf.vma->node.start, - dev_priv->perf.command_stream_buf.addr); + dev_priv->perf.command_stream_buf[id].vma->node.start, + dev_priv->perf.command_stream_buf[id].addr); return 0; } @@ -1225,17 +1434,17 @@ static void gen7_update_oacontrol_locked(struct drm_i915_private *dev_priv) { assert_spin_locked(&dev_priv->perf.hook_lock); - if (dev_priv->perf.oa.exclusive_stream->enabled) { + if (dev_priv->perf.exclusive_stream->enabled) { unsigned long ctx_id = 0; bool pinning_ok = false; - if (dev_priv->perf.oa.exclusive_stream->ctx && + if (dev_priv->perf.exclusive_stream->ctx && dev_priv->perf.oa.specific_ctx_id) { ctx_id = dev_priv->perf.oa.specific_ctx_id; pinning_ok = true; } - if (dev_priv->perf.oa.exclusive_stream->ctx == NULL || + if (dev_priv->perf.exclusive_stream->ctx == NULL || pinning_ok) { bool periodic = dev_priv->perf.oa.periodic; u32 period_exponent = dev_priv->perf.oa.period_exponent; @@ -1292,17 +1501,18 @@ static void gen8_oa_enable(struct drm_i915_private *dev_priv) I915_WRITE(GEN8_OAHEADPTR, tail); } -static void i915_oa_stream_enable(struct i915_perf_stream *stream) +static void i915_perf_stream_enable(struct i915_perf_stream *stream) { struct drm_i915_private *dev_priv = stream->dev_priv; - dev_priv->perf.oa.ops.oa_enable(dev_priv); + if (stream->sample_flags & SAMPLE_OA_REPORT) + dev_priv->perf.oa.ops.oa_enable(dev_priv); if (stream->cs_mode) - stream->command_stream_hook = i915_perf_command_stream_hook_oa; + stream->command_stream_hook = i915_perf_stream_cs_hook; - if (dev_priv->perf.oa.periodic) - hrtimer_start(&dev_priv->perf.oa.poll_check_timer, + if (stream->cs_mode || dev_priv->perf.oa.periodic) + hrtimer_start(&dev_priv->perf.poll_check_timer, ns_to_ktime(POLL_PERIOD), HRTIMER_MODE_REL_PINNED); } @@ -1317,23 +1527,24 @@ static void gen8_oa_disable(struct drm_i915_private *dev_priv) I915_WRITE(GEN8_OACONTROL, 0); } -static void i915_oa_stream_disable(struct i915_perf_stream *stream) +static void i915_perf_stream_disable(struct i915_perf_stream *stream) { struct drm_i915_private *dev_priv = stream->dev_priv; - if (dev_priv->perf.oa.periodic) - hrtimer_cancel(&dev_priv->perf.oa.poll_check_timer); + if (stream->cs_mode || dev_priv->perf.oa.periodic) + hrtimer_cancel(&dev_priv->perf.poll_check_timer); if (stream->cs_mode) { stream->command_stream_hook = NULL; - i915_oa_rcs_wait_gpu(dev_priv); - i915_oa_rcs_free_requests(dev_priv); + i915_perf_wait_gpu(dev_priv, stream->ring_id); + i915_perf_free_requests(dev_priv, stream->ring_id); } - dev_priv->perf.oa.ops.oa_disable(dev_priv); + if (stream->sample_flags & SAMPLE_OA_REPORT) + dev_priv->perf.oa.ops.oa_disable(dev_priv); } -static int i915_oa_stream_init(struct i915_perf_stream *stream, +static int i915_perf_stream_init(struct i915_perf_stream *stream, struct drm_i915_perf_open_param *param, struct perf_open_properties *props) { @@ -1341,15 +1552,15 @@ static int i915_oa_stream_init(struct i915_perf_stream *stream, bool require_oa_unit = props->sample_flags & (SAMPLE_OA_REPORT | SAMPLE_OA_SOURCE_INFO); bool require_cs_mode = props->sample_flags & (SAMPLE_PID | - SAMPLE_TAG); - int format_size; + SAMPLE_TAG | + SAMPLE_TS); int ret; /* To avoid the complexity of having to accurately filter * counter reports and marshal to the appropriate client * we currently only allow exclusive access */ - if (dev_priv->perf.oa.exclusive_stream) { - DRM_ERROR("OA unit already in use\n"); + if (dev_priv->perf.exclusive_stream) { + DRM_ERROR("Stream already in use\n"); return -EBUSY; } @@ -1364,6 +1575,7 @@ static int i915_oa_stream_init(struct i915_perf_stream *stream, stream->sample_size = sizeof(struct drm_i915_perf_record_header); if (require_oa_unit) { + int format_size; if (!dev_priv->perf.oa.ops.init_oa_buffer) { DRM_ERROR("OA unit not supported\n"); return -ENODEV; @@ -1386,6 +1598,8 @@ static int i915_oa_stream_init(struct i915_perf_stream *stream, return -EINVAL; } + stream->using_oa = true; + format_size = dev_priv->perf.oa.oa_formats[props->oa_format].size; @@ -1452,7 +1666,8 @@ static int i915_oa_stream_init(struct i915_perf_stream *stream, } if (require_cs_mode && !props->cs_mode) { - DRM_ERROR("PID or TAG sampling require a ring to be specified"); + DRM_ERROR( + "PID, TAG or TS sampling require a ring to be specified"); ret = -EINVAL; goto cs_error; } @@ -1472,6 +1687,7 @@ static int i915_oa_stream_init(struct i915_perf_stream *stream, } stream->cs_mode = true; + stream->ring_id = props->ring_id; if (props->sample_flags & SAMPLE_PID) { stream->sample_flags |= SAMPLE_PID; @@ -1483,20 +1699,25 @@ static int i915_oa_stream_init(struct i915_perf_stream *stream, stream->sample_size += 4; } - ret = alloc_command_stream_buf(dev_priv); + if (props->sample_flags & SAMPLE_TS) { + stream->sample_flags |= SAMPLE_TS; + stream->sample_size += I915_PERF_TS_SAMPLE_SIZE; + } + + ret = alloc_command_stream_buf(dev_priv, stream->ring_id); if (ret) goto cs_error; } - dev_priv->perf.oa.exclusive_stream = stream; + dev_priv->perf.exclusive_stream = stream; - stream->destroy = i915_oa_stream_destroy; - stream->enable = i915_oa_stream_enable; - stream->disable = i915_oa_stream_disable; - stream->can_read = i915_oa_can_read; - stream->wait_unlocked = i915_oa_wait_unlocked; - stream->poll_wait = i915_oa_poll_wait; - stream->read = i915_oa_read; + stream->destroy = i915_perf_stream_destroy; + stream->enable = i915_perf_stream_enable; + stream->disable = i915_perf_stream_disable; + stream->can_read = i915_perf_stream_can_read; + stream->wait_unlocked = i915_perf_stream_wait_unlocked; + stream->poll_wait = i915_perf_stream_poll_wait; + stream->read = i915_perf_stream_read; return 0; @@ -1530,8 +1751,8 @@ static void i915_oa_context_pin_notify_locked(struct drm_i915_private *dev_priv, dev_priv->perf.oa.ops.update_hw_ctx_id_locked == NULL) return; - if (dev_priv->perf.oa.exclusive_stream && - dev_priv->perf.oa.exclusive_stream->ctx == context) { + if (dev_priv->perf.exclusive_stream && + dev_priv->perf.exclusive_stream->ctx == context) { struct drm_i915_gem_object *obj = context->legacy_hw_ctx.rcs_state; u32 ctx_id = i915_gem_obj_ggtt_offset(obj); @@ -1599,8 +1820,8 @@ void i915_oa_legacy_ctx_switch_notify(struct drm_i915_gem_request *req) if (dev_priv->perf.oa.ops.legacy_ctx_switch_unlocked == NULL) return; - if (dev_priv->perf.oa.exclusive_stream && - dev_priv->perf.oa.exclusive_stream->enabled) { + if (dev_priv->perf.exclusive_stream && + dev_priv->perf.exclusive_stream->enabled) { /* XXX: We don't take a lock here and this may run * async with respect to stream methods. Notably we @@ -1729,7 +1950,7 @@ static enum hrtimer_restart poll_check_timer_cb(struct hrtimer *hrtimer) struct drm_i915_private *dev_priv = container_of(hrtimer, typeof(*dev_priv), - perf.oa.poll_check_timer); + perf.poll_check_timer); /* No need to protect the streams list here, since the hrtimer is * disabled before the stream is removed from list, and currently a @@ -1738,7 +1959,7 @@ static enum hrtimer_restart poll_check_timer_cb(struct hrtimer *hrtimer) */ list_for_each_entry(stream, &dev_priv->perf.streams, link) { if (stream_have_data__unlocked(stream)) - wake_up(&dev_priv->perf.oa.poll_wq); + wake_up(&dev_priv->perf.poll_wq[stream->ring_id]); } hrtimer_forward_now(hrtimer, ns_to_ktime(POLL_PERIOD)); @@ -1947,7 +2168,7 @@ int i915_perf_open_ioctl_locked(struct drm_device *dev, stream->dev_priv = dev_priv; stream->ctx = specific_ctx; - ret = i915_oa_stream_init(stream, param, props); + ret = i915_perf_stream_init(stream, param, props); if (ret) goto err_alloc; @@ -2088,13 +2309,6 @@ static int read_properties_unlocked(struct drm_i915_private *dev_priv, if (ring_id >= LAST_USER_RING) return -EINVAL; - /* XXX: Currently only RCS is supported. - * Remove this check when support for other - * rings is added - */ - if (ring_id != RCS) - return -EINVAL; - props->cs_mode = true; props->ring_id = ring_id; } @@ -2108,6 +2322,9 @@ static int read_properties_unlocked(struct drm_i915_private *dev_priv, case DRM_I915_PERF_SAMPLE_TAG_PROP: props->sample_flags |= SAMPLE_TAG; break; + case DRM_I915_PERF_SAMPLE_TS_PROP: + props->sample_flags |= SAMPLE_TS; + break; case DRM_I915_PERF_PROP_MAX: BUG(); } @@ -2193,6 +2410,7 @@ static struct ctl_table dev_root[] = { void i915_perf_init(struct drm_device *dev) { struct drm_i915_private *dev_priv = to_i915(dev); + int i; if (!(IS_HASWELL(dev) || IS_BROADWELL(dev) || IS_CHERRYVIEW(dev) || @@ -2204,16 +2422,18 @@ void i915_perf_init(struct drm_device *dev) if (!dev_priv->perf.metrics_kobj) return; - hrtimer_init(&dev_priv->perf.oa.poll_check_timer, + hrtimer_init(&dev_priv->perf.poll_check_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - dev_priv->perf.oa.poll_check_timer.function = poll_check_timer_cb; - init_waitqueue_head(&dev_priv->perf.oa.poll_wq); + dev_priv->perf.poll_check_timer.function = poll_check_timer_cb; + for (i = 0; i < I915_NUM_RINGS; i++) { + spin_lock_init(&dev_priv->perf.node_list_lock[i]); + init_waitqueue_head(&dev_priv->perf.poll_wq[i]); + } INIT_LIST_HEAD(&dev_priv->perf.streams); mutex_init(&dev_priv->perf.lock); mutex_init(&dev_priv->perf.streams_lock); spin_lock_init(&dev_priv->perf.hook_lock); - spin_lock_init(&dev_priv->perf.node_list_lock); if (IS_HASWELL(dev)) { dev_priv->perf.oa.ops.init_oa_buffer = gen7_init_oa_buffer; diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h index a333038..e244626 100644 --- a/drivers/gpu/drm/i915/i915_reg.h +++ b/drivers/gpu/drm/i915/i915_reg.h @@ -359,6 +359,7 @@ #define MI_FLUSH_DW_STORE_INDEX (1<<21) #define MI_INVALIDATE_TLB (1<<18) #define MI_FLUSH_DW_OP_STOREDW (1<<14) +#define MI_FLUSH_DW_OP_STAMP (3<<14) #define MI_FLUSH_DW_OP_MASK (3<<14) #define MI_FLUSH_DW_NOTIFY (1<<8) #define MI_INVALIDATE_BSD (1<<7) @@ -438,6 +439,7 @@ #define PIPE_CONTROL_TLB_INVALIDATE (1<<18) #define PIPE_CONTROL_MEDIA_STATE_CLEAR (1<<16) #define PIPE_CONTROL_QW_WRITE (1<<14) +#define PIPE_CONTROL_TIMESTAMP_WRITE (3<<14) #define PIPE_CONTROL_POST_SYNC_OP_MASK (3<<14) #define PIPE_CONTROL_DEPTH_STALL (1<<13) #define PIPE_CONTROL_WRITE_FLUSH (1<<12) diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h index 5687080..2570f3ea 100644 --- a/include/uapi/drm/i915_drm.h +++ b/include/uapi/drm/i915_drm.h @@ -1236,6 +1236,12 @@ enum drm_i915_perf_property_id { */ DRM_I915_PERF_SAMPLE_TAG_PROP, + /** + * The value of this property set to 1 requests inclusion of timestamp + * in the perf sample data. + */ + DRM_I915_PERF_SAMPLE_TS_PROP, + DRM_I915_PERF_PROP_MAX /* non-ABI */ }; @@ -1287,6 +1293,7 @@ enum drm_i915_perf_record_type { * { u32 ctx_id; } && DRM_I915_PERF_SAMPLE_CTX_ID_PROP * { u32 pid; } && DRM_I915_PERF_SAMPLE_PID_PROP * { u32 tag; } && DRM_I915_PERF_SAMPLE_TAG_PROP + * { u64 timestamp; } && DRM_I915_PERF_SAMPLE_TS_PROP * { u32 oa_report[]; } && DRM_I915_PERF_SAMPLE_OA_PROP * }; */

[09/11] drm/i915: Extend i915 perf framework for collecting timestamps on all gpu engines

Commit Message

Patch