@@ -1791,6 +1791,18 @@ struct i915_perf_stream_ops {
* The stream will always be disabled before this is called.
*/
void (*destroy)(struct i915_perf_stream *stream);
+
+ /*
+ * Routine to emit the commands in the command streamer associated
+ * with the corresponding gpu engine.
+ */
+ void (*command_stream_hook)(struct drm_i915_gem_request *req);
+};
+
+enum i915_perf_stream_state {
+ I915_PERF_STREAM_DISABLED,
+ I915_PERF_STREAM_ENABLE_IN_PROGRESS,
+ I915_PERF_STREAM_ENABLED,
};
struct i915_perf_stream {
@@ -1798,11 +1810,15 @@ struct i915_perf_stream {
struct list_head link;
+ enum intel_engine_id engine;
u32 sample_flags;
int sample_size;
struct intel_context *ctx;
- bool enabled;
+ enum i915_perf_stream_state state;
+
+ /* Whether command stream based data collection is enabled */
+ bool cs_mode;
const struct i915_perf_stream_ops *ops;
};
@@ -1818,10 +1834,21 @@ struct i915_oa_ops {
u32 ctx_id);
void (*legacy_ctx_switch_unlocked)(struct drm_i915_gem_request *req);
int (*read)(struct i915_perf_stream *stream,
- struct i915_perf_read_state *read_state);
+ struct i915_perf_read_state *read_state, u32 ts);
bool (*oa_buffer_is_empty)(struct drm_i915_private *dev_priv);
};
+/*
+ * List element to hold info about the perf sample data associated
+ * with a particular GPU command stream.
+ */
+struct i915_perf_cs_data_node {
+ struct list_head link;
+ struct drm_i915_gem_request *request;
+ u32 offset;
+ u32 ctx_id;
+};
+
struct drm_i915_private {
struct drm_device *dev;
struct kmem_cache *objects;
@@ -2107,6 +2134,8 @@ struct drm_i915_private {
struct ctl_table_header *sysctl_header;
struct mutex lock;
+
+ struct mutex streams_lock;
struct list_head streams;
spinlock_t hook_lock;
@@ -2151,6 +2180,16 @@ struct drm_i915_private {
const struct i915_oa_format *oa_formats;
int n_builtin_sets;
} oa;
+
+ /* Command stream based perf data buffer */
+ struct {
+ struct drm_i915_gem_object *obj;
+ struct i915_vma *vma;
+ u8 *addr;
+ } command_stream_buf;
+
+ struct list_head node_list;
+ spinlock_t node_list_lock;
} perf;
/* Abstract the submission mechanism (legacy ringbuffer or execlists) away */
@@ -3513,6 +3552,7 @@ void i915_oa_legacy_ctx_switch_notify(struct drm_i915_gem_request *req);
void i915_oa_update_reg_state(struct intel_engine_cs *engine,
struct intel_context *ctx,
uint32_t *reg_state);
+void i915_perf_command_stream_hook(struct drm_i915_gem_request *req);
/* i915_gem_evict.c */
int __must_check i915_gem_evict_something(struct drm_device *dev,
@@ -1305,12 +1305,16 @@ i915_gem_ringbuffer_submission(struct i915_execbuffer_params *params,
if (exec_len == 0)
exec_len = params->batch_obj->base.size;
+ i915_perf_command_stream_hook(params->request);
+
ret = engine->dispatch_execbuffer(params->request,
exec_start, exec_len,
params->dispatch_flags);
if (ret)
return ret;
+ i915_perf_command_stream_hook(params->request);
+
trace_i915_gem_ring_dispatch(params->request, params->dispatch_flags);
i915_gem_execbuffer_move_to_active(vmas, params->request);
@@ -81,6 +81,13 @@ static u32 i915_perf_stream_paranoid = true;
#define GEN8_OAREPORT_REASON_GO_TRANSITION (1<<23)
#define GEN9_OAREPORT_REASON_CLK_RATIO (1<<24)
+/* Data common to periodic and RCS based samples */
+struct oa_sample_data {
+ u32 source;
+ u32 ctx_id;
+ const u8 *report;
+};
+
/* for sysctl proc_dointvec_minmax of i915_oa_min_timer_exponent */
static int zero;
static int oa_exponent_max = OA_EXPONENT_MAX;
@@ -120,8 +127,19 @@ static struct i915_oa_format gen8_plus_oa_formats[I915_OA_FORMAT_MAX] = {
[I915_OA_FORMAT_C4_B8] = { 7, 64 },
};
+/* Duplicated from similar static enum in i915_gem_execbuffer.c */
+#define I915_USER_RINGS (4)
+static const enum intel_engine_id user_ring_map[I915_USER_RINGS + 1] = {
+ [I915_EXEC_DEFAULT] = RCS,
+ [I915_EXEC_RENDER] = RCS,
+ [I915_EXEC_BLT] = BCS,
+ [I915_EXEC_BSD] = VCS,
+ [I915_EXEC_VEBOX] = VECS
+};
+
#define SAMPLE_OA_REPORT (1<<0)
#define SAMPLE_OA_SOURCE_INFO (1<<1)
+#define SAMPLE_CTX_ID (1<<2)
struct perf_open_properties {
u32 sample_flags;
@@ -134,8 +152,231 @@ struct perf_open_properties {
int oa_format;
bool oa_periodic;
int oa_period_exponent;
+
+ /* Command stream mode */
+ bool cs_mode;
+ enum intel_engine_id engine;
};
+/*
+ * Emit the commands to capture metrics, into the command stream. This function
+ * can be called concurrently with the stream operations and doesn't require
+ * perf mutex lock.
+ */
+
+void i915_perf_command_stream_hook(struct drm_i915_gem_request *request)
+{
+ struct intel_engine_cs *engine = request->engine;
+ struct drm_i915_private *dev_priv = engine->dev->dev_private;
+ struct i915_perf_stream *stream;
+
+ if (!dev_priv->perf.initialized)
+ return;
+
+ mutex_lock(&dev_priv->perf.streams_lock);
+ list_for_each_entry(stream, &dev_priv->perf.streams, link) {
+ if ((stream->state == I915_PERF_STREAM_ENABLED) &&
+ stream->cs_mode)
+ stream->ops->command_stream_hook(request);
+ }
+ mutex_unlock(&dev_priv->perf.streams_lock);
+}
+
+/*
+ * Release some perf entries to make space for a new entry data. We dereference
+ * the associated request before deleting the entry. Also, no need to check for
+ * gpu completion of commands, since, these entries are anyways going to be
+ * replaced by a new entry, and gpu will overwrite the buffer contents
+ * eventually, when the request associated with new entry completes.
+ */
+static void release_some_perf_entries(struct drm_i915_private *dev_priv,
+ u32 target_size)
+{
+ struct i915_perf_cs_data_node *entry, *next;
+ u32 entry_size = dev_priv->perf.oa.oa_buffer.format_size;
+ u32 size = 0;
+
+ list_for_each_entry_safe
+ (entry, next, &dev_priv->perf.node_list, link) {
+
+ size += entry_size;
+ i915_gem_request_unreference(entry->request);
+ list_del(&entry->link);
+ kfree(entry);
+
+ if (size >= target_size)
+ break;
+ }
+}
+
+/*
+ * Insert the perf entry to the end of the list. This function never fails,
+ * since it always manages to insert the entry. If the space is exhausted in
+ * the buffer, it will remove the oldest entries in order to make space.
+ */
+static void insert_perf_entry(struct drm_i915_private *dev_priv,
+ struct i915_perf_cs_data_node *entry)
+{
+ struct i915_perf_cs_data_node *first_entry, *last_entry;
+ int max_offset = dev_priv->perf.command_stream_buf.obj->base.size;
+ u32 entry_size = dev_priv->perf.oa.oa_buffer.format_size;
+
+ spin_lock(&dev_priv->perf.node_list_lock);
+ if (list_empty(&dev_priv->perf.node_list)) {
+ entry->offset = 0;
+ list_add_tail(&entry->link, &dev_priv->perf.node_list);
+ spin_unlock(&dev_priv->perf.node_list_lock);
+ return;
+ }
+
+ first_entry = list_first_entry(&dev_priv->perf.node_list,
+ typeof(*first_entry), link);
+ last_entry = list_last_entry(&dev_priv->perf.node_list,
+ typeof(*last_entry), link);
+
+ if (last_entry->offset >= first_entry->offset) {
+ /* Sufficient space available at the end of buffer? */
+ if (last_entry->offset + 2*entry_size < max_offset)
+ entry->offset = last_entry->offset + entry_size;
+ /*
+ * Wraparound condition. Is sufficient space available at
+ * beginning of buffer?
+ */
+ else if (entry_size < first_entry->offset)
+ entry->offset = 0;
+ /* Insufficient space. Overwrite existing old entries */
+ else {
+ u32 target_size = entry_size - first_entry->offset;
+
+ release_some_perf_entries(dev_priv, target_size);
+ entry->offset = 0;
+ }
+ } else {
+ /* Sufficient space available? */
+ if (last_entry->offset + 2*entry_size < first_entry->offset)
+ entry->offset = last_entry->offset + entry_size;
+ /* Insufficient space. Overwrite existing old entries */
+ else {
+ u32 target_size = entry_size -
+ (first_entry->offset - last_entry->offset -
+ entry_size);
+
+ release_some_perf_entries(dev_priv, target_size);
+ entry->offset = last_entry->offset + entry_size;
+ }
+ }
+ list_add_tail(&entry->link, &dev_priv->perf.node_list);
+ spin_unlock(&dev_priv->perf.node_list_lock);
+}
+
+static void i915_perf_command_stream_hook_oa(struct drm_i915_gem_request *req)
+{
+ struct intel_engine_cs *engine = req->engine;
+ struct intel_ringbuffer *ringbuf = req->ringbuf;
+ struct intel_context *ctx = req->ctx;
+ struct drm_i915_private *dev_priv = engine->dev->dev_private;
+ struct i915_perf_cs_data_node *entry;
+ u32 addr = 0;
+ int ret;
+
+ /* OA counters are only supported on the render engine */
+ BUG_ON(engine->id != RCS);
+
+ entry = kzalloc(sizeof(*entry), GFP_KERNEL);
+ if (entry == NULL) {
+ DRM_ERROR("alloc failed\n");
+ return;
+ }
+
+ ret = intel_ring_begin(req, 4);
+ if (ret) {
+ kfree(entry);
+ return;
+ }
+
+ entry->ctx_id = ctx->hw_id;
+ i915_gem_request_assign(&entry->request, req);
+
+ insert_perf_entry(dev_priv, entry);
+
+ addr = dev_priv->perf.command_stream_buf.vma->node.start +
+ entry->offset;
+
+ /* addr should be 64 byte aligned */
+ BUG_ON(addr & 0x3f);
+
+ if (i915.enable_execlists) {
+ intel_logical_ring_emit(ringbuf, MI_REPORT_PERF_COUNT | (2<<0));
+ intel_logical_ring_emit(ringbuf,
+ addr | MI_REPORT_PERF_COUNT_GGTT);
+ intel_logical_ring_emit(ringbuf, 0);
+ intel_logical_ring_emit(ringbuf,
+ i915_gem_request_get_seqno(req));
+ intel_logical_ring_advance(ringbuf);
+ } else {
+ if (INTEL_INFO(engine->dev)->gen >= 8) {
+ intel_ring_emit(engine, MI_REPORT_PERF_COUNT | (2<<0));
+ intel_ring_emit(engine, addr | MI_REPORT_PERF_COUNT_GGTT);
+ intel_ring_emit(engine, 0);
+ intel_ring_emit(engine,
+ i915_gem_request_get_seqno(req));
+ } else {
+ intel_ring_emit(engine, MI_REPORT_PERF_COUNT | (1<<0));
+ intel_ring_emit(engine, addr | MI_REPORT_PERF_COUNT_GGTT);
+ intel_ring_emit(engine, i915_gem_request_get_seqno(req));
+ intel_ring_emit(engine, MI_NOOP);
+ }
+ intel_ring_advance(engine);
+ }
+ i915_vma_move_to_active(dev_priv->perf.command_stream_buf.vma, req);
+}
+
+static int i915_oa_rcs_wait_gpu(struct drm_i915_private *dev_priv)
+{
+ struct i915_perf_cs_data_node *last_entry = NULL;
+ struct drm_i915_gem_request *req = NULL;
+ int ret;
+
+ /*
+ * Wait for the last scheduled request to complete. This would
+ * implicitly wait for the prior submitted requests. The refcount
+ * of the requests is not decremented here.
+ */
+ spin_lock(&dev_priv->perf.node_list_lock);
+
+ if (!list_empty(&dev_priv->perf.node_list)) {
+ last_entry = list_last_entry(&dev_priv->perf.node_list,
+ struct i915_perf_cs_data_node, link);
+ req = last_entry->request;
+ }
+ spin_unlock(&dev_priv->perf.node_list_lock);
+
+ if (!req)
+ return 0;
+
+ ret = __i915_wait_request(req, true, NULL, NULL);
+ if (ret) {
+ DRM_ERROR("Failed to wait for request\n");
+ return ret;
+ }
+ return 0;
+}
+
+static void i915_oa_rcs_free_requests(struct drm_i915_private *dev_priv)
+{
+ struct i915_perf_cs_data_node *entry, *next;
+
+ list_for_each_entry_safe
+ (entry, next, &dev_priv->perf.node_list, link) {
+ i915_gem_request_unreference(entry->request);
+
+ spin_lock(&dev_priv->perf.node_list_lock);
+ list_del(&entry->link);
+ spin_unlock(&dev_priv->perf.node_list_lock);
+ kfree(entry);
+ }
+}
+
/* NB: This is either called via fops or the poll check hrtimer (atomic ctx)
*
* It's safe to read OA config state here unlocked, assuming that this is only
@@ -205,7 +446,7 @@ static int append_oa_status(struct i915_perf_stream *stream,
*/
static int append_oa_sample(struct i915_perf_stream *stream,
struct i915_perf_read_state *read_state,
- const u8 *report)
+ struct oa_sample_data *data)
{
struct drm_i915_private *dev_priv = stream->dev_priv;
int report_size = dev_priv->perf.oa.oa_buffer.format_size;
@@ -225,6 +466,38 @@ static int append_oa_sample(struct i915_perf_stream *stream,
buf += sizeof(header);
if (sample_flags & SAMPLE_OA_SOURCE_INFO) {
+ if (copy_to_user(buf, &data->source, 4))
+ return -EFAULT;
+ buf += 4;
+ }
+
+ if (sample_flags & SAMPLE_CTX_ID) {
+ if (copy_to_user(buf, &data->ctx_id, 4))
+ return -EFAULT;
+ buf += 4;
+ }
+
+ if (sample_flags & SAMPLE_OA_REPORT) {
+ if (copy_to_user(buf, data->report, report_size))
+ return -EFAULT;
+ buf += report_size;
+ }
+
+ read_state->buf = buf;
+ read_state->read += header.size;
+
+ return 0;
+}
+
+static int append_oa_buffer_sample(struct i915_perf_stream *stream,
+ struct i915_perf_read_state *read_state,
+ const u8 *report)
+{
+ struct drm_i915_private *dev_priv = stream->dev_priv;
+ u32 sample_flags = stream->sample_flags;
+ struct oa_sample_data data = { 0 };
+
+ if (sample_flags & SAMPLE_OA_SOURCE_INFO) {
enum drm_i915_perf_oa_event_source source;
if (INTEL_INFO(dev_priv)->gen >= 8) {
@@ -240,20 +513,16 @@ static int append_oa_sample(struct i915_perf_stream *stream,
} else
source = I915_PERF_OA_EVENT_SOURCE_PERIODIC;
- if (copy_to_user(buf, &source, 4))
- return -EFAULT;
- buf += 4;
- }
-
- if (sample_flags & SAMPLE_OA_REPORT) {
- if (copy_to_user(buf, report, report_size))
- return -EFAULT;
+ data.source = source;
}
+#warning "FIXME: append_oa_buffer_sample: read ctx ID from report and map that to an intel_context::global_id"
+ if (sample_flags & SAMPLE_CTX_ID)
+ data.ctx_id = 0;
- read_state->buf += header.size;
- read_state->read += header.size;
+ if (sample_flags & SAMPLE_OA_REPORT)
+ data.report = report;
- return 0;
+ return append_oa_sample(stream, read_state, &data);
}
/**
@@ -273,7 +542,7 @@ static int append_oa_sample(struct i915_perf_stream *stream,
static int gen8_append_oa_reports(struct i915_perf_stream *stream,
struct i915_perf_read_state *read_state,
u32 *head_ptr,
- u32 tail)
+ u32 tail, u32 ts)
{
struct drm_i915_private *dev_priv = stream->dev_priv;
int report_size = dev_priv->perf.oa.oa_buffer.format_size;
@@ -283,7 +552,7 @@ static int gen8_append_oa_reports(struct i915_perf_stream *stream,
u32 taken;
int ret = 0;
- BUG_ON(!stream->enabled);
+ BUG_ON(stream->state != I915_PERF_STREAM_ENABLED);
head = *head_ptr - dev_priv->perf.oa.oa_buffer.gtt_offset;
tail -= dev_priv->perf.oa.oa_buffer.gtt_offset;
@@ -313,6 +582,11 @@ static int gen8_append_oa_reports(struct i915_perf_stream *stream,
u8 *report = oa_buf_base + head;
u32 *report32 = (void *)report;
u32 ctx_id = report32[2];
+ u32 report_ts = report32[1];
+
+ /* Report timestamp should not exceed passed in ts */
+ if (report_ts > ts)
+ break;
/* All the report sizes factor neatly into the buffer
* size so we never expect to see a report split
@@ -364,7 +638,8 @@ static int gen8_append_oa_reports(struct i915_perf_stream *stream,
dev_priv->perf.oa.specific_ctx_id != ctx_id)
report32[2] = 0x1fffff;
- ret = append_oa_sample(stream, read_state, report);
+ ret = append_oa_buffer_sample(stream, read_state,
+ report);
if (ret)
break;
@@ -386,7 +661,7 @@ static int gen8_append_oa_reports(struct i915_perf_stream *stream,
* updated @read_state.
*/
static int gen8_oa_read(struct i915_perf_stream *stream,
- struct i915_perf_read_state *read_state)
+ struct i915_perf_read_state *read_state, u32 ts)
{
struct drm_i915_private *dev_priv = stream->dev_priv;
int report_size = dev_priv->perf.oa.oa_buffer.format_size;
@@ -428,7 +703,7 @@ static int gen8_oa_read(struct i915_perf_stream *stream,
/* If there is still buffer space */
- ret = gen8_append_oa_reports(stream, read_state, &head, tail);
+ ret = gen8_append_oa_reports(stream, read_state, &head, tail, ts);
/* All the report sizes are a power of two and the
* head should always be incremented by some multiple
@@ -467,7 +742,7 @@ static int gen8_oa_read(struct i915_perf_stream *stream,
static int gen7_append_oa_reports(struct i915_perf_stream *stream,
struct i915_perf_read_state *read_state,
u32 *head_ptr,
- u32 tail)
+ u32 tail, u32 ts)
{
struct drm_i915_private *dev_priv = stream->dev_priv;
int report_size = dev_priv->perf.oa.oa_buffer.format_size;
@@ -478,7 +753,7 @@ static int gen7_append_oa_reports(struct i915_perf_stream *stream,
u32 taken;
int ret = 0;
- BUG_ON(!stream->enabled);
+ BUG_ON(stream->state != I915_PERF_STREAM_ENABLED);
head = *head_ptr - dev_priv->perf.oa.oa_buffer.gtt_offset;
tail -= dev_priv->perf.oa.oa_buffer.gtt_offset;
@@ -542,7 +817,11 @@ static int gen7_append_oa_reports(struct i915_perf_stream *stream,
continue;
}
- ret = append_oa_sample(stream, read_state, report);
+ /* Report timestamp should not exceed passed in ts */
+ if (report32[1] > ts)
+ break;
+
+ ret = append_oa_buffer_sample(stream, read_state, report);
if (ret)
break;
@@ -569,7 +848,7 @@ static int gen7_append_oa_reports(struct i915_perf_stream *stream,
* updated @read_state.
*/
static int gen7_oa_read(struct i915_perf_stream *stream,
- struct i915_perf_read_state *read_state)
+ struct i915_perf_read_state *read_state, u32 ts)
{
struct drm_i915_private *dev_priv = stream->dev_priv;
int report_size = dev_priv->perf.oa.oa_buffer.format_size;
@@ -641,7 +920,7 @@ static int gen7_oa_read(struct i915_perf_stream *stream,
GEN7_OASTATUS1_REPORT_LOST;
}
- ret = gen7_append_oa_reports(stream, read_state, &head, tail);
+ ret = gen7_append_oa_reports(stream, read_state, &head, tail, ts);
/* All the report sizes are a power of two and the
* head should always be incremented by some multiple
@@ -665,20 +944,131 @@ static int gen7_oa_read(struct i915_perf_stream *stream,
return ret;
}
-static bool i915_oa_can_read(struct i915_perf_stream *stream)
+/**
+ * Copies a command stream OA report into userspace read() buffer, while also
+ * forwarding the periodic OA reports with timestamp lower than CS report.
+ *
+ * NB: some data may be successfully copied to the userspace buffer
+ * even if an error is returned, and this is reflected in the
+ * updated @read_state.
+ */
+static int append_oa_rcs_sample(struct i915_perf_stream *stream,
+ struct i915_perf_read_state *read_state,
+ struct i915_perf_cs_data_node *node)
{
struct drm_i915_private *dev_priv = stream->dev_priv;
+ struct oa_sample_data data = { 0 };
+ const u8 *report = dev_priv->perf.command_stream_buf.addr +
+ node->offset;
+ u32 sample_flags = stream->sample_flags;
+ u32 report_ts;
+ int ret;
- return !dev_priv->perf.oa.ops.oa_buffer_is_empty(dev_priv);
+ /* First, append the periodic OA samples having lower timestamps */
+ report_ts = *(u32 *)(report + 4);
+ ret = dev_priv->perf.oa.ops.read(stream, read_state, report_ts);
+ if (ret)
+ return ret;
+
+ if (sample_flags & SAMPLE_OA_SOURCE_INFO)
+ data.source = I915_PERF_OA_EVENT_SOURCE_RCS;
+
+ if (sample_flags & SAMPLE_CTX_ID)
+ data.ctx_id = node->ctx_id;
+
+ if (sample_flags & SAMPLE_OA_REPORT)
+ data.report = report;
+
+ return append_oa_sample(stream, read_state, &data);
}
-static int i915_oa_wait_unlocked(struct i915_perf_stream *stream)
+/**
+ * Copies all command stream based OA reports into userspace read() buffer.
+ *
+ * NB: some data may be successfully copied to the userspace buffer
+ * even if an error is returned, and this is reflected in the
+ * updated @read_state.
+ */
+static int oa_rcs_append_reports(struct i915_perf_stream *stream,
+ struct i915_perf_read_state *read_state)
{
struct drm_i915_private *dev_priv = stream->dev_priv;
+ struct i915_perf_cs_data_node *entry, *next;
+ LIST_HEAD(free_list);
+ int ret = 0;
- /* We would wait indefinitly if periodic sampling is not enabled */
- if (!dev_priv->perf.oa.periodic)
- return -EIO;
+ spin_lock(&dev_priv->perf.node_list_lock);
+ if (list_empty(&dev_priv->perf.node_list)) {
+ spin_unlock(&dev_priv->perf.node_list_lock);
+ return 0;
+ }
+ list_for_each_entry_safe(entry, next,
+ &dev_priv->perf.node_list, link) {
+ if (!i915_gem_request_completed(entry->request, true))
+ break;
+ list_move_tail(&entry->link, &free_list);
+ }
+ spin_unlock(&dev_priv->perf.node_list_lock);
+
+ if (list_empty(&free_list))
+ return 0;
+
+ list_for_each_entry_safe(entry, next, &free_list, link) {
+ ret = append_oa_rcs_sample(stream, read_state, entry);
+ if (ret)
+ break;
+
+ list_del(&entry->link);
+ i915_gem_request_unreference(entry->request);
+ kfree(entry);
+ }
+
+ /* Don't discard remaining entries, keep them for next read */
+ spin_lock(&dev_priv->perf.node_list_lock);
+ list_splice(&free_list, &dev_priv->perf.node_list);
+ spin_unlock(&dev_priv->perf.node_list_lock);
+
+ return ret;
+}
+
+/*
+ * Checks whether the command stream buffer associated with the stream has
+ * data ready to be forwarded to userspace.
+ * Returns true if atleast one request associated with command stream is
+ * completed, else returns false.
+ */
+static bool command_stream_buf_is_empty(struct i915_perf_stream *stream)
+
+{
+ struct drm_i915_private *dev_priv = stream->dev_priv;
+ struct i915_perf_cs_data_node *entry = NULL;
+ struct drm_i915_gem_request *request = NULL;
+
+ spin_lock(&dev_priv->perf.node_list_lock);
+ entry = list_first_entry_or_null(&dev_priv->perf.node_list,
+ struct i915_perf_cs_data_node, link);
+ if (entry)
+ request = entry->request;
+ spin_unlock(&dev_priv->perf.node_list_lock);
+
+ if (!entry)
+ return true;
+ else if (!i915_gem_request_completed(request, true))
+ return true;
+ else
+ return false;
+}
+
+/*
+ * Checks whether the stream has data ready to forward to userspace.
+ * For command stream based streams, check if the command stream buffer has
+ * atleast one sample ready, if not return false, irrespective of periodic
+ * oa buffer having the data or not.
+ */
+
+static bool stream_have_data__unlocked(struct i915_perf_stream *stream)
+{
+ struct drm_i915_private *dev_priv = stream->dev_priv;
/* Note: the oa_buffer_is_empty() condition is ok to run unlocked as it
* just performs mmio reads of the OA buffer head + tail pointers and
@@ -686,8 +1076,35 @@ static int i915_oa_wait_unlocked(struct i915_perf_stream *stream)
* can't be destroyed until completion (such as a read()) that ensures
* the device + OA buffer can't disappear
*/
+ if (stream->cs_mode)
+ return !command_stream_buf_is_empty(stream);
+ else
+ return !dev_priv->perf.oa.ops.oa_buffer_is_empty(dev_priv);
+}
+
+static bool i915_oa_can_read(struct i915_perf_stream *stream)
+{
+
+ return stream_have_data__unlocked(stream);
+}
+
+static int i915_oa_wait_unlocked(struct i915_perf_stream *stream)
+{
+ struct drm_i915_private *dev_priv = stream->dev_priv;
+ int ret;
+
+ /* We would wait indefinitly if periodic sampling is not enabled */
+ if (!dev_priv->perf.oa.periodic)
+ return -EIO;
+
+ if (stream->cs_mode) {
+ ret = i915_oa_rcs_wait_gpu(dev_priv);
+ if (ret)
+ return ret;
+ }
+
return wait_event_interruptible(dev_priv->perf.oa.poll_wq,
- !dev_priv->perf.oa.ops.oa_buffer_is_empty(dev_priv));
+ stream_have_data__unlocked(stream));
}
static void i915_oa_poll_wait(struct i915_perf_stream *stream,
@@ -704,7 +1121,27 @@ static int i915_oa_read(struct i915_perf_stream *stream,
{
struct drm_i915_private *dev_priv = stream->dev_priv;
- return dev_priv->perf.oa.ops.read(stream, read_state);
+ if (stream->cs_mode)
+ return oa_rcs_append_reports(stream, read_state);
+ else
+ return dev_priv->perf.oa.ops.read(stream, read_state, U32_MAX);
+}
+
+static void
+free_command_stream_buf(struct drm_i915_private *dev_priv)
+{
+ mutex_lock(&dev_priv->dev->struct_mutex);
+
+ i915_gem_object_unpin_map(dev_priv->perf.command_stream_buf.obj);
+ i915_gem_object_ggtt_unpin(dev_priv->perf.command_stream_buf.obj);
+ drm_gem_object_unreference(
+ &dev_priv->perf.command_stream_buf.obj->base);
+
+ dev_priv->perf.command_stream_buf.obj = NULL;
+ dev_priv->perf.command_stream_buf.vma = NULL;
+ dev_priv->perf.command_stream_buf.addr = NULL;
+
+ mutex_unlock(&dev_priv->dev->struct_mutex);
}
static void
@@ -729,12 +1166,17 @@ static void i915_oa_stream_destroy(struct i915_perf_stream *stream)
BUG_ON(stream != dev_priv->perf.oa.exclusive_stream);
- dev_priv->perf.oa.ops.disable_metric_set(dev_priv);
+ if (stream->cs_mode)
+ free_command_stream_buf(dev_priv);
- free_oa_buffer(dev_priv);
+ if (dev_priv->perf.oa.oa_buffer.obj) {
+ dev_priv->perf.oa.ops.disable_metric_set(dev_priv);
- intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
- intel_runtime_pm_put(dev_priv);
+ free_oa_buffer(dev_priv);
+
+ intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
+ intel_runtime_pm_put(dev_priv);
+ }
dev_priv->perf.oa.exclusive_stream = NULL;
}
@@ -792,16 +1234,17 @@ static void gen8_init_oa_buffer(struct drm_i915_private *dev_priv)
GEN8_OATAILPTR_MASK));
}
-static int alloc_oa_buffer(struct drm_i915_private *dev_priv)
+static int alloc_obj(struct drm_i915_private *dev_priv,
+ struct drm_i915_gem_object **obj, u8 **addr)
{
struct drm_i915_gem_object *bo;
int ret;
- BUG_ON(dev_priv->perf.oa.oa_buffer.obj);
+ intel_runtime_pm_get(dev_priv);
ret = i915_mutex_lock_interruptible(dev_priv->dev);
if (ret)
- return ret;
+ goto out;
bo = i915_gem_object_create(dev_priv->dev, OA_BUFFER_SIZE);
if (IS_ERR(bo)) {
@@ -809,7 +1252,6 @@ static int alloc_oa_buffer(struct drm_i915_private *dev_priv)
ret = PTR_ERR(bo);
goto unlock;
}
- dev_priv->perf.oa.oa_buffer.obj = bo;
ret = i915_gem_object_set_cache_level(bo, I915_CACHE_LLC);
if (ret)
@@ -820,17 +1262,13 @@ static int alloc_oa_buffer(struct drm_i915_private *dev_priv)
if (ret)
goto err_unref;
- dev_priv->perf.oa.oa_buffer.gtt_offset = i915_gem_obj_ggtt_offset(bo);
- dev_priv->perf.oa.oa_buffer.addr = i915_gem_object_pin_map(bo);
- if (dev_priv->perf.oa.oa_buffer.addr == NULL)
+ *addr = i915_gem_object_pin_map(bo);
+ if (*addr == NULL) {
+ ret = -ENOMEM;
goto err_unpin;
+ }
- dev_priv->perf.oa.ops.init_oa_buffer(dev_priv);
-
- DRM_DEBUG_DRIVER("OA Buffer initialized, gtt offset = 0x%x, vaddr = %p",
- dev_priv->perf.oa.oa_buffer.gtt_offset,
- dev_priv->perf.oa.oa_buffer.addr);
-
+ *obj = bo;
goto unlock;
err_unpin:
@@ -841,9 +1279,63 @@ err_unref:
unlock:
mutex_unlock(&dev_priv->dev->struct_mutex);
+
+out:
+ intel_runtime_pm_put(dev_priv);
return ret;
}
+static int alloc_oa_buffer(struct drm_i915_private *dev_priv)
+{
+ struct drm_i915_gem_object *bo;
+ u8 *obj_addr;
+ int ret;
+
+ BUG_ON(dev_priv->perf.oa.oa_buffer.obj);
+
+ ret = alloc_obj(dev_priv, &bo, &obj_addr);
+ if (ret)
+ return ret;
+
+ dev_priv->perf.oa.oa_buffer.obj = bo;
+ dev_priv->perf.oa.oa_buffer.addr = obj_addr;
+ dev_priv->perf.oa.oa_buffer.gtt_offset = i915_gem_obj_ggtt_offset(bo);
+
+ dev_priv->perf.oa.ops.init_oa_buffer(dev_priv);
+
+ DRM_DEBUG_DRIVER("OA Buffer initialized, gtt offset = 0x%x, vaddr = %p",
+ dev_priv->perf.oa.oa_buffer.gtt_offset,
+ dev_priv->perf.oa.oa_buffer.addr);
+ return 0;
+}
+
+static int alloc_command_stream_buf(struct drm_i915_private *dev_priv)
+{
+ struct drm_i915_gem_object *bo;
+ u8 *obj_addr;
+ int ret;
+
+ BUG_ON(dev_priv->perf.command_stream_buf.obj);
+
+ ret = alloc_obj(dev_priv, &bo, &obj_addr);
+ if (ret)
+ return ret;
+
+ dev_priv->perf.command_stream_buf.obj = bo;
+ dev_priv->perf.command_stream_buf.addr = obj_addr;
+ dev_priv->perf.command_stream_buf.vma = i915_gem_obj_to_ggtt(bo);
+ if (WARN_ON(!list_empty(&dev_priv->perf.node_list)))
+ INIT_LIST_HEAD(&dev_priv->perf.node_list);
+
+ DRM_DEBUG_DRIVER(
+ "command stream buf initialized, gtt offset = 0x%x, vaddr = %p",
+ (unsigned int)
+ dev_priv->perf.command_stream_buf.vma->node.start,
+ dev_priv->perf.command_stream_buf.addr);
+
+ return 0;
+}
+
static void config_oa_regs(struct drm_i915_private *dev_priv,
const struct i915_oa_reg *regs,
int n_regs)
@@ -1087,7 +1579,8 @@ static void gen7_update_oacontrol_locked(struct drm_i915_private *dev_priv)
{
assert_spin_locked(&dev_priv->perf.hook_lock);
- if (dev_priv->perf.oa.exclusive_stream->enabled) {
+ if (dev_priv->perf.oa.exclusive_stream->state !=
+ I915_PERF_STREAM_DISABLED) {
unsigned long ctx_id = 0;
if (dev_priv->perf.oa.exclusive_stream->ctx)
@@ -1169,10 +1662,15 @@ static void i915_oa_stream_disable(struct i915_perf_stream *stream)
{
struct drm_i915_private *dev_priv = stream->dev_priv;
- dev_priv->perf.oa.ops.oa_disable(dev_priv);
-
if (dev_priv->perf.oa.periodic)
hrtimer_cancel(&dev_priv->perf.oa.poll_check_timer);
+
+ if (stream->cs_mode) {
+ i915_oa_rcs_wait_gpu(dev_priv);
+ i915_oa_rcs_free_requests(dev_priv);
+ }
+
+ dev_priv->perf.oa.ops.oa_disable(dev_priv);
}
static u64 oa_exponent_to_ns(struct drm_i915_private *dev_priv, int exponent)
@@ -1189,6 +1687,7 @@ static const struct i915_perf_stream_ops i915_oa_stream_ops = {
.wait_unlocked = i915_oa_wait_unlocked,
.poll_wait = i915_oa_poll_wait,
.read = i915_oa_read,
+ .command_stream_hook = i915_perf_command_stream_hook_oa,
};
static int i915_oa_stream_init(struct i915_perf_stream *stream,
@@ -1196,14 +1695,11 @@ static int i915_oa_stream_init(struct i915_perf_stream *stream,
struct perf_open_properties *props)
{
struct drm_i915_private *dev_priv = stream->dev_priv;
- int format_size;
+ bool require_oa_unit = props->sample_flags & (SAMPLE_OA_REPORT |
+ SAMPLE_OA_SOURCE_INFO);
+ bool cs_sample_data = props->sample_flags & SAMPLE_OA_REPORT;
int ret;
- if (!dev_priv->perf.oa.ops.init_oa_buffer) {
- DRM_ERROR("OA unit not supported\n");
- return -ENODEV;
- }
-
/* To avoid the complexity of having to accurately filter
* counter reports and marshal to the appropriate client
* we currently only allow exclusive access
@@ -1213,97 +1709,166 @@ static int i915_oa_stream_init(struct i915_perf_stream *stream,
return -EBUSY;
}
- if (!props->metrics_set) {
- DRM_ERROR("OA metric set not specified\n");
- return -EINVAL;
- }
-
- if (!props->oa_format) {
- DRM_ERROR("OA report format not specified\n");
- return -EINVAL;
+ if ((props->sample_flags & SAMPLE_CTX_ID) && !props->cs_mode) {
+ if (IS_HASWELL(dev_priv->dev)) {
+ DRM_ERROR(
+ "On HSW, context ID sampling only supported via command stream");
+ return -EINVAL;
+ } else if (!i915.enable_execlists) {
+ DRM_ERROR(
+ "On Gen8+ without execlists, context ID sampling only supported via command stream");
+ return -EINVAL;
+ }
}
stream->sample_size = sizeof(struct drm_i915_perf_record_header);
- format_size = dev_priv->perf.oa.oa_formats[props->oa_format].size;
+ if (require_oa_unit) {
+ int format_size;
- if (props->sample_flags & SAMPLE_OA_REPORT) {
- stream->sample_flags |= SAMPLE_OA_REPORT;
- stream->sample_size += format_size;
- }
+ if (!dev_priv->perf.oa.ops.init_oa_buffer) {
+ DRM_ERROR("OA unit not supported\n");
+ return -ENODEV;
+ }
+
+ if (!props->metrics_set) {
+ DRM_ERROR("OA metric set not specified\n");
+ return -EINVAL;
+ }
+
+ if (!props->oa_format) {
+ DRM_ERROR("OA report format not specified\n");
+ return -EINVAL;
+ }
- if (props->sample_flags & SAMPLE_OA_SOURCE_INFO) {
- if (!(props->sample_flags & SAMPLE_OA_REPORT)) {
+ if (props->cs_mode && (props->engine!= RCS)) {
DRM_ERROR(
- "OA source type can't be sampled without OA report");
+ "Command stream OA metrics only available via Render CS\n");
return -EINVAL;
}
- stream->sample_flags |= SAMPLE_OA_SOURCE_INFO;
- stream->sample_size += 4;
- }
+ stream->engine= RCS;
+
+ format_size =
+ dev_priv->perf.oa.oa_formats[props->oa_format].size;
+
+ if (props->sample_flags & SAMPLE_OA_REPORT) {
+ stream->sample_flags |= SAMPLE_OA_REPORT;
+ stream->sample_size += format_size;
+ }
+
+ if (props->sample_flags & SAMPLE_OA_SOURCE_INFO) {
+ if (!(props->sample_flags & SAMPLE_OA_REPORT)) {
+ DRM_ERROR(
+ "OA source type can't be sampled without OA report");
+ return -EINVAL;
+ }
+ stream->sample_flags |= SAMPLE_OA_SOURCE_INFO;
+ stream->sample_size += 4;
+ }
+
+ dev_priv->perf.oa.oa_buffer.format_size = format_size;
+ BUG_ON(dev_priv->perf.oa.oa_buffer.format_size == 0);
+
+ dev_priv->perf.oa.oa_buffer.format =
+ dev_priv->perf.oa.oa_formats[props->oa_format].format;
- dev_priv->perf.oa.oa_buffer.format_size = format_size;
- BUG_ON(dev_priv->perf.oa.oa_buffer.format_size == 0);
+ dev_priv->perf.oa.metrics_set = props->metrics_set;
- dev_priv->perf.oa.oa_buffer.format =
- dev_priv->perf.oa.oa_formats[props->oa_format].format;
+ dev_priv->perf.oa.periodic = props->oa_periodic;
+ if (dev_priv->perf.oa.periodic) {
+ u64 period_ns = oa_exponent_to_ns(dev_priv,
+ props->oa_period_exponent);
- dev_priv->perf.oa.metrics_set = props->metrics_set;
+ dev_priv->perf.oa.period_exponent =
+ props->oa_period_exponent;
- dev_priv->perf.oa.periodic = props->oa_periodic;
- if (dev_priv->perf.oa.periodic) {
- u64 period_ns = oa_exponent_to_ns(dev_priv,
- props->oa_period_exponent);
+ /* See comment for OA_TAIL_MARGIN_NSEC for details
+ * about this tail_margin...
+ */
+ dev_priv->perf.oa.tail_margin =
+ ((OA_TAIL_MARGIN_NSEC / period_ns) + 1) *
+ format_size;
+ }
+
+ if (i915.enable_execlists && stream->ctx)
+ dev_priv->perf.oa.specific_ctx_id = stream->ctx->hw_id;
- dev_priv->perf.oa.period_exponent = props->oa_period_exponent;
+ ret = alloc_oa_buffer(dev_priv);
+ if (ret)
+ return ret;
- /* See comment for OA_TAIL_MARGIN_NSEC for details
- * about this tail_margin...
+ /* PRM - observability performance counters:
+ *
+ * OACONTROL, performance counter enable, note:
+ *
+ * "When this bit is set, in order to have coherent counts,
+ * RC6 power state and trunk clock gating must be disabled.
+ * This can be achieved by programming MMIO registers as
+ * 0xA094=0 and 0xA090[31]=1"
+ *
+ * In our case we are expecting that taking pm + FORCEWAKE
+ * references will effectively disable RC6.
*/
- dev_priv->perf.oa.tail_margin =
- ((OA_TAIL_MARGIN_NSEC / period_ns) + 1) * format_size;
+ intel_runtime_pm_get(dev_priv);
+ intel_uncore_forcewake_get(dev_priv, FORCEWAKE_ALL);
+
+ ret = dev_priv->perf.oa.ops.enable_metric_set(dev_priv);
+ if (ret) {
+ intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
+ intel_runtime_pm_put(dev_priv);
+ free_oa_buffer(dev_priv);
+ return ret;
+ }
+
+ /* On Haswell we have to track which OASTATUS1 flags we've already
+ * seen since they can't be cleared while periodic sampling is enabled.
+ */
+ dev_priv->perf.oa.gen7_latched_oastatus1 = 0;
}
- if (i915.enable_execlists && stream->ctx)
- dev_priv->perf.oa.specific_ctx_id = stream->ctx->hw_id;
+ if (props->sample_flags & SAMPLE_CTX_ID) {
+ stream->sample_flags |= SAMPLE_CTX_ID;
+ stream->sample_size += 4;
+ }
- ret = alloc_oa_buffer(dev_priv);
- if (ret)
- return ret;
+ if (props->cs_mode) {
+ if (!cs_sample_data) {
+ DRM_ERROR(
+ "Ring given without requesting any CS data to sample");
+ ret = -EINVAL;
+ goto cs_error;
+ }
- /* PRM - observability performance counters:
- *
- * OACONTROL, performance counter enable, note:
- *
- * "When this bit is set, in order to have coherent counts,
- * RC6 power state and trunk clock gating must be disabled.
- * This can be achieved by programming MMIO registers as
- * 0xA094=0 and 0xA090[31]=1"
- *
- * In our case we are expecting that taking pm + FORCEWAKE
- * references will effectively disable RC6.
- */
- intel_runtime_pm_get(dev_priv);
- intel_uncore_forcewake_get(dev_priv, FORCEWAKE_ALL);
+ if (!(props->sample_flags & SAMPLE_CTX_ID)) {
+ DRM_ERROR(
+ "Ring given without requesting any CS specific property");
+ ret = -EINVAL;
+ goto cs_error;
+ }
- ret = dev_priv->perf.oa.ops.enable_metric_set(dev_priv);
- if (ret) {
- intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
- intel_runtime_pm_put(dev_priv);
- free_oa_buffer(dev_priv);
- return ret;
+ stream->cs_mode = true;
+
+ ret = alloc_command_stream_buf(dev_priv);
+ if (ret)
+ goto cs_error;
}
stream->ops = &i915_oa_stream_ops;
- /* On Haswell we have to track which OASTATUS1 flags we've already
- * seen since they can't be cleared while periodic sampling is enabled.
- */
- dev_priv->perf.oa.gen7_latched_oastatus1 = 0;
-
dev_priv->perf.oa.exclusive_stream = stream;
return 0;
+
+cs_error:
+ if (require_oa_unit) {
+ dev_priv->perf.oa.ops.disable_metric_set(dev_priv);
+
+ intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
+ intel_runtime_pm_put(dev_priv);
+
+ free_oa_buffer(dev_priv);
+ }
+ return ret;
}
static void gen7_update_hw_ctx_id_locked(struct drm_i915_private *dev_priv,
@@ -1395,7 +1960,8 @@ void i915_oa_legacy_ctx_switch_notify(struct drm_i915_gem_request *req)
return;
if (dev_priv->perf.oa.exclusive_stream &&
- dev_priv->perf.oa.exclusive_stream->enabled) {
+ dev_priv->perf.oa.exclusive_stream->state !=
+ I915_PERF_STREAM_DISABLED) {
/* XXX: We don't take a lock here and this may run
* async with respect to stream methods. Notably we
@@ -1505,7 +2071,7 @@ static ssize_t i915_perf_read(struct file *file,
* disabled stream as an error. In particular it might otherwise lead
* to a deadlock for blocking file descriptors...
*/
- if (!stream->enabled)
+ if (stream->state == I915_PERF_STREAM_DISABLED)
return -EIO;
if (!(file->f_flags & O_NONBLOCK)) {
@@ -1537,12 +2103,21 @@ static ssize_t i915_perf_read(struct file *file,
static enum hrtimer_restart oa_poll_check_timer_cb(struct hrtimer *hrtimer)
{
+ struct i915_perf_stream *stream;
+
struct drm_i915_private *dev_priv =
container_of(hrtimer, typeof(*dev_priv),
perf.oa.poll_check_timer);
- if (!dev_priv->perf.oa.ops.oa_buffer_is_empty(dev_priv))
- wake_up(&dev_priv->perf.oa.poll_wq);
+ /* No need to protect the streams list here, since the hrtimer is
+ * disabled before the stream is removed from list, and currently a
+ * single exclusive_stream is supported.
+ * XXX: revisit this when multiple concurrent streams are supported.
+ */
+ list_for_each_entry(stream, &dev_priv->perf.streams, link) {
+ if (stream_have_data__unlocked(stream))
+ wake_up(&dev_priv->perf.oa.poll_wq);
+ }
hrtimer_forward_now(hrtimer, ns_to_ktime(POLL_PERIOD));
@@ -1578,23 +2153,23 @@ static unsigned int i915_perf_poll(struct file *file, poll_table *wait)
static void i915_perf_enable_locked(struct i915_perf_stream *stream)
{
- if (stream->enabled)
+ if (stream->state != I915_PERF_STREAM_DISABLED)
return;
- /* Allow stream->ops->enable() to refer to this */
- stream->enabled = true;
+ stream->state = I915_PERF_STREAM_ENABLE_IN_PROGRESS;
if (stream->ops->enable)
stream->ops->enable(stream);
+
+ stream->state = I915_PERF_STREAM_ENABLED;
}
static void i915_perf_disable_locked(struct i915_perf_stream *stream)
{
- if (!stream->enabled)
+ if (stream->state != I915_PERF_STREAM_ENABLED)
return;
- /* Allow stream->ops->disable() to refer to this */
- stream->enabled = false;
+ stream->state = I915_PERF_STREAM_DISABLED;
if (stream->ops->disable)
stream->ops->disable(stream);
@@ -1635,14 +2210,16 @@ static void i915_perf_destroy_locked(struct i915_perf_stream *stream)
{
struct drm_i915_private *dev_priv = stream->dev_priv;
- if (stream->enabled)
+ mutex_lock(&dev_priv->perf.streams_lock);
+ list_del(&stream->link);
+ mutex_unlock(&dev_priv->perf.streams_lock);
+
+ if (stream->state == I915_PERF_STREAM_ENABLED)
i915_perf_disable_locked(stream);
if (stream->ops->destroy)
stream->ops->destroy(stream);
- list_del(&stream->link);
-
if (stream->ctx) {
mutex_lock(&dev_priv->dev->struct_mutex);
i915_gem_context_unreference(stream->ctx);
@@ -1753,7 +2330,9 @@ int i915_perf_open_ioctl_locked(struct drm_device *dev,
*/
BUG_ON(stream->sample_flags != props->sample_flags);
+ mutex_lock(&dev_priv->perf.streams_lock);
list_add(&stream->link, &dev_priv->perf.streams);
+ mutex_unlock(&dev_priv->perf.streams_lock);
if (param->flags & I915_PERF_FLAG_FD_CLOEXEC)
f_flags |= O_CLOEXEC;
@@ -1772,7 +2351,9 @@ int i915_perf_open_ioctl_locked(struct drm_device *dev,
return stream_fd;
err_open:
+ mutex_lock(&dev_priv->perf.streams_lock);
list_del(&stream->link);
+ mutex_unlock(&dev_priv->perf.streams_lock);
if (stream->ops->destroy)
stream->ops->destroy(stream);
err_alloc:
@@ -1881,6 +2462,29 @@ static int read_properties_unlocked(struct drm_i915_private *dev_priv,
case DRM_I915_PERF_PROP_SAMPLE_OA_SOURCE:
props->sample_flags |= SAMPLE_OA_SOURCE_INFO;
break;
+ case DRM_I915_PERF_PROP_ENGINE: {
+ unsigned int user_ring_id =
+ value & I915_EXEC_RING_MASK;
+ enum intel_engine_id engine;
+
+ if (user_ring_id > I915_USER_RINGS)
+ return -EINVAL;
+
+ /* XXX: Currently only RCS is supported.
+ * Remove this check when support for other
+ * engines is added
+ */
+ engine = user_ring_map[user_ring_id];
+ if (engine != RCS)
+ return -EINVAL;
+
+ props->cs_mode = true;
+ props->engine = engine;
+ }
+ break;
+ case DRM_I915_PERF_PROP_SAMPLE_CTX_ID:
+ props->sample_flags |= SAMPLE_CTX_ID;
+ break;
case DRM_I915_PERF_PROP_MAX:
BUG();
}
@@ -1988,8 +2592,11 @@ void i915_perf_init(struct drm_device *dev)
init_waitqueue_head(&dev_priv->perf.oa.poll_wq);
INIT_LIST_HEAD(&dev_priv->perf.streams);
+ INIT_LIST_HEAD(&dev_priv->perf.node_list);
mutex_init(&dev_priv->perf.lock);
+ mutex_init(&dev_priv->perf.streams_lock);
spin_lock_init(&dev_priv->perf.hook_lock);
+ spin_lock_init(&dev_priv->perf.node_list_lock);
if (IS_HASWELL(dev)) {
dev_priv->perf.oa.ops.init_oa_buffer = gen7_init_oa_buffer;
@@ -869,10 +869,14 @@ int intel_execlists_submission(struct i915_execbuffer_params *params,
exec_start = params->batch_obj_vm_offset +
args->batch_start_offset;
+ i915_perf_command_stream_hook(params->request);
+
ret = engine->emit_bb_start(params->request, exec_start, params->dispatch_flags);
if (ret)
return ret;
+ i915_perf_command_stream_hook(params->request);
+
trace_i915_gem_ring_dispatch(params->request, params->dispatch_flags);
i915_gem_execbuffer_move_to_active(vmas, params->request);
@@ -1196,6 +1196,7 @@ enum drm_i915_perf_oa_event_source {
I915_PERF_OA_EVENT_SOURCE_UNDEFINED,
I915_PERF_OA_EVENT_SOURCE_PERIODIC,
I915_PERF_OA_EVENT_SOURCE_CONTEXT_SWITCH,
+ I915_PERF_OA_EVENT_SOURCE_RCS,
I915_PERF_OA_EVENT_SOURCE_MAX /* non-ABI */
};
@@ -1241,6 +1242,19 @@ enum drm_i915_perf_property_id {
*/
DRM_I915_PERF_PROP_SAMPLE_OA_SOURCE,
+ /**
+ * The value of this property specifies the GPU engine for which
+ * the samples need to be collected. Specifying this property also
+ * implies the command stream based sample collection.
+ */
+ DRM_I915_PERF_PROP_ENGINE,
+
+ /**
+ * The value of this property set to 1 requests inclusion of context ID
+ * in the perf sample data.
+ */
+ DRM_I915_PERF_PROP_SAMPLE_CTX_ID,
+
DRM_I915_PERF_PROP_MAX /* non-ABI */
};
@@ -1306,6 +1320,7 @@ enum drm_i915_perf_record_type {
* struct drm_i915_perf_record_header header;
*
* { u32 source_info; } && DRM_I915_PERF_PROP_SAMPLE_OA_SOURCE
+ * { u32 ctx_id; } && DRM_I915_PERF_PROP_SAMPLE_CTX_ID
* { u32 oa_report[]; } && DRM_I915_PERF_PROP_SAMPLE_OA
* };
*/