@@ -2032,6 +2032,30 @@ struct i915_perf_stream_ops {
* The stream will always be disabled before this is called.
*/
void (*destroy)(struct i915_perf_stream *stream);
+
+ /*
+ * @emit_sample_capture: Emit the commands in the command streamer
+ * for a particular gpu engine.
+ *
+ * The commands are inserted to capture the perf sample data at
+ * specific points during workload execution, such as before and after
+ * the batch buffer.
+ */
+ void (*emit_sample_capture)(struct i915_perf_stream *stream,
+ struct drm_i915_gem_request *request,
+ bool preallocate);
+
+ /*
+ * @patch_sample_capture: Patch the offsets in commands to capture perf
+ * sample data in the ringbuffer corresponding to request.
+ *
+ * To get the perf samples in the order of execution on the engines, we
+ * emit the commands to capture the data while populating the request in
+ * the ringbuffer but update the location and maintain the ordered list
+ * during __i915_gem_request_submit.
+ */
+ void (*patch_request)(struct i915_perf_stream *stream,
+ struct drm_i915_gem_request *request);
};
/**
@@ -2044,11 +2068,6 @@ struct i915_perf_stream {
struct drm_i915_private *dev_priv;
/**
- * @link: Links the stream into ``&drm_i915_private->streams``
- */
- struct list_head link;
-
- /**
* @sample_flags: Flags representing the `DRM_I915_PERF_PROP_SAMPLE_*`
* properties given when opening a stream, representing the contents
* of a single sample as read() by userspace.
@@ -2085,6 +2104,27 @@ struct i915_perf_stream {
* @oa_config: The OA configuration used by the stream.
*/
struct i915_oa_config *oa_config;
+
+ /**
+ * @cs_mode: Whether command stream based perf sample collection is
+ * enabled for this stream
+ */
+ bool cs_mode;
+
+ /**
+ * @using_oa: Whether OA unit is in use for this particular stream
+ */
+ bool using_oa;
+
+ /* Command stream based perf data buffer */
+ struct {
+ struct i915_vma *vma;
+ u8 *vaddr;
+ } cs_buffer;
+
+ struct list_head cs_samples;
+ struct list_head free_samples;
+ spinlock_t samples_lock;
};
/**
@@ -2159,7 +2199,8 @@ struct i915_oa_ops {
int (*read)(struct i915_perf_stream *stream,
char __user *buf,
size_t count,
- size_t *offset);
+ size_t *offset,
+ u32 ts);
/**
* @oa_hw_tail_read: read the OA tail pointer register
@@ -2171,6 +2212,41 @@ struct i915_oa_ops {
u32 (*oa_hw_tail_read)(struct drm_i915_private *dev_priv);
};
+enum request_sample_id {
+ PRE_REQUEST_SAMPLE_ID,
+ POST_REQUEST_SAMPLE_ID,
+ MAX_REQUEST_SAMPLE_ID
+};
+
+/*
+ * i915_perf_cs_sample - Sample element to hold info about a single perf
+ * sample data associated with a particular GPU command stream.
+ */
+struct i915_perf_cs_sample {
+ /**
+ * @link: Links the sample into ``&stream->cs_samples`` or
+ * ``&stream->free_samples``
+ */
+ struct list_head link;
+
+ /**
+ * @request: GEM request associated with the sample. The commands to
+ * capture the perf metrics are inserted into the command streamer in
+ * context of this request.
+ */
+ struct drm_i915_gem_request *request;
+
+ /**
+ * @oa_offset: Offset into ``&stream->cs_buffer``
+ * where the perf metrics will be collected, when the commands inserted
+ * into the command stream are executed by GPU.
+ */
+ u32 oa_offset;
+
+ /* Is this sample prior to request start or post request end */
+ enum request_sample_id id;
+};
+
struct intel_cdclk_state {
unsigned int cdclk, vco, ref;
};
@@ -2513,7 +2589,6 @@ struct drm_i915_private {
* except exclusive_stream.
*/
struct mutex lock;
- struct list_head streams;
struct {
/*
@@ -2523,6 +2598,7 @@ struct drm_i915_private {
* dev_priv->drm.struct_mutex.
*/
struct i915_perf_stream *exclusive_stream;
+ struct srcu_struct srcu;
u32 specific_ctx_id;
@@ -3883,6 +3959,10 @@ int intel_engine_cmd_parser(struct intel_engine_cs *engine,
extern void i915_perf_fini(struct drm_i915_private *dev_priv);
extern void i915_perf_register(struct drm_i915_private *dev_priv);
extern void i915_perf_unregister(struct drm_i915_private *dev_priv);
+void i915_perf_emit_sample_capture(struct drm_i915_gem_request *req,
+ bool preallocate);
+void i915_perf_patch_request(struct drm_i915_gem_request *request);
+void i915_perf_streams_mark_idle(struct drm_i915_private *dev_priv);
/* i915_suspend.c */
extern int i915_save_state(struct drm_i915_private *dev_priv);
@@ -3226,6 +3226,7 @@ bool i915_gem_unset_wedged(struct drm_i915_private *i915)
intel_engines_mark_idle(dev_priv);
i915_gem_timelines_mark_idle(dev_priv);
+ i915_perf_streams_mark_idle(dev_priv);
GEM_BUG_ON(!dev_priv->gt.awake);
dev_priv->gt.awake = false;
@@ -1115,12 +1115,16 @@ static int __reloc_gpu_alloc(struct i915_execbuffer *eb,
if (err)
goto err_request;
+ i915_perf_emit_sample_capture(rq, true);
+
err = eb->engine->emit_bb_start(rq,
batch->node.start, PAGE_SIZE,
cache->gen > 5 ? 0 : I915_DISPATCH_SECURE);
if (err)
goto err_request;
+ i915_perf_emit_sample_capture(rq, false);
+
GEM_BUG_ON(!reservation_object_test_signaled_rcu(batch->resv, true));
i915_vma_move_to_active(batch, rq, 0);
reservation_object_lock(batch->resv, NULL);
@@ -1970,6 +1974,8 @@ static int eb_submit(struct i915_execbuffer *eb)
return err;
}
+ i915_perf_emit_sample_capture(eb->request, true);
+
err = eb->engine->emit_bb_start(eb->request,
eb->batch->node.start +
eb->batch_start_offset,
@@ -1978,6 +1984,8 @@ static int eb_submit(struct i915_execbuffer *eb)
if (err)
return err;
+ i915_perf_emit_sample_capture(eb->request, false);
+
return 0;
}
@@ -481,6 +481,8 @@ void __i915_gem_request_submit(struct drm_i915_gem_request *request)
list_move_tail(&request->link, &timeline->requests);
spin_unlock(&request->timeline->lock);
+ i915_perf_patch_request(request);
+
wake_up_all(&request->execute);
}
@@ -195,6 +195,9 @@ struct drm_i915_gem_request {
struct drm_i915_file_private *file_priv;
/** file_priv list entry for this request */
struct list_head client_link;
+
+ u32 *pre_oa_offset;
+ u32 *post_oa_offset;
};
extern const struct dma_fence_ops i915_fence_ops;
@@ -194,6 +194,7 @@
#include <linux/anon_inodes.h>
#include <linux/sizes.h>
#include <linux/uuid.h>
+#include <linux/srcu.h>
#include "i915_drv.h"
#include "i915_oa_hsw.h"
@@ -289,6 +290,11 @@
#define OAREPORT_REASON_CTX_SWITCH (1<<3)
#define OAREPORT_REASON_CLK_RATIO (1<<5)
+/* Data common to periodic and RCS based OA samples */
+struct i915_perf_sample_data {
+ u64 source;
+ const u8 *report;
+};
/* For sysctl proc_dointvec_minmax of i915_oa_max_sample_rate
*
@@ -329,6 +335,16 @@
[I915_OA_FORMAT_C4_B8] = { 7, 64 },
};
+/* Duplicated from similar static enum in i915_gem_execbuffer.c */
+#define I915_USER_RINGS (4)
+static const enum intel_engine_id user_ring_map[I915_USER_RINGS + 1] = {
+ [I915_EXEC_DEFAULT] = RCS,
+ [I915_EXEC_RENDER] = RCS,
+ [I915_EXEC_BLT] = BCS,
+ [I915_EXEC_BSD] = VCS,
+ [I915_EXEC_VEBOX] = VECS
+};
+
#define SAMPLE_OA_REPORT (1<<0)
#define SAMPLE_OA_SOURCE (1<<1)
@@ -341,6 +357,9 @@
* @oa_format: An OA unit HW report format
* @oa_periodic: Whether to enable periodic OA unit sampling
* @oa_period_exponent: The OA unit sampling period is derived from this
+ * @cs_mode: Whether the stream is configured to enable collection of metrics
+ * associated with command stream of a particular GPU engine
+ * @engine: The GPU engine associated with the stream in case cs_mode is enabled
*
* As read_properties_unlocked() enumerates and validates the properties given
* to open a stream of metrics the configuration is built up in the structure
@@ -357,6 +376,10 @@ struct perf_open_properties {
int oa_format;
bool oa_periodic;
int oa_period_exponent;
+
+ /* Command stream mode */
+ bool cs_mode;
+ enum intel_engine_id engine;
};
static void free_oa_config(struct drm_i915_private *dev_priv,
@@ -420,6 +443,236 @@ static u32 gen7_oa_hw_tail_read(struct drm_i915_private *dev_priv)
}
/**
+ * i915_emit_oa_report_capture - Insert the commands to capture OA
+ * reports metrics into the render command stream
+ * @request: request in whose context the metrics are being collected.
+ * @preallocate: allocate space in ring for related sample.
+ */
+static int i915_emit_oa_report_capture(struct drm_i915_gem_request *request,
+ bool preallocate)
+{
+ struct drm_i915_private *dev_priv = request->i915;
+ u32 cmd, len = 4, *cs;
+
+ if (preallocate)
+ request->reserved_space += len;
+ else
+ request->reserved_space -= len;
+
+ cs = intel_ring_begin(request, 4);
+ if (IS_ERR(cs))
+ return PTR_ERR(cs);
+
+ cmd = MI_REPORT_PERF_COUNT | (1<<0);
+ if (INTEL_GEN(dev_priv) >= 8)
+ cmd |= (2<<0);
+
+ *cs++ = cmd;
+ /*
+ * Save the address in the ringbuffer where offset for OA report
+ * capture is to be placed during __i915_gem_request_submit.
+ */
+ if (preallocate)
+ request->pre_oa_offset = cs++;
+ else
+ request->post_oa_offset = cs++;
+
+ *cs++ = request->fence.seqno;
+
+ if (INTEL_GEN(dev_priv) >= 8)
+ *cs++ = 0;
+ else
+ *cs++ = MI_NOOP;
+
+ intel_ring_advance(request, cs);
+
+ return 0;
+}
+
+/**
+ * i915_perf_stream_emit_sample_capture - Insert the commands to capture perf
+ * metrics into the GPU command stream
+ * @stream: Stream to which this request corresponds.
+ * @request: request in whose context the metrics are being collected.
+ * @preallocate: allocate space in ring for related sample.
+ */
+static void i915_perf_stream_emit_sample_capture(
+ struct i915_perf_stream *stream,
+ struct drm_i915_gem_request *request,
+ bool preallocate)
+{
+ struct reservation_object *resv = stream->cs_buffer.vma->resv;
+ int ret;
+
+ if (stream->sample_flags & SAMPLE_OA_REPORT) {
+ ret = i915_emit_oa_report_capture(request, preallocate);
+ if (ret)
+ DRM_ERROR("Emit of OA capture commands failed\n");
+ }
+
+ reservation_object_lock(resv, NULL);
+ if (reservation_object_reserve_shared(resv) == 0)
+ reservation_object_add_shared_fence(resv, &request->fence);
+ reservation_object_unlock(resv);
+
+ i915_vma_move_to_active(stream->cs_buffer.vma, request,
+ EXEC_OBJECT_WRITE);
+}
+
+/**
+ * i915_perf_emit_sample_capture - Insert the commands to capture metrics into
+ * the command stream of a GPU engine.
+ * @request: request in whose context the metrics are being collected.
+ * @preallocate: allocate space in ring for related sample.
+ *
+ * The function provides a hook through which the commands to capture perf
+ * metrics, are inserted into the command stream of a GPU engine.
+ */
+void i915_perf_emit_sample_capture(struct drm_i915_gem_request *request,
+ bool preallocate)
+{
+ struct drm_i915_private *dev_priv = request->i915;
+ struct i915_perf_stream *stream;
+ int idx;
+
+ if (!dev_priv->perf.initialized)
+ return;
+
+ idx = srcu_read_lock(&dev_priv->perf.oa.srcu);
+ stream = srcu_dereference(dev_priv->perf.oa.exclusive_stream,
+ &dev_priv->perf.oa.srcu);
+ if (stream && stream->enabled && stream->cs_mode)
+ stream->ops->emit_sample_capture(stream, request,
+ preallocate);
+ srcu_read_unlock(&dev_priv->perf.oa.srcu, idx);
+}
+
+/**
+ * release_perf_sample - Release old perf sample to make space for new
+ * sample data.
+ * @stream: Stream from which space is to be freed up.
+ *
+ * We also dereference the associated request before marking the sample freed.
+ * Also, no need to check whether the commands associated with old samples
+ * have been completed. This is because these sample entries are anyways going
+ * to be replaced by a new sample, and gpu will eventually overwrite the buffer
+ * contents, when the request associated with new sample completes.
+ */
+static void release_perf_sample(struct i915_perf_stream *stream)
+{
+ struct i915_perf_cs_sample *sample, *next;
+
+ list_for_each_entry_safe
+ (sample, next, &stream->cs_samples, link) {
+ i915_gem_request_put(sample->request);
+ list_move_tail(&sample->link, &stream->free_samples);
+ break;
+ }
+}
+
+static void i915_perf_stream_patch_sample_oa(struct i915_perf_stream *stream,
+ struct drm_i915_gem_request *request,
+ struct i915_perf_cs_sample *sample)
+{
+ u32 oa_addr = stream->cs_buffer.vma->node.start + sample->oa_offset;
+
+ if (WARN_ON(oa_addr & 0x3f)) {
+ DRM_ERROR("OA buffer address not aligned to 64 byte\n");
+ return;
+ }
+
+ switch (sample->id) {
+ case PRE_REQUEST_SAMPLE_ID:
+ *request->pre_oa_offset = oa_addr |
+ MI_REPORT_PERF_COUNT_GGTT;
+ break;
+ case POST_REQUEST_SAMPLE_ID:
+ *request->post_oa_offset = oa_addr |
+ MI_REPORT_PERF_COUNT_GGTT;
+ break;
+ default:
+ DRM_ERROR("Invalid sample being patched\n");
+ }
+}
+
+/**
+ * i915_perf_stream_patch_request - Assign free sample. If none available,
+ * remove one. Patch offset of the perf sample address with the one from
+ * sample.
+ * @stream: Stream to which this request corresponds.
+ * @request: request in whose context the metrics are being collected.
+ */
+static void i915_perf_stream_patch_request(struct i915_perf_stream *stream,
+ struct drm_i915_gem_request *request)
+{
+ struct i915_perf_cs_sample *sample;
+ unsigned long flags;
+ enum request_sample_id sample_id = PRE_REQUEST_SAMPLE_ID;
+
+ while (sample_id < MAX_REQUEST_SAMPLE_ID) {
+ spin_lock_irqsave(&stream->samples_lock, flags);
+ if (list_empty(&stream->free_samples))
+ release_perf_sample(stream);
+ sample = list_first_entry_or_null(&stream->free_samples,
+ struct i915_perf_cs_sample, link);
+ WARN_ON(sample == NULL);
+
+ list_move_tail(&sample->link, &stream->cs_samples);
+ sample->request = i915_gem_request_get(request);
+ sample->id = sample_id;
+ if (stream->sample_flags &
+ (SAMPLE_OA_REPORT | SAMPLE_OA_SOURCE))
+ i915_perf_stream_patch_sample_oa(stream, request,
+ sample);
+ spin_unlock_irqrestore(&stream->samples_lock, flags);
+ sample_id++;
+ }
+}
+
+/**
+ * i915_perf_patch_request - Update the commands to capture metrics into
+ * the command stream of a GPU engine with offsets.
+ * @request: request in whose context the metrics are being collected.
+ */
+void i915_perf_patch_request(struct drm_i915_gem_request *request)
+{
+ struct drm_i915_private *dev_priv = request->i915;
+ struct i915_perf_stream *stream;
+ int idx;
+
+ if (!dev_priv->perf.initialized)
+ return;
+
+ idx = srcu_read_lock(&dev_priv->perf.oa.srcu);
+ stream = srcu_dereference(dev_priv->perf.oa.exclusive_stream,
+ &dev_priv->perf.oa.srcu);
+ if (stream && stream->enabled && stream->cs_mode)
+ stream->ops->patch_request(stream, request);
+ srcu_read_unlock(&dev_priv->perf.oa.srcu, idx);
+}
+
+/**
+ * i915_perf_stream_release_samples - Release the perf command stream samples
+ * @stream: Stream from which sample are to be released.
+ *
+ * Note: The associated requests should be completed before releasing the
+ * references here.
+ */
+static void i915_perf_stream_release_samples(struct i915_perf_stream *stream)
+{
+ struct i915_perf_cs_sample *entry, *next;
+ unsigned long flags;
+
+ spin_lock_irqsave(&stream->samples_lock, flags);
+ list_for_each_entry_safe
+ (entry, next, &stream->cs_samples, link) {
+ i915_gem_request_put(entry->request);
+ list_move_tail(&entry->link, &stream->free_samples);
+ }
+ spin_unlock_irqrestore(&stream->samples_lock, flags);
+}
+
+/**
* oa_buffer_check_unlocked - check for data and update tail ptr state
* @dev_priv: i915 device instance
*
@@ -570,12 +823,13 @@ static int append_oa_status(struct i915_perf_stream *stream,
}
/**
- * append_oa_sample - Copies single OA report into userspace read() buffer.
- * @stream: An i915-perf stream opened for OA metrics
+ * append_perf_sample - Copies single perf sample into userspace read() buffer.
+ * @stream: An i915-perf stream opened for perf samples
* @buf: destination buffer given by userspace
* @count: the number of bytes userspace wants to read
* @offset: (inout): the current position for writing into @buf
- * @report: A single OA report to (optionally) include as part of the sample
+ * @data: perf sample data which contains (optionally) metrics configured
+ * earlier when opening a stream
*
* The contents of a sample are configured through `DRM_I915_PERF_PROP_SAMPLE_*`
* properties when opening a stream, tracked as `stream->sample_flags`. This
@@ -586,11 +840,11 @@ static int append_oa_status(struct i915_perf_stream *stream,
*
* Returns: 0 on success, negative error code on failure.
*/
-static int append_oa_sample(struct i915_perf_stream *stream,
- char __user *buf,
- size_t count,
- size_t *offset,
- const u8 *report)
+static int append_perf_sample(struct i915_perf_stream *stream,
+ char __user *buf,
+ size_t count,
+ size_t *offset,
+ const struct i915_perf_sample_data *data)
{
struct drm_i915_private *dev_priv = stream->dev_priv;
int report_size = dev_priv->perf.oa.oa_buffer.format_size;
@@ -618,16 +872,15 @@ static int append_oa_sample(struct i915_perf_stream *stream,
* transition. These are considered as source 'OABUFFER'.
*/
if (sample_flags & SAMPLE_OA_SOURCE) {
- u64 source = I915_PERF_SAMPLE_OA_SOURCE_OABUFFER;
-
- if (copy_to_user(buf, &source, 8))
+ if (copy_to_user(buf, &data->source, 8))
return -EFAULT;
buf += 8;
}
if (sample_flags & SAMPLE_OA_REPORT) {
- if (copy_to_user(buf, report, report_size))
+ if (copy_to_user(buf, data->report, report_size))
return -EFAULT;
+ buf += report_size;
}
(*offset) += header.size;
@@ -636,11 +889,39 @@ static int append_oa_sample(struct i915_perf_stream *stream,
}
/**
+ * append_oa_buffer_sample - Copies single periodic OA report into userspace
+ * read() buffer.
+ * @stream: An i915-perf stream opened for OA metrics
+ * @buf: destination buffer given by userspace
+ * @count: the number of bytes userspace wants to read
+ * @offset: (inout): the current position for writing into @buf
+ * @report: A single OA report to (optionally) include as part of the sample
+ *
+ * Returns: 0 on success, negative error code on failure.
+ */
+static int append_oa_buffer_sample(struct i915_perf_stream *stream,
+ char __user *buf, size_t count,
+ size_t *offset, const u8 *report)
+{
+ u32 sample_flags = stream->sample_flags;
+ struct i915_perf_sample_data data = { 0 };
+
+ if (sample_flags & SAMPLE_OA_SOURCE)
+ data.source = I915_PERF_SAMPLE_OA_SOURCE_OABUFFER;
+
+ if (sample_flags & SAMPLE_OA_REPORT)
+ data.report = report;
+
+ return append_perf_sample(stream, buf, count, offset, &data);
+}
+
+/**
* Copies all buffered OA reports into userspace read() buffer.
* @stream: An i915-perf stream opened for OA metrics
* @buf: destination buffer given by userspace
* @count: the number of bytes userspace wants to read
* @offset: (inout): the current position for writing into @buf
+ * @ts: copy OA reports till this timestamp
*
* Notably any error condition resulting in a short read (-%ENOSPC or
* -%EFAULT) will be returned even though one or more records may
@@ -658,7 +939,8 @@ static int append_oa_sample(struct i915_perf_stream *stream,
static int gen8_append_oa_reports(struct i915_perf_stream *stream,
char __user *buf,
size_t count,
- size_t *offset)
+ size_t *offset,
+ u32 ts)
{
struct drm_i915_private *dev_priv = stream->dev_priv;
int report_size = dev_priv->perf.oa.oa_buffer.format_size;
@@ -718,6 +1000,11 @@ static int gen8_append_oa_reports(struct i915_perf_stream *stream,
u32 *report32 = (void *)report;
u32 ctx_id;
u32 reason;
+ u32 report_ts = report32[1];
+
+ /* Report timestamp should not exceed the given ts */
+ if (report_ts > ts)
+ break;
/*
* All the report sizes factor neatly into the buffer
@@ -814,8 +1101,8 @@ static int gen8_append_oa_reports(struct i915_perf_stream *stream,
report32[2] = INVALID_CTX_ID;
}
- ret = append_oa_sample(stream, buf, count, offset,
- report);
+ ret = append_oa_buffer_sample(stream, buf, count,
+ offset, report);
if (ret)
break;
@@ -856,6 +1143,7 @@ static int gen8_append_oa_reports(struct i915_perf_stream *stream,
* @buf: destination buffer given by userspace
* @count: the number of bytes userspace wants to read
* @offset: (inout): the current position for writing into @buf
+ * @ts: copy OA reports till this timestamp
*
* Checks OA unit status registers and if necessary appends corresponding
* status records for userspace (such as for a buffer full condition) and then
@@ -873,7 +1161,8 @@ static int gen8_append_oa_reports(struct i915_perf_stream *stream,
static int gen8_oa_read(struct i915_perf_stream *stream,
char __user *buf,
size_t count,
- size_t *offset)
+ size_t *offset,
+ u32 ts)
{
struct drm_i915_private *dev_priv = stream->dev_priv;
u32 oastatus;
@@ -926,7 +1215,7 @@ static int gen8_oa_read(struct i915_perf_stream *stream,
oastatus & ~GEN8_OASTATUS_REPORT_LOST);
}
- return gen8_append_oa_reports(stream, buf, count, offset);
+ return gen8_append_oa_reports(stream, buf, count, offset, ts);
}
/**
@@ -935,6 +1224,7 @@ static int gen8_oa_read(struct i915_perf_stream *stream,
* @buf: destination buffer given by userspace
* @count: the number of bytes userspace wants to read
* @offset: (inout): the current position for writing into @buf
+ * @ts: copy OA reports till this timestamp
*
* Notably any error condition resulting in a short read (-%ENOSPC or
* -%EFAULT) will be returned even though one or more records may
@@ -952,7 +1242,8 @@ static int gen8_oa_read(struct i915_perf_stream *stream,
static int gen7_append_oa_reports(struct i915_perf_stream *stream,
char __user *buf,
size_t count,
- size_t *offset)
+ size_t *offset,
+ u32 ts)
{
struct drm_i915_private *dev_priv = stream->dev_priv;
int report_size = dev_priv->perf.oa.oa_buffer.format_size;
@@ -1033,7 +1324,12 @@ static int gen7_append_oa_reports(struct i915_perf_stream *stream,
continue;
}
- ret = append_oa_sample(stream, buf, count, offset, report);
+ /* Report timestamp should not exceed the given ts */
+ if (report32[1] > ts)
+ break;
+
+ ret = append_oa_buffer_sample(stream, buf, count, offset,
+ report);
if (ret)
break;
@@ -1071,6 +1367,7 @@ static int gen7_append_oa_reports(struct i915_perf_stream *stream,
* @buf: destination buffer given by userspace
* @count: the number of bytes userspace wants to read
* @offset: (inout): the current position for writing into @buf
+ * @ts: copy OA reports till this timestamp
*
* Checks Gen 7 specific OA unit status registers and if necessary appends
* corresponding status records for userspace (such as for a buffer full
@@ -1084,7 +1381,8 @@ static int gen7_append_oa_reports(struct i915_perf_stream *stream,
static int gen7_oa_read(struct i915_perf_stream *stream,
char __user *buf,
size_t count,
- size_t *offset)
+ size_t *offset,
+ u32 ts)
{
struct drm_i915_private *dev_priv = stream->dev_priv;
u32 oastatus1;
@@ -1146,16 +1444,175 @@ static int gen7_oa_read(struct i915_perf_stream *stream,
GEN7_OASTATUS1_REPORT_LOST;
}
- return gen7_append_oa_reports(stream, buf, count, offset);
+ return gen7_append_oa_reports(stream, buf, count, offset, ts);
+}
+
+/**
+ * append_cs_buffer_sample - Copies single perf sample data associated with
+ * GPU command stream, into userspace read() buffer.
+ * @stream: An i915-perf stream opened for perf CS metrics
+ * @buf: destination buffer given by userspace
+ * @count: the number of bytes userspace wants to read
+ * @offset: (inout): the current position for writing into @buf
+ * @node: Sample data associated with perf metrics
+ *
+ * Returns: 0 on success, negative error code on failure.
+ */
+static int append_cs_buffer_sample(struct i915_perf_stream *stream,
+ char __user *buf,
+ size_t count,
+ size_t *offset,
+ struct i915_perf_cs_sample *node)
+{
+ struct drm_i915_private *dev_priv = stream->dev_priv;
+ struct i915_perf_sample_data data = { 0 };
+ u32 sample_flags = stream->sample_flags;
+ int ret = 0;
+
+ if (sample_flags & SAMPLE_OA_REPORT) {
+ const u8 *report = stream->cs_buffer.vaddr + node->oa_offset;
+ u32 sample_ts = *(u32 *)(report + 4);
+
+ data.report = report;
+
+ /* First, append the periodic OA samples having lower
+ * timestamp values
+ */
+ ret = dev_priv->perf.oa.ops.read(stream, buf, count, offset,
+ sample_ts);
+ if (ret)
+ return ret;
+ }
+
+ if (sample_flags & SAMPLE_OA_SOURCE)
+ data.source = I915_PERF_SAMPLE_OA_SOURCE_CS;
+
+ return append_perf_sample(stream, buf, count, offset, &data);
}
/**
- * i915_oa_wait_unlocked - handles blocking IO until OA data available
+ * append_cs_buffer_samples: Copies all command stream based perf samples
+ * into userspace read() buffer.
+ * @stream: An i915-perf stream opened for perf CS metrics
+ * @buf: destination buffer given by userspace
+ * @count: the number of bytes userspace wants to read
+ * @offset: (inout): the current position for writing into @buf
+ *
+ * Notably any error condition resulting in a short read (-%ENOSPC or
+ * -%EFAULT) will be returned even though one or more records may
+ * have been successfully copied. In this case it's up to the caller
+ * to decide if the error should be squashed before returning to
+ * userspace.
+ *
+ * Returns: 0 on success, negative error code on failure.
+ */
+static int append_cs_buffer_samples(struct i915_perf_stream *stream,
+ char __user *buf,
+ size_t count,
+ size_t *offset)
+{
+ struct i915_perf_cs_sample *entry, *next;
+ LIST_HEAD(free_list);
+ int ret = 0;
+ unsigned long flags;
+
+ spin_lock_irqsave(&stream->samples_lock, flags);
+ if (list_empty(&stream->cs_samples)) {
+ spin_unlock_irqrestore(&stream->samples_lock, flags);
+ return 0;
+ }
+ list_for_each_entry_safe(entry, next,
+ &stream->cs_samples, link) {
+ /*
+ * XXX: Need to check if the request is preempted.
+ * If preempted mark the sample as free and move
+ * to stream->free_samples
+ */
+ if (!i915_gem_request_completed(entry->request))
+ break;
+ list_move_tail(&entry->link, &free_list);
+ }
+ spin_unlock_irqrestore(&stream->samples_lock, flags);
+
+ if (list_empty(&free_list))
+ return 0;
+
+ list_for_each_entry_safe(entry, next, &free_list, link) {
+ ret = append_cs_buffer_sample(stream, buf, count, offset,
+ entry);
+ if (ret)
+ break;
+
+ spin_lock_irqsave(&stream->samples_lock, flags);
+ i915_gem_request_put(entry->request);
+ list_move_tail(&entry->link, &stream->free_samples);
+ spin_unlock_irqrestore(&stream->samples_lock, flags);
+ }
+
+ /* Don't discard remaining entries, keep them for next read */
+ spin_lock_irqsave(&stream->samples_lock, flags);
+ list_splice(&free_list, &stream->cs_samples);
+ spin_unlock_irqrestore(&stream->samples_lock, flags);
+
+ return ret;
+}
+
+/*
+ * cs_buffer_is_empty - Checks whether the command stream buffer
+ * associated with the stream has data available.
* @stream: An i915-perf stream opened for OA metrics
*
+ * Returns: true if atleast one request associated with command stream is
+ * completed, else returns false.
+ */
+static bool cs_buffer_is_empty(struct i915_perf_stream *stream)
+
+{
+ struct i915_perf_cs_sample *entry = NULL;
+ struct drm_i915_gem_request *request = NULL;
+ unsigned long flags;
+
+ spin_lock_irqsave(&stream->samples_lock, flags);
+ entry = list_first_entry_or_null(&stream->cs_samples,
+ struct i915_perf_cs_sample, link);
+ if (entry)
+ request = entry->request;
+ spin_unlock_irqrestore(&stream->samples_lock, flags);
+
+ if (!entry)
+ return true;
+ else if (!i915_gem_request_completed(request))
+ return true;
+ else
+ return false;
+}
+
+/**
+ * stream_have_data_unlocked - Checks whether the stream has data available
+ * @stream: An i915-perf stream opened for OA metrics
+ *
+ * For command stream based streams, check if the command stream buffer has
+ * atleast one sample available, if not return false, irrespective of periodic
+ * oa buffer having the data or not.
+ */
+
+static bool stream_have_data_unlocked(struct i915_perf_stream *stream)
+{
+ struct drm_i915_private *dev_priv = stream->dev_priv;
+
+ if (stream->cs_mode)
+ return !cs_buffer_is_empty(stream);
+ else
+ return oa_buffer_check_unlocked(dev_priv);
+}
+
+/**
+ * i915_perf_stream_wait_unlocked - handles blocking IO until data available
+ * @stream: An i915-perf stream opened for GPU metrics
+ *
* Called when userspace tries to read() from a blocking stream FD opened
- * for OA metrics. It waits until the hrtimer callback finds a non-empty
- * OA buffer and wakes us.
+ * for perf metrics. It waits until the hrtimer callback finds a non-empty
+ * command stream buffer / OA buffer and wakes us.
*
* Note: it's acceptable to have this return with some false positives
* since any subsequent read handling will return -EAGAIN if there isn't
@@ -1163,31 +1620,42 @@ static int gen7_oa_read(struct i915_perf_stream *stream,
*
* Returns: zero on success or a negative error code
*/
-static int i915_oa_wait_unlocked(struct i915_perf_stream *stream)
+static int i915_perf_stream_wait_unlocked(struct i915_perf_stream *stream)
{
- struct drm_i915_private *dev_priv = stream->dev_priv;
-
- /* We would wait indefinitely if periodic sampling is not enabled */
- if (!dev_priv->perf.oa.periodic)
- return -EIO;
+ if (stream->cs_mode) {
+ long int ret;
+
+ /* Wait for the all sampled requests. */
+ ret = reservation_object_wait_timeout_rcu(
+ stream->cs_buffer.vma->resv,
+ true,
+ true,
+ MAX_SCHEDULE_TIMEOUT);
+ if (unlikely(ret < 0)) {
+ DRM_DEBUG_DRIVER("Failed to wait for sampled requests: "
+ "%li\n", ret);
+ return ret;
+ }
+ }
return wait_event_interruptible(dev_priv->perf.oa.poll_wq,
- oa_buffer_check_unlocked(dev_priv));
+ stream_have_data_unlocked(stream));
}
/**
- * i915_oa_poll_wait - call poll_wait() for an OA stream poll()
- * @stream: An i915-perf stream opened for OA metrics
+ * i915_perf_stream_poll_wait - call poll_wait() for an stream poll()
+ * @stream: An i915-perf stream opened for GPU metrics
* @file: An i915 perf stream file
* @wait: poll() state table
*
- * For handling userspace polling on an i915 perf stream opened for OA metrics,
+ * For handling userspace polling on an i915 perf stream opened for metrics,
* this starts a poll_wait with the wait queue that our hrtimer callback wakes
- * when it sees data ready to read in the circular OA buffer.
+ * when it sees data ready to read either in command stream buffer or in the
+ * circular OA buffer.
*/
-static void i915_oa_poll_wait(struct i915_perf_stream *stream,
- struct file *file,
- poll_table *wait)
+static void i915_perf_stream_poll_wait(struct i915_perf_stream *stream,
+ struct file *file,
+ poll_table *wait)
{
struct drm_i915_private *dev_priv = stream->dev_priv;
@@ -1195,8 +1663,9 @@ static void i915_oa_poll_wait(struct i915_perf_stream *stream,
}
/**
- * i915_oa_read - just calls through to &i915_oa_ops->read
- * @stream: An i915-perf stream opened for OA metrics
+ * i915_perf_stream_read - Reads perf metrics available into userspace read
+ * buffer
+ * @stream: An i915-perf stream opened for GPU metrics
* @buf: destination buffer given by userspace
* @count: the number of bytes userspace wants to read
* @offset: (inout): the current position for writing into @buf
@@ -1206,14 +1675,21 @@ static void i915_oa_poll_wait(struct i915_perf_stream *stream,
*
* Returns: zero on success or a negative error code
*/
-static int i915_oa_read(struct i915_perf_stream *stream,
- char __user *buf,
- size_t count,
- size_t *offset)
+static int i915_perf_stream_read(struct i915_perf_stream *stream,
+ char __user *buf,
+ size_t count,
+ size_t *offset)
{
struct drm_i915_private *dev_priv = stream->dev_priv;
- return dev_priv->perf.oa.ops.read(stream, buf, count, offset);
+
+ if (stream->cs_mode)
+ return append_cs_buffer_samples(stream, buf, count, offset);
+ else if (stream->sample_flags & SAMPLE_OA_REPORT)
+ return dev_priv->perf.oa.ops.read(stream, buf, count, offset,
+ U32_MAX);
+ else
+ return -EINVAL;
}
/**
@@ -1290,14 +1766,51 @@ static void oa_put_render_ctx_id(struct i915_perf_stream *stream)
}
}
+/**
+ * free_perf_samples - Free the perf command stream samples
+ * @stream: Stream from which sample are to be released.
+ */
+static void free_perf_samples(struct i915_perf_stream *stream)
+{
+ struct i915_perf_cs_sample *entry, *next;
+ unsigned long flags;
+
+ WARN_ON(!list_empty(&stream->cs_samples));
+
+ spin_lock_irqsave(&stream->samples_lock, flags);
+ list_for_each_entry_safe
+ (entry, next, &stream->free_samples, link) {
+ list_del(&entry->link);
+ kfree(entry);
+ }
+ spin_unlock_irqrestore(&stream->samples_lock, flags);
+}
+
+static void
+free_cs_buffer(struct i915_perf_stream *stream)
+{
+ struct drm_i915_private *dev_priv = stream->dev_priv;
+
+ free_perf_samples(stream);
+
+ mutex_lock(&dev_priv->drm.struct_mutex);
+
+ i915_gem_object_unpin_map(stream->cs_buffer.vma->obj);
+ i915_vma_unpin_and_release(&stream->cs_buffer.vma);
+
+ stream->cs_buffer.vma = NULL;
+ stream->cs_buffer.vaddr = NULL;
+
+ mutex_unlock(&dev_priv->drm.struct_mutex);
+}
+
static void
free_oa_buffer(struct drm_i915_private *i915)
{
mutex_lock(&i915->drm.struct_mutex);
i915_gem_object_unpin_map(i915->perf.oa.oa_buffer.vma->obj);
- i915_vma_unpin(i915->perf.oa.oa_buffer.vma);
- i915_gem_object_put(i915->perf.oa.oa_buffer.vma->obj);
+ i915_vma_unpin_and_release(&i915->perf.oa.oa_buffer.vma);
i915->perf.oa.oa_buffer.vma = NULL;
i915->perf.oa.oa_buffer.vaddr = NULL;
@@ -1305,36 +1818,50 @@ static void oa_put_render_ctx_id(struct i915_perf_stream *stream)
mutex_unlock(&i915->drm.struct_mutex);
}
-static void i915_oa_stream_destroy(struct i915_perf_stream *stream)
+static void i915_perf_stream_destroy(struct i915_perf_stream *stream)
{
struct drm_i915_private *dev_priv = stream->dev_priv;
+ struct i915_perf_stream *engine_stream;
+ int idx;
- BUG_ON(stream != dev_priv->perf.oa.exclusive_stream);
+ idx = srcu_read_lock(&dev_priv->perf.oa.srcu);
+ engine_stream = srcu_dereference(dev_priv->perf.oa.exclusive_stream,
+ &dev_priv->perf.oa.srcu);
+ if (WARN_ON(stream != engine_stream))
+ return;
+ srcu_read_unlock(&dev_priv->perf.oa.srcu, idx);
/*
* Unset exclusive_stream first, it will be checked while disabling
* the metric set on gen8+.
*/
mutex_lock(&dev_priv->drm.struct_mutex);
- dev_priv->perf.oa.exclusive_stream = NULL;
+ rcu_assign_pointer(dev_priv->perf.oa.exclusive_stream, NULL);
+ synchronize_srcu(&dev_priv->perf.oa.srcu);
mutex_unlock(&dev_priv->drm.struct_mutex);
- dev_priv->perf.oa.ops.disable_metric_set(dev_priv);
+ if (stream->using_oa) {
+ dev_priv->perf.oa.ops.disable_metric_set(dev_priv);
- free_oa_buffer(dev_priv);
+ free_oa_buffer(dev_priv);
- intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
- intel_runtime_pm_put(dev_priv);
+ intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
+ intel_runtime_pm_put(dev_priv);
- if (stream->ctx)
- oa_put_render_ctx_id(stream);
+ if (stream->ctx)
+ oa_put_render_ctx_id(stream);
- put_oa_config(dev_priv, stream->oa_config);
+ put_oa_config(dev_priv, stream->oa_config);
- if (dev_priv->perf.oa.spurious_report_rs.missed) {
- DRM_NOTE("%d spurious OA report notices suppressed due to ratelimiting\n",
- dev_priv->perf.oa.spurious_report_rs.missed);
+ if (dev_priv->perf.oa.spurious_report_rs.missed) {
+ DRM_NOTE("%d spurious OA report notices suppressed due "
+ "to ratelimiting\n",
+ dev_priv->perf.oa.spurious_report_rs.missed);
+ }
}
+
+ if (stream->cs_mode)
+ free_cs_buffer(stream);
}
static void gen7_init_oa_buffer(struct drm_i915_private *dev_priv)
@@ -1444,25 +1971,24 @@ static void gen8_init_oa_buffer(struct drm_i915_private *dev_priv)
dev_priv->perf.oa.pollin = false;
}
-static int alloc_oa_buffer(struct drm_i915_private *dev_priv)
+static int alloc_obj(struct drm_i915_private *dev_priv,
+ struct i915_vma **vma, u8 **vaddr)
{
struct drm_i915_gem_object *bo;
- struct i915_vma *vma;
int ret;
- if (WARN_ON(dev_priv->perf.oa.oa_buffer.vma))
- return -ENODEV;
+ intel_runtime_pm_get(dev_priv);
ret = i915_mutex_lock_interruptible(&dev_priv->drm);
if (ret)
- return ret;
+ goto out;
BUILD_BUG_ON_NOT_POWER_OF_2(OA_BUFFER_SIZE);
BUILD_BUG_ON(OA_BUFFER_SIZE < SZ_128K || OA_BUFFER_SIZE > SZ_16M);
bo = i915_gem_object_create(dev_priv, OA_BUFFER_SIZE);
if (IS_ERR(bo)) {
- DRM_ERROR("Failed to allocate OA buffer\n");
+ DRM_ERROR("Failed to allocate i915 perf obj\n");
ret = PTR_ERR(bo);
goto unlock;
}
@@ -1472,39 +1998,116 @@ static int alloc_oa_buffer(struct drm_i915_private *dev_priv)
goto err_unref;
/* PreHSW required 512K alignment, HSW requires 16M */
- vma = i915_gem_object_ggtt_pin(bo, NULL, 0, SZ_16M, 0);
- if (IS_ERR(vma)) {
- ret = PTR_ERR(vma);
+ *vma = i915_gem_object_ggtt_pin(bo, NULL, 0, SZ_16M, 0);
+ if (IS_ERR(*vma)) {
+ ret = PTR_ERR(*vma);
goto err_unref;
}
- dev_priv->perf.oa.oa_buffer.vma = vma;
- dev_priv->perf.oa.oa_buffer.vaddr =
- i915_gem_object_pin_map(bo, I915_MAP_WB);
- if (IS_ERR(dev_priv->perf.oa.oa_buffer.vaddr)) {
- ret = PTR_ERR(dev_priv->perf.oa.oa_buffer.vaddr);
+ *vaddr = i915_gem_object_pin_map(bo, I915_MAP_WB);
+ if (IS_ERR(*vaddr)) {
+ ret = PTR_ERR(*vaddr);
goto err_unpin;
}
- dev_priv->perf.oa.ops.init_oa_buffer(dev_priv);
-
- DRM_DEBUG_DRIVER("OA Buffer initialized, gtt offset = 0x%x, vaddr = %p\n",
- i915_ggtt_offset(dev_priv->perf.oa.oa_buffer.vma),
- dev_priv->perf.oa.oa_buffer.vaddr);
-
goto unlock;
err_unpin:
- __i915_vma_unpin(vma);
+ i915_vma_unpin(*vma);
err_unref:
i915_gem_object_put(bo);
- dev_priv->perf.oa.oa_buffer.vaddr = NULL;
- dev_priv->perf.oa.oa_buffer.vma = NULL;
-
unlock:
mutex_unlock(&dev_priv->drm.struct_mutex);
+
+out:
+ intel_runtime_pm_put(dev_priv);
+ return ret;
+}
+
+static int alloc_oa_buffer(struct drm_i915_private *dev_priv)
+{
+ struct i915_vma *vma;
+ u8 *vaddr;
+ int ret;
+
+ if (WARN_ON(dev_priv->perf.oa.oa_buffer.vma))
+ return -ENODEV;
+
+ dev_priv->perf.oa.oa_buffer.vma = NULL;
+ dev_priv->perf.oa.oa_buffer.vaddr = NULL;
+
+ ret = alloc_obj(dev_priv, &vma, &vaddr);
+ if (ret)
+ return ret;
+
+ dev_priv->perf.oa.oa_buffer.vma = vma;
+ dev_priv->perf.oa.oa_buffer.vaddr = vaddr;
+
+ dev_priv->perf.oa.ops.init_oa_buffer(dev_priv);
+
+ DRM_DEBUG_DRIVER("OA Buffer initialized, gtt offset = 0x%x, vaddr = %p",
+ i915_ggtt_offset(dev_priv->perf.oa.oa_buffer.vma),
+ dev_priv->perf.oa.oa_buffer.vaddr);
+ return 0;
+}
+
+static int init_perf_samples(struct i915_perf_stream *stream)
+{
+ struct i915_perf_cs_sample *sample;
+ u32 sample_size = 0;
+ u32 offset = 0;
+
+ sample_size = stream->dev_priv->perf.oa.oa_buffer.format_size;
+
+ while ((offset + sample_size) < stream->cs_buffer.vma->size) {
+ sample = kzalloc(sizeof(*sample), GFP_KERNEL);
+ if (sample == NULL) {
+ DRM_ERROR("Perf sample alloc failed\n");
+ return -ENOMEM;
+ }
+ sample->oa_offset = offset;
+ list_add_tail(&sample->link, &stream->free_samples);
+ offset += sample_size;
+ }
+
+ return 0;
+}
+
+static int alloc_cs_buffer(struct i915_perf_stream *stream)
+{
+ struct drm_i915_private *dev_priv = stream->dev_priv;
+ struct i915_vma *vma;
+ u8 *vaddr;
+ int ret;
+
+ if (WARN_ON(stream->cs_buffer.vma))
+ return -ENODEV;
+
+ stream->cs_buffer.vma = NULL;
+ stream->cs_buffer.vaddr = NULL;
+ INIT_LIST_HEAD(&stream->cs_samples);
+ INIT_LIST_HEAD(&stream->free_samples);
+
+ ret = alloc_obj(dev_priv, &vma, &vaddr);
+ if (ret)
+ return ret;
+
+ stream->cs_buffer.vma = vma;
+ stream->cs_buffer.vaddr = vaddr;
+
+ DRM_DEBUG_DRIVER("Command stream buf initialized, gtt offset = 0x%x, "
+ "vaddr = %p",
+ i915_ggtt_offset(stream->cs_buffer.vma),
+ stream->cs_buffer.vaddr);
+
+ ret = init_perf_samples(stream);
+ if (ret) {
+ free_perf_samples(stream);
+ free_cs_buffer(stream);
+ }
+
return ret;
}
@@ -1903,6 +2506,9 @@ static void gen8_disable_metric_set(struct drm_i915_private *dev_priv)
static void gen7_oa_enable(struct drm_i915_private *dev_priv)
{
+ struct i915_perf_stream *stream;
+ int idx;
+
/*
* Reset buf pointers so we don't forward reports from before now.
*
@@ -1914,11 +2520,13 @@ static void gen7_oa_enable(struct drm_i915_private *dev_priv)
*/
gen7_init_oa_buffer(dev_priv);
- if (dev_priv->perf.oa.exclusive_stream->enabled) {
+ idx = srcu_read_lock(&dev_priv->perf.oa.srcu);
+ stream = srcu_dereference(dev_priv->perf.oa.exclusive_stream,
+ &dev_priv->perf.oa.srcu);
+ if (!stream->enabled) {
struct i915_gem_context *ctx =
dev_priv->perf.oa.exclusive_stream->ctx;
u32 ctx_id = dev_priv->perf.oa.specific_ctx_id;
-
bool periodic = dev_priv->perf.oa.periodic;
u32 period_exponent = dev_priv->perf.oa.period_exponent;
u32 report_format = dev_priv->perf.oa.oa_buffer.format;
@@ -1933,6 +2541,7 @@ static void gen7_oa_enable(struct drm_i915_private *dev_priv)
GEN7_OACONTROL_ENABLE);
} else
I915_WRITE(GEN7_OACONTROL, 0);
+ srcu_read_unlock(&dev_priv->perf.oa.srcu, idx);
}
static void gen8_oa_enable(struct drm_i915_private *dev_priv)
@@ -1961,21 +2570,22 @@ static void gen8_oa_enable(struct drm_i915_private *dev_priv)
}
/**
- * i915_oa_stream_enable - handle `I915_PERF_IOCTL_ENABLE` for OA stream
- * @stream: An i915 perf stream opened for OA metrics
+ * i915_perf_stream_enable - handle `I915_PERF_IOCTL_ENABLE` for perf stream
+ * @stream: An i915 perf stream opened for GPU metrics
*
* [Re]enables hardware periodic sampling according to the period configured
* when opening the stream. This also starts a hrtimer that will periodically
* check for data in the circular OA buffer for notifying userspace (e.g.
* during a read() or poll()).
*/
-static void i915_oa_stream_enable(struct i915_perf_stream *stream)
+static void i915_perf_stream_enable(struct i915_perf_stream *stream)
{
struct drm_i915_private *dev_priv = stream->dev_priv;
- dev_priv->perf.oa.ops.oa_enable(dev_priv);
+ if (stream->sample_flags & SAMPLE_OA_REPORT)
+ dev_priv->perf.oa.ops.oa_enable(dev_priv);
- if (dev_priv->perf.oa.periodic)
+ if (stream->cs_mode || dev_priv->perf.oa.periodic)
hrtimer_start(&dev_priv->perf.oa.poll_check_timer,
ns_to_ktime(POLL_PERIOD),
HRTIMER_MODE_REL_PINNED);
@@ -1992,34 +2602,40 @@ static void gen8_oa_disable(struct drm_i915_private *dev_priv)
}
/**
- * i915_oa_stream_disable - handle `I915_PERF_IOCTL_DISABLE` for OA stream
- * @stream: An i915 perf stream opened for OA metrics
+ * i915_perf_stream_disable - handle `I915_PERF_IOCTL_DISABLE` for perf stream
+ * @stream: An i915 perf stream opened for GPU metrics
*
* Stops the OA unit from periodically writing counter reports into the
* circular OA buffer. This also stops the hrtimer that periodically checks for
* data in the circular OA buffer, for notifying userspace.
*/
-static void i915_oa_stream_disable(struct i915_perf_stream *stream)
+static void i915_perf_stream_disable(struct i915_perf_stream *stream)
{
struct drm_i915_private *dev_priv = stream->dev_priv;
- dev_priv->perf.oa.ops.oa_disable(dev_priv);
-
- if (dev_priv->perf.oa.periodic)
+ if (stream->cs_mode || dev_priv->perf.oa.periodic)
hrtimer_cancel(&dev_priv->perf.oa.poll_check_timer);
+
+ if (stream->cs_mode)
+ i915_perf_stream_release_samples(stream);
+
+ if (stream->sample_flags & SAMPLE_OA_REPORT)
+ dev_priv->perf.oa.ops.oa_disable(dev_priv);
}
-static const struct i915_perf_stream_ops i915_oa_stream_ops = {
- .destroy = i915_oa_stream_destroy,
- .enable = i915_oa_stream_enable,
- .disable = i915_oa_stream_disable,
- .wait_unlocked = i915_oa_wait_unlocked,
- .poll_wait = i915_oa_poll_wait,
- .read = i915_oa_read,
+static const struct i915_perf_stream_ops perf_stream_ops = {
+ .destroy = i915_perf_stream_destroy,
+ .enable = i915_perf_stream_enable,
+ .disable = i915_perf_stream_disable,
+ .wait_unlocked = i915_perf_stream_wait_unlocked,
+ .poll_wait = i915_perf_stream_poll_wait,
+ .read = i915_perf_stream_read,
+ .emit_sample_capture = i915_perf_stream_emit_sample_capture,
+ .patch_request = i915_perf_stream_patch_request,
};
/**
- * i915_oa_stream_init - validate combined props for OA stream and init
+ * i915_perf_stream_init - validate combined props for stream and init
* @stream: An i915 perf stream
* @param: The open parameters passed to `DRM_I915_PERF_OPEN`
* @props: The property state that configures stream (individually validated)
@@ -2028,55 +2644,27 @@ static void i915_oa_stream_disable(struct i915_perf_stream *stream)
* doesn't ensure that the combination necessarily makes sense.
*
* At this point it has been determined that userspace wants a stream of
- * OA metrics, but still we need to further validate the combined
+ * perf metrics, but still we need to further validate the combined
* properties are OK.
*
* If the configuration makes sense then we can allocate memory for
- * a circular OA buffer and apply the requested metric set configuration.
+ * a circular perf buffer and apply the requested metric set configuration.
*
* Returns: zero on success or a negative error code.
*/
-static int i915_oa_stream_init(struct i915_perf_stream *stream,
- struct drm_i915_perf_open_param *param,
- struct perf_open_properties *props)
+static int i915_perf_stream_init(struct i915_perf_stream *stream,
+ struct drm_i915_perf_open_param *param,
+ struct perf_open_properties *props)
{
struct drm_i915_private *dev_priv = stream->dev_priv;
- int format_size;
+ int format_size, idx;
+ bool require_oa_unit = props->sample_flags & (SAMPLE_OA_REPORT |
+ SAMPLE_OA_SOURCE);
+ bool cs_sample_data = props->sample_flags & SAMPLE_OA_REPORT;
+ struct i915_perf_stream *curr_stream;
+ struct intel_engine_cs *engine = NULL;
int ret;
- /* If the sysfs metrics/ directory wasn't registered for some
- * reason then don't let userspace try their luck with config
- * IDs
- */
- if (!dev_priv->perf.metrics_kobj) {
- DRM_DEBUG("OA metrics weren't advertised via sysfs\n");
- return -EINVAL;
- }
-
- if (!(props->sample_flags & SAMPLE_OA_REPORT)) {
- DRM_DEBUG("Only OA report sampling supported\n");
- return -EINVAL;
- }
-
- if (!dev_priv->perf.oa.ops.init_oa_buffer) {
- DRM_DEBUG("OA unit not supported\n");
- return -ENODEV;
- }
-
- /* To avoid the complexity of having to accurately filter
- * counter reports and marshal to the appropriate client
- * we currently only allow exclusive access
- */
- if (dev_priv->perf.oa.exclusive_stream) {
- DRM_DEBUG("OA unit already in use\n");
- return -EBUSY;
- }
-
- if (!props->oa_format) {
- DRM_DEBUG("OA report format not specified\n");
- return -EINVAL;
- }
-
/* We set up some ratelimit state to potentially throttle any _NOTES
* about spurious, invalid OA reports which we don't forward to
* userspace.
@@ -2099,62 +2687,141 @@ static int i915_oa_stream_init(struct i915_perf_stream *stream,
stream->sample_size = sizeof(struct drm_i915_perf_record_header);
- format_size = dev_priv->perf.oa.oa_formats[props->oa_format].size;
+ if (require_oa_unit) {
+ /* If the sysfs metrics/ directory wasn't registered for some
+ * reason then don't let userspace try their luck with config
+ * IDs
+ */
+ if (!dev_priv->perf.metrics_kobj) {
+ DRM_DEBUG("OA metrics weren't advertised via sysfs\n");
+ return -EINVAL;
+ }
+
+ if (!dev_priv->perf.oa.ops.init_oa_buffer) {
+ DRM_DEBUG("OA unit not supported\n");
+ return -ENODEV;
+ }
- stream->sample_flags |= SAMPLE_OA_REPORT;
- stream->sample_size += format_size;
+ /*
+ * To avoid the complexity of having to accurately filter
+ * counter reports and marshal to the appropriate client
+ * we currently only allow exclusive access
+ */
+ idx = srcu_read_lock(&dev_priv->perf.oa.srcu);
+ curr_stream = srcu_dereference(
+ dev_priv->perf.oa.exclusive_stream,
+ &dev_priv->perf.oa.srcu);
+ if (curr_stream) {
+ DRM_ERROR("Stream already opened\n");
+ return -EBUSY;
+ }
+ srcu_read_unlock(&dev_priv->perf.oa.srcu, idx);
- if (props->sample_flags & SAMPLE_OA_SOURCE) {
- stream->sample_flags |= SAMPLE_OA_SOURCE;
- stream->sample_size += 8;
- }
+ if (!props->oa_format) {
+ DRM_DEBUG("OA report format not specified\n");
+ return -EINVAL;
+ }
- dev_priv->perf.oa.oa_buffer.format_size = format_size;
- if (WARN_ON(dev_priv->perf.oa.oa_buffer.format_size == 0))
- return -EINVAL;
+ if (props->cs_mode && (props->engine != RCS)) {
+ DRM_DEBUG_DRIVER(
+ "Command stream OA metrics only available "
+ "via Render CS\n");
+ return -EINVAL;
+ }
+
+ engine = dev_priv->engine[RCS];
+ stream->using_oa = true;
+
+ format_size =
+ dev_priv->perf.oa.oa_formats[props->oa_format].size;
+
+ if (props->sample_flags & SAMPLE_OA_REPORT) {
+ stream->sample_flags |= SAMPLE_OA_REPORT;
+ stream->sample_size += format_size;
+ }
+
+ if (props->sample_flags & SAMPLE_OA_SOURCE) {
+ stream->sample_flags |= SAMPLE_OA_SOURCE;
+ stream->sample_size += 8;
+ }
- dev_priv->perf.oa.oa_buffer.format =
- dev_priv->perf.oa.oa_formats[props->oa_format].format;
+ dev_priv->perf.oa.oa_buffer.format_size = format_size;
+ if (WARN_ON(dev_priv->perf.oa.oa_buffer.format_size == 0))
+ return -EINVAL;
- dev_priv->perf.oa.periodic = props->oa_periodic;
- if (dev_priv->perf.oa.periodic)
- dev_priv->perf.oa.period_exponent = props->oa_period_exponent;
+ dev_priv->perf.oa.oa_buffer.format =
+ dev_priv->perf.oa.oa_formats[props->oa_format].format;
- if (stream->ctx) {
- ret = oa_get_render_ctx_id(stream);
+ dev_priv->perf.oa.periodic = props->oa_periodic;
+ if (dev_priv->perf.oa.periodic)
+ dev_priv->perf.oa.period_exponent =
+ props->oa_period_exponent;
+
+ if (stream->ctx) {
+ ret = oa_get_render_ctx_id(stream);
+ if (ret)
+ return ret;
+ }
+
+ ret = get_oa_config(dev_priv, props->metrics_set,
+ &stream->oa_config);
if (ret)
- return ret;
+ goto err_config;
+
+ /* PRM - observability performance counters:
+ *
+ * OACONTROL, performance counter enable, note:
+ *
+ * "When this bit is set, in order to have coherent counts,
+ * RC6 power state and trunk clock gating must be disabled.
+ * This can be achieved by programming MMIO registers as
+ * 0xA094=0 and 0xA090[31]=1"
+ *
+ * In our case we are expecting that taking pm + FORCEWAKE
+ * references will effectively disable RC6.
+ */
+ intel_runtime_pm_get(dev_priv);
+ intel_uncore_forcewake_get(dev_priv, FORCEWAKE_ALL);
+
+ ret = alloc_oa_buffer(dev_priv);
+ if (ret)
+ goto err_oa_buf_alloc;
+
+ ret = dev_priv->perf.oa.ops.enable_metric_set(dev_priv,
+ stream->oa_config);
+ if (ret)
+ goto err_enable;
+
}
- ret = get_oa_config(dev_priv, props->metrics_set, &stream->oa_config);
- if (ret)
- goto err_config;
+ if (props->cs_mode) {
+ if (!cs_sample_data) {
+ DRM_DEBUG_DRIVER(
+ "Stream engine given without requesting any "
+ "CS data to sample\n");
+ ret = -EINVAL;
+ goto err_enable;
+ }
- /* PRM - observability performance counters:
- *
- * OACONTROL, performance counter enable, note:
- *
- * "When this bit is set, in order to have coherent counts,
- * RC6 power state and trunk clock gating must be disabled.
- * This can be achieved by programming MMIO registers as
- * 0xA094=0 and 0xA090[31]=1"
- *
- * In our case we are expecting that taking pm + FORCEWAKE
- * references will effectively disable RC6.
- */
- intel_runtime_pm_get(dev_priv);
- intel_uncore_forcewake_get(dev_priv, FORCEWAKE_ALL);
+ idx = srcu_read_lock(&dev_priv->perf.oa.srcu);
+ curr_stream = srcu_dereference(
+ dev_priv->perf.oa.exclusive_stream,
+ &dev_priv->perf.oa.srcu);
+ if (curr_stream) {
+ DRM_ERROR("Stream already opened\n");
+ ret = -EINVAL;
+ goto err_enable;
+ }
+ srcu_read_unlock(&dev_priv->perf.oa.srcu, idx);
- ret = alloc_oa_buffer(dev_priv);
- if (ret)
- goto err_oa_buf_alloc;
+ ret = alloc_cs_buffer(stream);
+ if (ret)
+ goto err_enable;
- ret = dev_priv->perf.oa.ops.enable_metric_set(dev_priv,
- stream->oa_config);
- if (ret)
- goto err_enable;
+ stream->cs_mode = true;
+ }
- stream->ops = &i915_oa_stream_ops;
+ stream->ops = &perf_stream_ops;
/* Lock device for exclusive_stream access late because
* enable_metric_set() might lock as well on gen8+.
@@ -2162,9 +2829,7 @@ static int i915_oa_stream_init(struct i915_perf_stream *stream,
ret = i915_mutex_lock_interruptible(&dev_priv->drm);
if (ret)
goto err_lock;
-
- dev_priv->perf.oa.exclusive_stream = stream;
-
+ rcu_assign_pointer(dev_priv->perf.oa.exclusive_stream, stream);
mutex_unlock(&dev_priv->drm.struct_mutex);
return 0;
@@ -2173,14 +2838,15 @@ static int i915_oa_stream_init(struct i915_perf_stream *stream,
dev_priv->perf.oa.ops.disable_metric_set(dev_priv);
err_enable:
- free_oa_buffer(dev_priv);
+ if (require_oa_unit)
+ free_oa_buffer(dev_priv);
err_oa_buf_alloc:
- put_oa_config(dev_priv, stream->oa_config);
-
- intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
- intel_runtime_pm_put(dev_priv);
-
+ if (require_oa_unit) {
+ put_oa_config(dev_priv, stream->oa_config);
+ intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
+ intel_runtime_pm_put(dev_priv);
+ }
err_config:
if (stream->ctx)
oa_put_render_ctx_id(stream);
@@ -2311,25 +2977,32 @@ static ssize_t i915_perf_read(struct file *file,
* effectively ensures we back off until the next hrtimer callback
* before reporting another POLLIN event.
*/
- if (ret >= 0 || ret == -EAGAIN) {
- /* Maybe make ->pollin per-stream state if we support multiple
- * concurrent streams in the future.
- */
+ if (ret >= 0 || ret == -EAGAIN)
dev_priv->perf.oa.pollin = false;
- }
return ret;
}
-static enum hrtimer_restart oa_poll_check_timer_cb(struct hrtimer *hrtimer)
+static enum hrtimer_restart poll_check_timer_cb(struct hrtimer *hrtimer)
{
+ struct i915_perf_stream *stream;
struct drm_i915_private *dev_priv =
container_of(hrtimer, typeof(*dev_priv),
perf.oa.poll_check_timer);
-
- if (oa_buffer_check_unlocked(dev_priv)) {
- dev_priv->perf.oa.pollin = true;
- wake_up(&dev_priv->perf.oa.poll_wq);
+ int idx;
+ struct intel_engine_cs *engine;
+ enum intel_engine_id id;
+
+ for_each_engine(engine, dev_priv, id) {
+ idx = srcu_read_lock(&dev_priv->perf.oa.srcu);
+ stream = srcu_dereference(dev_priv->perf.oa.exclusive_stream,
+ &dev_priv->perf.oa.srcu);
+ if (stream && stream->enabled &&
+ stream_have_data_unlocked(stream)) {
+ dev_priv->perf.oa.pollin = true;
+ wake_up(&dev_priv->perf.oa.poll_wq);
+ }
+ srcu_read_unlock(&dev_priv->perf.oa.srcu, idx);
}
hrtimer_forward_now(hrtimer, ns_to_ktime(POLL_PERIOD));
@@ -2416,10 +3089,12 @@ static void i915_perf_enable_locked(struct i915_perf_stream *stream)
return;
/* Allow stream->ops->enable() to refer to this */
- stream->enabled = true;
+ stream->enabled = false;
if (stream->ops->enable)
stream->ops->enable(stream);
+
+ stream->enabled = true;
}
/**
@@ -2520,8 +3195,6 @@ static void i915_perf_destroy_locked(struct i915_perf_stream *stream)
if (stream->ops->destroy)
stream->ops->destroy(stream);
- list_del(&stream->link);
-
if (stream->ctx)
i915_gem_context_put(stream->ctx);
@@ -2581,7 +3254,7 @@ static int i915_perf_release(struct inode *inode, struct file *file)
*
* In the case where userspace is interested in OA unit metrics then further
* config validation and stream initialization details will be handled by
- * i915_oa_stream_init(). The code here should only validate config state that
+ * i915_perf_stream_init(). The code here should only validate config state that
* will be relevant to all stream types / backends.
*
* Returns: zero on success or a negative error code.
@@ -2650,7 +3323,7 @@ static int i915_perf_release(struct inode *inode, struct file *file)
stream->dev_priv = dev_priv;
stream->ctx = specific_ctx;
- ret = i915_oa_stream_init(stream, param, props);
+ ret = i915_perf_stream_init(stream, param, props);
if (ret)
goto err_alloc;
@@ -2663,8 +3336,6 @@ static int i915_perf_release(struct inode *inode, struct file *file)
goto err_flags;
}
- list_add(&stream->link, &dev_priv->perf.streams);
-
if (param->flags & I915_PERF_FLAG_FD_CLOEXEC)
f_flags |= O_CLOEXEC;
if (param->flags & I915_PERF_FLAG_FD_NONBLOCK)
@@ -2682,7 +3353,6 @@ static int i915_perf_release(struct inode *inode, struct file *file)
return stream_fd;
err_open:
- list_del(&stream->link);
err_flags:
if (stream->ops->destroy)
stream->ops->destroy(stream);
@@ -2830,6 +3500,26 @@ static int read_properties_unlocked(struct drm_i915_private *dev_priv,
case DRM_I915_PERF_PROP_SAMPLE_OA_SOURCE:
props->sample_flags |= SAMPLE_OA_SOURCE;
break;
+ case DRM_I915_PERF_PROP_ENGINE: {
+ unsigned int user_ring_id =
+ value & I915_EXEC_RING_MASK;
+ enum intel_engine_id engine;
+
+ if (user_ring_id > I915_USER_RINGS)
+ return -EINVAL;
+
+ /* XXX: Currently only RCS is supported.
+ * Remove this check when support for other
+ * engines is added
+ */
+ engine = user_ring_map[user_ring_id];
+ if (engine != RCS)
+ return -EINVAL;
+
+ props->cs_mode = true;
+ props->engine = engine;
+ }
+ break;
case DRM_I915_PERF_PROP_MAX:
MISSING_CASE(id);
return -EINVAL;
@@ -3399,6 +4089,25 @@ int i915_perf_remove_config_ioctl(struct drm_device *dev, void *data,
{}
};
+void i915_perf_streams_mark_idle(struct drm_i915_private *dev_priv)
+{
+ struct i915_perf_stream *stream;
+ int idx;
+
+ idx = srcu_read_lock(&dev_priv->perf.oa.srcu);
+ stream = srcu_dereference(dev_priv->perf.oa.exclusive_stream,
+ &dev_priv->perf.oa.srcu);
+ if (stream && stream->enabled && stream->cs_mode) {
+ struct reservation_object *resv =
+ stream->cs_buffer.vma->resv;
+
+ reservation_object_lock(resv, NULL);
+ reservation_object_add_excl_fence(resv, NULL);
+ reservation_object_unlock(resv);
+ }
+ srcu_read_unlock(&dev_priv->perf.oa.srcu, idx);
+}
+
/**
* i915_perf_init - initialize i915-perf state on module load
* @dev_priv: i915 device instance
@@ -3492,13 +4201,17 @@ void i915_perf_init(struct drm_i915_private *dev_priv)
if (dev_priv->perf.oa.timestamp_frequency) {
hrtimer_init(&dev_priv->perf.oa.poll_check_timer,
CLOCK_MONOTONIC, HRTIMER_MODE_REL);
- dev_priv->perf.oa.poll_check_timer.function = oa_poll_check_timer_cb;
+ dev_priv->perf.oa.poll_check_timer.function =
+ poll_check_timer_cb;
init_waitqueue_head(&dev_priv->perf.oa.poll_wq);
- INIT_LIST_HEAD(&dev_priv->perf.streams);
mutex_init(&dev_priv->perf.lock);
spin_lock_init(&dev_priv->perf.oa.oa_buffer.ptr_lock);
+ /* Perf stream related initialization for Engine */
+ rcu_assign_pointer(dev_priv->perf.oa.exclusive_stream, NULL);
+ init_srcu_struct(&dev_priv->perf.oa.srcu);
+
oa_sample_rate_hard_limit =
dev_priv->perf.oa.timestamp_frequency / 2;
dev_priv->perf.sysctl_header = register_sysctl_table(dev_root);
@@ -3536,5 +4249,7 @@ void i915_perf_fini(struct drm_i915_private *dev_priv)
memset(&dev_priv->perf.oa.ops, 0, sizeof(dev_priv->perf.oa.ops));
+ cleanup_srcu_struct(&dev_priv->perf.oa.srcu);
+
dev_priv->perf.initialized = false;
}
@@ -1381,6 +1381,7 @@ enum drm_i915_oa_format {
enum drm_i915_perf_sample_oa_source {
I915_PERF_SAMPLE_OA_SOURCE_OABUFFER,
+ I915_PERF_SAMPLE_OA_SOURCE_CS,
I915_PERF_SAMPLE_OA_SOURCE_MAX /* non-ABI */
};
@@ -1425,6 +1426,13 @@ enum drm_i915_perf_property_id {
*/
DRM_I915_PERF_PROP_SAMPLE_OA_SOURCE,
+ /**
+ * The value of this property specifies the GPU engine for which
+ * the samples need to be collected. Specifying this property also
+ * implies the command stream based sample collection.
+ */
+ DRM_I915_PERF_PROP_ENGINE,
+
DRM_I915_PERF_PROP_MAX /* non-ABI */
};