@@ -2104,6 +2104,14 @@ static int eb_oa_config(struct i915_execbuffer *eb)
if (err)
return err;
+ /*
+ * If the perf stream was opened with hold preemption, flag the
+ * request properly so that the priority of the request is bumped once
+ * it reaches the execlist ports.
+ */
+ if (eb->i915->perf.oa.exclusive_stream->hold_preemption)
+ eb->request->flags |= I915_REQUEST_FLAGS_PERF;
+
/*
* If the config hasn't changed, skip reconfiguring the HW (this is
* subject to a delay we want to avoid has much as possible).
@@ -256,7 +256,12 @@ static inline int rq_prio(const struct i915_request *rq)
static int effective_prio(const struct i915_request *rq)
{
- int prio = rq_prio(rq);
+ int prio;
+
+ if (i915_request_has_perf(rq))
+ prio = I915_USER_PRIORITY(I915_PRIORITY_PERF);
+ else
+ prio = rq_prio(rq);
/*
* On unwinding the active request, we give it a priority bump
@@ -1232,6 +1232,14 @@ struct i915_perf_stream {
*/
bool enabled;
+ /**
+ * @hold_preemption: Whether preemption is put on hold for command
+ * submissions done on the @ctx. This is useful for some drivers that
+ * cannot easily post process the OA buffer context to subtract delta
+ * of performance counters not associated with @ctx.
+ */
+ bool hold_preemption;
+
/**
* @ops: The callbacks providing the implementation of this specific
* type of configured stream.
@@ -344,6 +344,8 @@ static const struct i915_oa_format gen8_plus_oa_formats[I915_OA_FORMAT_MAX] = {
* struct perf_open_properties - for validated properties given to open a stream
* @sample_flags: `DRM_I915_PERF_PROP_SAMPLE_*` properties are tracked as flags
* @single_context: Whether a single or all gpu contexts should be monitored
+ * @hold_preemption: Whether the preemption is disabled for the filtered
+ * context
* @ctx_handle: A gem ctx handle for use with @single_context
* @metrics_set: An ID for an OA unit metric set advertised via sysfs
* @oa_format: An OA unit HW report format
@@ -358,6 +360,7 @@ struct perf_open_properties {
u32 sample_flags;
u64 single_context:1;
+ u64 hold_preemption:1;
u64 ctx_handle;
/* OA sampling state */
@@ -2401,6 +2404,8 @@ static int i915_oa_stream_init(struct i915_perf_stream *stream,
stream->sample_flags |= SAMPLE_OA_REPORT;
stream->sample_size += format_size;
+ stream->hold_preemption = props->hold_preemption;
+
dev_priv->perf.oa.oa_buffer.format_size = format_size;
if (WARN_ON(dev_priv->perf.oa.oa_buffer.format_size == 0))
return -EINVAL;
@@ -2942,6 +2947,15 @@ i915_perf_open_ioctl_locked(struct drm_i915_private *dev_priv,
}
}
+ if (props->hold_preemption) {
+ if (!props->single_context) {
+ DRM_DEBUG("preemption disable with no context\n");
+ ret = -EINVAL;
+ goto err;
+ }
+ privileged_op = true;
+ }
+
/*
* On Haswell the OA unit supports clock gating off for a specific
* context and in this mode there's no visibility of metrics for the
@@ -2956,8 +2970,9 @@ i915_perf_open_ioctl_locked(struct drm_i915_private *dev_priv,
* MI_REPORT_PERF_COUNT commands and so consider it a privileged op to
* enable the OA unit by default.
*/
- if (IS_HASWELL(dev_priv) && specific_ctx)
+ if (IS_HASWELL(dev_priv) && specific_ctx && !props->hold_preemption) {
privileged_op = false;
+ }
/* Similar to perf's kernel.perf_paranoid_cpu sysctl option
* we check a dev.i915.perf_stream_paranoid sysctl option
@@ -2966,7 +2981,7 @@ i915_perf_open_ioctl_locked(struct drm_i915_private *dev_priv,
*/
if (privileged_op &&
i915_perf_stream_paranoid && !capable(CAP_SYS_ADMIN)) {
- DRM_DEBUG("Insufficient privileges to open system-wide i915 perf stream\n");
+ DRM_DEBUG("Insufficient privileges to open i915 perf stream\n");
ret = -EACCES;
goto err_ctx;
}
@@ -3163,6 +3178,9 @@ static int read_properties_unlocked(struct drm_i915_private *dev_priv,
props->oa_periodic = true;
props->oa_period_exponent = value;
break;
+ case DRM_I915_PERF_PROP_HOLD_PREEMPTION:
+ props->hold_preemption = !!value;
+ break;
case DRM_I915_PERF_PROP_MAX:
MISSING_CASE(id);
return -EINVAL;
@@ -3918,5 +3936,12 @@ void i915_perf_fini(struct drm_i915_private *dev_priv)
*/
int i915_perf_ioctl_version(void)
{
- return 1;
+ /* 1: initial version
+ *
+ * 2: Add DRM_I915_PERF_PROP_HOLD_PREEMPTION parameter to hold
+ * preemption on a particular context so that performance data is
+ * accessible from a delta of MI_RPC reports without looking at the
+ * OA buffer.
+ */
+ return 2;
}
@@ -17,6 +17,13 @@ enum {
I915_PRIORITY_NORMAL = I915_CONTEXT_DEFAULT_PRIORITY,
I915_PRIORITY_MAX = I915_CONTEXT_MAX_USER_PRIORITY + 1,
+ /* Requests containing performance queries must not be preempted by
+ * another context. They get scheduled with their default priority and
+ * once they reach the execlist ports we bump them to
+ * I915_PRIORITY_PERF so that they stick to the HW until they finish.
+ */
+ I915_PRIORITY_PERF,
+
I915_PRIORITY_INVALID = INT_MIN
};
@@ -292,7 +292,7 @@ static bool i915_request_retire(struct i915_request *rq)
dma_fence_signal_locked(&rq->fence);
if (test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT, &rq->fence.flags))
i915_request_cancel_breadcrumb(rq);
- if (rq->waitboost) {
+ if (i915_request_has_waitboost(rq)) {
GEM_BUG_ON(!atomic_read(&rq->i915->gt_pm.rps.num_waiters));
atomic_dec(&rq->i915->gt_pm.rps.num_waiters);
}
@@ -684,7 +684,7 @@ __i915_request_create(struct intel_context *ce, gfp_t gfp)
rq->file_priv = NULL;
rq->batch = NULL;
rq->capture_list = NULL;
- rq->waitboost = false;
+ rq->flags = 0;
rq->execution_mask = ALL_ENGINES;
INIT_LIST_HEAD(&rq->active_list);
@@ -216,7 +216,9 @@ struct i915_request {
/** Time at which this request was emitted, in jiffies. */
unsigned long emitted_jiffies;
- bool waitboost;
+#define I915_REQUEST_FLAGS_WAITBOOST BIT(0)
+#define I915_REQUEST_FLAGS_PERF BIT(1)
+ u32 flags;
/** timeline->request entry for this request */
struct list_head link;
@@ -430,6 +432,16 @@ static inline void i915_request_mark_complete(struct i915_request *rq)
rq->hwsp_seqno = (u32 *)&rq->fence.seqno; /* decouple from HWSP */
}
+static inline bool i915_request_has_waitboost(const struct i915_request *rq)
+{
+ return rq->flags & I915_REQUEST_FLAGS_WAITBOOST;
+}
+
+static inline bool i915_request_has_perf(const struct i915_request *rq)
+{
+ return rq->flags & I915_REQUEST_FLAGS_PERF;
+}
+
bool i915_retire_requests(struct drm_i915_private *i915);
#endif /* I915_REQUEST_H */
@@ -707,6 +707,14 @@ static inline int rq_prio(const struct i915_request *rq)
return rq->sched.attr.priority | __NO_PREEMPTION;
}
+static inline int effective_prio(const struct i915_request *rq)
+{
+ if (i915_request_has_perf(rq))
+ return I915_USER_PRIORITY(I915_PRIORITY_PERF) | __NO_PREEMPTION;
+
+ return rq_prio(rq);
+}
+
static struct i915_request *schedule_in(struct i915_request *rq, int idx)
{
trace_i915_request_in(rq, idx);
@@ -747,7 +755,7 @@ static void __guc_dequeue(struct intel_engine_cs *engine)
&engine->i915->guc.preempt_work[engine->id];
int prio = execlists->queue_priority_hint;
- if (i915_scheduler_need_preempt(prio, rq_prio(last))) {
+ if (i915_scheduler_need_preempt(prio, effective_prio(last))) {
intel_write_status_page(engine,
I915_GEM_HWS_PREEMPT,
GUC_PREEMPT_INPROGRESS);
@@ -6876,9 +6876,10 @@ void gen6_rps_boost(struct i915_request *rq)
/* Serializes with i915_request_retire() */
boost = false;
spin_lock_irqsave(&rq->lock, flags);
- if (!rq->waitboost && !dma_fence_is_signaled_locked(&rq->fence)) {
+ if (!i915_request_has_waitboost(rq) &&
+ !dma_fence_is_signaled_locked(&rq->fence)) {
boost = !atomic_fetch_inc(&rps->num_waiters);
- rq->waitboost = true;
+ rq->flags |= I915_REQUEST_FLAGS_WAITBOOST;
}
spin_unlock_irqrestore(&rq->lock, flags);
if (!boost)
@@ -1984,6 +1984,17 @@ enum drm_i915_perf_property_id {
*/
DRM_I915_PERF_PROP_OA_EXPONENT,
+ /**
+ * Specifying this property is only valid when specify a context to
+ * filter with DRM_I915_PERF_PROP_CTX_HANDLE. Specifying this property
+ * will hold preemption of the particular context we want to gather
+ * performance data about. The execbuf2 submissions must include a
+ * drm_i915_gem_execbuffer_ext_perf parameter for this to apply.
+ *
+ * This property is available in perf revision 2.
+ */
+ DRM_I915_PERF_PROP_HOLD_PREEMPTION,
+
DRM_I915_PERF_PROP_MAX /* non-ABI */
};