@@ -1653,6 +1653,13 @@ struct i915_oa_reg {
u32 value;
};
+struct i915_oa_rcs_node {
+ struct list_head head;
+ struct drm_i915_gem_request *req;
+ u32 offset;
+ u32 ctx_id;
+};
+
extern const struct i915_oa_reg i915_oa_3d_mux_config_hsw[];
extern const int i915_oa_3d_mux_config_hsw_len;
extern const struct i915_oa_reg i915_oa_3d_b_counter_config_hsw[];
@@ -1952,7 +1959,11 @@ struct drm_i915_private {
u8 *addr;
int format;
int format_size;
+ u32 node_size;
+ u32 node_count;
} oa_rcs_buffer;
+ struct list_head node_list;
+ struct work_struct work_timer;
} oa_pmu;
#endif
@@ -58,6 +58,14 @@ static u32 forward_oa_snapshots(struct drm_i915_private *dev_priv,
u8 *snapshot;
u32 taken;
+ /*
+ * Schedule a worker to forward the RCS based OA reports collected.
+ * A worker is needed since it requires device mutex to be taken
+ * which can't be done here because of atomic context
+ */
+ if (dev_priv->oa_pmu.multiple_ctx_mode)
+ schedule_work(&dev_priv->oa_pmu.work_timer);
+
head -= dev_priv->oa_pmu.oa_buffer.gtt_offset;
tail -= dev_priv->oa_pmu.oa_buffer.gtt_offset;
@@ -165,6 +173,103 @@ static void flush_oa_snapshots(struct drm_i915_private *dev_priv,
spin_unlock_irqrestore(&dev_priv->oa_pmu.oa_buffer.flush_lock, flags);
}
+int i915_oa_rcs_wait_gpu(struct drm_i915_private *dev_priv)
+{
+ struct i915_oa_rcs_node *last_entry;
+ unsigned long lock_flags;
+ int ret;
+
+ /*
+ * Wait for the last scheduled request to complete. This would
+ * implicitly wait for the prior submitted requests. The refcount
+ * of the requests is not decremented here.
+ */
+ spin_lock_irqsave(&dev_priv->oa_pmu.lock, lock_flags);
+
+ if (list_empty(&dev_priv->oa_pmu.node_list)) {
+ spin_unlock_irqrestore(&dev_priv->oa_pmu.lock, lock_flags);
+ return 0;
+ }
+ last_entry = list_last_entry(&dev_priv->oa_pmu.node_list,
+ struct i915_oa_rcs_node, head);
+ spin_unlock_irqrestore(&dev_priv->oa_pmu.lock, lock_flags);
+
+ if (last_entry && last_entry->req) {
+ ret = __i915_wait_request(last_entry->req, atomic_read(
+ &dev_priv->gpu_error.reset_counter),
+ dev_priv->mm.interruptible, NULL, NULL);
+ if (ret) {
+ DRM_ERROR("failed to wait\n");
+ return ret;
+ }
+ }
+ return 0;
+}
+
+static void forward_one_oa_rcs_sample(struct drm_i915_private *dev_priv,
+ struct i915_oa_rcs_node *node)
+{
+ struct perf_sample_data data;
+ struct perf_event *event = dev_priv->oa_pmu.exclusive_event;
+ int format_size, snapshot_size;
+ u8 *snapshot;
+ struct drm_i915_oa_node_ctx_id *ctx_info;
+ struct perf_raw_record raw;
+
+ format_size = dev_priv->oa_pmu.oa_rcs_buffer.format_size;
+ snapshot_size = format_size + sizeof(*ctx_info);
+ snapshot = dev_priv->oa_pmu.oa_rcs_buffer.addr + node->offset;
+
+ ctx_info = (struct drm_i915_oa_node_ctx_id *)(snapshot + format_size);
+ ctx_info->ctx_id = node->ctx_id;
+
+ perf_sample_data_init(&data, 0, event->hw.last_period);
+
+ /* Note: the combined u32 raw->size member + raw data itself must be 8
+ * byte aligned. (See note in init_oa_buffer for more details) */
+ raw.size = snapshot_size + 4;
+ raw.data = snapshot;
+
+ data.raw = &raw;
+
+ perf_event_overflow(event, &data, &dev_priv->oa_pmu.dummy_regs);
+}
+
+void forward_oa_rcs_snapshots_work(struct work_struct *__work)
+{
+ struct drm_i915_private *dev_priv =
+ container_of(__work, typeof(*dev_priv), oa_pmu.work_timer);
+ struct i915_oa_rcs_node *entry, *next;
+ struct drm_i915_gem_request *req;
+ unsigned long lock_flags;
+ int ret;
+
+ list_for_each_entry_safe
+ (entry, next, &dev_priv->oa_pmu.node_list, head) {
+ req = entry->req;
+ if (req && i915_gem_request_completed(req, true)) {
+ forward_one_oa_rcs_sample(dev_priv, entry);
+ ret = i915_mutex_lock_interruptible(dev_priv->dev);
+ if (ret)
+ break;
+ i915_gem_request_assign(&entry->req, NULL);
+ mutex_unlock(&dev_priv->dev->struct_mutex);
+ } else
+ break;
+
+ /*
+ * Do we instead need to protect whole loop? If so, we would
+ * need to *list_move_tail* to a deferred list, from where
+ * i915 device mutex could be taken to deference the requests,
+ * and free the node.
+ */
+ spin_lock_irqsave(&dev_priv->oa_pmu.lock, lock_flags);
+ list_del(&entry->head);
+ spin_unlock_irqrestore(&dev_priv->oa_pmu.lock, lock_flags);
+ kfree(entry);
+ }
+}
+
static void
oa_rcs_buffer_destroy(struct drm_i915_private *i915)
{
@@ -364,7 +469,7 @@ static int init_oa_rcs_buffer(struct perf_event *event)
struct drm_i915_private *dev_priv =
container_of(event->pmu, typeof(*dev_priv), oa_pmu.pmu);
struct drm_i915_gem_object *bo;
- int ret;
+ int ret, node_size;
BUG_ON(dev_priv->oa_pmu.oa_rcs_buffer.obj);
@@ -375,6 +480,16 @@ static int init_oa_rcs_buffer(struct perf_event *event)
dev_priv->oa_pmu.oa_rcs_buffer.obj = bo;
dev_priv->oa_pmu.oa_rcs_buffer.addr = vmap_oa_buffer(bo);
+ INIT_LIST_HEAD(&dev_priv->oa_pmu.node_list);
+
+ node_size = dev_priv->oa_pmu.oa_rcs_buffer.format_size +
+ sizeof(struct drm_i915_oa_node_ctx_id);
+
+ /* node size has to be aligned to 64 bytes, since only 64 byte aligned
+ * addresses can be given to OA unit for dumping OA reports */
+ node_size = ALIGN(node_size, 64);
+ dev_priv->oa_pmu.oa_rcs_buffer.node_size = node_size;
+ dev_priv->oa_pmu.oa_rcs_buffer.node_count = bo->base.size / node_size;
DRM_DEBUG_DRIVER("OA RCS Buffer initialized, vaddr = %p",
dev_priv->oa_pmu.oa_rcs_buffer.addr);
@@ -849,7 +964,13 @@ static int i915_oa_event_flush(struct perf_event *event)
if (event->attr.sample_period) {
struct drm_i915_private *i915 =
container_of(event->pmu, typeof(*i915), oa_pmu.pmu);
+ int ret;
+ if (i915->oa_pmu.multiple_ctx_mode) {
+ ret = i915_oa_rcs_wait_gpu(i915);
+ if (ret)
+ return ret;
+ }
flush_oa_snapshots(i915, true);
}
@@ -945,6 +1066,8 @@ void i915_oa_pmu_register(struct drm_device *dev)
hrtimer_init(&i915->oa_pmu.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
i915->oa_pmu.timer.function = hrtimer_sample;
+ INIT_WORK(&i915->oa_pmu.work_timer, forward_oa_rcs_snapshots_work);
+
spin_lock_init(&i915->oa_pmu.lock);
i915->oa_pmu.pmu.capabilities = PERF_PMU_CAP_IS_DEVICE;
@@ -974,6 +1097,9 @@ void i915_oa_pmu_unregister(struct drm_device *dev)
if (i915->oa_pmu.pmu.event_init == NULL)
return;
+ if (i915->oa_pmu.multiple_ctx_mode)
+ cancel_work_sync(&i915->oa_pmu.work_timer);
+
unregister_sysctl_table(i915->oa_pmu.sysctl_header);
perf_pmu_unregister(&i915->oa_pmu.pmu);
@@ -123,6 +123,11 @@ enum drm_i915_oa_event_type {
I915_OA_RECORD_MAX, /* non-ABI */
};
+struct drm_i915_oa_node_ctx_id {
+ __u32 ctx_id;
+ __u32 pad;
+};
+
/* Each region is a minimum of 16k, and there are at most 255 of them.
*/
#define I915_NR_TEX_REGIONS 255 /* table size 2k - maximum due to use