Message ID | 20190709123351.5645-7-lionel.g.landwerlin@intel.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | drm/i915: Vulkan performance query support | expand |
On Tue, Jul 09, 2019 at 03:33:44PM +0300, Lionel Landwerlin wrote: >NOA configuration take some amount of time to apply. That amount of >time depends on the size of the GT. There is no documented time for >this. For example, past experimentations with powergating >configuration changes seem to indicate a 60~70us delay. We go with >500us as default for now which should be over the required amount of >time (according to HW architects). > >v2: Don't forget to save/restore registers used for the wait (Chris) > >v3: Name used CS_GPR registers (Chris) > Fix compile issue due to rebase (Lionel) > >Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> >Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk> >--- > drivers/gpu/drm/i915/gt/intel_gpu_commands.h | 24 ++ > drivers/gpu/drm/i915/gt/intel_gt_types.h | 5 + > drivers/gpu/drm/i915/i915_debugfs.c | 31 +++ > drivers/gpu/drm/i915/i915_drv.h | 8 + > drivers/gpu/drm/i915/i915_perf.c | 226 ++++++++++++++++++- > drivers/gpu/drm/i915/i915_reg.h | 4 +- > 6 files changed, 295 insertions(+), 3 deletions(-) > >diff --git a/drivers/gpu/drm/i915/gt/intel_gpu_commands.h b/drivers/gpu/drm/i915/gt/intel_gpu_commands.h >index e7eff9db343e..4a66af38c87b 100644 >--- a/drivers/gpu/drm/i915/gt/intel_gpu_commands.h >+++ b/drivers/gpu/drm/i915/gt/intel_gpu_commands.h >@@ -151,6 +151,7 @@ > #define MI_BATCH_GTT (2<<6) /* aliased with (1<<7) on gen4 */ > #define MI_BATCH_BUFFER_START_GEN8 MI_INSTR(0x31, 1) > #define MI_BATCH_RESOURCE_STREAMER (1<<10) >+#define MI_BATCH_PREDICATE (1 << 15) /* HSW+ on RCS only*/ > > /* > * 3D instructions used by the kernel >@@ -226,6 +227,29 @@ > #define PIPE_CONTROL_DEPTH_CACHE_FLUSH (1<<0) > #define PIPE_CONTROL_GLOBAL_GTT (1<<2) /* in addr dword */ > >+#define MI_MATH(x) MI_INSTR(0x1a, (x)-1) >+#define MI_ALU_OP(op, src1, src2) (((op) << 20) | ((src1) << 10) | (src2)) >+/* operands */ >+#define MI_ALU_OP_NOOP 0 >+#define MI_ALU_OP_LOAD 128 >+#define MI_ALU_OP_LOADINV 1152 >+#define MI_ALU_OP_LOAD0 129 >+#define MI_ALU_OP_LOAD1 1153 >+#define MI_ALU_OP_ADD 256 >+#define MI_ALU_OP_SUB 257 >+#define MI_ALU_OP_AND 258 >+#define MI_ALU_OP_OR 259 >+#define MI_ALU_OP_XOR 260 >+#define MI_ALU_OP_STORE 384 >+#define MI_ALU_OP_STOREINV 1408 >+/* sources */ >+#define MI_ALU_SRC_REG(x) (x) /* 0 -> 15 */ >+#define MI_ALU_SRC_SRCA 32 >+#define MI_ALU_SRC_SRCB 33 >+#define MI_ALU_SRC_ACCU 49 >+#define MI_ALU_SRC_ZF 50 >+#define MI_ALU_SRC_CF 51 >+ > /* > * Commands used only by the command parser > */ >diff --git a/drivers/gpu/drm/i915/gt/intel_gt_types.h b/drivers/gpu/drm/i915/gt/intel_gt_types.h >index 3563ce970102..a3141b79d344 100644 >--- a/drivers/gpu/drm/i915/gt/intel_gt_types.h >+++ b/drivers/gpu/drm/i915/gt/intel_gt_types.h >@@ -73,6 +73,11 @@ enum intel_gt_scratch_field { > /* 8 bytes */ > INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA = 256, > >+ /* 6 * 8 bytes */ >+ INTEL_GT_SCRATCH_FIELD_PERF_CS_GPR = 2048, >+ >+ /* 4 bytes */ >+ INTEL_GT_SCRATCH_FIELD_PERF_PREDICATE_RESULT_1 = 2096, > }; > > #endif /* __INTEL_GT_TYPES_H__ */ >diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c >index 3e4f58f19362..46fca53dfbda 100644 >--- a/drivers/gpu/drm/i915/i915_debugfs.c >+++ b/drivers/gpu/drm/i915/i915_debugfs.c >@@ -3653,6 +3653,36 @@ DEFINE_SIMPLE_ATTRIBUTE(i915_wedged_fops, > i915_wedged_get, i915_wedged_set, > "%llu\n"); > >+static int >+i915_perf_noa_delay_set(void *data, u64 val) >+{ >+ struct drm_i915_private *i915 = data; >+ >+ /* This would lead to infinite waits as we're doing timestamp >+ * difference on the CS with only 32bits. >+ */ >+ if (val > ((1ul << 32) - 1) * RUNTIME_INFO(i915)->cs_timestamp_frequency_khz) >+ return -EINVAL; >+ >+ atomic64_set(&i915->perf.oa.noa_programming_delay, val); >+ return 0; >+} >+ >+static int >+i915_perf_noa_delay_get(void *data, u64 *val) >+{ >+ struct drm_i915_private *i915 = data; >+ >+ *val = atomic64_read(&i915->perf.oa.noa_programming_delay); >+ return 0; >+} >+ >+DEFINE_SIMPLE_ATTRIBUTE(i915_perf_noa_delay_fops, >+ i915_perf_noa_delay_get, >+ i915_perf_noa_delay_set, >+ "%llu\n"); >+ >+ > #define DROP_UNBOUND BIT(0) > #define DROP_BOUND BIT(1) > #define DROP_RETIRE BIT(2) >@@ -4418,6 +4448,7 @@ static const struct i915_debugfs_files { > const char *name; > const struct file_operations *fops; > } i915_debugfs_files[] = { >+ {"i915_perf_noa_delay", &i915_perf_noa_delay_fops}, > {"i915_wedged", &i915_wedged_fops}, > {"i915_cache_sharing", &i915_cache_sharing_fops}, > {"i915_gem_drop_caches", &i915_drop_caches_fops}, >diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h >index 0419dfd0dea3..b3c6dd72c7a1 100644 >--- a/drivers/gpu/drm/i915/i915_drv.h >+++ b/drivers/gpu/drm/i915/i915_drv.h >@@ -1834,6 +1834,14 @@ struct drm_i915_private { > > struct i915_oa_ops ops; > const struct i915_oa_format *oa_formats; >+ >+ /** >+ * A batch buffer doing a wait on the GPU for the NOA >+ * logic to be reprogrammed. >+ */ >+ struct i915_vma *noa_wait; >+ >+ atomic64_t noa_programming_delay; > } oa; > } perf; > >diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c >index 882d7056aec3..abfa437a95b7 100644 >--- a/drivers/gpu/drm/i915/i915_perf.c >+++ b/drivers/gpu/drm/i915/i915_perf.c >@@ -197,6 +197,7 @@ > > #include "gem/i915_gem_context.h" > #include "gem/i915_gem_pm.h" >+#include "gt/intel_gt.h" > #include "gt/intel_lrc_reg.h" > > #include "i915_drv.h" >@@ -429,7 +430,7 @@ static int alloc_oa_config_buffer(struct drm_i915_private *i915, > MI_LOAD_REGISTER_IMM_MAX_REGS) * 4; > config_length += oa_config->flex_regs_len * 8; > } >- config_length += 4; /* MI_BATCH_BUFFER_END */ >+ config_length += 12; /* MI_BATCH_BUFFER_START into noa_wait loop */ > config_length = ALIGN(config_length, I915_GTT_PAGE_SIZE); > > bo = i915_gem_object_create_shmem(i915, config_length); >@@ -446,7 +447,12 @@ static int alloc_oa_config_buffer(struct drm_i915_private *i915, > cs = write_cs_mi_lri(cs, oa_config->b_counter_regs, oa_config->b_counter_regs_len); > cs = write_cs_mi_lri(cs, oa_config->flex_regs, oa_config->flex_regs_len); > >- *cs++ = MI_BATCH_BUFFER_END; >+ >+ /* Jump into the NOA wait busy loop. */ >+ *cs++ = (INTEL_GEN(i915) < 8 ? >+ MI_BATCH_BUFFER_START : MI_BATCH_BUFFER_START_GEN8); >+ *cs++ = i915_ggtt_offset(i915->perf.oa.noa_wait); >+ *cs++ = 0; > > i915_gem_object_flush_map(bo); > i915_gem_object_unpin_map(bo); >@@ -1467,6 +1473,7 @@ static void i915_oa_stream_destroy(struct i915_perf_stream *stream) > mutex_lock(&dev_priv->drm.struct_mutex); > dev_priv->perf.oa.exclusive_stream = NULL; > dev_priv->perf.oa.ops.disable_metric_set(dev_priv); >+ i915_vma_unpin_and_release(&dev_priv->perf.oa.noa_wait, 0); > mutex_unlock(&dev_priv->drm.struct_mutex); > > free_oa_buffer(dev_priv); >@@ -1653,6 +1660,205 @@ static int alloc_oa_buffer(struct drm_i915_private *dev_priv) > return ret; > } > >+static u32 *save_register(struct drm_i915_private *i915, u32 *cs, >+ i915_reg_t reg, u32 offset, u32 dword_count) >+{ >+ uint32_t d; >+ >+ for (d = 0; d < dword_count; d++) { >+ *cs++ = INTEL_GEN(i915) >= 8 ? >+ MI_STORE_REGISTER_MEM_GEN8 : MI_STORE_REGISTER_MEM; >+ *cs++ = i915_mmio_reg_offset(reg) + 4 * d; >+ *cs++ = intel_gt_scratch_offset(&i915->gt, offset) + 4 * d; >+ *cs++ = 0; >+ } >+ >+ return cs; >+} >+ >+static u32 *restore_register(struct drm_i915_private *i915, u32 *cs, >+ i915_reg_t reg, u32 offset, u32 dword_count) >+{ >+ uint32_t d; >+ >+ for (d = 0; d < dword_count; d++) { >+ *cs++ = INTEL_GEN(i915) >= 8 ? >+ MI_LOAD_REGISTER_MEM_GEN8 : MI_LOAD_REGISTER_MEM; >+ *cs++ = i915_mmio_reg_offset(reg); >+ *cs++ = intel_gt_scratch_offset(&i915->gt, offset); are you missing + 4 * d in the above 2 lines? Regards, Umesh >+ *cs++ = 0; >+ } >+ >+ return cs; >+} >+ >+static int alloc_noa_wait(struct drm_i915_private *i915) >+{ >+ struct drm_i915_gem_object *bo; >+ struct i915_vma *vma; >+ const u64 delay_ticks = 0xffffffffffffffff - >+ DIV64_U64_ROUND_UP( >+ atomic64_read(&i915->perf.oa.noa_programming_delay) * >+ RUNTIME_INFO(i915)->cs_timestamp_frequency_khz, >+ 1000000ull); >+ u32 *batch, *ts0, *cs, *jump; >+ int ret, i; >+ enum { START_TS, NOW_TS, DELTA_TS, JUMP_PREDICATE, DELTA_TARGET, N_CS_GPR }; >+ >+ bo = i915_gem_object_create_internal(i915, 4096); >+ if (IS_ERR(bo)) { >+ DRM_ERROR("Failed to allocate NOA wait batchbuffer\n"); >+ return PTR_ERR(bo); >+ } >+ >+ /* >+ * We pin in GGTT because we jump into this buffer now because >+ * multiple OA config BOs will have a jump to this address and it >+ * needs to be fixed during the lifetime of the i915/perf stream. >+ */ >+ vma = i915_gem_object_ggtt_pin(bo, NULL, 0, 4096, 0); >+ if (IS_ERR(vma)) { >+ ret = PTR_ERR(vma); >+ goto err_unref; >+ } >+ >+ batch = cs = i915_gem_object_pin_map(bo, I915_MAP_WB); >+ if (IS_ERR(batch)) { >+ ret = PTR_ERR(batch); >+ goto err_unpin; >+ } >+ >+ /* Save registers. */ >+ for (i = 0; i < N_CS_GPR; i++) { >+ cs = save_register(i915, cs, HSW_CS_GPR(i), >+ INTEL_GT_SCRATCH_FIELD_PERF_CS_GPR + 8 * i, 2); >+ } >+ cs = save_register(i915, cs, MI_PREDICATE_RESULT_1, >+ INTEL_GT_SCRATCH_FIELD_PERF_PREDICATE_RESULT_1, 1); >+ >+ /* First timestamp snapshot location. */ >+ ts0 = cs; >+ >+ /* >+ * Initial snapshot of the timestamp register to implement the wait. >+ * We work with 32b values, so clear out the top 32b bits of the >+ * register because the ALU works 64bits. >+ */ >+ *cs++ = MI_LOAD_REGISTER_IMM(1); >+ *cs++ = i915_mmio_reg_offset(HSW_CS_GPR(START_TS)) + 4; >+ *cs++ = 0; >+ *cs++ = MI_LOAD_REGISTER_REG | (3 - 2); >+ *cs++ = i915_mmio_reg_offset(RING_TIMESTAMP(RENDER_RING_BASE)); >+ *cs++ = i915_mmio_reg_offset(HSW_CS_GPR(START_TS)); >+ >+ /* >+ * This is the location we're going to jump back into until the >+ * required amount of time has passed. >+ */ >+ jump = cs; >+ >+ /* >+ * Take another snapshot of the timestamp register. Take care to clear >+ * up the top 32bits of CS_GPR(1) as we're using it for other >+ * operations below. >+ */ >+ *cs++ = MI_LOAD_REGISTER_IMM(1); >+ *cs++ = i915_mmio_reg_offset(HSW_CS_GPR(NOW_TS)) + 4; >+ *cs++ = 0; >+ *cs++ = MI_LOAD_REGISTER_REG | (3 - 2); >+ *cs++ = i915_mmio_reg_offset(RING_TIMESTAMP(RENDER_RING_BASE)); >+ *cs++ = i915_mmio_reg_offset(HSW_CS_GPR(NOW_TS)); >+ >+ /* >+ * Do a diff between the 2 timestamps and store the result back into >+ * CS_GPR(1). >+ */ >+ *cs++ = MI_MATH(5); >+ *cs++ = MI_ALU_OP(MI_ALU_OP_LOAD, MI_ALU_SRC_SRCA, MI_ALU_SRC_REG(NOW_TS)); >+ *cs++ = MI_ALU_OP(MI_ALU_OP_LOAD, MI_ALU_SRC_SRCB, MI_ALU_SRC_REG(START_TS)); >+ *cs++ = MI_ALU_OP(MI_ALU_OP_SUB, 0, 0); >+ *cs++ = MI_ALU_OP(MI_ALU_OP_STORE, MI_ALU_SRC_REG(DELTA_TS), MI_ALU_SRC_ACCU); >+ *cs++ = MI_ALU_OP(MI_ALU_OP_STORE, MI_ALU_SRC_REG(JUMP_PREDICATE), MI_ALU_SRC_CF); >+ >+ /* >+ * Transfer the carry flag (set to 1 if ts1 < ts0, meaning the >+ * timestamp have rolled over the 32bits) into the predicate register >+ * to be used for the predicated jump. >+ */ >+ *cs++ = MI_LOAD_REGISTER_REG | (3 - 2); >+ *cs++ = i915_mmio_reg_offset(HSW_CS_GPR(JUMP_PREDICATE)); >+ *cs++ = i915_mmio_reg_offset(MI_PREDICATE_RESULT_1); >+ >+ /* Restart from the beginning if we had timestamps roll over. */ >+ *cs++ = (INTEL_GEN(i915) < 8 ? >+ MI_BATCH_BUFFER_START : MI_BATCH_BUFFER_START_GEN8) | >+ MI_BATCH_PREDICATE; >+ *cs++ = i915_ggtt_offset(vma) + (ts0 - batch) * 4; >+ *cs++ = 0; >+ >+ /* >+ * Now add the diff between to previous timestamps and add it to : >+ * (((1 * << 64) - 1) - delay_ns) >+ * >+ * When the Carry Flag contains 1 this means the elapsed time is >+ * longer than the expected delay, and we can exit the wait loop. >+ */ >+ *cs++ = MI_LOAD_REGISTER_IMM(2); >+ *cs++ = i915_mmio_reg_offset(HSW_CS_GPR(DELTA_TARGET)); >+ *cs++ = lower_32_bits(delay_ticks); >+ *cs++ = i915_mmio_reg_offset(HSW_CS_GPR(DELTA_TARGET)) + 4; >+ *cs++ = upper_32_bits(delay_ticks); >+ >+ *cs++ = MI_MATH(4); >+ *cs++ = MI_ALU_OP(MI_ALU_OP_LOAD, MI_ALU_SRC_SRCA, MI_ALU_SRC_REG(DELTA_TS)); >+ *cs++ = MI_ALU_OP(MI_ALU_OP_LOAD, MI_ALU_SRC_SRCB, MI_ALU_SRC_REG(DELTA_TARGET)); >+ *cs++ = MI_ALU_OP(MI_ALU_OP_ADD, 0, 0); >+ *cs++ = MI_ALU_OP(MI_ALU_OP_STOREINV, MI_ALU_SRC_REG(JUMP_PREDICATE), MI_ALU_SRC_CF); >+ >+ /* >+ * Transfer the result into the predicate register to be used for the >+ * predicated jump. >+ */ >+ *cs++ = MI_LOAD_REGISTER_REG | (3 - 2); >+ *cs++ = i915_mmio_reg_offset(HSW_CS_GPR(JUMP_PREDICATE)); >+ *cs++ = i915_mmio_reg_offset(MI_PREDICATE_RESULT_1); >+ >+ /* Predicate the jump. */ >+ *cs++ = (INTEL_GEN(i915) < 8 ? >+ MI_BATCH_BUFFER_START : MI_BATCH_BUFFER_START_GEN8) | >+ MI_BATCH_PREDICATE; >+ *cs++ = i915_ggtt_offset(vma) + (jump - batch) * 4; >+ *cs++ = 0; >+ >+ /* Restore registers. */ >+ for (i = 0; i < N_CS_GPR; i++) { >+ cs = restore_register(i915, cs, HSW_CS_GPR(i), >+ INTEL_GT_SCRATCH_FIELD_PERF_CS_GPR + 8 * i, 2); >+ } >+ cs = restore_register(i915, cs, MI_PREDICATE_RESULT_1, >+ INTEL_GT_SCRATCH_FIELD_PERF_PREDICATE_RESULT_1, 1); >+ >+ /* And return to the ring. */ >+ *cs++ = MI_BATCH_BUFFER_END; >+ >+ GEM_BUG_ON((cs - batch) > (PAGE_SIZE / sizeof(*batch))); >+ >+ i915_gem_object_flush_map(bo); >+ i915_gem_object_unpin_map(bo); >+ >+ i915->perf.oa.noa_wait = vma; >+ >+ return 0; >+ >+err_unpin: >+ __i915_vma_unpin(vma); >+ >+err_unref: >+ i915_gem_object_put(bo); >+ >+ return ret; >+} >+ > static void config_oa_regs(struct drm_i915_private *dev_priv, > const struct i915_oa_reg *regs, > u32 n_regs) >@@ -2221,6 +2427,12 @@ static int i915_oa_stream_init(struct i915_perf_stream *stream, > goto err_config; > } > >+ ret = alloc_noa_wait(dev_priv); >+ if (ret) { >+ DRM_DEBUG("Unable to allocate NOA wait batch buffer\n"); >+ goto err_noa_wait_alloc; >+ } >+ > /* PRM - observability performance counters: > * > * OACONTROL, performance counter enable, note: >@@ -2273,6 +2485,13 @@ static int i915_oa_stream_init(struct i915_perf_stream *stream, > intel_uncore_forcewake_put(&dev_priv->uncore, FORCEWAKE_ALL); > intel_runtime_pm_put(&dev_priv->runtime_pm, stream->wakeref); > >+ mutex_lock(&dev_priv->drm.struct_mutex); >+ i915_vma_unpin_and_release(&dev_priv->perf.oa.noa_wait, 0); >+ mutex_unlock(&dev_priv->drm.struct_mutex); >+ >+err_noa_wait_alloc: >+ put_oa_config(stream->oa_config); >+ > err_config: > if (stream->ctx) > oa_put_render_ctx_id(stream); >@@ -3657,6 +3876,9 @@ void i915_perf_init(struct drm_i915_private *dev_priv) > mutex_init(&dev_priv->perf.metrics_lock); > idr_init(&dev_priv->perf.metrics_idr); > >+ atomic64_set(&dev_priv->perf.oa.noa_programming_delay, >+ 500 * 1000 /* 500us */); >+ > dev_priv->perf.initialized = true; > } > } >diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h >index 5898f59e3dd7..a73464dd5e91 100644 >--- a/drivers/gpu/drm/i915/i915_reg.h >+++ b/drivers/gpu/drm/i915/i915_reg.h >@@ -567,7 +567,9 @@ static inline bool i915_mmio_reg_valid(i915_reg_t reg) > #define MI_PREDICATE_SRC0_UDW _MMIO(0x2400 + 4) > #define MI_PREDICATE_SRC1 _MMIO(0x2408) > #define MI_PREDICATE_SRC1_UDW _MMIO(0x2408 + 4) >- >+#define MI_PREDICATE_DATA _MMIO(0x2410) >+#define MI_PREDICATE_RESULT _MMIO(0x2418) >+#define MI_PREDICATE_RESULT_1 _MMIO(0x241c) > #define MI_PREDICATE_RESULT_2 _MMIO(0x2214) > #define LOWER_SLICE_ENABLED (1 << 0) > #define LOWER_SLICE_DISABLED (0 << 0) >-- >2.22.0 > >_______________________________________________ >Intel-gfx mailing list >Intel-gfx@lists.freedesktop.org >https://lists.freedesktop.org/mailman/listinfo/intel-gfx
Quoting Umesh Nerlige Ramappa (2019-07-11 00:43:21) > On Tue, Jul 09, 2019 at 03:33:44PM +0300, Lionel Landwerlin wrote: > >+static u32 *save_register(struct drm_i915_private *i915, u32 *cs, > >+ i915_reg_t reg, u32 offset, u32 dword_count) > >+{ > >+ uint32_t d; > >+ > >+ for (d = 0; d < dword_count; d++) { > >+ *cs++ = INTEL_GEN(i915) >= 8 ? > >+ MI_STORE_REGISTER_MEM_GEN8 : MI_STORE_REGISTER_MEM; > >+ *cs++ = i915_mmio_reg_offset(reg) + 4 * d; > >+ *cs++ = intel_gt_scratch_offset(&i915->gt, offset) + 4 * d; > >+ *cs++ = 0; > >+ } > >+ > >+ return cs; > >+} > >+ > >+static u32 *restore_register(struct drm_i915_private *i915, u32 *cs, > >+ i915_reg_t reg, u32 offset, u32 dword_count) > >+{ > >+ uint32_t d; > >+ > >+ for (d = 0; d < dword_count; d++) { > >+ *cs++ = INTEL_GEN(i915) >= 8 ? > >+ MI_LOAD_REGISTER_MEM_GEN8 : MI_LOAD_REGISTER_MEM; > >+ *cs++ = i915_mmio_reg_offset(reg); > >+ *cs++ = intel_gt_scratch_offset(&i915->gt, offset); > > are you missing + 4 * d in the above 2 lines? Whoops bad reviewer. Since these are the same two loops just with a different cmd... -Chris
On 11/07/2019 09:29, Chris Wilson wrote: > Quoting Umesh Nerlige Ramappa (2019-07-11 00:43:21) >> On Tue, Jul 09, 2019 at 03:33:44PM +0300, Lionel Landwerlin wrote: >>> +static u32 *save_register(struct drm_i915_private *i915, u32 *cs, >>> + i915_reg_t reg, u32 offset, u32 dword_count) >>> +{ >>> + uint32_t d; >>> + >>> + for (d = 0; d < dword_count; d++) { >>> + *cs++ = INTEL_GEN(i915) >= 8 ? >>> + MI_STORE_REGISTER_MEM_GEN8 : MI_STORE_REGISTER_MEM; >>> + *cs++ = i915_mmio_reg_offset(reg) + 4 * d; >>> + *cs++ = intel_gt_scratch_offset(&i915->gt, offset) + 4 * d; >>> + *cs++ = 0; >>> + } >>> + >>> + return cs; >>> +} >>> + >>> +static u32 *restore_register(struct drm_i915_private *i915, u32 *cs, >>> + i915_reg_t reg, u32 offset, u32 dword_count) >>> +{ >>> + uint32_t d; >>> + >>> + for (d = 0; d < dword_count; d++) { >>> + *cs++ = INTEL_GEN(i915) >= 8 ? >>> + MI_LOAD_REGISTER_MEM_GEN8 : MI_LOAD_REGISTER_MEM; >>> + *cs++ = i915_mmio_reg_offset(reg); >>> + *cs++ = intel_gt_scratch_offset(&i915->gt, offset); >> are you missing + 4 * d in the above 2 lines? > Whoops bad reviewer. Since these are the same two loops just with a > different cmd... > -Chris > Thanks Umesh! I've merged these 2 function locally. I'm about to resend. -Lionel
diff --git a/drivers/gpu/drm/i915/gt/intel_gpu_commands.h b/drivers/gpu/drm/i915/gt/intel_gpu_commands.h index e7eff9db343e..4a66af38c87b 100644 --- a/drivers/gpu/drm/i915/gt/intel_gpu_commands.h +++ b/drivers/gpu/drm/i915/gt/intel_gpu_commands.h @@ -151,6 +151,7 @@ #define MI_BATCH_GTT (2<<6) /* aliased with (1<<7) on gen4 */ #define MI_BATCH_BUFFER_START_GEN8 MI_INSTR(0x31, 1) #define MI_BATCH_RESOURCE_STREAMER (1<<10) +#define MI_BATCH_PREDICATE (1 << 15) /* HSW+ on RCS only*/ /* * 3D instructions used by the kernel @@ -226,6 +227,29 @@ #define PIPE_CONTROL_DEPTH_CACHE_FLUSH (1<<0) #define PIPE_CONTROL_GLOBAL_GTT (1<<2) /* in addr dword */ +#define MI_MATH(x) MI_INSTR(0x1a, (x)-1) +#define MI_ALU_OP(op, src1, src2) (((op) << 20) | ((src1) << 10) | (src2)) +/* operands */ +#define MI_ALU_OP_NOOP 0 +#define MI_ALU_OP_LOAD 128 +#define MI_ALU_OP_LOADINV 1152 +#define MI_ALU_OP_LOAD0 129 +#define MI_ALU_OP_LOAD1 1153 +#define MI_ALU_OP_ADD 256 +#define MI_ALU_OP_SUB 257 +#define MI_ALU_OP_AND 258 +#define MI_ALU_OP_OR 259 +#define MI_ALU_OP_XOR 260 +#define MI_ALU_OP_STORE 384 +#define MI_ALU_OP_STOREINV 1408 +/* sources */ +#define MI_ALU_SRC_REG(x) (x) /* 0 -> 15 */ +#define MI_ALU_SRC_SRCA 32 +#define MI_ALU_SRC_SRCB 33 +#define MI_ALU_SRC_ACCU 49 +#define MI_ALU_SRC_ZF 50 +#define MI_ALU_SRC_CF 51 + /* * Commands used only by the command parser */ diff --git a/drivers/gpu/drm/i915/gt/intel_gt_types.h b/drivers/gpu/drm/i915/gt/intel_gt_types.h index 3563ce970102..a3141b79d344 100644 --- a/drivers/gpu/drm/i915/gt/intel_gt_types.h +++ b/drivers/gpu/drm/i915/gt/intel_gt_types.h @@ -73,6 +73,11 @@ enum intel_gt_scratch_field { /* 8 bytes */ INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA = 256, + /* 6 * 8 bytes */ + INTEL_GT_SCRATCH_FIELD_PERF_CS_GPR = 2048, + + /* 4 bytes */ + INTEL_GT_SCRATCH_FIELD_PERF_PREDICATE_RESULT_1 = 2096, }; #endif /* __INTEL_GT_TYPES_H__ */ diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c index 3e4f58f19362..46fca53dfbda 100644 --- a/drivers/gpu/drm/i915/i915_debugfs.c +++ b/drivers/gpu/drm/i915/i915_debugfs.c @@ -3653,6 +3653,36 @@ DEFINE_SIMPLE_ATTRIBUTE(i915_wedged_fops, i915_wedged_get, i915_wedged_set, "%llu\n"); +static int +i915_perf_noa_delay_set(void *data, u64 val) +{ + struct drm_i915_private *i915 = data; + + /* This would lead to infinite waits as we're doing timestamp + * difference on the CS with only 32bits. + */ + if (val > ((1ul << 32) - 1) * RUNTIME_INFO(i915)->cs_timestamp_frequency_khz) + return -EINVAL; + + atomic64_set(&i915->perf.oa.noa_programming_delay, val); + return 0; +} + +static int +i915_perf_noa_delay_get(void *data, u64 *val) +{ + struct drm_i915_private *i915 = data; + + *val = atomic64_read(&i915->perf.oa.noa_programming_delay); + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(i915_perf_noa_delay_fops, + i915_perf_noa_delay_get, + i915_perf_noa_delay_set, + "%llu\n"); + + #define DROP_UNBOUND BIT(0) #define DROP_BOUND BIT(1) #define DROP_RETIRE BIT(2) @@ -4418,6 +4448,7 @@ static const struct i915_debugfs_files { const char *name; const struct file_operations *fops; } i915_debugfs_files[] = { + {"i915_perf_noa_delay", &i915_perf_noa_delay_fops}, {"i915_wedged", &i915_wedged_fops}, {"i915_cache_sharing", &i915_cache_sharing_fops}, {"i915_gem_drop_caches", &i915_drop_caches_fops}, diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index 0419dfd0dea3..b3c6dd72c7a1 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -1834,6 +1834,14 @@ struct drm_i915_private { struct i915_oa_ops ops; const struct i915_oa_format *oa_formats; + + /** + * A batch buffer doing a wait on the GPU for the NOA + * logic to be reprogrammed. + */ + struct i915_vma *noa_wait; + + atomic64_t noa_programming_delay; } oa; } perf; diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c index 882d7056aec3..abfa437a95b7 100644 --- a/drivers/gpu/drm/i915/i915_perf.c +++ b/drivers/gpu/drm/i915/i915_perf.c @@ -197,6 +197,7 @@ #include "gem/i915_gem_context.h" #include "gem/i915_gem_pm.h" +#include "gt/intel_gt.h" #include "gt/intel_lrc_reg.h" #include "i915_drv.h" @@ -429,7 +430,7 @@ static int alloc_oa_config_buffer(struct drm_i915_private *i915, MI_LOAD_REGISTER_IMM_MAX_REGS) * 4; config_length += oa_config->flex_regs_len * 8; } - config_length += 4; /* MI_BATCH_BUFFER_END */ + config_length += 12; /* MI_BATCH_BUFFER_START into noa_wait loop */ config_length = ALIGN(config_length, I915_GTT_PAGE_SIZE); bo = i915_gem_object_create_shmem(i915, config_length); @@ -446,7 +447,12 @@ static int alloc_oa_config_buffer(struct drm_i915_private *i915, cs = write_cs_mi_lri(cs, oa_config->b_counter_regs, oa_config->b_counter_regs_len); cs = write_cs_mi_lri(cs, oa_config->flex_regs, oa_config->flex_regs_len); - *cs++ = MI_BATCH_BUFFER_END; + + /* Jump into the NOA wait busy loop. */ + *cs++ = (INTEL_GEN(i915) < 8 ? + MI_BATCH_BUFFER_START : MI_BATCH_BUFFER_START_GEN8); + *cs++ = i915_ggtt_offset(i915->perf.oa.noa_wait); + *cs++ = 0; i915_gem_object_flush_map(bo); i915_gem_object_unpin_map(bo); @@ -1467,6 +1473,7 @@ static void i915_oa_stream_destroy(struct i915_perf_stream *stream) mutex_lock(&dev_priv->drm.struct_mutex); dev_priv->perf.oa.exclusive_stream = NULL; dev_priv->perf.oa.ops.disable_metric_set(dev_priv); + i915_vma_unpin_and_release(&dev_priv->perf.oa.noa_wait, 0); mutex_unlock(&dev_priv->drm.struct_mutex); free_oa_buffer(dev_priv); @@ -1653,6 +1660,205 @@ static int alloc_oa_buffer(struct drm_i915_private *dev_priv) return ret; } +static u32 *save_register(struct drm_i915_private *i915, u32 *cs, + i915_reg_t reg, u32 offset, u32 dword_count) +{ + uint32_t d; + + for (d = 0; d < dword_count; d++) { + *cs++ = INTEL_GEN(i915) >= 8 ? + MI_STORE_REGISTER_MEM_GEN8 : MI_STORE_REGISTER_MEM; + *cs++ = i915_mmio_reg_offset(reg) + 4 * d; + *cs++ = intel_gt_scratch_offset(&i915->gt, offset) + 4 * d; + *cs++ = 0; + } + + return cs; +} + +static u32 *restore_register(struct drm_i915_private *i915, u32 *cs, + i915_reg_t reg, u32 offset, u32 dword_count) +{ + uint32_t d; + + for (d = 0; d < dword_count; d++) { + *cs++ = INTEL_GEN(i915) >= 8 ? + MI_LOAD_REGISTER_MEM_GEN8 : MI_LOAD_REGISTER_MEM; + *cs++ = i915_mmio_reg_offset(reg); + *cs++ = intel_gt_scratch_offset(&i915->gt, offset); + *cs++ = 0; + } + + return cs; +} + +static int alloc_noa_wait(struct drm_i915_private *i915) +{ + struct drm_i915_gem_object *bo; + struct i915_vma *vma; + const u64 delay_ticks = 0xffffffffffffffff - + DIV64_U64_ROUND_UP( + atomic64_read(&i915->perf.oa.noa_programming_delay) * + RUNTIME_INFO(i915)->cs_timestamp_frequency_khz, + 1000000ull); + u32 *batch, *ts0, *cs, *jump; + int ret, i; + enum { START_TS, NOW_TS, DELTA_TS, JUMP_PREDICATE, DELTA_TARGET, N_CS_GPR }; + + bo = i915_gem_object_create_internal(i915, 4096); + if (IS_ERR(bo)) { + DRM_ERROR("Failed to allocate NOA wait batchbuffer\n"); + return PTR_ERR(bo); + } + + /* + * We pin in GGTT because we jump into this buffer now because + * multiple OA config BOs will have a jump to this address and it + * needs to be fixed during the lifetime of the i915/perf stream. + */ + vma = i915_gem_object_ggtt_pin(bo, NULL, 0, 4096, 0); + if (IS_ERR(vma)) { + ret = PTR_ERR(vma); + goto err_unref; + } + + batch = cs = i915_gem_object_pin_map(bo, I915_MAP_WB); + if (IS_ERR(batch)) { + ret = PTR_ERR(batch); + goto err_unpin; + } + + /* Save registers. */ + for (i = 0; i < N_CS_GPR; i++) { + cs = save_register(i915, cs, HSW_CS_GPR(i), + INTEL_GT_SCRATCH_FIELD_PERF_CS_GPR + 8 * i, 2); + } + cs = save_register(i915, cs, MI_PREDICATE_RESULT_1, + INTEL_GT_SCRATCH_FIELD_PERF_PREDICATE_RESULT_1, 1); + + /* First timestamp snapshot location. */ + ts0 = cs; + + /* + * Initial snapshot of the timestamp register to implement the wait. + * We work with 32b values, so clear out the top 32b bits of the + * register because the ALU works 64bits. + */ + *cs++ = MI_LOAD_REGISTER_IMM(1); + *cs++ = i915_mmio_reg_offset(HSW_CS_GPR(START_TS)) + 4; + *cs++ = 0; + *cs++ = MI_LOAD_REGISTER_REG | (3 - 2); + *cs++ = i915_mmio_reg_offset(RING_TIMESTAMP(RENDER_RING_BASE)); + *cs++ = i915_mmio_reg_offset(HSW_CS_GPR(START_TS)); + + /* + * This is the location we're going to jump back into until the + * required amount of time has passed. + */ + jump = cs; + + /* + * Take another snapshot of the timestamp register. Take care to clear + * up the top 32bits of CS_GPR(1) as we're using it for other + * operations below. + */ + *cs++ = MI_LOAD_REGISTER_IMM(1); + *cs++ = i915_mmio_reg_offset(HSW_CS_GPR(NOW_TS)) + 4; + *cs++ = 0; + *cs++ = MI_LOAD_REGISTER_REG | (3 - 2); + *cs++ = i915_mmio_reg_offset(RING_TIMESTAMP(RENDER_RING_BASE)); + *cs++ = i915_mmio_reg_offset(HSW_CS_GPR(NOW_TS)); + + /* + * Do a diff between the 2 timestamps and store the result back into + * CS_GPR(1). + */ + *cs++ = MI_MATH(5); + *cs++ = MI_ALU_OP(MI_ALU_OP_LOAD, MI_ALU_SRC_SRCA, MI_ALU_SRC_REG(NOW_TS)); + *cs++ = MI_ALU_OP(MI_ALU_OP_LOAD, MI_ALU_SRC_SRCB, MI_ALU_SRC_REG(START_TS)); + *cs++ = MI_ALU_OP(MI_ALU_OP_SUB, 0, 0); + *cs++ = MI_ALU_OP(MI_ALU_OP_STORE, MI_ALU_SRC_REG(DELTA_TS), MI_ALU_SRC_ACCU); + *cs++ = MI_ALU_OP(MI_ALU_OP_STORE, MI_ALU_SRC_REG(JUMP_PREDICATE), MI_ALU_SRC_CF); + + /* + * Transfer the carry flag (set to 1 if ts1 < ts0, meaning the + * timestamp have rolled over the 32bits) into the predicate register + * to be used for the predicated jump. + */ + *cs++ = MI_LOAD_REGISTER_REG | (3 - 2); + *cs++ = i915_mmio_reg_offset(HSW_CS_GPR(JUMP_PREDICATE)); + *cs++ = i915_mmio_reg_offset(MI_PREDICATE_RESULT_1); + + /* Restart from the beginning if we had timestamps roll over. */ + *cs++ = (INTEL_GEN(i915) < 8 ? + MI_BATCH_BUFFER_START : MI_BATCH_BUFFER_START_GEN8) | + MI_BATCH_PREDICATE; + *cs++ = i915_ggtt_offset(vma) + (ts0 - batch) * 4; + *cs++ = 0; + + /* + * Now add the diff between to previous timestamps and add it to : + * (((1 * << 64) - 1) - delay_ns) + * + * When the Carry Flag contains 1 this means the elapsed time is + * longer than the expected delay, and we can exit the wait loop. + */ + *cs++ = MI_LOAD_REGISTER_IMM(2); + *cs++ = i915_mmio_reg_offset(HSW_CS_GPR(DELTA_TARGET)); + *cs++ = lower_32_bits(delay_ticks); + *cs++ = i915_mmio_reg_offset(HSW_CS_GPR(DELTA_TARGET)) + 4; + *cs++ = upper_32_bits(delay_ticks); + + *cs++ = MI_MATH(4); + *cs++ = MI_ALU_OP(MI_ALU_OP_LOAD, MI_ALU_SRC_SRCA, MI_ALU_SRC_REG(DELTA_TS)); + *cs++ = MI_ALU_OP(MI_ALU_OP_LOAD, MI_ALU_SRC_SRCB, MI_ALU_SRC_REG(DELTA_TARGET)); + *cs++ = MI_ALU_OP(MI_ALU_OP_ADD, 0, 0); + *cs++ = MI_ALU_OP(MI_ALU_OP_STOREINV, MI_ALU_SRC_REG(JUMP_PREDICATE), MI_ALU_SRC_CF); + + /* + * Transfer the result into the predicate register to be used for the + * predicated jump. + */ + *cs++ = MI_LOAD_REGISTER_REG | (3 - 2); + *cs++ = i915_mmio_reg_offset(HSW_CS_GPR(JUMP_PREDICATE)); + *cs++ = i915_mmio_reg_offset(MI_PREDICATE_RESULT_1); + + /* Predicate the jump. */ + *cs++ = (INTEL_GEN(i915) < 8 ? + MI_BATCH_BUFFER_START : MI_BATCH_BUFFER_START_GEN8) | + MI_BATCH_PREDICATE; + *cs++ = i915_ggtt_offset(vma) + (jump - batch) * 4; + *cs++ = 0; + + /* Restore registers. */ + for (i = 0; i < N_CS_GPR; i++) { + cs = restore_register(i915, cs, HSW_CS_GPR(i), + INTEL_GT_SCRATCH_FIELD_PERF_CS_GPR + 8 * i, 2); + } + cs = restore_register(i915, cs, MI_PREDICATE_RESULT_1, + INTEL_GT_SCRATCH_FIELD_PERF_PREDICATE_RESULT_1, 1); + + /* And return to the ring. */ + *cs++ = MI_BATCH_BUFFER_END; + + GEM_BUG_ON((cs - batch) > (PAGE_SIZE / sizeof(*batch))); + + i915_gem_object_flush_map(bo); + i915_gem_object_unpin_map(bo); + + i915->perf.oa.noa_wait = vma; + + return 0; + +err_unpin: + __i915_vma_unpin(vma); + +err_unref: + i915_gem_object_put(bo); + + return ret; +} + static void config_oa_regs(struct drm_i915_private *dev_priv, const struct i915_oa_reg *regs, u32 n_regs) @@ -2221,6 +2427,12 @@ static int i915_oa_stream_init(struct i915_perf_stream *stream, goto err_config; } + ret = alloc_noa_wait(dev_priv); + if (ret) { + DRM_DEBUG("Unable to allocate NOA wait batch buffer\n"); + goto err_noa_wait_alloc; + } + /* PRM - observability performance counters: * * OACONTROL, performance counter enable, note: @@ -2273,6 +2485,13 @@ static int i915_oa_stream_init(struct i915_perf_stream *stream, intel_uncore_forcewake_put(&dev_priv->uncore, FORCEWAKE_ALL); intel_runtime_pm_put(&dev_priv->runtime_pm, stream->wakeref); + mutex_lock(&dev_priv->drm.struct_mutex); + i915_vma_unpin_and_release(&dev_priv->perf.oa.noa_wait, 0); + mutex_unlock(&dev_priv->drm.struct_mutex); + +err_noa_wait_alloc: + put_oa_config(stream->oa_config); + err_config: if (stream->ctx) oa_put_render_ctx_id(stream); @@ -3657,6 +3876,9 @@ void i915_perf_init(struct drm_i915_private *dev_priv) mutex_init(&dev_priv->perf.metrics_lock); idr_init(&dev_priv->perf.metrics_idr); + atomic64_set(&dev_priv->perf.oa.noa_programming_delay, + 500 * 1000 /* 500us */); + dev_priv->perf.initialized = true; } } diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h index 5898f59e3dd7..a73464dd5e91 100644 --- a/drivers/gpu/drm/i915/i915_reg.h +++ b/drivers/gpu/drm/i915/i915_reg.h @@ -567,7 +567,9 @@ static inline bool i915_mmio_reg_valid(i915_reg_t reg) #define MI_PREDICATE_SRC0_UDW _MMIO(0x2400 + 4) #define MI_PREDICATE_SRC1 _MMIO(0x2408) #define MI_PREDICATE_SRC1_UDW _MMIO(0x2408 + 4) - +#define MI_PREDICATE_DATA _MMIO(0x2410) +#define MI_PREDICATE_RESULT _MMIO(0x2418) +#define MI_PREDICATE_RESULT_1 _MMIO(0x241c) #define MI_PREDICATE_RESULT_2 _MMIO(0x2214) #define LOWER_SLICE_ENABLED (1 << 0) #define LOWER_SLICE_DISABLED (0 << 0)