diff mbox series

[v6,05/11] drm/i915/perf: implement active wait for noa configurations

Message ID 20190701113437.4043-6-lionel.g.landwerlin@intel.com (mailing list archive)
State New, archived
Headers show
Series drm/i915: Vulkan performance query support | expand

Commit Message

Lionel Landwerlin July 1, 2019, 11:34 a.m. UTC
NOA configuration take some amount of time to apply. That amount of
time depends on the size of the GT. There is no documented time for
this. For example, past experimentations with powergating
configuration changes seem to indicate a 60~70us delay. We go with
500us as default for now which should be over the required amount of
time (according to HW architects).

v2: Don't forget to save/restore registers used for the wait (Chris)

Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
---
 drivers/gpu/drm/i915/gt/intel_gpu_commands.h |  24 ++
 drivers/gpu/drm/i915/gt/intel_gt_types.h     |   5 +
 drivers/gpu/drm/i915/i915_debugfs.c          |  25 +++
 drivers/gpu/drm/i915/i915_drv.h              |   8 +
 drivers/gpu/drm/i915/i915_perf.c             | 225 ++++++++++++++++++-
 drivers/gpu/drm/i915/i915_reg.h              |   4 +-
 6 files changed, 288 insertions(+), 3 deletions(-)

Comments

Chris Wilson July 1, 2019, 12:43 p.m. UTC | #1
Quoting Lionel Landwerlin (2019-07-01 12:34:31)
> NOA configuration take some amount of time to apply. That amount of
> time depends on the size of the GT. There is no documented time for
> this. For example, past experimentations with powergating
> configuration changes seem to indicate a 60~70us delay. We go with
> 500us as default for now which should be over the required amount of
> time (according to HW architects).
> 
> v2: Don't forget to save/restore registers used for the wait (Chris)
> 
> Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
> ---
>  drivers/gpu/drm/i915/gt/intel_gpu_commands.h |  24 ++
>  drivers/gpu/drm/i915/gt/intel_gt_types.h     |   5 +
>  drivers/gpu/drm/i915/i915_debugfs.c          |  25 +++
>  drivers/gpu/drm/i915/i915_drv.h              |   8 +
>  drivers/gpu/drm/i915/i915_perf.c             | 225 ++++++++++++++++++-
>  drivers/gpu/drm/i915/i915_reg.h              |   4 +-
>  6 files changed, 288 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/gt/intel_gpu_commands.h b/drivers/gpu/drm/i915/gt/intel_gpu_commands.h
> index e7eff9db343e..4a66af38c87b 100644
> --- a/drivers/gpu/drm/i915/gt/intel_gpu_commands.h
> +++ b/drivers/gpu/drm/i915/gt/intel_gpu_commands.h
> @@ -151,6 +151,7 @@
>  #define   MI_BATCH_GTT             (2<<6) /* aliased with (1<<7) on gen4 */
>  #define MI_BATCH_BUFFER_START_GEN8     MI_INSTR(0x31, 1)
>  #define   MI_BATCH_RESOURCE_STREAMER (1<<10)
> +#define   MI_BATCH_PREDICATE         (1 << 15) /* HSW+ on RCS only*/
>  
>  /*
>   * 3D instructions used by the kernel
> @@ -226,6 +227,29 @@
>  #define   PIPE_CONTROL_DEPTH_CACHE_FLUSH               (1<<0)
>  #define   PIPE_CONTROL_GLOBAL_GTT (1<<2) /* in addr dword */
>  
> +#define MI_MATH(x) MI_INSTR(0x1a, (x)-1)
> +#define   MI_ALU_OP(op, src1, src2) (((op) << 20) | ((src1) << 10) | (src2))
> +/* operands */
> +#define   MI_ALU_OP_NOOP     0
> +#define   MI_ALU_OP_LOAD     128
> +#define   MI_ALU_OP_LOADINV  1152
> +#define   MI_ALU_OP_LOAD0    129
> +#define   MI_ALU_OP_LOAD1    1153
> +#define   MI_ALU_OP_ADD      256
> +#define   MI_ALU_OP_SUB      257
> +#define   MI_ALU_OP_AND      258
> +#define   MI_ALU_OP_OR       259
> +#define   MI_ALU_OP_XOR      260
> +#define   MI_ALU_OP_STORE    384
> +#define   MI_ALU_OP_STOREINV 1408
> +/* sources */
> +#define   MI_ALU_SRC_REG(x)  (x) /* 0 -> 15 */
> +#define   MI_ALU_SRC_SRCA    32
> +#define   MI_ALU_SRC_SRCB    33
> +#define   MI_ALU_SRC_ACCU    49
> +#define   MI_ALU_SRC_ZF      50
> +#define   MI_ALU_SRC_CF      51
> +
>  /*
>   * Commands used only by the command parser
>   */
> diff --git a/drivers/gpu/drm/i915/gt/intel_gt_types.h b/drivers/gpu/drm/i915/gt/intel_gt_types.h
> index e625a5e320d3..0750ac49a05b 100644
> --- a/drivers/gpu/drm/i915/gt/intel_gt_types.h
> +++ b/drivers/gpu/drm/i915/gt/intel_gt_types.h
> @@ -70,6 +70,11 @@ enum intel_gt_scratch_field {
>         /* 8 bytes */
>         INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA = 256,
>  
> +       /* 6 * 8 bytes */
> +       INTEL_GT_SCRATCH_FIELD_PERF_CS_GPR = 2048,
> +
> +       /* 4 bytes */
> +       INTEL_GT_SCRATCH_FIELD_PERF_PREDICATE_RESULT_1 = 2096,
>  };
>  
>  #endif /* __INTEL_GT_TYPES_H__ */
> diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
> index eeecdad0e3ca..6b49fda145e7 100644
> --- a/drivers/gpu/drm/i915/i915_debugfs.c
> +++ b/drivers/gpu/drm/i915/i915_debugfs.c
> @@ -3646,6 +3646,30 @@ DEFINE_SIMPLE_ATTRIBUTE(i915_wedged_fops,
>                         i915_wedged_get, i915_wedged_set,
>                         "%llu\n");
>  
> +static int
> +i915_perf_noa_delay_set(void *data, u64 val)
> +{
> +       struct drm_i915_private *i915 = data;
> +
> +       atomic64_set(&i915->perf.oa.noa_programming_delay, val);
> +       return 0;
> +}
> +
> +static int
> +i915_perf_noa_delay_get(void *data, u64 *val)
> +{
> +       struct drm_i915_private *i915 = data;
> +
> +       *val = atomic64_read(&i915->perf.oa.noa_programming_delay);
> +       return 0;
> +}
> +
> +DEFINE_SIMPLE_ATTRIBUTE(i915_perf_noa_delay_fops,
> +                       i915_perf_noa_delay_get,
> +                       i915_perf_noa_delay_set,
> +                       "%llu\n");
> +
> +
>  #define DROP_UNBOUND   BIT(0)
>  #define DROP_BOUND     BIT(1)
>  #define DROP_RETIRE    BIT(2)
> @@ -4411,6 +4435,7 @@ static const struct i915_debugfs_files {
>         const char *name;
>         const struct file_operations *fops;
>  } i915_debugfs_files[] = {
> +       {"i915_perf_noa_delay", &i915_perf_noa_delay_fops},
>         {"i915_wedged", &i915_wedged_fops},
>         {"i915_cache_sharing", &i915_cache_sharing_fops},
>         {"i915_gem_drop_caches", &i915_drop_caches_fops},
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index df39b2ee6bd9..fe93a260bd28 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -1837,6 +1837,14 @@ struct drm_i915_private {
>  
>                         struct i915_oa_ops ops;
>                         const struct i915_oa_format *oa_formats;
> +
> +                       /**
> +                        * A batch buffer doing a wait on the GPU for the NOA
> +                        * logic to be reprogrammed.
> +                        */
> +                       struct i915_vma *noa_wait;
> +
> +                       atomic64_t noa_programming_delay;
>                 } oa;
>         } perf;
>  
> diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
> index 5ba771468078..03e6908282e3 100644
> --- a/drivers/gpu/drm/i915/i915_perf.c
> +++ b/drivers/gpu/drm/i915/i915_perf.c
> @@ -197,6 +197,7 @@
>  
>  #include "gem/i915_gem_context.h"
>  #include "gem/i915_gem_pm.h"
> +#include "gt/intel_gt.h"
>  #include "gt/intel_lrc_reg.h"
>  
>  #include "i915_drv.h"
> @@ -429,7 +430,7 @@ static int alloc_oa_config_buffer(struct drm_i915_private *i915,
>                                               MI_LOAD_REGISTER_IMM_MAX_REGS) * 4;
>                 config_length += oa_config->flex_regs_len * 8;
>         }
> -       config_length += 4; /* MI_BATCH_BUFFER_END */
> +       config_length += 12; /* MI_BATCH_BUFFER_START into noa_wait loop */
>         config_length = ALIGN(config_length, I915_GTT_PAGE_SIZE);
>  
>         bo = i915_gem_object_create_shmem(i915, config_length);
> @@ -446,7 +447,12 @@ static int alloc_oa_config_buffer(struct drm_i915_private *i915,
>         cs = write_cs_mi_lri(cs, oa_config->b_counter_regs, oa_config->b_counter_regs_len);
>         cs = write_cs_mi_lri(cs, oa_config->flex_regs, oa_config->flex_regs_len);
>  
> -       *cs++ = MI_BATCH_BUFFER_END;
> +
> +       /* Jump into the NOA wait busy loop. */
> +       *cs++ = (INTEL_GEN(i915) < 8 ?
> +                MI_BATCH_BUFFER_START : MI_BATCH_BUFFER_START_GEN8);
> +       *cs++ = i915_ggtt_offset(i915->perf.oa.noa_wait);
> +       *cs++ = 0;
>  
>         i915_gem_object_flush_map(bo);
>         i915_gem_object_unpin_map(bo);
> @@ -1467,6 +1473,7 @@ static void i915_oa_stream_destroy(struct i915_perf_stream *stream)
>         mutex_lock(&dev_priv->drm.struct_mutex);
>         dev_priv->perf.oa.exclusive_stream = NULL;
>         dev_priv->perf.oa.ops.disable_metric_set(dev_priv);
> +       i915_vma_unpin_and_release(&dev_priv->perf.oa.noa_wait, 0);
>         mutex_unlock(&dev_priv->drm.struct_mutex);
>  
>         free_oa_buffer(dev_priv);
> @@ -1653,6 +1660,204 @@ static int alloc_oa_buffer(struct drm_i915_private *dev_priv)
>         return ret;
>  }
>  
> +static u32 *save_register(struct drm_i915_private *i915, u32 *cs,
> +                         i915_reg_t reg, u32 offset, u32 dword_count)
> +{
> +       uint32_t d;
> +
> +       for (d = 0; d < dword_count; d++) {
> +               *cs++ = INTEL_GEN(i915) >= 8 ?
> +                       MI_STORE_REGISTER_MEM_GEN8 : MI_STORE_REGISTER_MEM;
> +               *cs++ = i915_mmio_reg_offset(reg) + 4 * d;
> +               *cs++ = intel_gt_scratch_offset(&i915->gt, offset) + 4 * d;
> +               if (INTEL_GEN(i915) >= 8)
> +                       *cs++ = 0;

Will anyone care about the extra nop on hsw? :)

> +static int alloc_noa_wait(struct drm_i915_private *i915)
> +{
> +       struct drm_i915_gem_object *bo;
> +       struct i915_vma *vma;
> +       u64 delay_ns = atomic64_read(&i915->perf.oa.noa_programming_delay), delay_ticks;
> +       u32 *batch, *ts0, *cs, *jump;
> +       int ret, i;
> +
> +       bo = i915_gem_object_create_shmem(i915, 4096);

So swappable. At 4k, we almost consume as much in our bookkeeping as we
allocate for the backing store. Yes, it's atrocious.

Hang on. This can never be unpinned as it stores absolute addresses. So
this can be i915_gem_object_create_internal().

> +       if (IS_ERR(bo)) {
> +               DRM_ERROR("Failed to allocate NOA wait batchbuffer\n");
> +               return PTR_ERR(bo);
> +       }
> +
> +       /*
> +        * We pin in GGTT because we jump into this buffer now because
> +        * multiple OA config BOs will have a jump to this address and it
> +        * needs to be fixed during the lifetime of the i915/perf stream.
> +        */
> +       vma = i915_gem_object_ggtt_pin(bo, NULL, 0, 4096, 0);
> +       if (IS_ERR(vma)) {
> +               ret = PTR_ERR(vma);
> +               goto err_unref;
> +       }
> +
> +       batch = cs = i915_gem_object_pin_map(bo, I915_MAP_WB);
> +       if (IS_ERR(batch)) {
> +               ret = PTR_ERR(batch);
> +               goto err_unpin;
> +       }
> +
> +       /* Save registers. */
> +       for (i = 0; i <= 5; i++) {
> +               cs = save_register(i915, cs, HSW_CS_GPR(i),
> +                                  INTEL_GT_SCRATCH_FIELD_PERF_CS_GPR + 8 * i, 2);
> +       }
> +       cs = save_register(i915, cs, MI_PREDICATE_RESULT_1,
> +                          INTEL_GT_SCRATCH_FIELD_PERF_PREDICATE_RESULT_1, 1);
> +
> +       /* First timestamp snapshot location. */
> +       ts0 = cs;

Did this ever get used?

> +       /*
> +        * Initial snapshot of the timestamp register to implement the wait.
> +        * We work with 32b values, so clear out the top 32b bits of the
> +        * register because the ALU works 64bits.
> +        */
> +       *cs++ = MI_LOAD_REGISTER_IMM(1);
> +       *cs++ = i915_mmio_reg_offset(HSW_CS_GPR(0)) + 4;
> +       *cs++ = 0;
> +       *cs++ = MI_LOAD_REGISTER_REG | (3 - 2);
> +       *cs++ = i915_mmio_reg_offset(RING_TIMESTAMP(RENDER_RING_BASE));
> +       *cs++ = i915_mmio_reg_offset(HSW_CS_GPR(0));
> +
> +       /*
> +        * This is the location we're going to jump back into until the
> +        * required amount of time has passed.
> +        */
> +       jump = cs;
> +
> +       /*
> +        * Take another snapshot of the timestamp register. Take care to clear
> +        * up the top 32bits of CS_GPR(1) as we're using it for other
> +        * operations below.
> +        */
> +       *cs++ = MI_LOAD_REGISTER_IMM(1);
> +       *cs++ = i915_mmio_reg_offset(HSW_CS_GPR(1)) + 4;
> +       *cs++ = 0;
> +       *cs++ = MI_LOAD_REGISTER_REG | (3 - 2);
> +       *cs++ = i915_mmio_reg_offset(RING_TIMESTAMP(RENDER_RING_BASE));
> +       *cs++ = i915_mmio_reg_offset(HSW_CS_GPR(1));

cs = get_timestamp(cs, 1);

enum { START, NOW, DELTA, RESULT, TARGET } ?

That would also help with save/restore registers, as all CS_GPR should
then be named.

> +       /*
> +        * Do a diff between the 2 timestamps and store the result back into
> +        * CS_GPR(1).
> +        */
> +       *cs++ = MI_MATH(5);
> +       *cs++ = MI_ALU_OP(MI_ALU_OP_LOAD, MI_ALU_SRC_SRCA, MI_ALU_SRC_REG(1));
> +       *cs++ = MI_ALU_OP(MI_ALU_OP_LOAD, MI_ALU_SRC_SRCB, MI_ALU_SRC_REG(0));
> +       *cs++ = MI_ALU_OP(MI_ALU_OP_SUB, 0, 0);
> +       *cs++ = MI_ALU_OP(MI_ALU_OP_STORE, MI_ALU_SRC_REG(2), MI_ALU_SRC_ACCU);
> +       *cs++ = MI_ALU_OP(MI_ALU_OP_STORE, MI_ALU_SRC_REG(3), MI_ALU_SRC_CF);
> +
> +       /*
> +        * Transfer the carry flag (set to 1 if ts1 < ts0, meaning the
> +        * timestamp have rolled over the 32bits) into the predicate register
> +        * to be used for the predicated jump.
> +        */
> +       *cs++ = MI_LOAD_REGISTER_REG | (3 - 2);
> +       *cs++ = i915_mmio_reg_offset(HSW_CS_GPR(3));
> +       *cs++ = i915_mmio_reg_offset(MI_PREDICATE_RESULT_1);
> +
> +       /* Restart from the beginning if we had timestamps roll over. */
> +       *cs++ = (INTEL_GEN(i915) < 8 ?
> +                MI_BATCH_BUFFER_START : MI_BATCH_BUFFER_START_GEN8) |
> +               MI_BATCH_PREDICATE;
> +       *cs++ = vma->node.start;
> +       *cs++ = 0;
> +
> +       /*
> +        * Now add the diff between to previous timestamps and add it to :
> +        *      (((1 * << 64) - 1) - delay_ns)
> +        *
> +        * When the Carry Flag contains 1 this means the elapsed time is
> +        * longer than the expected delay, and we can exit the wait loop.
> +        */
> +       delay_ticks = 0xffffffffffffffff -
> +               DIV64_U64_ROUND_UP(delay_ns *
> +                                  RUNTIME_INFO(i915)->cs_timestamp_frequency_khz,
> +                                  1000000ull);
> +       *cs++ = MI_LOAD_REGISTER_IMM(2);
> +       *cs++ = i915_mmio_reg_offset(HSW_CS_GPR(4));
> +       *cs++ = lower_32_bits(delay_ticks);
> +       *cs++ = i915_mmio_reg_offset(HSW_CS_GPR(4)) + 4;
> +       *cs++ = upper_32_bits(delay_ticks);

Now, I was expecting to compute the 32b end timestamp and compare now to
that using the carry-flag to indicate the completion.

Why the detour? (I'm sure I am missing something here.)

Quick question request for >32b delays are rejected in the user debug api?

> +       *cs++ = MI_MATH(4);
> +       *cs++ = MI_ALU_OP(MI_ALU_OP_LOAD, MI_ALU_SRC_SRCA, MI_ALU_SRC_REG(2));
> +       *cs++ = MI_ALU_OP(MI_ALU_OP_LOAD, MI_ALU_SRC_SRCB, MI_ALU_SRC_REG(4));
> +       *cs++ = MI_ALU_OP(MI_ALU_OP_ADD, 0, 0);
> +       *cs++ = MI_ALU_OP(MI_ALU_OP_STOREINV, MI_ALU_SRC_REG(5), MI_ALU_SRC_CF);

Comparing delta against target delay. Inverted store to give 
	(delay - target) >= 0

> +       /*
> +        * Transfer the result into the predicate register to be used for the
> +        * predicated jump.
> +        */
> +       *cs++ = MI_LOAD_REGISTER_REG | (3 - 2);
> +       *cs++ = i915_mmio_reg_offset(HSW_CS_GPR(5));
> +       *cs++ = i915_mmio_reg_offset(MI_PREDICATE_RESULT_1);
> +
> +       /* Predicate the jump.  */
> +       *cs++ = (INTEL_GEN(i915) < 8 ?
> +                MI_BATCH_BUFFER_START : MI_BATCH_BUFFER_START_GEN8) |
> +               MI_BATCH_PREDICATE;

The joy of being on rcs.

> +       *cs++ = vma->node.start + (jump - batch) * 4;

Mutters i915_ggtt_offset(vma)

> +       *cs++ = 0;
> +
> +       /* Restore registers. */
> +       for (i = 0; i <= 5; i++) {
> +               cs = restore_register(i915, cs, HSW_CS_GPR(i),
> +                                     INTEL_GT_SCRATCH_FIELD_PERF_CS_GPR + 8 * i, 2);
> +       }
> +       cs = restore_register(i915, cs, MI_PREDICATE_RESULT_1,
> +                             INTEL_GT_SCRATCH_FIELD_PERF_PREDICATE_RESULT_1, 1);
> +
> +       /* And return to the ring. */
> +       *cs++ = MI_BATCH_BUFFER_END;

GEM_BUG_ON(cs - batch > PAGE_SIZE/sizeof(*batch));

> +
> +       i915_gem_object_flush_map(bo);
> +       i915_gem_object_unpin_map(bo);
> +
> +       i915->perf.oa.noa_wait = vma;
> +
> +       return 0;
> +
> +err_unpin:
> +       __i915_vma_unpin(vma);
> +
> +err_unref:
> +       i915_gem_object_put(bo);
> +
> +       return ret;
> +}

Certainly seems to do what you say on the tin.
-Chris
Lionel Landwerlin July 1, 2019, 1:10 p.m. UTC | #2
On 01/07/2019 15:43, Chris Wilson wrote:
> Quoting Lionel Landwerlin (2019-07-01 12:34:31)
>> NOA configuration take some amount of time to apply. That amount of
>> time depends on the size of the GT. There is no documented time for
>> this. For example, past experimentations with powergating
>> configuration changes seem to indicate a 60~70us delay. We go with
>> 500us as default for now which should be over the required amount of
>> time (according to HW architects).
>>
>> v2: Don't forget to save/restore registers used for the wait (Chris)
>>
>> Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
>> ---
>>   drivers/gpu/drm/i915/gt/intel_gpu_commands.h |  24 ++
>>   drivers/gpu/drm/i915/gt/intel_gt_types.h     |   5 +
>>   drivers/gpu/drm/i915/i915_debugfs.c          |  25 +++
>>   drivers/gpu/drm/i915/i915_drv.h              |   8 +
>>   drivers/gpu/drm/i915/i915_perf.c             | 225 ++++++++++++++++++-
>>   drivers/gpu/drm/i915/i915_reg.h              |   4 +-
>>   6 files changed, 288 insertions(+), 3 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/i915/gt/intel_gpu_commands.h b/drivers/gpu/drm/i915/gt/intel_gpu_commands.h
>> index e7eff9db343e..4a66af38c87b 100644
>> --- a/drivers/gpu/drm/i915/gt/intel_gpu_commands.h
>> +++ b/drivers/gpu/drm/i915/gt/intel_gpu_commands.h
>> @@ -151,6 +151,7 @@
>>   #define   MI_BATCH_GTT             (2<<6) /* aliased with (1<<7) on gen4 */
>>   #define MI_BATCH_BUFFER_START_GEN8     MI_INSTR(0x31, 1)
>>   #define   MI_BATCH_RESOURCE_STREAMER (1<<10)
>> +#define   MI_BATCH_PREDICATE         (1 << 15) /* HSW+ on RCS only*/
>>   
>>   /*
>>    * 3D instructions used by the kernel
>> @@ -226,6 +227,29 @@
>>   #define   PIPE_CONTROL_DEPTH_CACHE_FLUSH               (1<<0)
>>   #define   PIPE_CONTROL_GLOBAL_GTT (1<<2) /* in addr dword */
>>   
>> +#define MI_MATH(x) MI_INSTR(0x1a, (x)-1)
>> +#define   MI_ALU_OP(op, src1, src2) (((op) << 20) | ((src1) << 10) | (src2))
>> +/* operands */
>> +#define   MI_ALU_OP_NOOP     0
>> +#define   MI_ALU_OP_LOAD     128
>> +#define   MI_ALU_OP_LOADINV  1152
>> +#define   MI_ALU_OP_LOAD0    129
>> +#define   MI_ALU_OP_LOAD1    1153
>> +#define   MI_ALU_OP_ADD      256
>> +#define   MI_ALU_OP_SUB      257
>> +#define   MI_ALU_OP_AND      258
>> +#define   MI_ALU_OP_OR       259
>> +#define   MI_ALU_OP_XOR      260
>> +#define   MI_ALU_OP_STORE    384
>> +#define   MI_ALU_OP_STOREINV 1408
>> +/* sources */
>> +#define   MI_ALU_SRC_REG(x)  (x) /* 0 -> 15 */
>> +#define   MI_ALU_SRC_SRCA    32
>> +#define   MI_ALU_SRC_SRCB    33
>> +#define   MI_ALU_SRC_ACCU    49
>> +#define   MI_ALU_SRC_ZF      50
>> +#define   MI_ALU_SRC_CF      51
>> +
>>   /*
>>    * Commands used only by the command parser
>>    */
>> diff --git a/drivers/gpu/drm/i915/gt/intel_gt_types.h b/drivers/gpu/drm/i915/gt/intel_gt_types.h
>> index e625a5e320d3..0750ac49a05b 100644
>> --- a/drivers/gpu/drm/i915/gt/intel_gt_types.h
>> +++ b/drivers/gpu/drm/i915/gt/intel_gt_types.h
>> @@ -70,6 +70,11 @@ enum intel_gt_scratch_field {
>>          /* 8 bytes */
>>          INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA = 256,
>>   
>> +       /* 6 * 8 bytes */
>> +       INTEL_GT_SCRATCH_FIELD_PERF_CS_GPR = 2048,
>> +
>> +       /* 4 bytes */
>> +       INTEL_GT_SCRATCH_FIELD_PERF_PREDICATE_RESULT_1 = 2096,
>>   };
>>   
>>   #endif /* __INTEL_GT_TYPES_H__ */
>> diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
>> index eeecdad0e3ca..6b49fda145e7 100644
>> --- a/drivers/gpu/drm/i915/i915_debugfs.c
>> +++ b/drivers/gpu/drm/i915/i915_debugfs.c
>> @@ -3646,6 +3646,30 @@ DEFINE_SIMPLE_ATTRIBUTE(i915_wedged_fops,
>>                          i915_wedged_get, i915_wedged_set,
>>                          "%llu\n");
>>   
>> +static int
>> +i915_perf_noa_delay_set(void *data, u64 val)
>> +{
>> +       struct drm_i915_private *i915 = data;
>> +
>> +       atomic64_set(&i915->perf.oa.noa_programming_delay, val);
>> +       return 0;
>> +}
>> +
>> +static int
>> +i915_perf_noa_delay_get(void *data, u64 *val)
>> +{
>> +       struct drm_i915_private *i915 = data;
>> +
>> +       *val = atomic64_read(&i915->perf.oa.noa_programming_delay);
>> +       return 0;
>> +}
>> +
>> +DEFINE_SIMPLE_ATTRIBUTE(i915_perf_noa_delay_fops,
>> +                       i915_perf_noa_delay_get,
>> +                       i915_perf_noa_delay_set,
>> +                       "%llu\n");
>> +
>> +
>>   #define DROP_UNBOUND   BIT(0)
>>   #define DROP_BOUND     BIT(1)
>>   #define DROP_RETIRE    BIT(2)
>> @@ -4411,6 +4435,7 @@ static const struct i915_debugfs_files {
>>          const char *name;
>>          const struct file_operations *fops;
>>   } i915_debugfs_files[] = {
>> +       {"i915_perf_noa_delay", &i915_perf_noa_delay_fops},
>>          {"i915_wedged", &i915_wedged_fops},
>>          {"i915_cache_sharing", &i915_cache_sharing_fops},
>>          {"i915_gem_drop_caches", &i915_drop_caches_fops},
>> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
>> index df39b2ee6bd9..fe93a260bd28 100644
>> --- a/drivers/gpu/drm/i915/i915_drv.h
>> +++ b/drivers/gpu/drm/i915/i915_drv.h
>> @@ -1837,6 +1837,14 @@ struct drm_i915_private {
>>   
>>                          struct i915_oa_ops ops;
>>                          const struct i915_oa_format *oa_formats;
>> +
>> +                       /**
>> +                        * A batch buffer doing a wait on the GPU for the NOA
>> +                        * logic to be reprogrammed.
>> +                        */
>> +                       struct i915_vma *noa_wait;
>> +
>> +                       atomic64_t noa_programming_delay;
>>                  } oa;
>>          } perf;
>>   
>> diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
>> index 5ba771468078..03e6908282e3 100644
>> --- a/drivers/gpu/drm/i915/i915_perf.c
>> +++ b/drivers/gpu/drm/i915/i915_perf.c
>> @@ -197,6 +197,7 @@
>>   
>>   #include "gem/i915_gem_context.h"
>>   #include "gem/i915_gem_pm.h"
>> +#include "gt/intel_gt.h"
>>   #include "gt/intel_lrc_reg.h"
>>   
>>   #include "i915_drv.h"
>> @@ -429,7 +430,7 @@ static int alloc_oa_config_buffer(struct drm_i915_private *i915,
>>                                                MI_LOAD_REGISTER_IMM_MAX_REGS) * 4;
>>                  config_length += oa_config->flex_regs_len * 8;
>>          }
>> -       config_length += 4; /* MI_BATCH_BUFFER_END */
>> +       config_length += 12; /* MI_BATCH_BUFFER_START into noa_wait loop */
>>          config_length = ALIGN(config_length, I915_GTT_PAGE_SIZE);
>>   
>>          bo = i915_gem_object_create_shmem(i915, config_length);
>> @@ -446,7 +447,12 @@ static int alloc_oa_config_buffer(struct drm_i915_private *i915,
>>          cs = write_cs_mi_lri(cs, oa_config->b_counter_regs, oa_config->b_counter_regs_len);
>>          cs = write_cs_mi_lri(cs, oa_config->flex_regs, oa_config->flex_regs_len);
>>   
>> -       *cs++ = MI_BATCH_BUFFER_END;
>> +
>> +       /* Jump into the NOA wait busy loop. */
>> +       *cs++ = (INTEL_GEN(i915) < 8 ?
>> +                MI_BATCH_BUFFER_START : MI_BATCH_BUFFER_START_GEN8);
>> +       *cs++ = i915_ggtt_offset(i915->perf.oa.noa_wait);
>> +       *cs++ = 0;
>>   
>>          i915_gem_object_flush_map(bo);
>>          i915_gem_object_unpin_map(bo);
>> @@ -1467,6 +1473,7 @@ static void i915_oa_stream_destroy(struct i915_perf_stream *stream)
>>          mutex_lock(&dev_priv->drm.struct_mutex);
>>          dev_priv->perf.oa.exclusive_stream = NULL;
>>          dev_priv->perf.oa.ops.disable_metric_set(dev_priv);
>> +       i915_vma_unpin_and_release(&dev_priv->perf.oa.noa_wait, 0);
>>          mutex_unlock(&dev_priv->drm.struct_mutex);
>>   
>>          free_oa_buffer(dev_priv);
>> @@ -1653,6 +1660,204 @@ static int alloc_oa_buffer(struct drm_i915_private *dev_priv)
>>          return ret;
>>   }
>>   
>> +static u32 *save_register(struct drm_i915_private *i915, u32 *cs,
>> +                         i915_reg_t reg, u32 offset, u32 dword_count)
>> +{
>> +       uint32_t d;
>> +
>> +       for (d = 0; d < dword_count; d++) {
>> +               *cs++ = INTEL_GEN(i915) >= 8 ?
>> +                       MI_STORE_REGISTER_MEM_GEN8 : MI_STORE_REGISTER_MEM;
>> +               *cs++ = i915_mmio_reg_offset(reg) + 4 * d;
>> +               *cs++ = intel_gt_scratch_offset(&i915->gt, offset) + 4 * d;
>> +               if (INTEL_GEN(i915) >= 8)
>> +                       *cs++ = 0;
> Will anyone care about the extra nop on hsw? :)


Alright :)

I didn't noticed I left a NOOP below too.


>
>> +static int alloc_noa_wait(struct drm_i915_private *i915)
>> +{
>> +       struct drm_i915_gem_object *bo;
>> +       struct i915_vma *vma;
>> +       u64 delay_ns = atomic64_read(&i915->perf.oa.noa_programming_delay), delay_ticks;
>> +       u32 *batch, *ts0, *cs, *jump;
>> +       int ret, i;
>> +
>> +       bo = i915_gem_object_create_shmem(i915, 4096);
> So swappable. At 4k, we almost consume as much in our bookkeeping as we
> allocate for the backing store. Yes, it's atrocious.
>
> Hang on. This can never be unpinned as it stores absolute addresses. So
> this can be i915_gem_object_create_internal().
>
>> +       if (IS_ERR(bo)) {
>> +               DRM_ERROR("Failed to allocate NOA wait batchbuffer\n");
>> +               return PTR_ERR(bo);
>> +       }
>> +
>> +       /*
>> +        * We pin in GGTT because we jump into this buffer now because
>> +        * multiple OA config BOs will have a jump to this address and it
>> +        * needs to be fixed during the lifetime of the i915/perf stream.
>> +        */
>> +       vma = i915_gem_object_ggtt_pin(bo, NULL, 0, 4096, 0);
>> +       if (IS_ERR(vma)) {
>> +               ret = PTR_ERR(vma);
>> +               goto err_unref;
>> +       }
>> +
>> +       batch = cs = i915_gem_object_pin_map(bo, I915_MAP_WB);
>> +       if (IS_ERR(batch)) {
>> +               ret = PTR_ERR(batch);
>> +               goto err_unpin;
>> +       }
>> +
>> +       /* Save registers. */
>> +       for (i = 0; i <= 5; i++) {
>> +               cs = save_register(i915, cs, HSW_CS_GPR(i),
>> +                                  INTEL_GT_SCRATCH_FIELD_PERF_CS_GPR + 8 * i, 2);
>> +       }
>> +       cs = save_register(i915, cs, MI_PREDICATE_RESULT_1,
>> +                          INTEL_GT_SCRATCH_FIELD_PERF_PREDICATE_RESULT_1, 1);
>> +
>> +       /* First timestamp snapshot location. */
>> +       ts0 = cs;
> Did this ever get used?


Oops, my mistake. This is buggy, the jump back to the beginning should 
come back here.


>
>> +       /*
>> +        * Initial snapshot of the timestamp register to implement the wait.
>> +        * We work with 32b values, so clear out the top 32b bits of the
>> +        * register because the ALU works 64bits.
>> +        */
>> +       *cs++ = MI_LOAD_REGISTER_IMM(1);
>> +       *cs++ = i915_mmio_reg_offset(HSW_CS_GPR(0)) + 4;
>> +       *cs++ = 0;
>> +       *cs++ = MI_LOAD_REGISTER_REG | (3 - 2);
>> +       *cs++ = i915_mmio_reg_offset(RING_TIMESTAMP(RENDER_RING_BASE));
>> +       *cs++ = i915_mmio_reg_offset(HSW_CS_GPR(0));
>> +
>> +       /*
>> +        * This is the location we're going to jump back into until the
>> +        * required amount of time has passed.
>> +        */
>> +       jump = cs;
>> +
>> +       /*
>> +        * Take another snapshot of the timestamp register. Take care to clear
>> +        * up the top 32bits of CS_GPR(1) as we're using it for other
>> +        * operations below.
>> +        */
>> +       *cs++ = MI_LOAD_REGISTER_IMM(1);
>> +       *cs++ = i915_mmio_reg_offset(HSW_CS_GPR(1)) + 4;
>> +       *cs++ = 0;
>> +       *cs++ = MI_LOAD_REGISTER_REG | (3 - 2);
>> +       *cs++ = i915_mmio_reg_offset(RING_TIMESTAMP(RENDER_RING_BASE));
>> +       *cs++ = i915_mmio_reg_offset(HSW_CS_GPR(1));
> cs = get_timestamp(cs, 1);
>
> enum { START, NOW, DELTA, RESULT, TARGET } ?
>
> That would also help with save/restore registers, as all CS_GPR should
> then be named.


Very sensible :)

If only you had 
https://gitlab.freedesktop.org/mesa/mesa/blob/master/src/intel/common/gen_mi_builder.h


>
>> +       /*
>> +        * Do a diff between the 2 timestamps and store the result back into
>> +        * CS_GPR(1).
>> +        */
>> +       *cs++ = MI_MATH(5);
>> +       *cs++ = MI_ALU_OP(MI_ALU_OP_LOAD, MI_ALU_SRC_SRCA, MI_ALU_SRC_REG(1));
>> +       *cs++ = MI_ALU_OP(MI_ALU_OP_LOAD, MI_ALU_SRC_SRCB, MI_ALU_SRC_REG(0));
>> +       *cs++ = MI_ALU_OP(MI_ALU_OP_SUB, 0, 0);
>> +       *cs++ = MI_ALU_OP(MI_ALU_OP_STORE, MI_ALU_SRC_REG(2), MI_ALU_SRC_ACCU);
>> +       *cs++ = MI_ALU_OP(MI_ALU_OP_STORE, MI_ALU_SRC_REG(3), MI_ALU_SRC_CF);
>> +
>> +       /*
>> +        * Transfer the carry flag (set to 1 if ts1 < ts0, meaning the
>> +        * timestamp have rolled over the 32bits) into the predicate register
>> +        * to be used for the predicated jump.
>> +        */
>> +       *cs++ = MI_LOAD_REGISTER_REG | (3 - 2);
>> +       *cs++ = i915_mmio_reg_offset(HSW_CS_GPR(3));
>> +       *cs++ = i915_mmio_reg_offset(MI_PREDICATE_RESULT_1);
>> +
>> +       /* Restart from the beginning if we had timestamps roll over. */
>> +       *cs++ = (INTEL_GEN(i915) < 8 ?
>> +                MI_BATCH_BUFFER_START : MI_BATCH_BUFFER_START_GEN8) |
>> +               MI_BATCH_PREDICATE;
>> +       *cs++ = vma->node.start;
>> +       *cs++ = 0;
>> +
>> +       /*
>> +        * Now add the diff between to previous timestamps and add it to :
>> +        *      (((1 * << 64) - 1) - delay_ns)
>> +        *
>> +        * When the Carry Flag contains 1 this means the elapsed time is
>> +        * longer than the expected delay, and we can exit the wait loop.
>> +        */
>> +       delay_ticks = 0xffffffffffffffff -
>> +               DIV64_U64_ROUND_UP(delay_ns *
>> +                                  RUNTIME_INFO(i915)->cs_timestamp_frequency_khz,
>> +                                  1000000ull);
>> +       *cs++ = MI_LOAD_REGISTER_IMM(2);
>> +       *cs++ = i915_mmio_reg_offset(HSW_CS_GPR(4));
>> +       *cs++ = lower_32_bits(delay_ticks);
>> +       *cs++ = i915_mmio_reg_offset(HSW_CS_GPR(4)) + 4;
>> +       *cs++ = upper_32_bits(delay_ticks);
> Now, I was expecting to compute the 32b end timestamp and compare now to
> that using the carry-flag to indicate the completion.
>
> Why the detour? (I'm sure I am missing something here.)


Probably trying to save a register... My thinking is not that deep too ;)


>
> Quick question request for >32b delays are rejected in the user debug api?


Nop :( Will add.


>
>> +       *cs++ = MI_MATH(4);
>> +       *cs++ = MI_ALU_OP(MI_ALU_OP_LOAD, MI_ALU_SRC_SRCA, MI_ALU_SRC_REG(2));
>> +       *cs++ = MI_ALU_OP(MI_ALU_OP_LOAD, MI_ALU_SRC_SRCB, MI_ALU_SRC_REG(4));
>> +       *cs++ = MI_ALU_OP(MI_ALU_OP_ADD, 0, 0);
>> +       *cs++ = MI_ALU_OP(MI_ALU_OP_STOREINV, MI_ALU_SRC_REG(5), MI_ALU_SRC_CF);
> Comparing delta against target delay. Inverted store to give
> 	(delay - target) >= 0


I don't trust myself too much with ALU changes ;)


>
>> +       /*
>> +        * Transfer the result into the predicate register to be used for the
>> +        * predicated jump.
>> +        */
>> +       *cs++ = MI_LOAD_REGISTER_REG | (3 - 2);
>> +       *cs++ = i915_mmio_reg_offset(HSW_CS_GPR(5));
>> +       *cs++ = i915_mmio_reg_offset(MI_PREDICATE_RESULT_1);
>> +
>> +       /* Predicate the jump.  */
>> +       *cs++ = (INTEL_GEN(i915) < 8 ?
>> +                MI_BATCH_BUFFER_START : MI_BATCH_BUFFER_START_GEN8) |
>> +               MI_BATCH_PREDICATE;
> The joy of being on rcs.


You'll be delighted to know about this new TGL generation where all CS 
(afaict) have graduated from predication.


>
>> +       *cs++ = vma->node.start + (jump - batch) * 4;
> Mutters i915_ggtt_offset(vma)
>
>> +       *cs++ = 0;
>> +
>> +       /* Restore registers. */
>> +       for (i = 0; i <= 5; i++) {
>> +               cs = restore_register(i915, cs, HSW_CS_GPR(i),
>> +                                     INTEL_GT_SCRATCH_FIELD_PERF_CS_GPR + 8 * i, 2);
>> +       }
>> +       cs = restore_register(i915, cs, MI_PREDICATE_RESULT_1,
>> +                             INTEL_GT_SCRATCH_FIELD_PERF_PREDICATE_RESULT_1, 1);
>> +
>> +       /* And return to the ring. */
>> +       *cs++ = MI_BATCH_BUFFER_END;
> GEM_BUG_ON(cs - batch > PAGE_SIZE/sizeof(*batch));


Done!


>
>> +
>> +       i915_gem_object_flush_map(bo);
>> +       i915_gem_object_unpin_map(bo);
>> +
>> +       i915->perf.oa.noa_wait = vma;
>> +
>> +       return 0;
>> +
>> +err_unpin:
>> +       __i915_vma_unpin(vma);
>> +
>> +err_unref:
>> +       i915_gem_object_put(bo);
>> +
>> +       return ret;
>> +}
> Certainly seems to do what you say on the tin.
> -Chris
>
diff mbox series

Patch

diff --git a/drivers/gpu/drm/i915/gt/intel_gpu_commands.h b/drivers/gpu/drm/i915/gt/intel_gpu_commands.h
index e7eff9db343e..4a66af38c87b 100644
--- a/drivers/gpu/drm/i915/gt/intel_gpu_commands.h
+++ b/drivers/gpu/drm/i915/gt/intel_gpu_commands.h
@@ -151,6 +151,7 @@ 
 #define   MI_BATCH_GTT		    (2<<6) /* aliased with (1<<7) on gen4 */
 #define MI_BATCH_BUFFER_START_GEN8	MI_INSTR(0x31, 1)
 #define   MI_BATCH_RESOURCE_STREAMER (1<<10)
+#define   MI_BATCH_PREDICATE         (1 << 15) /* HSW+ on RCS only*/
 
 /*
  * 3D instructions used by the kernel
@@ -226,6 +227,29 @@ 
 #define   PIPE_CONTROL_DEPTH_CACHE_FLUSH		(1<<0)
 #define   PIPE_CONTROL_GLOBAL_GTT (1<<2) /* in addr dword */
 
+#define MI_MATH(x) MI_INSTR(0x1a, (x)-1)
+#define   MI_ALU_OP(op, src1, src2) (((op) << 20) | ((src1) << 10) | (src2))
+/* operands */
+#define   MI_ALU_OP_NOOP     0
+#define   MI_ALU_OP_LOAD     128
+#define   MI_ALU_OP_LOADINV  1152
+#define   MI_ALU_OP_LOAD0    129
+#define   MI_ALU_OP_LOAD1    1153
+#define   MI_ALU_OP_ADD      256
+#define   MI_ALU_OP_SUB      257
+#define   MI_ALU_OP_AND      258
+#define   MI_ALU_OP_OR       259
+#define   MI_ALU_OP_XOR      260
+#define   MI_ALU_OP_STORE    384
+#define   MI_ALU_OP_STOREINV 1408
+/* sources */
+#define   MI_ALU_SRC_REG(x)  (x) /* 0 -> 15 */
+#define   MI_ALU_SRC_SRCA    32
+#define   MI_ALU_SRC_SRCB    33
+#define   MI_ALU_SRC_ACCU    49
+#define   MI_ALU_SRC_ZF      50
+#define   MI_ALU_SRC_CF      51
+
 /*
  * Commands used only by the command parser
  */
diff --git a/drivers/gpu/drm/i915/gt/intel_gt_types.h b/drivers/gpu/drm/i915/gt/intel_gt_types.h
index e625a5e320d3..0750ac49a05b 100644
--- a/drivers/gpu/drm/i915/gt/intel_gt_types.h
+++ b/drivers/gpu/drm/i915/gt/intel_gt_types.h
@@ -70,6 +70,11 @@  enum intel_gt_scratch_field {
 	/* 8 bytes */
 	INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA = 256,
 
+	/* 6 * 8 bytes */
+	INTEL_GT_SCRATCH_FIELD_PERF_CS_GPR = 2048,
+
+	/* 4 bytes */
+	INTEL_GT_SCRATCH_FIELD_PERF_PREDICATE_RESULT_1 = 2096,
 };
 
 #endif /* __INTEL_GT_TYPES_H__ */
diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index eeecdad0e3ca..6b49fda145e7 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -3646,6 +3646,30 @@  DEFINE_SIMPLE_ATTRIBUTE(i915_wedged_fops,
 			i915_wedged_get, i915_wedged_set,
 			"%llu\n");
 
+static int
+i915_perf_noa_delay_set(void *data, u64 val)
+{
+	struct drm_i915_private *i915 = data;
+
+	atomic64_set(&i915->perf.oa.noa_programming_delay, val);
+	return 0;
+}
+
+static int
+i915_perf_noa_delay_get(void *data, u64 *val)
+{
+	struct drm_i915_private *i915 = data;
+
+	*val = atomic64_read(&i915->perf.oa.noa_programming_delay);
+	return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(i915_perf_noa_delay_fops,
+			i915_perf_noa_delay_get,
+			i915_perf_noa_delay_set,
+			"%llu\n");
+
+
 #define DROP_UNBOUND	BIT(0)
 #define DROP_BOUND	BIT(1)
 #define DROP_RETIRE	BIT(2)
@@ -4411,6 +4435,7 @@  static const struct i915_debugfs_files {
 	const char *name;
 	const struct file_operations *fops;
 } i915_debugfs_files[] = {
+	{"i915_perf_noa_delay", &i915_perf_noa_delay_fops},
 	{"i915_wedged", &i915_wedged_fops},
 	{"i915_cache_sharing", &i915_cache_sharing_fops},
 	{"i915_gem_drop_caches", &i915_drop_caches_fops},
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index df39b2ee6bd9..fe93a260bd28 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -1837,6 +1837,14 @@  struct drm_i915_private {
 
 			struct i915_oa_ops ops;
 			const struct i915_oa_format *oa_formats;
+
+			/**
+			 * A batch buffer doing a wait on the GPU for the NOA
+			 * logic to be reprogrammed.
+			 */
+			struct i915_vma *noa_wait;
+
+			atomic64_t noa_programming_delay;
 		} oa;
 	} perf;
 
diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
index 5ba771468078..03e6908282e3 100644
--- a/drivers/gpu/drm/i915/i915_perf.c
+++ b/drivers/gpu/drm/i915/i915_perf.c
@@ -197,6 +197,7 @@ 
 
 #include "gem/i915_gem_context.h"
 #include "gem/i915_gem_pm.h"
+#include "gt/intel_gt.h"
 #include "gt/intel_lrc_reg.h"
 
 #include "i915_drv.h"
@@ -429,7 +430,7 @@  static int alloc_oa_config_buffer(struct drm_i915_private *i915,
 					      MI_LOAD_REGISTER_IMM_MAX_REGS) * 4;
 		config_length += oa_config->flex_regs_len * 8;
 	}
-	config_length += 4; /* MI_BATCH_BUFFER_END */
+	config_length += 12; /* MI_BATCH_BUFFER_START into noa_wait loop */
 	config_length = ALIGN(config_length, I915_GTT_PAGE_SIZE);
 
 	bo = i915_gem_object_create_shmem(i915, config_length);
@@ -446,7 +447,12 @@  static int alloc_oa_config_buffer(struct drm_i915_private *i915,
 	cs = write_cs_mi_lri(cs, oa_config->b_counter_regs, oa_config->b_counter_regs_len);
 	cs = write_cs_mi_lri(cs, oa_config->flex_regs, oa_config->flex_regs_len);
 
-	*cs++ = MI_BATCH_BUFFER_END;
+
+	/* Jump into the NOA wait busy loop. */
+	*cs++ = (INTEL_GEN(i915) < 8 ?
+		 MI_BATCH_BUFFER_START : MI_BATCH_BUFFER_START_GEN8);
+	*cs++ = i915_ggtt_offset(i915->perf.oa.noa_wait);
+	*cs++ = 0;
 
 	i915_gem_object_flush_map(bo);
 	i915_gem_object_unpin_map(bo);
@@ -1467,6 +1473,7 @@  static void i915_oa_stream_destroy(struct i915_perf_stream *stream)
 	mutex_lock(&dev_priv->drm.struct_mutex);
 	dev_priv->perf.oa.exclusive_stream = NULL;
 	dev_priv->perf.oa.ops.disable_metric_set(dev_priv);
+	i915_vma_unpin_and_release(&dev_priv->perf.oa.noa_wait, 0);
 	mutex_unlock(&dev_priv->drm.struct_mutex);
 
 	free_oa_buffer(dev_priv);
@@ -1653,6 +1660,204 @@  static int alloc_oa_buffer(struct drm_i915_private *dev_priv)
 	return ret;
 }
 
+static u32 *save_register(struct drm_i915_private *i915, u32 *cs,
+			  i915_reg_t reg, u32 offset, u32 dword_count)
+{
+	uint32_t d;
+
+	for (d = 0; d < dword_count; d++) {
+		*cs++ = INTEL_GEN(i915) >= 8 ?
+			MI_STORE_REGISTER_MEM_GEN8 : MI_STORE_REGISTER_MEM;
+		*cs++ = i915_mmio_reg_offset(reg) + 4 * d;
+		*cs++ = intel_gt_scratch_offset(&i915->gt, offset) + 4 * d;
+		if (INTEL_GEN(i915) >= 8)
+			*cs++ = 0;
+	}
+
+	return cs;
+}
+
+static u32 *restore_register(struct drm_i915_private *i915, u32 *cs,
+			     i915_reg_t reg, u32 offset, u32 dword_count)
+{
+	uint32_t d;
+
+	for (d = 0; d < dword_count; d++) {
+		*cs++ = INTEL_GEN(i915) >= 8 ?
+			MI_LOAD_REGISTER_MEM_GEN8 : MI_LOAD_REGISTER_MEM;
+		*cs++ = i915_mmio_reg_offset(reg);
+		*cs++ = intel_gt_scratch_offset(&i915->gt, offset);
+		if (INTEL_GEN(i915) >= 8)
+			*cs++ = 0;
+	}
+
+	return cs;
+}
+
+static int alloc_noa_wait(struct drm_i915_private *i915)
+{
+	struct drm_i915_gem_object *bo;
+	struct i915_vma *vma;
+	u64 delay_ns = atomic64_read(&i915->perf.oa.noa_programming_delay), delay_ticks;
+	u32 *batch, *ts0, *cs, *jump;
+	int ret, i;
+
+	bo = i915_gem_object_create_shmem(i915, 4096);
+	if (IS_ERR(bo)) {
+		DRM_ERROR("Failed to allocate NOA wait batchbuffer\n");
+		return PTR_ERR(bo);
+	}
+
+	/*
+	 * We pin in GGTT because we jump into this buffer now because
+	 * multiple OA config BOs will have a jump to this address and it
+	 * needs to be fixed during the lifetime of the i915/perf stream.
+	 */
+	vma = i915_gem_object_ggtt_pin(bo, NULL, 0, 4096, 0);
+	if (IS_ERR(vma)) {
+		ret = PTR_ERR(vma);
+		goto err_unref;
+	}
+
+	batch = cs = i915_gem_object_pin_map(bo, I915_MAP_WB);
+	if (IS_ERR(batch)) {
+		ret = PTR_ERR(batch);
+		goto err_unpin;
+	}
+
+	/* Save registers. */
+	for (i = 0; i <= 5; i++) {
+		cs = save_register(i915, cs, HSW_CS_GPR(i),
+				   INTEL_GT_SCRATCH_FIELD_PERF_CS_GPR + 8 * i, 2);
+	}
+	cs = save_register(i915, cs, MI_PREDICATE_RESULT_1,
+			   INTEL_GT_SCRATCH_FIELD_PERF_PREDICATE_RESULT_1, 1);
+
+	/* First timestamp snapshot location. */
+	ts0 = cs;
+
+	/*
+	 * Initial snapshot of the timestamp register to implement the wait.
+	 * We work with 32b values, so clear out the top 32b bits of the
+	 * register because the ALU works 64bits.
+	 */
+	*cs++ = MI_LOAD_REGISTER_IMM(1);
+	*cs++ = i915_mmio_reg_offset(HSW_CS_GPR(0)) + 4;
+	*cs++ = 0;
+	*cs++ = MI_LOAD_REGISTER_REG | (3 - 2);
+	*cs++ = i915_mmio_reg_offset(RING_TIMESTAMP(RENDER_RING_BASE));
+	*cs++ = i915_mmio_reg_offset(HSW_CS_GPR(0));
+
+	/*
+	 * This is the location we're going to jump back into until the
+	 * required amount of time has passed.
+	 */
+	jump = cs;
+
+	/*
+	 * Take another snapshot of the timestamp register. Take care to clear
+	 * up the top 32bits of CS_GPR(1) as we're using it for other
+	 * operations below.
+	 */
+	*cs++ = MI_LOAD_REGISTER_IMM(1);
+	*cs++ = i915_mmio_reg_offset(HSW_CS_GPR(1)) + 4;
+	*cs++ = 0;
+	*cs++ = MI_LOAD_REGISTER_REG | (3 - 2);
+	*cs++ = i915_mmio_reg_offset(RING_TIMESTAMP(RENDER_RING_BASE));
+	*cs++ = i915_mmio_reg_offset(HSW_CS_GPR(1));
+
+	/*
+	 * Do a diff between the 2 timestamps and store the result back into
+	 * CS_GPR(1).
+	 */
+	*cs++ = MI_MATH(5);
+	*cs++ = MI_ALU_OP(MI_ALU_OP_LOAD, MI_ALU_SRC_SRCA, MI_ALU_SRC_REG(1));
+	*cs++ = MI_ALU_OP(MI_ALU_OP_LOAD, MI_ALU_SRC_SRCB, MI_ALU_SRC_REG(0));
+	*cs++ = MI_ALU_OP(MI_ALU_OP_SUB, 0, 0);
+	*cs++ = MI_ALU_OP(MI_ALU_OP_STORE, MI_ALU_SRC_REG(2), MI_ALU_SRC_ACCU);
+	*cs++ = MI_ALU_OP(MI_ALU_OP_STORE, MI_ALU_SRC_REG(3), MI_ALU_SRC_CF);
+
+	/*
+	 * Transfer the carry flag (set to 1 if ts1 < ts0, meaning the
+	 * timestamp have rolled over the 32bits) into the predicate register
+	 * to be used for the predicated jump.
+	 */
+	*cs++ = MI_LOAD_REGISTER_REG | (3 - 2);
+	*cs++ = i915_mmio_reg_offset(HSW_CS_GPR(3));
+	*cs++ = i915_mmio_reg_offset(MI_PREDICATE_RESULT_1);
+
+	/* Restart from the beginning if we had timestamps roll over. */
+	*cs++ = (INTEL_GEN(i915) < 8 ?
+		 MI_BATCH_BUFFER_START : MI_BATCH_BUFFER_START_GEN8) |
+		MI_BATCH_PREDICATE;
+	*cs++ = vma->node.start;
+	*cs++ = 0;
+
+	/*
+	 * Now add the diff between to previous timestamps and add it to :
+	 *      (((1 * << 64) - 1) - delay_ns)
+	 *
+	 * When the Carry Flag contains 1 this means the elapsed time is
+	 * longer than the expected delay, and we can exit the wait loop.
+	 */
+	delay_ticks = 0xffffffffffffffff -
+		DIV64_U64_ROUND_UP(delay_ns *
+				   RUNTIME_INFO(i915)->cs_timestamp_frequency_khz,
+				   1000000ull);
+	*cs++ = MI_LOAD_REGISTER_IMM(2);
+	*cs++ = i915_mmio_reg_offset(HSW_CS_GPR(4));
+	*cs++ = lower_32_bits(delay_ticks);
+	*cs++ = i915_mmio_reg_offset(HSW_CS_GPR(4)) + 4;
+	*cs++ = upper_32_bits(delay_ticks);
+
+	*cs++ = MI_MATH(4);
+	*cs++ = MI_ALU_OP(MI_ALU_OP_LOAD, MI_ALU_SRC_SRCA, MI_ALU_SRC_REG(2));
+	*cs++ = MI_ALU_OP(MI_ALU_OP_LOAD, MI_ALU_SRC_SRCB, MI_ALU_SRC_REG(4));
+	*cs++ = MI_ALU_OP(MI_ALU_OP_ADD, 0, 0);
+	*cs++ = MI_ALU_OP(MI_ALU_OP_STOREINV, MI_ALU_SRC_REG(5), MI_ALU_SRC_CF);
+
+	/*
+	 * Transfer the result into the predicate register to be used for the
+	 * predicated jump.
+	 */
+	*cs++ = MI_LOAD_REGISTER_REG | (3 - 2);
+	*cs++ = i915_mmio_reg_offset(HSW_CS_GPR(5));
+	*cs++ = i915_mmio_reg_offset(MI_PREDICATE_RESULT_1);
+
+	/* Predicate the jump.  */
+	*cs++ = (INTEL_GEN(i915) < 8 ?
+		 MI_BATCH_BUFFER_START : MI_BATCH_BUFFER_START_GEN8) |
+		MI_BATCH_PREDICATE;
+	*cs++ = vma->node.start + (jump - batch) * 4;
+	*cs++ = 0;
+
+	/* Restore registers. */
+	for (i = 0; i <= 5; i++) {
+		cs = restore_register(i915, cs, HSW_CS_GPR(i),
+				      INTEL_GT_SCRATCH_FIELD_PERF_CS_GPR + 8 * i, 2);
+	}
+	cs = restore_register(i915, cs, MI_PREDICATE_RESULT_1,
+			      INTEL_GT_SCRATCH_FIELD_PERF_PREDICATE_RESULT_1, 1);
+
+	/* And return to the ring. */
+	*cs++ = MI_BATCH_BUFFER_END;
+
+	i915_gem_object_flush_map(bo);
+	i915_gem_object_unpin_map(bo);
+
+	i915->perf.oa.noa_wait = vma;
+
+	return 0;
+
+err_unpin:
+	__i915_vma_unpin(vma);
+
+err_unref:
+	i915_gem_object_put(bo);
+
+	return ret;
+}
+
 static void config_oa_regs(struct drm_i915_private *dev_priv,
 			   const struct i915_oa_reg *regs,
 			   u32 n_regs)
@@ -2221,6 +2426,12 @@  static int i915_oa_stream_init(struct i915_perf_stream *stream,
 		goto err_config;
 	}
 
+	ret = alloc_noa_wait(dev_priv);
+	if (ret) {
+		DRM_DEBUG("Unable to allocate NOA wait batch buffer\n");
+		goto err_noa_wait_alloc;
+	}
+
 	/* PRM - observability performance counters:
 	 *
 	 *   OACONTROL, performance counter enable, note:
@@ -2273,6 +2484,13 @@  static int i915_oa_stream_init(struct i915_perf_stream *stream,
 	intel_uncore_forcewake_put(&dev_priv->uncore, FORCEWAKE_ALL);
 	intel_runtime_pm_put(&dev_priv->runtime_pm, stream->wakeref);
 
+	mutex_lock(&dev_priv->drm.struct_mutex);
+	i915_vma_unpin_and_release(&dev_priv->perf.oa.noa_wait, 0);
+	mutex_unlock(&dev_priv->drm.struct_mutex);
+
+err_noa_wait_alloc:
+	i915_oa_config_put(stream->oa_config);
+
 err_config:
 	if (stream->ctx)
 		oa_put_render_ctx_id(stream);
@@ -3649,6 +3867,9 @@  void i915_perf_init(struct drm_i915_private *dev_priv)
 		mutex_init(&dev_priv->perf.metrics_lock);
 		idr_init(&dev_priv->perf.metrics_idr);
 
+		atomic64_set(&dev_priv->perf.oa.noa_programming_delay,
+			     500 * 1000 /* 500us */);
+
 		dev_priv->perf.initialized = true;
 	}
 }
diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
index 7e6009cefb18..e12b2fccef70 100644
--- a/drivers/gpu/drm/i915/i915_reg.h
+++ b/drivers/gpu/drm/i915/i915_reg.h
@@ -567,7 +567,9 @@  static inline bool i915_mmio_reg_valid(i915_reg_t reg)
 #define MI_PREDICATE_SRC0_UDW	_MMIO(0x2400 + 4)
 #define MI_PREDICATE_SRC1	_MMIO(0x2408)
 #define MI_PREDICATE_SRC1_UDW	_MMIO(0x2408 + 4)
-
+#define MI_PREDICATE_DATA       _MMIO(0x2410)
+#define MI_PREDICATE_RESULT     _MMIO(0x2418)
+#define MI_PREDICATE_RESULT_1   _MMIO(0x241c)
 #define MI_PREDICATE_RESULT_2	_MMIO(0x2214)
 #define  LOWER_SLICE_ENABLED	(1 << 0)
 #define  LOWER_SLICE_DISABLED	(0 << 0)