Message ID | 1434735435-14728-7-git-send-email-arun.siluvery@linux.intel.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On 19/06/2015 18:37, Arun Siluvery wrote: > In Per context w/a batch buffer, > WaRsRestoreWithPerCtxtBb > > This WA performs writes to scratch page so it must be valid, this check > is performed before initializing the batch with this WA. > > v2: This patches modifies definitions of MI_LOAD_REGISTER_MEM and > MI_LOAD_REGISTER_REG; Add GEN8 specific defines for these instructions > so as to not break any future users of existing definitions (Michel) > > v3: Length defined in current definitions of LRM, LRR instructions was specified > as 0. It seems it is common convention for instructions whose length vary between > platforms. This is not an issue so far because they are not used anywhere except > command parser; now that we use in this patch update them with correct length > and also move them out of command parser placeholder to appropriate place. > remove unnecessary padding and follow the WA programming sequence exactly > as mentioned in spec which is essential for this WA (Dave). > > Cc: Chris Wilson <chris@chris-wilson.co.uk> > Cc: Dave Gordon <david.s.gordon@intel.com> > Signed-off-by: Rafael Barbalho <rafael.barbalho@intel.com> > Signed-off-by: Arun Siluvery <arun.siluvery@linux.intel.com> > --- > drivers/gpu/drm/i915/i915_reg.h | 29 +++++++++++++++++++-- > drivers/gpu/drm/i915/intel_lrc.c | 54 ++++++++++++++++++++++++++++++++++++++++ > 2 files changed, 81 insertions(+), 2 deletions(-) > > diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h > index 7637e64..208620d 100644 > --- a/drivers/gpu/drm/i915/i915_reg.h > +++ b/drivers/gpu/drm/i915/i915_reg.h > @@ -347,6 +347,31 @@ > #define MI_INVALIDATE_BSD (1<<7) > #define MI_FLUSH_DW_USE_GTT (1<<2) > #define MI_FLUSH_DW_USE_PPGTT (0<<2) > +#define MI_LOAD_REGISTER_MEM MI_INSTR(0x29, 1) > +#define MI_LOAD_REGISTER_MEM_GEN8 MI_INSTR(0x29, 2) > +#define MI_LRM_USE_GLOBAL_GTT (1<<22) > +#define MI_LRM_ASYNC_MODE_ENABLE (1<<21) > +#define MI_LOAD_REGISTER_REG MI_INSTR(0x2A, 1) > +#define MI_ATOMIC(len) MI_INSTR(0x2F, (len-2)) > +#define MI_ATOMIC_MEMORY_TYPE_GGTT (1<<22) > +#define MI_ATOMIC_INLINE_DATA (1<<18) > +#define MI_ATOMIC_CS_STALL (1<<17) > +#define MI_ATOMIC_RETURN_DATA_CTL (1<<16) > +#define MI_ATOMIC_OP_MASK(op) ((op) << 8) > +#define MI_ATOMIC_AND MI_ATOMIC_OP_MASK(0x01) > +#define MI_ATOMIC_OR MI_ATOMIC_OP_MASK(0x02) > +#define MI_ATOMIC_XOR MI_ATOMIC_OP_MASK(0x03) > +#define MI_ATOMIC_MOVE MI_ATOMIC_OP_MASK(0x04) > +#define MI_ATOMIC_INC MI_ATOMIC_OP_MASK(0x05) > +#define MI_ATOMIC_DEC MI_ATOMIC_OP_MASK(0x06) > +#define MI_ATOMIC_ADD MI_ATOMIC_OP_MASK(0x07) > +#define MI_ATOMIC_SUB MI_ATOMIC_OP_MASK(0x08) > +#define MI_ATOMIC_RSUB MI_ATOMIC_OP_MASK(0x09) > +#define MI_ATOMIC_IMAX MI_ATOMIC_OP_MASK(0x0A) > +#define MI_ATOMIC_IMIN MI_ATOMIC_OP_MASK(0x0B) > +#define MI_ATOMIC_UMAX MI_ATOMIC_OP_MASK(0x0C) > +#define MI_ATOMIC_UMIN MI_ATOMIC_OP_MASK(0x0D) > + > #define MI_BATCH_BUFFER MI_INSTR(0x30, 1) > #define MI_BATCH_NON_SECURE (1) > /* for snb/ivb/vlv this also means "batch in ppgtt" when ppgtt is enabled. */ > @@ -451,8 +476,6 @@ > #define MI_CLFLUSH MI_INSTR(0x27, 0) > #define MI_REPORT_PERF_COUNT MI_INSTR(0x28, 0) > #define MI_REPORT_PERF_COUNT_GGTT (1<<0) > -#define MI_LOAD_REGISTER_MEM MI_INSTR(0x29, 0) > -#define MI_LOAD_REGISTER_REG MI_INSTR(0x2A, 0) > #define MI_RS_STORE_DATA_IMM MI_INSTR(0x2B, 0) > #define MI_LOAD_URB_MEM MI_INSTR(0x2C, 0) > #define MI_STORE_URB_MEM MI_INSTR(0x2D, 0) > @@ -1799,6 +1822,8 @@ enum skl_disp_power_wells { > #define GEN8_RC_SEMA_IDLE_MSG_DISABLE (1 << 12) > #define GEN8_FF_DOP_CLOCK_GATE_DISABLE (1<<10) > > +#define GEN8_RS_PREEMPT_STATUS 0x215C > + > /* Fuse readout registers for GT */ > #define CHV_FUSE_GT (VLV_DISPLAY_BASE + 0x2168) > #define CHV_FGT_DISABLE_SS0 (1 << 10) > diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c > index 664455c..28198c4 100644 > --- a/drivers/gpu/drm/i915/intel_lrc.c > +++ b/drivers/gpu/drm/i915/intel_lrc.c > @@ -1215,11 +1215,65 @@ static int gen8_init_perctx_bb(struct intel_engine_cs *ring, > uint32_t *const batch, > uint32_t *offset) > { > + uint32_t scratch_addr; > uint32_t index = wa_ctx_start(wa_ctx, *offset, CACHELINE_DWORDS); > > + /* Actual scratch location is at 128 bytes offset */ > + scratch_addr = ring->scratch.gtt_offset + 2*CACHELINE_BYTES; > + scratch_addr |= PIPE_CONTROL_GLOBAL_GTT; > + Daniel, could you please remove this line when applying this patch? sorry for additional work. > + scratch_addr |= PIPE_CONTROL_GLOBAL_GTT; regards Arun > /* WaDisableCtxRestoreArbitration:bdw,chv */ > wa_ctx_emit(batch, MI_ARB_ON_OFF | MI_ARB_ENABLE); > > + /* > + * As per Bspec, to workaround a known HW issue, SW must perform the > + * below programming sequence prior to programming MI_BATCH_BUFFER_END. > + * > + * This is only applicable for Gen8. > + */ > + > + /* WaRsRestoreWithPerCtxtBb:bdw,chv */ > + wa_ctx_emit(batch, MI_LOAD_REGISTER_IMM(1)); > + wa_ctx_emit(batch, INSTPM); > + wa_ctx_emit(batch, _MASKED_BIT_DISABLE(INSTPM_FORCE_ORDERING)); > + > + wa_ctx_emit(batch, (MI_ATOMIC(5) | > + MI_ATOMIC_MEMORY_TYPE_GGTT | > + MI_ATOMIC_INLINE_DATA | > + MI_ATOMIC_CS_STALL | > + MI_ATOMIC_RETURN_DATA_CTL | > + MI_ATOMIC_MOVE)); > + wa_ctx_emit(batch, scratch_addr); > + wa_ctx_emit(batch, 0); > + wa_ctx_emit(batch, _MASKED_BIT_ENABLE(INSTPM_FORCE_ORDERING)); > + wa_ctx_emit(batch, _MASKED_BIT_ENABLE(INSTPM_FORCE_ORDERING)); > + > + /* > + * BSpec says MI_LOAD_REGISTER_MEM, MI_LOAD_REGISTER_REG and > + * MI_BATCH_BUFFER_END instructions in this sequence need to be > + * in the same cacheline. To satisfy this case even if more WA are > + * added in future, pad current cacheline and start remaining sequence > + * in new cacheline. > + */ > + while (index % CACHELINE_DWORDS) > + wa_ctx_emit(batch, MI_NOOP); > + > + wa_ctx_emit(batch, (MI_LOAD_REGISTER_MEM_GEN8 | > + MI_LRM_USE_GLOBAL_GTT | > + MI_LRM_ASYNC_MODE_ENABLE)); > + wa_ctx_emit(batch, INSTPM); > + wa_ctx_emit(batch, scratch_addr); > + wa_ctx_emit(batch, 0); > + > + /* > + * BSpec says there should not be any commands programmed > + * between MI_LOAD_REGISTER_REG and MI_BATCH_BUFFER_END so > + * do not add any new commands > + */ > + wa_ctx_emit(batch, MI_LOAD_REGISTER_REG); > + wa_ctx_emit(batch, GEN8_RS_PREEMPT_STATUS); > + wa_ctx_emit(batch, GEN8_RS_PREEMPT_STATUS); > + > wa_ctx_emit(batch, MI_BATCH_BUFFER_END); > > return wa_ctx_end(wa_ctx, *offset = index, 1); >
On Fri, Jun 19, 2015 at 06:37:15PM +0100, Arun Siluvery wrote: > In Per context w/a batch buffer, > WaRsRestoreWithPerCtxtBb > > This WA performs writes to scratch page so it must be valid, this check > is performed before initializing the batch with this WA. > > v2: This patches modifies definitions of MI_LOAD_REGISTER_MEM and > MI_LOAD_REGISTER_REG; Add GEN8 specific defines for these instructions > so as to not break any future users of existing definitions (Michel) > > v3: Length defined in current definitions of LRM, LRR instructions was specified > as 0. It seems it is common convention for instructions whose length vary between > platforms. This is not an issue so far because they are not used anywhere except > command parser; now that we use in this patch update them with correct length > and also move them out of command parser placeholder to appropriate place. > remove unnecessary padding and follow the WA programming sequence exactly > as mentioned in spec which is essential for this WA (Dave). > > Cc: Chris Wilson <chris@chris-wilson.co.uk> > Cc: Dave Gordon <david.s.gordon@intel.com> > Signed-off-by: Rafael Barbalho <rafael.barbalho@intel.com> > Signed-off-by: Arun Siluvery <arun.siluvery@linux.intel.com> > --- > drivers/gpu/drm/i915/i915_reg.h | 29 +++++++++++++++++++-- > drivers/gpu/drm/i915/intel_lrc.c | 54 ++++++++++++++++++++++++++++++++++++++++ > 2 files changed, 81 insertions(+), 2 deletions(-) > > diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h > index 7637e64..208620d 100644 > --- a/drivers/gpu/drm/i915/i915_reg.h > +++ b/drivers/gpu/drm/i915/i915_reg.h > @@ -347,6 +347,31 @@ > #define MI_INVALIDATE_BSD (1<<7) > #define MI_FLUSH_DW_USE_GTT (1<<2) > #define MI_FLUSH_DW_USE_PPGTT (0<<2) > +#define MI_LOAD_REGISTER_MEM MI_INSTR(0x29, 1) > +#define MI_LOAD_REGISTER_MEM_GEN8 MI_INSTR(0x29, 2) > +#define MI_LRM_USE_GLOBAL_GTT (1<<22) > +#define MI_LRM_ASYNC_MODE_ENABLE (1<<21) > +#define MI_LOAD_REGISTER_REG MI_INSTR(0x2A, 1) > +#define MI_ATOMIC(len) MI_INSTR(0x2F, (len-2)) > +#define MI_ATOMIC_MEMORY_TYPE_GGTT (1<<22) > +#define MI_ATOMIC_INLINE_DATA (1<<18) > +#define MI_ATOMIC_CS_STALL (1<<17) > +#define MI_ATOMIC_RETURN_DATA_CTL (1<<16) > +#define MI_ATOMIC_OP_MASK(op) ((op) << 8) > +#define MI_ATOMIC_AND MI_ATOMIC_OP_MASK(0x01) > +#define MI_ATOMIC_OR MI_ATOMIC_OP_MASK(0x02) > +#define MI_ATOMIC_XOR MI_ATOMIC_OP_MASK(0x03) > +#define MI_ATOMIC_MOVE MI_ATOMIC_OP_MASK(0x04) > +#define MI_ATOMIC_INC MI_ATOMIC_OP_MASK(0x05) > +#define MI_ATOMIC_DEC MI_ATOMIC_OP_MASK(0x06) > +#define MI_ATOMIC_ADD MI_ATOMIC_OP_MASK(0x07) > +#define MI_ATOMIC_SUB MI_ATOMIC_OP_MASK(0x08) > +#define MI_ATOMIC_RSUB MI_ATOMIC_OP_MASK(0x09) > +#define MI_ATOMIC_IMAX MI_ATOMIC_OP_MASK(0x0A) > +#define MI_ATOMIC_IMIN MI_ATOMIC_OP_MASK(0x0B) > +#define MI_ATOMIC_UMAX MI_ATOMIC_OP_MASK(0x0C) > +#define MI_ATOMIC_UMIN MI_ATOMIC_OP_MASK(0x0D) > + > #define MI_BATCH_BUFFER MI_INSTR(0x30, 1) > #define MI_BATCH_NON_SECURE (1) > /* for snb/ivb/vlv this also means "batch in ppgtt" when ppgtt is enabled. */ > @@ -451,8 +476,6 @@ > #define MI_CLFLUSH MI_INSTR(0x27, 0) > #define MI_REPORT_PERF_COUNT MI_INSTR(0x28, 0) > #define MI_REPORT_PERF_COUNT_GGTT (1<<0) > -#define MI_LOAD_REGISTER_MEM MI_INSTR(0x29, 0) > -#define MI_LOAD_REGISTER_REG MI_INSTR(0x2A, 0) > #define MI_RS_STORE_DATA_IMM MI_INSTR(0x2B, 0) > #define MI_LOAD_URB_MEM MI_INSTR(0x2C, 0) > #define MI_STORE_URB_MEM MI_INSTR(0x2D, 0) > @@ -1799,6 +1822,8 @@ enum skl_disp_power_wells { > #define GEN8_RC_SEMA_IDLE_MSG_DISABLE (1 << 12) > #define GEN8_FF_DOP_CLOCK_GATE_DISABLE (1<<10) > > +#define GEN8_RS_PREEMPT_STATUS 0x215C > + > /* Fuse readout registers for GT */ > #define CHV_FUSE_GT (VLV_DISPLAY_BASE + 0x2168) > #define CHV_FGT_DISABLE_SS0 (1 << 10) > diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c > index 664455c..28198c4 100644 > --- a/drivers/gpu/drm/i915/intel_lrc.c > +++ b/drivers/gpu/drm/i915/intel_lrc.c > @@ -1215,11 +1215,65 @@ static int gen8_init_perctx_bb(struct intel_engine_cs *ring, > uint32_t *const batch, > uint32_t *offset) > { > + uint32_t scratch_addr; > uint32_t index = wa_ctx_start(wa_ctx, *offset, CACHELINE_DWORDS); > > + /* Actual scratch location is at 128 bytes offset */ > + scratch_addr = ring->scratch.gtt_offset + 2*CACHELINE_BYTES; > + scratch_addr |= PIPE_CONTROL_GLOBAL_GTT; > + > /* WaDisableCtxRestoreArbitration:bdw,chv */ > wa_ctx_emit(batch, MI_ARB_ON_OFF | MI_ARB_ENABLE); > > + /* > + * As per Bspec, to workaround a known HW issue, SW must perform the > + * below programming sequence prior to programming MI_BATCH_BUFFER_END. > + * > + * This is only applicable for Gen8. > + */ > + > + /* WaRsRestoreWithPerCtxtBb:bdw,chv */ This w/a doesn't seem to be needed for CHV. Also BDW seems to have gained a chicken bit in H0 (FF_SLICE_CS_CHICKEN3[5]) that supposedly means we shouldn't need this w/a on BDW either. > + wa_ctx_emit(batch, MI_LOAD_REGISTER_IMM(1)); > + wa_ctx_emit(batch, INSTPM); > + wa_ctx_emit(batch, _MASKED_BIT_DISABLE(INSTPM_FORCE_ORDERING)); > + > + wa_ctx_emit(batch, (MI_ATOMIC(5) | > + MI_ATOMIC_MEMORY_TYPE_GGTT | > + MI_ATOMIC_INLINE_DATA | > + MI_ATOMIC_CS_STALL | > + MI_ATOMIC_RETURN_DATA_CTL | > + MI_ATOMIC_MOVE)); > + wa_ctx_emit(batch, scratch_addr); > + wa_ctx_emit(batch, 0); > + wa_ctx_emit(batch, _MASKED_BIT_ENABLE(INSTPM_FORCE_ORDERING)); > + wa_ctx_emit(batch, _MASKED_BIT_ENABLE(INSTPM_FORCE_ORDERING)); > + > + /* > + * BSpec says MI_LOAD_REGISTER_MEM, MI_LOAD_REGISTER_REG and > + * MI_BATCH_BUFFER_END instructions in this sequence need to be > + * in the same cacheline. To satisfy this case even if more WA are > + * added in future, pad current cacheline and start remaining sequence > + * in new cacheline. > + */ > + while (index % CACHELINE_DWORDS) > + wa_ctx_emit(batch, MI_NOOP); > + > + wa_ctx_emit(batch, (MI_LOAD_REGISTER_MEM_GEN8 | > + MI_LRM_USE_GLOBAL_GTT | > + MI_LRM_ASYNC_MODE_ENABLE)); > + wa_ctx_emit(batch, INSTPM); > + wa_ctx_emit(batch, scratch_addr); > + wa_ctx_emit(batch, 0); > + > + /* > + * BSpec says there should not be any commands programmed > + * between MI_LOAD_REGISTER_REG and MI_BATCH_BUFFER_END so > + * do not add any new commands > + */ > + wa_ctx_emit(batch, MI_LOAD_REGISTER_REG); > + wa_ctx_emit(batch, GEN8_RS_PREEMPT_STATUS); > + wa_ctx_emit(batch, GEN8_RS_PREEMPT_STATUS); > + > wa_ctx_emit(batch, MI_BATCH_BUFFER_END); > > return wa_ctx_end(wa_ctx, *offset = index, 1); > -- > 2.3.0 > > _______________________________________________ > Intel-gfx mailing list > Intel-gfx@lists.freedesktop.org > http://lists.freedesktop.org/mailman/listinfo/intel-gfx
On 22/06/2015 17:21, Ville Syrjälä wrote: > On Fri, Jun 19, 2015 at 06:37:15PM +0100, Arun Siluvery wrote: >> In Per context w/a batch buffer, >> WaRsRestoreWithPerCtxtBb >> >> This WA performs writes to scratch page so it must be valid, this check >> is performed before initializing the batch with this WA. >> >> v2: This patches modifies definitions of MI_LOAD_REGISTER_MEM and >> MI_LOAD_REGISTER_REG; Add GEN8 specific defines for these instructions >> so as to not break any future users of existing definitions (Michel) >> >> v3: Length defined in current definitions of LRM, LRR instructions was specified >> as 0. It seems it is common convention for instructions whose length vary between >> platforms. This is not an issue so far because they are not used anywhere except >> command parser; now that we use in this patch update them with correct length >> and also move them out of command parser placeholder to appropriate place. >> remove unnecessary padding and follow the WA programming sequence exactly >> as mentioned in spec which is essential for this WA (Dave). >> >> Cc: Chris Wilson <chris@chris-wilson.co.uk> >> Cc: Dave Gordon <david.s.gordon@intel.com> >> Signed-off-by: Rafael Barbalho <rafael.barbalho@intel.com> >> Signed-off-by: Arun Siluvery <arun.siluvery@linux.intel.com> >> --- >> drivers/gpu/drm/i915/i915_reg.h | 29 +++++++++++++++++++-- >> drivers/gpu/drm/i915/intel_lrc.c | 54 ++++++++++++++++++++++++++++++++++++++++ >> 2 files changed, 81 insertions(+), 2 deletions(-) >> >> diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h >> index 7637e64..208620d 100644 >> --- a/drivers/gpu/drm/i915/i915_reg.h >> +++ b/drivers/gpu/drm/i915/i915_reg.h >> @@ -347,6 +347,31 @@ >> #define MI_INVALIDATE_BSD (1<<7) >> #define MI_FLUSH_DW_USE_GTT (1<<2) >> #define MI_FLUSH_DW_USE_PPGTT (0<<2) >> +#define MI_LOAD_REGISTER_MEM MI_INSTR(0x29, 1) >> +#define MI_LOAD_REGISTER_MEM_GEN8 MI_INSTR(0x29, 2) >> +#define MI_LRM_USE_GLOBAL_GTT (1<<22) >> +#define MI_LRM_ASYNC_MODE_ENABLE (1<<21) >> +#define MI_LOAD_REGISTER_REG MI_INSTR(0x2A, 1) >> +#define MI_ATOMIC(len) MI_INSTR(0x2F, (len-2)) >> +#define MI_ATOMIC_MEMORY_TYPE_GGTT (1<<22) >> +#define MI_ATOMIC_INLINE_DATA (1<<18) >> +#define MI_ATOMIC_CS_STALL (1<<17) >> +#define MI_ATOMIC_RETURN_DATA_CTL (1<<16) >> +#define MI_ATOMIC_OP_MASK(op) ((op) << 8) >> +#define MI_ATOMIC_AND MI_ATOMIC_OP_MASK(0x01) >> +#define MI_ATOMIC_OR MI_ATOMIC_OP_MASK(0x02) >> +#define MI_ATOMIC_XOR MI_ATOMIC_OP_MASK(0x03) >> +#define MI_ATOMIC_MOVE MI_ATOMIC_OP_MASK(0x04) >> +#define MI_ATOMIC_INC MI_ATOMIC_OP_MASK(0x05) >> +#define MI_ATOMIC_DEC MI_ATOMIC_OP_MASK(0x06) >> +#define MI_ATOMIC_ADD MI_ATOMIC_OP_MASK(0x07) >> +#define MI_ATOMIC_SUB MI_ATOMIC_OP_MASK(0x08) >> +#define MI_ATOMIC_RSUB MI_ATOMIC_OP_MASK(0x09) >> +#define MI_ATOMIC_IMAX MI_ATOMIC_OP_MASK(0x0A) >> +#define MI_ATOMIC_IMIN MI_ATOMIC_OP_MASK(0x0B) >> +#define MI_ATOMIC_UMAX MI_ATOMIC_OP_MASK(0x0C) >> +#define MI_ATOMIC_UMIN MI_ATOMIC_OP_MASK(0x0D) >> + >> #define MI_BATCH_BUFFER MI_INSTR(0x30, 1) >> #define MI_BATCH_NON_SECURE (1) >> /* for snb/ivb/vlv this also means "batch in ppgtt" when ppgtt is enabled. */ >> @@ -451,8 +476,6 @@ >> #define MI_CLFLUSH MI_INSTR(0x27, 0) >> #define MI_REPORT_PERF_COUNT MI_INSTR(0x28, 0) >> #define MI_REPORT_PERF_COUNT_GGTT (1<<0) >> -#define MI_LOAD_REGISTER_MEM MI_INSTR(0x29, 0) >> -#define MI_LOAD_REGISTER_REG MI_INSTR(0x2A, 0) >> #define MI_RS_STORE_DATA_IMM MI_INSTR(0x2B, 0) >> #define MI_LOAD_URB_MEM MI_INSTR(0x2C, 0) >> #define MI_STORE_URB_MEM MI_INSTR(0x2D, 0) >> @@ -1799,6 +1822,8 @@ enum skl_disp_power_wells { >> #define GEN8_RC_SEMA_IDLE_MSG_DISABLE (1 << 12) >> #define GEN8_FF_DOP_CLOCK_GATE_DISABLE (1<<10) >> >> +#define GEN8_RS_PREEMPT_STATUS 0x215C >> + >> /* Fuse readout registers for GT */ >> #define CHV_FUSE_GT (VLV_DISPLAY_BASE + 0x2168) >> #define CHV_FGT_DISABLE_SS0 (1 << 10) >> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c >> index 664455c..28198c4 100644 >> --- a/drivers/gpu/drm/i915/intel_lrc.c >> +++ b/drivers/gpu/drm/i915/intel_lrc.c >> @@ -1215,11 +1215,65 @@ static int gen8_init_perctx_bb(struct intel_engine_cs *ring, >> uint32_t *const batch, >> uint32_t *offset) >> { >> + uint32_t scratch_addr; >> uint32_t index = wa_ctx_start(wa_ctx, *offset, CACHELINE_DWORDS); >> >> + /* Actual scratch location is at 128 bytes offset */ >> + scratch_addr = ring->scratch.gtt_offset + 2*CACHELINE_BYTES; >> + scratch_addr |= PIPE_CONTROL_GLOBAL_GTT; >> + >> /* WaDisableCtxRestoreArbitration:bdw,chv */ >> wa_ctx_emit(batch, MI_ARB_ON_OFF | MI_ARB_ENABLE); >> >> + /* >> + * As per Bspec, to workaround a known HW issue, SW must perform the >> + * below programming sequence prior to programming MI_BATCH_BUFFER_END. >> + * >> + * This is only applicable for Gen8. >> + */ >> + >> + /* WaRsRestoreWithPerCtxtBb:bdw,chv */ > > This w/a doesn't seem to be needed for CHV. Also BDW seems to have > gained a chicken bit in H0 (FF_SLICE_CS_CHICKEN3[5]) that supposedly > means we shouldn't need this w/a on BDW either. > looks like this chicken bit is applying this WA, if this is working as expected then we can ignore this patch, I will try to get some confirmation on this. regards Arun >> + wa_ctx_emit(batch, MI_LOAD_REGISTER_IMM(1)); >> + wa_ctx_emit(batch, INSTPM); >> + wa_ctx_emit(batch, _MASKED_BIT_DISABLE(INSTPM_FORCE_ORDERING)); >> + >> + wa_ctx_emit(batch, (MI_ATOMIC(5) | >> + MI_ATOMIC_MEMORY_TYPE_GGTT | >> + MI_ATOMIC_INLINE_DATA | >> + MI_ATOMIC_CS_STALL | >> + MI_ATOMIC_RETURN_DATA_CTL | >> + MI_ATOMIC_MOVE)); >> + wa_ctx_emit(batch, scratch_addr); >> + wa_ctx_emit(batch, 0); >> + wa_ctx_emit(batch, _MASKED_BIT_ENABLE(INSTPM_FORCE_ORDERING)); >> + wa_ctx_emit(batch, _MASKED_BIT_ENABLE(INSTPM_FORCE_ORDERING)); >> + >> + /* >> + * BSpec says MI_LOAD_REGISTER_MEM, MI_LOAD_REGISTER_REG and >> + * MI_BATCH_BUFFER_END instructions in this sequence need to be >> + * in the same cacheline. To satisfy this case even if more WA are >> + * added in future, pad current cacheline and start remaining sequence >> + * in new cacheline. >> + */ >> + while (index % CACHELINE_DWORDS) >> + wa_ctx_emit(batch, MI_NOOP); >> + >> + wa_ctx_emit(batch, (MI_LOAD_REGISTER_MEM_GEN8 | >> + MI_LRM_USE_GLOBAL_GTT | >> + MI_LRM_ASYNC_MODE_ENABLE)); >> + wa_ctx_emit(batch, INSTPM); >> + wa_ctx_emit(batch, scratch_addr); >> + wa_ctx_emit(batch, 0); >> + >> + /* >> + * BSpec says there should not be any commands programmed >> + * between MI_LOAD_REGISTER_REG and MI_BATCH_BUFFER_END so >> + * do not add any new commands >> + */ >> + wa_ctx_emit(batch, MI_LOAD_REGISTER_REG); >> + wa_ctx_emit(batch, GEN8_RS_PREEMPT_STATUS); >> + wa_ctx_emit(batch, GEN8_RS_PREEMPT_STATUS); >> + >> wa_ctx_emit(batch, MI_BATCH_BUFFER_END); >> >> return wa_ctx_end(wa_ctx, *offset = index, 1); >> -- >> 2.3.0 >> >> _______________________________________________ >> Intel-gfx mailing list >> Intel-gfx@lists.freedesktop.org >> http://lists.freedesktop.org/mailman/listinfo/intel-gfx >
On 22/06/2015 17:59, Siluvery, Arun wrote: > On 22/06/2015 17:21, Ville Syrjälä wrote: >> On Fri, Jun 19, 2015 at 06:37:15PM +0100, Arun Siluvery wrote: >>> In Per context w/a batch buffer, >>> WaRsRestoreWithPerCtxtBb >>> >>> This WA performs writes to scratch page so it must be valid, this check >>> is performed before initializing the batch with this WA. >>> >>> v2: This patches modifies definitions of MI_LOAD_REGISTER_MEM and >>> MI_LOAD_REGISTER_REG; Add GEN8 specific defines for these instructions >>> so as to not break any future users of existing definitions (Michel) >>> >>> v3: Length defined in current definitions of LRM, LRR instructions was specified >>> as 0. It seems it is common convention for instructions whose length vary between >>> platforms. This is not an issue so far because they are not used anywhere except >>> command parser; now that we use in this patch update them with correct length >>> and also move them out of command parser placeholder to appropriate place. >>> remove unnecessary padding and follow the WA programming sequence exactly >>> as mentioned in spec which is essential for this WA (Dave). >>> >>> Cc: Chris Wilson <chris@chris-wilson.co.uk> >>> Cc: Dave Gordon <david.s.gordon@intel.com> >>> Signed-off-by: Rafael Barbalho <rafael.barbalho@intel.com> >>> Signed-off-by: Arun Siluvery <arun.siluvery@linux.intel.com> >>> --- >>> drivers/gpu/drm/i915/i915_reg.h | 29 +++++++++++++++++++-- >>> drivers/gpu/drm/i915/intel_lrc.c | 54 ++++++++++++++++++++++++++++++++++++++++ >>> 2 files changed, 81 insertions(+), 2 deletions(-) >>> >>> diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h >>> index 7637e64..208620d 100644 >>> --- a/drivers/gpu/drm/i915/i915_reg.h >>> +++ b/drivers/gpu/drm/i915/i915_reg.h >>> @@ -347,6 +347,31 @@ >>> #define MI_INVALIDATE_BSD (1<<7) >>> #define MI_FLUSH_DW_USE_GTT (1<<2) >>> #define MI_FLUSH_DW_USE_PPGTT (0<<2) >>> +#define MI_LOAD_REGISTER_MEM MI_INSTR(0x29, 1) >>> +#define MI_LOAD_REGISTER_MEM_GEN8 MI_INSTR(0x29, 2) >>> +#define MI_LRM_USE_GLOBAL_GTT (1<<22) >>> +#define MI_LRM_ASYNC_MODE_ENABLE (1<<21) >>> +#define MI_LOAD_REGISTER_REG MI_INSTR(0x2A, 1) >>> +#define MI_ATOMIC(len) MI_INSTR(0x2F, (len-2)) >>> +#define MI_ATOMIC_MEMORY_TYPE_GGTT (1<<22) >>> +#define MI_ATOMIC_INLINE_DATA (1<<18) >>> +#define MI_ATOMIC_CS_STALL (1<<17) >>> +#define MI_ATOMIC_RETURN_DATA_CTL (1<<16) >>> +#define MI_ATOMIC_OP_MASK(op) ((op) << 8) >>> +#define MI_ATOMIC_AND MI_ATOMIC_OP_MASK(0x01) >>> +#define MI_ATOMIC_OR MI_ATOMIC_OP_MASK(0x02) >>> +#define MI_ATOMIC_XOR MI_ATOMIC_OP_MASK(0x03) >>> +#define MI_ATOMIC_MOVE MI_ATOMIC_OP_MASK(0x04) >>> +#define MI_ATOMIC_INC MI_ATOMIC_OP_MASK(0x05) >>> +#define MI_ATOMIC_DEC MI_ATOMIC_OP_MASK(0x06) >>> +#define MI_ATOMIC_ADD MI_ATOMIC_OP_MASK(0x07) >>> +#define MI_ATOMIC_SUB MI_ATOMIC_OP_MASK(0x08) >>> +#define MI_ATOMIC_RSUB MI_ATOMIC_OP_MASK(0x09) >>> +#define MI_ATOMIC_IMAX MI_ATOMIC_OP_MASK(0x0A) >>> +#define MI_ATOMIC_IMIN MI_ATOMIC_OP_MASK(0x0B) >>> +#define MI_ATOMIC_UMAX MI_ATOMIC_OP_MASK(0x0C) >>> +#define MI_ATOMIC_UMIN MI_ATOMIC_OP_MASK(0x0D) >>> + >>> #define MI_BATCH_BUFFER MI_INSTR(0x30, 1) >>> #define MI_BATCH_NON_SECURE (1) >>> /* for snb/ivb/vlv this also means "batch in ppgtt" when ppgtt is enabled. */ >>> @@ -451,8 +476,6 @@ >>> #define MI_CLFLUSH MI_INSTR(0x27, 0) >>> #define MI_REPORT_PERF_COUNT MI_INSTR(0x28, 0) >>> #define MI_REPORT_PERF_COUNT_GGTT (1<<0) >>> -#define MI_LOAD_REGISTER_MEM MI_INSTR(0x29, 0) >>> -#define MI_LOAD_REGISTER_REG MI_INSTR(0x2A, 0) >>> #define MI_RS_STORE_DATA_IMM MI_INSTR(0x2B, 0) >>> #define MI_LOAD_URB_MEM MI_INSTR(0x2C, 0) >>> #define MI_STORE_URB_MEM MI_INSTR(0x2D, 0) >>> @@ -1799,6 +1822,8 @@ enum skl_disp_power_wells { >>> #define GEN8_RC_SEMA_IDLE_MSG_DISABLE (1 << 12) >>> #define GEN8_FF_DOP_CLOCK_GATE_DISABLE (1<<10) >>> >>> +#define GEN8_RS_PREEMPT_STATUS 0x215C >>> + >>> /* Fuse readout registers for GT */ >>> #define CHV_FUSE_GT (VLV_DISPLAY_BASE + 0x2168) >>> #define CHV_FGT_DISABLE_SS0 (1 << 10) >>> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c >>> index 664455c..28198c4 100644 >>> --- a/drivers/gpu/drm/i915/intel_lrc.c >>> +++ b/drivers/gpu/drm/i915/intel_lrc.c >>> @@ -1215,11 +1215,65 @@ static int gen8_init_perctx_bb(struct intel_engine_cs *ring, >>> uint32_t *const batch, >>> uint32_t *offset) >>> { >>> + uint32_t scratch_addr; >>> uint32_t index = wa_ctx_start(wa_ctx, *offset, CACHELINE_DWORDS); >>> >>> + /* Actual scratch location is at 128 bytes offset */ >>> + scratch_addr = ring->scratch.gtt_offset + 2*CACHELINE_BYTES; >>> + scratch_addr |= PIPE_CONTROL_GLOBAL_GTT; >>> + >>> /* WaDisableCtxRestoreArbitration:bdw,chv */ >>> wa_ctx_emit(batch, MI_ARB_ON_OFF | MI_ARB_ENABLE); >>> >>> + /* >>> + * As per Bspec, to workaround a known HW issue, SW must perform the >>> + * below programming sequence prior to programming MI_BATCH_BUFFER_END. >>> + * >>> + * This is only applicable for Gen8. >>> + */ >>> + >>> + /* WaRsRestoreWithPerCtxtBb:bdw,chv */ >> >> This w/a doesn't seem to be needed for CHV. Also BDW seems to have >> gained a chicken bit in H0 (FF_SLICE_CS_CHICKEN3[5]) that supposedly >> means we shouldn't need this w/a on BDW either. >> > looks like this chicken bit is applying this WA, if this is working as > expected then we can ignore this patch, I will try to get some > confirmation on this. I got confirmation from HW that chicken bit is enough, this patch can be ignored. regards Arun > > regards > Arun > >>> + wa_ctx_emit(batch, MI_LOAD_REGISTER_IMM(1)); >>> + wa_ctx_emit(batch, INSTPM); >>> + wa_ctx_emit(batch, _MASKED_BIT_DISABLE(INSTPM_FORCE_ORDERING)); >>> + >>> + wa_ctx_emit(batch, (MI_ATOMIC(5) | >>> + MI_ATOMIC_MEMORY_TYPE_GGTT | >>> + MI_ATOMIC_INLINE_DATA | >>> + MI_ATOMIC_CS_STALL | >>> + MI_ATOMIC_RETURN_DATA_CTL | >>> + MI_ATOMIC_MOVE)); >>> + wa_ctx_emit(batch, scratch_addr); >>> + wa_ctx_emit(batch, 0); >>> + wa_ctx_emit(batch, _MASKED_BIT_ENABLE(INSTPM_FORCE_ORDERING)); >>> + wa_ctx_emit(batch, _MASKED_BIT_ENABLE(INSTPM_FORCE_ORDERING)); >>> + >>> + /* >>> + * BSpec says MI_LOAD_REGISTER_MEM, MI_LOAD_REGISTER_REG and >>> + * MI_BATCH_BUFFER_END instructions in this sequence need to be >>> + * in the same cacheline. To satisfy this case even if more WA are >>> + * added in future, pad current cacheline and start remaining sequence >>> + * in new cacheline. >>> + */ >>> + while (index % CACHELINE_DWORDS) >>> + wa_ctx_emit(batch, MI_NOOP); >>> + >>> + wa_ctx_emit(batch, (MI_LOAD_REGISTER_MEM_GEN8 | >>> + MI_LRM_USE_GLOBAL_GTT | >>> + MI_LRM_ASYNC_MODE_ENABLE)); >>> + wa_ctx_emit(batch, INSTPM); >>> + wa_ctx_emit(batch, scratch_addr); >>> + wa_ctx_emit(batch, 0); >>> + >>> + /* >>> + * BSpec says there should not be any commands programmed >>> + * between MI_LOAD_REGISTER_REG and MI_BATCH_BUFFER_END so >>> + * do not add any new commands >>> + */ >>> + wa_ctx_emit(batch, MI_LOAD_REGISTER_REG); >>> + wa_ctx_emit(batch, GEN8_RS_PREEMPT_STATUS); >>> + wa_ctx_emit(batch, GEN8_RS_PREEMPT_STATUS); >>> + >>> wa_ctx_emit(batch, MI_BATCH_BUFFER_END); >>> >>> return wa_ctx_end(wa_ctx, *offset = index, 1); >>> -- >>> 2.3.0 >>> >>> _______________________________________________ >>> Intel-gfx mailing list >>> Intel-gfx@lists.freedesktop.org >>> http://lists.freedesktop.org/mailman/listinfo/intel-gfx >> > > _______________________________________________ > Intel-gfx mailing list > Intel-gfx@lists.freedesktop.org > http://lists.freedesktop.org/mailman/listinfo/intel-gfx >
diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h index 7637e64..208620d 100644 --- a/drivers/gpu/drm/i915/i915_reg.h +++ b/drivers/gpu/drm/i915/i915_reg.h @@ -347,6 +347,31 @@ #define MI_INVALIDATE_BSD (1<<7) #define MI_FLUSH_DW_USE_GTT (1<<2) #define MI_FLUSH_DW_USE_PPGTT (0<<2) +#define MI_LOAD_REGISTER_MEM MI_INSTR(0x29, 1) +#define MI_LOAD_REGISTER_MEM_GEN8 MI_INSTR(0x29, 2) +#define MI_LRM_USE_GLOBAL_GTT (1<<22) +#define MI_LRM_ASYNC_MODE_ENABLE (1<<21) +#define MI_LOAD_REGISTER_REG MI_INSTR(0x2A, 1) +#define MI_ATOMIC(len) MI_INSTR(0x2F, (len-2)) +#define MI_ATOMIC_MEMORY_TYPE_GGTT (1<<22) +#define MI_ATOMIC_INLINE_DATA (1<<18) +#define MI_ATOMIC_CS_STALL (1<<17) +#define MI_ATOMIC_RETURN_DATA_CTL (1<<16) +#define MI_ATOMIC_OP_MASK(op) ((op) << 8) +#define MI_ATOMIC_AND MI_ATOMIC_OP_MASK(0x01) +#define MI_ATOMIC_OR MI_ATOMIC_OP_MASK(0x02) +#define MI_ATOMIC_XOR MI_ATOMIC_OP_MASK(0x03) +#define MI_ATOMIC_MOVE MI_ATOMIC_OP_MASK(0x04) +#define MI_ATOMIC_INC MI_ATOMIC_OP_MASK(0x05) +#define MI_ATOMIC_DEC MI_ATOMIC_OP_MASK(0x06) +#define MI_ATOMIC_ADD MI_ATOMIC_OP_MASK(0x07) +#define MI_ATOMIC_SUB MI_ATOMIC_OP_MASK(0x08) +#define MI_ATOMIC_RSUB MI_ATOMIC_OP_MASK(0x09) +#define MI_ATOMIC_IMAX MI_ATOMIC_OP_MASK(0x0A) +#define MI_ATOMIC_IMIN MI_ATOMIC_OP_MASK(0x0B) +#define MI_ATOMIC_UMAX MI_ATOMIC_OP_MASK(0x0C) +#define MI_ATOMIC_UMIN MI_ATOMIC_OP_MASK(0x0D) + #define MI_BATCH_BUFFER MI_INSTR(0x30, 1) #define MI_BATCH_NON_SECURE (1) /* for snb/ivb/vlv this also means "batch in ppgtt" when ppgtt is enabled. */ @@ -451,8 +476,6 @@ #define MI_CLFLUSH MI_INSTR(0x27, 0) #define MI_REPORT_PERF_COUNT MI_INSTR(0x28, 0) #define MI_REPORT_PERF_COUNT_GGTT (1<<0) -#define MI_LOAD_REGISTER_MEM MI_INSTR(0x29, 0) -#define MI_LOAD_REGISTER_REG MI_INSTR(0x2A, 0) #define MI_RS_STORE_DATA_IMM MI_INSTR(0x2B, 0) #define MI_LOAD_URB_MEM MI_INSTR(0x2C, 0) #define MI_STORE_URB_MEM MI_INSTR(0x2D, 0) @@ -1799,6 +1822,8 @@ enum skl_disp_power_wells { #define GEN8_RC_SEMA_IDLE_MSG_DISABLE (1 << 12) #define GEN8_FF_DOP_CLOCK_GATE_DISABLE (1<<10) +#define GEN8_RS_PREEMPT_STATUS 0x215C + /* Fuse readout registers for GT */ #define CHV_FUSE_GT (VLV_DISPLAY_BASE + 0x2168) #define CHV_FGT_DISABLE_SS0 (1 << 10) diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c index 664455c..28198c4 100644 --- a/drivers/gpu/drm/i915/intel_lrc.c +++ b/drivers/gpu/drm/i915/intel_lrc.c @@ -1215,11 +1215,65 @@ static int gen8_init_perctx_bb(struct intel_engine_cs *ring, uint32_t *const batch, uint32_t *offset) { + uint32_t scratch_addr; uint32_t index = wa_ctx_start(wa_ctx, *offset, CACHELINE_DWORDS); + /* Actual scratch location is at 128 bytes offset */ + scratch_addr = ring->scratch.gtt_offset + 2*CACHELINE_BYTES; + scratch_addr |= PIPE_CONTROL_GLOBAL_GTT; + /* WaDisableCtxRestoreArbitration:bdw,chv */ wa_ctx_emit(batch, MI_ARB_ON_OFF | MI_ARB_ENABLE); + /* + * As per Bspec, to workaround a known HW issue, SW must perform the + * below programming sequence prior to programming MI_BATCH_BUFFER_END. + * + * This is only applicable for Gen8. + */ + + /* WaRsRestoreWithPerCtxtBb:bdw,chv */ + wa_ctx_emit(batch, MI_LOAD_REGISTER_IMM(1)); + wa_ctx_emit(batch, INSTPM); + wa_ctx_emit(batch, _MASKED_BIT_DISABLE(INSTPM_FORCE_ORDERING)); + + wa_ctx_emit(batch, (MI_ATOMIC(5) | + MI_ATOMIC_MEMORY_TYPE_GGTT | + MI_ATOMIC_INLINE_DATA | + MI_ATOMIC_CS_STALL | + MI_ATOMIC_RETURN_DATA_CTL | + MI_ATOMIC_MOVE)); + wa_ctx_emit(batch, scratch_addr); + wa_ctx_emit(batch, 0); + wa_ctx_emit(batch, _MASKED_BIT_ENABLE(INSTPM_FORCE_ORDERING)); + wa_ctx_emit(batch, _MASKED_BIT_ENABLE(INSTPM_FORCE_ORDERING)); + + /* + * BSpec says MI_LOAD_REGISTER_MEM, MI_LOAD_REGISTER_REG and + * MI_BATCH_BUFFER_END instructions in this sequence need to be + * in the same cacheline. To satisfy this case even if more WA are + * added in future, pad current cacheline and start remaining sequence + * in new cacheline. + */ + while (index % CACHELINE_DWORDS) + wa_ctx_emit(batch, MI_NOOP); + + wa_ctx_emit(batch, (MI_LOAD_REGISTER_MEM_GEN8 | + MI_LRM_USE_GLOBAL_GTT | + MI_LRM_ASYNC_MODE_ENABLE)); + wa_ctx_emit(batch, INSTPM); + wa_ctx_emit(batch, scratch_addr); + wa_ctx_emit(batch, 0); + + /* + * BSpec says there should not be any commands programmed + * between MI_LOAD_REGISTER_REG and MI_BATCH_BUFFER_END so + * do not add any new commands + */ + wa_ctx_emit(batch, MI_LOAD_REGISTER_REG); + wa_ctx_emit(batch, GEN8_RS_PREEMPT_STATUS); + wa_ctx_emit(batch, GEN8_RS_PREEMPT_STATUS); + wa_ctx_emit(batch, MI_BATCH_BUFFER_END); return wa_ctx_end(wa_ctx, *offset = index, 1);