Message ID | 1424886882-8117-3-git-send-email-arun.siluvery@linux.intel.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Tested-By: PRC QA PRTS (Patch Regression Test System Contact: shuang.he@intel.com)
Task id: 5828
-------------------------------------Summary-------------------------------------
Platform Delta drm-intel-nightly Series Applied
PNV -1 281/281 280/281
ILK 308/308 308/308
SNB 326/326 326/326
IVB 380/380 380/380
BYT 294/294 294/294
HSW 387/421 387/421
BDW -1 316/316 315/316
-------------------------------------Detailed-------------------------------------
Platform Test drm-intel-nightly Series Applied
*PNV igt_gen3_render_mixed_blits PASS(7) CRASH(1)PASS(1)
*BDW igt_gem_gtt_hog PASS(12) DMESG_WARN(1)PASS(1)
Note: You need to pay more attention to line start with '*'
On 25/02/15 17:54, Arun Siluvery wrote: > Some of the workarounds are to be applied during context save but before > restore and some at the end of context save/restore but before executing > the instructions in the ring. Workaround batch buffers are created for > this purpose as they cannot be applied using normal means. HW executes > them at specific stages during context save/restore. > > In this method we initialize batch buffer with w/a commands and its address > is supplied using context offset pointers when a context is initialized. > > This patch introduces indirect and per-context batch buffers using which > following workarounds are applied. These are required to fix issues > observed with preemption related workloads. > > In Indirect context w/a batch buffer, > +WaDisableCtxRestoreArbitration > +WaFlushCoherentL3CacheLinesAtContextSwitch > +WaClearSlmSpaceAtContextSwitch > > In Per context w/a batch buffer, > +WaDisableCtxRestoreArbitration > +WaRsRestoreWithPerCtxtBb > > v2: Use GTT address type for all privileged instructions, update as > per dynamic pinning changes, minor simplifications, rename variables > as follows to keep lines under 80 chars and rebase. > s/indirect_ctx_wa_ringbuf/indirect_ctx_wa_bb > s/per_ctx_wa_ringbuf/per_ctx_wa_bb > > v3: Modify WA BB initialization to Gen specific. > > Change-Id: I0cedb536b7f6d9f10ba9e81ba625848e7bab603c > Signed-off-by: Rafael Barbalho <rafael.barbalho@intel.com> > Signed-off-by: Arun Siluvery <arun.siluvery@linux.intel.com> > --- > drivers/gpu/drm/i915/i915_drv.h | 3 + > drivers/gpu/drm/i915/i915_reg.h | 30 +++- > drivers/gpu/drm/i915/intel_lrc.c | 302 +++++++++++++++++++++++++++----- > drivers/gpu/drm/i915/intel_ringbuffer.h | 3 + > 4 files changed, 297 insertions(+), 41 deletions(-) > > diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h > index d42040f..86cdb52 100644 > --- a/drivers/gpu/drm/i915/i915_drv.h > +++ b/drivers/gpu/drm/i915/i915_drv.h > @@ -774,6 +774,9 @@ struct intel_context { > > /* Execlists */ > bool rcs_initialized; > + struct intel_ringbuffer *indirect_ctx_wa_bb; > + struct intel_ringbuffer *per_ctx_wa_bb; > + > struct { > struct drm_i915_gem_object *state; > struct intel_ringbuffer *ringbuf; > diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h > index 55143cb..eb41d7f 100644 > --- a/drivers/gpu/drm/i915/i915_reg.h > +++ b/drivers/gpu/drm/i915/i915_reg.h > @@ -347,6 +347,26 @@ > #define MI_INVALIDATE_BSD (1<<7) > #define MI_FLUSH_DW_USE_GTT (1<<2) > #define MI_FLUSH_DW_USE_PPGTT (0<<2) > +#define MI_ATOMIC(len) MI_INSTR(0x2F, (len-2)) > +#define MI_ATOMIC_MEMORY_TYPE_GGTT (1<<22) > +#define MI_ATOMIC_INLINE_DATA (1<<18) > +#define MI_ATOMIC_CS_STALL (1<<17) > +#define MI_ATOMIC_RETURN_DATA_CTL (1<<16) > +#define MI_ATOMIC_OP_MASK(op) ((op) << 8) > +#define MI_ATOMIC_AND MI_ATOMIC_OP_MASK(0x01) > +#define MI_ATOMIC_OR MI_ATOMIC_OP_MASK(0x02) > +#define MI_ATOMIC_XOR MI_ATOMIC_OP_MASK(0x03) > +#define MI_ATOMIC_MOVE MI_ATOMIC_OP_MASK(0x04) > +#define MI_ATOMIC_INC MI_ATOMIC_OP_MASK(0x05) > +#define MI_ATOMIC_DEC MI_ATOMIC_OP_MASK(0x06) > +#define MI_ATOMIC_ADD MI_ATOMIC_OP_MASK(0x07) > +#define MI_ATOMIC_SUB MI_ATOMIC_OP_MASK(0x08) > +#define MI_ATOMIC_RSUB MI_ATOMIC_OP_MASK(0x09) > +#define MI_ATOMIC_IMAX MI_ATOMIC_OP_MASK(0x0A) > +#define MI_ATOMIC_IMIN MI_ATOMIC_OP_MASK(0x0B) > +#define MI_ATOMIC_UMAX MI_ATOMIC_OP_MASK(0x0C) > +#define MI_ATOMIC_UMIN MI_ATOMIC_OP_MASK(0x0D) > + > #define MI_BATCH_BUFFER MI_INSTR(0x30, 1) > #define MI_BATCH_NON_SECURE (1) > /* for snb/ivb/vlv this also means "batch in ppgtt" when ppgtt is enabled. */ > @@ -410,6 +430,7 @@ > #define DISPLAY_PLANE_A (0<<20) > #define DISPLAY_PLANE_B (1<<20) > #define GFX_OP_PIPE_CONTROL(len) ((0x3<<29)|(0x3<<27)|(0x2<<24)|(len-2)) > +#define PIPE_CONTROL_FLUSH_RO_CACHES (1<<27) I think the consensus is to rename this to PIPE_CONTROL_FLUSH_L3, isn't it? > #define PIPE_CONTROL_GLOBAL_GTT_IVB (1<<24) /* gen7+ */ > #define PIPE_CONTROL_MMIO_WRITE (1<<23) > #define PIPE_CONTROL_STORE_DATA_INDEX (1<<21) > @@ -426,6 +447,7 @@ > #define PIPE_CONTROL_INDIRECT_STATE_DISABLE (1<<9) > #define PIPE_CONTROL_NOTIFY (1<<8) > #define PIPE_CONTROL_FLUSH_ENABLE (1<<7) /* gen7+ */ > +#define PIPE_CONTROL_DC_FLUSH_ENABLE (1<<5) > #define PIPE_CONTROL_VF_CACHE_INVALIDATE (1<<4) > #define PIPE_CONTROL_CONST_CACHE_INVALIDATE (1<<3) > #define PIPE_CONTROL_STATE_CACHE_INVALIDATE (1<<2) > @@ -449,8 +471,10 @@ > #define MI_CLFLUSH MI_INSTR(0x27, 0) > #define MI_REPORT_PERF_COUNT MI_INSTR(0x28, 0) > #define MI_REPORT_PERF_COUNT_GGTT (1<<0) > -#define MI_LOAD_REGISTER_MEM MI_INSTR(0x29, 0) > -#define MI_LOAD_REGISTER_REG MI_INSTR(0x2A, 0) > +#define MI_LOAD_REGISTER_MEM MI_INSTR(0x29, 2) Should this change be GEN8 only? > +#define MI_LRM_USE_GLOBAL_GTT (1<<22) > +#define MI_LRM_ASYNC_MODE_ENABLE (1<<21) > +#define MI_LOAD_REGISTER_REG MI_INSTR(0x2A, 1) > #define MI_RS_STORE_DATA_IMM MI_INSTR(0x2B, 0) > #define MI_LOAD_URB_MEM MI_INSTR(0x2C, 0) > #define MI_STORE_URB_MEM MI_INSTR(0x2D, 0) > @@ -1520,6 +1544,8 @@ enum skl_disp_power_wells { > #define GEN8_RC_SEMA_IDLE_MSG_DISABLE (1 << 12) > #define GEN8_FF_DOP_CLOCK_GATE_DISABLE (1<<10) > > +#define GEN8_RS_PREEMPT_STATUS 0x215C > + > /* Fuse readout registers for GT */ > #define CHV_FUSE_GT (VLV_DISPLAY_BASE + 0x2168) > #define CHV_FGT_EU_DIS_SS0_R0_SHIFT 16 > diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c > index 47473e5..b34ef48 100644 > --- a/drivers/gpu/drm/i915/intel_lrc.c > +++ b/drivers/gpu/drm/i915/intel_lrc.c > @@ -202,6 +202,7 @@ enum { > FAULT_AND_CONTINUE /* Unsupported */ > }; > #define GEN8_CTX_ID_SHIFT 32 > +#define CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT 0x17 > > static int intel_lr_context_pin(struct intel_engine_cs *ring, > struct intel_context *ctx); > @@ -1107,6 +1108,208 @@ static int intel_logical_ring_workarounds_emit(struct intel_engine_cs *ring, > return 0; > } > > +static struct intel_ringbuffer * > +create_wa_bb(struct intel_engine_cs *ring, uint32_t bb_size) > +{ > + struct drm_device *dev = ring->dev; > + struct intel_ringbuffer *ringbuf; > + int ret; > + > + ringbuf = kzalloc(sizeof(*ringbuf), GFP_KERNEL); > + if (!ringbuf) > + return NULL; > + > + ringbuf->ring = ring; > + > + ringbuf->size = roundup(bb_size, PAGE_SIZE); > + ringbuf->effective_size = ringbuf->size; > + ringbuf->head = 0; > + ringbuf->tail = 0; > + ringbuf->space = ringbuf->size; > + ringbuf->last_retired_head = -1; > + > + ret = intel_alloc_ringbuffer_obj(dev, ringbuf); > + if (ret) { > + DRM_DEBUG_DRIVER("Failed to allocate ringbuffer obj %s: %d\n", > + ring->name, ret); > + kfree(ringbuf); > + return NULL; > + } > + > + ret = intel_pin_and_map_ringbuffer_obj(dev, ringbuf); > + if (ret) { > + DRM_ERROR("Failed to pin and map %s w/a batch: %d\n", > + ring->name, ret); > + intel_destroy_ringbuffer_obj(ringbuf); > + kfree(ringbuf); > + return NULL; > + } > + > + return ringbuf; > +} > + > +static int gen8_init_indirectctx_bb(struct intel_engine_cs *ring, > + struct intel_context *ctx) > +{ > + unsigned long flags = 0; > + u32 scratch_addr; > + struct intel_ringbuffer *ringbuf = NULL; > + > + if (ring->scratch.obj == NULL) { > + DRM_ERROR("scratch page not allocated for %s\n", ring->name); > + return -EINVAL; > + } > + > + ringbuf = create_wa_bb(ring, PAGE_SIZE); > + if (!ringbuf) > + return -ENOMEM; > + > + ctx->indirect_ctx_wa_bb = ringbuf; > + > + /* WaDisableCtxRestoreArbitration:bdw,chv */ > + intel_logical_ring_emit(ringbuf, MI_ARB_ON_OFF | MI_ARB_DISABLE); > + > + /* WaFlushCoherentL3CacheLinesAtContextSwitch:bdw,chv */ > + intel_logical_ring_emit(ringbuf, GFX_OP_PIPE_CONTROL(6)); > + intel_logical_ring_emit(ringbuf, PIPE_CONTROL_DC_FLUSH_ENABLE); > + intel_logical_ring_emit(ringbuf, 0); > + intel_logical_ring_emit(ringbuf, 0); > + intel_logical_ring_emit(ringbuf, 0); > + intel_logical_ring_emit(ringbuf, 0); > + > + /* WaClearSlmSpaceAtContextSwitch:bdw,chv */ > + flags = PIPE_CONTROL_FLUSH_RO_CACHES | > + PIPE_CONTROL_GLOBAL_GTT_IVB | > + PIPE_CONTROL_CS_STALL | > + PIPE_CONTROL_QW_WRITE; > + > + /* Actual scratch location is at 128 bytes offset */ > + scratch_addr = ring->scratch.gtt_offset + 2*CACHELINE_BYTES; > + scratch_addr |= PIPE_CONTROL_GLOBAL_GTT; > + > + intel_logical_ring_emit(ringbuf, GFX_OP_PIPE_CONTROL(6)); > + intel_logical_ring_emit(ringbuf, flags); > + intel_logical_ring_emit(ringbuf, scratch_addr); > + intel_logical_ring_emit(ringbuf, 0); > + intel_logical_ring_emit(ringbuf, 0); > + intel_logical_ring_emit(ringbuf, 0); > + > + /* Padding to align with cache line */ > + intel_logical_ring_emit(ringbuf, 0); > + intel_logical_ring_emit(ringbuf, 0); > + intel_logical_ring_emit(ringbuf, 0); > + > + /* > + * No MI_BATCH_BUFFER_END is required in Indirect ctx BB because > + * execution depends on the size defined in CTX_RCS_INDIRECT_CTX > + */ > + > + return 0; > +} > + > +static int gen8_init_perctx_bb(struct intel_engine_cs *ring, > + struct intel_context *ctx) > +{ > + unsigned long flags = 0; > + u32 scratch_addr; > + struct intel_ringbuffer *ringbuf = NULL; > + > + if (ring->scratch.obj == NULL) { > + DRM_ERROR("scratch page not allocated for %s\n", ring->name); > + return -EINVAL; > + } > + > + ringbuf = create_wa_bb(ring, PAGE_SIZE); > + if (!ringbuf) > + return -ENOMEM; > + > + ctx->per_ctx_wa_bb = ringbuf; > + > + /* Actual scratch location is at 128 bytes offset */ > + scratch_addr = ring->scratch.gtt_offset + 2*CACHELINE_BYTES; > + scratch_addr |= PIPE_CONTROL_GLOBAL_GTT; > + > + /* WaDisableCtxRestoreArbitration:bdw,chv */ > + intel_logical_ring_emit(ringbuf, MI_ARB_ON_OFF | MI_ARB_ENABLE); > + > + /* > + * As per Bspec, to workaround a known HW issue, SW must perform the > + * below programming sequence prior to programming MI_BATCH_BUFFER_END. > + * > + * This is only applicable for Gen8. > + */ > + > + /* WaRsRestoreWithPerCtxtBb:bdw,chv */ > + intel_logical_ring_emit(ringbuf, MI_LOAD_REGISTER_IMM(1)); > + intel_logical_ring_emit(ringbuf, INSTPM); > + intel_logical_ring_emit(ringbuf, > + _MASKED_BIT_DISABLE(INSTPM_FORCE_ORDERING)); > + > + flags = MI_ATOMIC_MEMORY_TYPE_GGTT | > + MI_ATOMIC_INLINE_DATA | > + MI_ATOMIC_CS_STALL | > + MI_ATOMIC_RETURN_DATA_CTL | > + MI_ATOMIC_MOVE; > + > + intel_logical_ring_emit(ringbuf, MI_ATOMIC(5) | flags); > + intel_logical_ring_emit(ringbuf, scratch_addr); > + intel_logical_ring_emit(ringbuf, 0); > + intel_logical_ring_emit(ringbuf, > + _MASKED_BIT_ENABLE(INSTPM_FORCE_ORDERING)); > + intel_logical_ring_emit(ringbuf, > + _MASKED_BIT_ENABLE(INSTPM_FORCE_ORDERING)); > + > + /* > + * Bspec says MI_LOAD_REGISTER_MEM, MI_LOAD_REGISTER_REG and > + * MI_BATCH_BUFFER_END need to be in the same cacheline. > + */ > + while (((unsigned long) ringbuf->tail % CACHELINE_BYTES) != 0) > + intel_logical_ring_emit(ringbuf, MI_NOOP); > + > + intel_logical_ring_emit(ringbuf, > + MI_LOAD_REGISTER_MEM | > + MI_LRM_USE_GLOBAL_GTT | > + MI_LRM_ASYNC_MODE_ENABLE); > + intel_logical_ring_emit(ringbuf, INSTPM); > + intel_logical_ring_emit(ringbuf, 0); > + intel_logical_ring_emit(ringbuf, scratch_addr); > + > + /* > + * Bspec says there should not be any commands programmed > + * between MI_LOAD_REGISTER_REG and MI_BATCH_BUFFER_END so > + * do not add any new commands > + */ > + intel_logical_ring_emit(ringbuf, MI_LOAD_REGISTER_REG); > + intel_logical_ring_emit(ringbuf, GEN8_RS_PREEMPT_STATUS); > + intel_logical_ring_emit(ringbuf, GEN8_RS_PREEMPT_STATUS); > + > + intel_logical_ring_emit(ringbuf, MI_BATCH_BUFFER_END); > + > + return 0; > +} > + > +static int intel_init_workaround_bb(struct intel_engine_cs *ring, > + struct intel_context *ctx) > +{ > + int ret; > + struct drm_device *dev = ring->dev; > + > + WARN_ON(ring->id != RCS); > + > + if (IS_GEN8(dev)) { > + ret = gen8_init_indirectctx_bb(ring, ctx); > + if (ret) > + return ret; > + > + ret = gen8_init_perctx_bb(ring, ctx); > + if (ret) > + return ret; > + } > + > + return 0; > + > +} > + > static int gen8_init_common_ring(struct intel_engine_cs *ring) > { > struct drm_device *dev = ring->dev; > @@ -1337,38 +1540,6 @@ static int gen8_emit_request(struct intel_ringbuffer *ringbuf, > return 0; > } > > -static struct intel_ringbuffer * > -create_wa_bb(struct intel_engine_cs *ring, uint32_t bb_size) > -{ > - struct drm_device *dev = ring->dev; > - struct intel_ringbuffer *ringbuf; > - int ret; > - > - ringbuf = kzalloc(sizeof(*ringbuf), GFP_KERNEL); > - if (!ringbuf) > - return NULL; > - > - ringbuf->ring = ring; > - > - ringbuf->size = roundup(bb_size, PAGE_SIZE); > - ringbuf->effective_size = ringbuf->size; > - ringbuf->head = 0; > - ringbuf->tail = 0; > - ringbuf->space = ringbuf->size; > - ringbuf->last_retired_head = -1; > - > - ret = intel_alloc_ringbuffer_obj(dev, ringbuf); > - if (ret) { > - DRM_DEBUG_DRIVER( > - "Failed to allocate ringbuf obj for wa_bb%s: %d\n", > - ring->name, ret); > - kfree(ringbuf); > - return NULL; > - } > - > - return ringbuf; > -} > - > static int intel_lr_context_render_state_init(struct intel_engine_cs *ring, > struct intel_context *ctx) > { > @@ -1490,6 +1661,7 @@ static int logical_render_ring_init(struct drm_device *dev) > else > ring->init_hw = gen8_init_render_ring; > ring->init_context = gen8_init_rcs_context; > + ring->init_context_bb = intel_init_workaround_bb; > ring->cleanup = intel_fini_pipe_control; > ring->get_seqno = gen8_get_seqno; > ring->set_seqno = gen8_set_seqno; > @@ -1500,11 +1672,16 @@ static int logical_render_ring_init(struct drm_device *dev) > ring->emit_bb_start = gen8_emit_bb_start; > > ring->dev = dev; > + > + ret = intel_init_pipe_control(ring); > + if (ret) > + return ret; > + > ret = logical_ring_init(dev, ring); > if (ret) > return ret; > > - return intel_init_pipe_control(ring); > + return 0; > } > > static int logical_bsd_ring_init(struct drm_device *dev) > @@ -1784,15 +1961,29 @@ populate_lr_context(struct intel_context *ctx, struct drm_i915_gem_object *ctx_o > reg_state[CTX_SECOND_BB_STATE] = ring->mmio_base + 0x118; > reg_state[CTX_SECOND_BB_STATE+1] = 0; > if (ring->id == RCS) { > - /* TODO: according to BSpec, the register state context > - * for CHV does not have these. OTOH, these registers do > - * exist in CHV. I'm waiting for a clarification */ > reg_state[CTX_BB_PER_CTX_PTR] = ring->mmio_base + 0x1c0; > - reg_state[CTX_BB_PER_CTX_PTR+1] = 0; > + > + if (ctx->per_ctx_wa_bb) > + reg_state[CTX_BB_PER_CTX_PTR + 1] = > + i915_gem_obj_ggtt_offset( > + ctx->per_ctx_wa_bb->obj) | 0x01; > + else > + reg_state[CTX_BB_PER_CTX_PTR+1] = 0; > + > reg_state[CTX_RCS_INDIRECT_CTX] = ring->mmio_base + 0x1c4; > - reg_state[CTX_RCS_INDIRECT_CTX+1] = 0; > reg_state[CTX_RCS_INDIRECT_CTX_OFFSET] = ring->mmio_base + 0x1c8; > - reg_state[CTX_RCS_INDIRECT_CTX_OFFSET+1] = 0; > + > + if (ctx->indirect_ctx_wa_bb) { > + reg_state[CTX_RCS_INDIRECT_CTX + 1] = > + i915_gem_obj_ggtt_offset( > + ctx->indirect_ctx_wa_bb->obj) | 0x01; > + > + reg_state[CTX_RCS_INDIRECT_CTX_OFFSET + 1] = > + CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT << 6; > + } else { > + reg_state[CTX_RCS_INDIRECT_CTX+1] = 0; > + reg_state[CTX_RCS_INDIRECT_CTX_OFFSET+1] = 0; > + } > } > reg_state[CTX_LRI_HEADER_1] = MI_LOAD_REGISTER_IMM(9); > reg_state[CTX_LRI_HEADER_1] |= MI_LRI_FORCE_POSTED; > @@ -1859,6 +2050,18 @@ void intel_lr_context_free(struct intel_context *ctx) > drm_gem_object_unreference(&ctx_obj->base); > } > } > + > + if (ctx->indirect_ctx_wa_bb) { > + intel_unpin_ringbuffer_obj(ctx->indirect_ctx_wa_bb); > + intel_destroy_ringbuffer_obj(ctx->indirect_ctx_wa_bb); > + kfree(ctx->indirect_ctx_wa_bb); > + } > + > + if (ctx->per_ctx_wa_bb) { > + intel_unpin_ringbuffer_obj(ctx->per_ctx_wa_bb); > + intel_destroy_ringbuffer_obj(ctx->per_ctx_wa_bb); > + kfree(ctx->per_ctx_wa_bb); > + } > } > > static uint32_t get_lr_context_size(struct intel_engine_cs *ring) > @@ -1985,6 +2188,16 @@ int intel_lr_context_deferred_create(struct intel_context *ctx, > > } > > + if (ring->id == RCS && !ctx->rcs_initialized) { > + if (ring->init_context_bb) { > + ret = ring->init_context_bb(ring, ctx); > + if (ret) { > + DRM_ERROR("ring init context bb: %d\n", ret); > + goto error; > + } > + } > + } > + > ret = populate_lr_context(ctx, ctx_obj, ring, ringbuf); > if (ret) { > DRM_DEBUG_DRIVER("Failed to populate LRC: %d\n", ret); > @@ -2013,6 +2226,17 @@ int intel_lr_context_deferred_create(struct intel_context *ctx, > return 0; > > error: > + if (ctx->indirect_ctx_wa_bb) { > + intel_unpin_ringbuffer_obj(ctx->indirect_ctx_wa_bb); > + intel_destroy_ringbuffer_obj(ctx->indirect_ctx_wa_bb); > + kfree(ctx->indirect_ctx_wa_bb); > + } > + if (ctx->per_ctx_wa_bb) { > + intel_unpin_ringbuffer_obj(ctx->per_ctx_wa_bb); > + intel_destroy_ringbuffer_obj(ctx->per_ctx_wa_bb); > + kfree(ctx->per_ctx_wa_bb); > + } > + > if (is_global_default_ctx) > intel_unpin_ringbuffer_obj(ringbuf); > error_destroy_rbuf: > diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h > index 39183fc..839d698 100644 > --- a/drivers/gpu/drm/i915/intel_ringbuffer.h > +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h > @@ -146,6 +146,9 @@ struct intel_engine_cs { > int (*init_context)(struct intel_engine_cs *ring, > struct intel_context *ctx); > > + int (*init_context_bb)(struct intel_engine_cs *ring, > + struct intel_context *ctx); > + > void (*write_tail)(struct intel_engine_cs *ring, > u32 value); > int __must_check (*flush)(struct intel_engine_cs *ring,
On 02/03/2015 10:10, Michel Thierry wrote: > > > On 25/02/15 17:54, Arun Siluvery wrote: >> Some of the workarounds are to be applied during context save but before >> restore and some at the end of context save/restore but before executing >> the instructions in the ring. Workaround batch buffers are created for >> this purpose as they cannot be applied using normal means. HW executes >> them at specific stages during context save/restore. >> >> In this method we initialize batch buffer with w/a commands and its address >> is supplied using context offset pointers when a context is initialized. >> >> This patch introduces indirect and per-context batch buffers using which >> following workarounds are applied. These are required to fix issues >> observed with preemption related workloads. >> >> In Indirect context w/a batch buffer, >> +WaDisableCtxRestoreArbitration >> +WaFlushCoherentL3CacheLinesAtContextSwitch >> +WaClearSlmSpaceAtContextSwitch >> >> In Per context w/a batch buffer, >> +WaDisableCtxRestoreArbitration >> +WaRsRestoreWithPerCtxtBb >> >> v2: Use GTT address type for all privileged instructions, update as >> per dynamic pinning changes, minor simplifications, rename variables >> as follows to keep lines under 80 chars and rebase. >> s/indirect_ctx_wa_ringbuf/indirect_ctx_wa_bb >> s/per_ctx_wa_ringbuf/per_ctx_wa_bb >> >> v3: Modify WA BB initialization to Gen specific. >> >> Change-Id: I0cedb536b7f6d9f10ba9e81ba625848e7bab603c >> Signed-off-by: Rafael Barbalho <rafael.barbalho@intel.com> >> Signed-off-by: Arun Siluvery <arun.siluvery@linux.intel.com> >> --- >> drivers/gpu/drm/i915/i915_drv.h | 3 + >> drivers/gpu/drm/i915/i915_reg.h | 30 +++- >> drivers/gpu/drm/i915/intel_lrc.c | 302 +++++++++++++++++++++++++++----- >> drivers/gpu/drm/i915/intel_ringbuffer.h | 3 + >> 4 files changed, 297 insertions(+), 41 deletions(-) >> >> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h >> index d42040f..86cdb52 100644 >> --- a/drivers/gpu/drm/i915/i915_drv.h >> +++ b/drivers/gpu/drm/i915/i915_drv.h >> @@ -774,6 +774,9 @@ struct intel_context { >> >> /* Execlists */ >> bool rcs_initialized; >> + struct intel_ringbuffer *indirect_ctx_wa_bb; >> + struct intel_ringbuffer *per_ctx_wa_bb; >> + >> struct { >> struct drm_i915_gem_object *state; >> struct intel_ringbuffer *ringbuf; >> diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h >> index 55143cb..eb41d7f 100644 >> --- a/drivers/gpu/drm/i915/i915_reg.h >> +++ b/drivers/gpu/drm/i915/i915_reg.h >> @@ -347,6 +347,26 @@ >> #define MI_INVALIDATE_BSD (1<<7) >> #define MI_FLUSH_DW_USE_GTT (1<<2) >> #define MI_FLUSH_DW_USE_PPGTT (0<<2) >> +#define MI_ATOMIC(len) MI_INSTR(0x2F, (len-2)) >> +#define MI_ATOMIC_MEMORY_TYPE_GGTT (1<<22) >> +#define MI_ATOMIC_INLINE_DATA (1<<18) >> +#define MI_ATOMIC_CS_STALL (1<<17) >> +#define MI_ATOMIC_RETURN_DATA_CTL (1<<16) >> +#define MI_ATOMIC_OP_MASK(op) ((op) << 8) >> +#define MI_ATOMIC_AND MI_ATOMIC_OP_MASK(0x01) >> +#define MI_ATOMIC_OR MI_ATOMIC_OP_MASK(0x02) >> +#define MI_ATOMIC_XOR MI_ATOMIC_OP_MASK(0x03) >> +#define MI_ATOMIC_MOVE MI_ATOMIC_OP_MASK(0x04) >> +#define MI_ATOMIC_INC MI_ATOMIC_OP_MASK(0x05) >> +#define MI_ATOMIC_DEC MI_ATOMIC_OP_MASK(0x06) >> +#define MI_ATOMIC_ADD MI_ATOMIC_OP_MASK(0x07) >> +#define MI_ATOMIC_SUB MI_ATOMIC_OP_MASK(0x08) >> +#define MI_ATOMIC_RSUB MI_ATOMIC_OP_MASK(0x09) >> +#define MI_ATOMIC_IMAX MI_ATOMIC_OP_MASK(0x0A) >> +#define MI_ATOMIC_IMIN MI_ATOMIC_OP_MASK(0x0B) >> +#define MI_ATOMIC_UMAX MI_ATOMIC_OP_MASK(0x0C) >> +#define MI_ATOMIC_UMIN MI_ATOMIC_OP_MASK(0x0D) >> + >> #define MI_BATCH_BUFFER MI_INSTR(0x30, 1) >> #define MI_BATCH_NON_SECURE (1) >> /* for snb/ivb/vlv this also means "batch in ppgtt" when ppgtt is enabled. */ >> @@ -410,6 +430,7 @@ >> #define DISPLAY_PLANE_A (0<<20) >> #define DISPLAY_PLANE_B (1<<20) >> #define GFX_OP_PIPE_CONTROL(len) ((0x3<<29)|(0x3<<27)|(0x2<<24)|(len-2)) >> +#define PIPE_CONTROL_FLUSH_RO_CACHES (1<<27) > I think the consensus is to rename this to PIPE_CONTROL_FLUSH_L3, isn't it? Yes, it will be renamed to PIPE_CONTROL_FLUSH_L3 in v2. >> #define PIPE_CONTROL_GLOBAL_GTT_IVB (1<<24) /* gen7+ */ >> #define PIPE_CONTROL_MMIO_WRITE (1<<23) >> #define PIPE_CONTROL_STORE_DATA_INDEX (1<<21) >> @@ -426,6 +447,7 @@ >> #define PIPE_CONTROL_INDIRECT_STATE_DISABLE (1<<9) >> #define PIPE_CONTROL_NOTIFY (1<<8) >> #define PIPE_CONTROL_FLUSH_ENABLE (1<<7) /* gen7+ */ >> +#define PIPE_CONTROL_DC_FLUSH_ENABLE (1<<5) >> #define PIPE_CONTROL_VF_CACHE_INVALIDATE (1<<4) >> #define PIPE_CONTROL_CONST_CACHE_INVALIDATE (1<<3) >> #define PIPE_CONTROL_STATE_CACHE_INVALIDATE (1<<2) >> @@ -449,8 +471,10 @@ >> #define MI_CLFLUSH MI_INSTR(0x27, 0) >> #define MI_REPORT_PERF_COUNT MI_INSTR(0x28, 0) >> #define MI_REPORT_PERF_COUNT_GGTT (1<<0) >> -#define MI_LOAD_REGISTER_MEM MI_INSTR(0x29, 0) >> -#define MI_LOAD_REGISTER_REG MI_INSTR(0x2A, 0) >> +#define MI_LOAD_REGISTER_MEM MI_INSTR(0x29, 2) > Should this change be GEN8 only? At this point no one is using MI_LOAD_REGISTER_MEM but to avoid breaking any future users I will keep the previous definition as it is and use a different define for Gen8 alone. regards Arun >> +#define MI_LRM_USE_GLOBAL_GTT (1<<22) >> +#define MI_LRM_ASYNC_MODE_ENABLE (1<<21) >> +#define MI_LOAD_REGISTER_REG MI_INSTR(0x2A, 1) >> #define MI_RS_STORE_DATA_IMM MI_INSTR(0x2B, 0) >> #define MI_LOAD_URB_MEM MI_INSTR(0x2C, 0) >> #define MI_STORE_URB_MEM MI_INSTR(0x2D, 0) >> @@ -1520,6 +1544,8 @@ enum skl_disp_power_wells { >> #define GEN8_RC_SEMA_IDLE_MSG_DISABLE (1 << 12) >> #define GEN8_FF_DOP_CLOCK_GATE_DISABLE (1<<10) >> >> +#define GEN8_RS_PREEMPT_STATUS 0x215C >> + >> /* Fuse readout registers for GT */ >> #define CHV_FUSE_GT (VLV_DISPLAY_BASE + 0x2168) >> #define CHV_FGT_EU_DIS_SS0_R0_SHIFT 16 >> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c >> index 47473e5..b34ef48 100644 >> --- a/drivers/gpu/drm/i915/intel_lrc.c >> +++ b/drivers/gpu/drm/i915/intel_lrc.c >> @@ -202,6 +202,7 @@ enum { >> FAULT_AND_CONTINUE /* Unsupported */ >> }; >> #define GEN8_CTX_ID_SHIFT 32 >> +#define CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT 0x17 >> >> static int intel_lr_context_pin(struct intel_engine_cs *ring, >> struct intel_context *ctx); >> @@ -1107,6 +1108,208 @@ static int intel_logical_ring_workarounds_emit(struct intel_engine_cs *ring, >> return 0; >> } >> >> +static struct intel_ringbuffer * >> +create_wa_bb(struct intel_engine_cs *ring, uint32_t bb_size) >> +{ >> + struct drm_device *dev = ring->dev; >> + struct intel_ringbuffer *ringbuf; >> + int ret; >> + >> + ringbuf = kzalloc(sizeof(*ringbuf), GFP_KERNEL); >> + if (!ringbuf) >> + return NULL; >> + >> + ringbuf->ring = ring; >> + >> + ringbuf->size = roundup(bb_size, PAGE_SIZE); >> + ringbuf->effective_size = ringbuf->size; >> + ringbuf->head = 0; >> + ringbuf->tail = 0; >> + ringbuf->space = ringbuf->size; >> + ringbuf->last_retired_head = -1; >> + >> + ret = intel_alloc_ringbuffer_obj(dev, ringbuf); >> + if (ret) { >> + DRM_DEBUG_DRIVER("Failed to allocate ringbuffer obj %s: %d\n", >> + ring->name, ret); >> + kfree(ringbuf); >> + return NULL; >> + } >> + >> + ret = intel_pin_and_map_ringbuffer_obj(dev, ringbuf); >> + if (ret) { >> + DRM_ERROR("Failed to pin and map %s w/a batch: %d\n", >> + ring->name, ret); >> + intel_destroy_ringbuffer_obj(ringbuf); >> + kfree(ringbuf); >> + return NULL; >> + } >> + >> + return ringbuf; >> +} >> + >> +static int gen8_init_indirectctx_bb(struct intel_engine_cs *ring, >> + struct intel_context *ctx) >> +{ >> + unsigned long flags = 0; >> + u32 scratch_addr; >> + struct intel_ringbuffer *ringbuf = NULL; >> + >> + if (ring->scratch.obj == NULL) { >> + DRM_ERROR("scratch page not allocated for %s\n", ring->name); >> + return -EINVAL; >> + } >> + >> + ringbuf = create_wa_bb(ring, PAGE_SIZE); >> + if (!ringbuf) >> + return -ENOMEM; >> + >> + ctx->indirect_ctx_wa_bb = ringbuf; >> + >> + /* WaDisableCtxRestoreArbitration:bdw,chv */ >> + intel_logical_ring_emit(ringbuf, MI_ARB_ON_OFF | MI_ARB_DISABLE); >> + >> + /* WaFlushCoherentL3CacheLinesAtContextSwitch:bdw,chv */ >> + intel_logical_ring_emit(ringbuf, GFX_OP_PIPE_CONTROL(6)); >> + intel_logical_ring_emit(ringbuf, PIPE_CONTROL_DC_FLUSH_ENABLE); >> + intel_logical_ring_emit(ringbuf, 0); >> + intel_logical_ring_emit(ringbuf, 0); >> + intel_logical_ring_emit(ringbuf, 0); >> + intel_logical_ring_emit(ringbuf, 0); >> + >> + /* WaClearSlmSpaceAtContextSwitch:bdw,chv */ >> + flags = PIPE_CONTROL_FLUSH_RO_CACHES | >> + PIPE_CONTROL_GLOBAL_GTT_IVB | >> + PIPE_CONTROL_CS_STALL | >> + PIPE_CONTROL_QW_WRITE; >> + >> + /* Actual scratch location is at 128 bytes offset */ >> + scratch_addr = ring->scratch.gtt_offset + 2*CACHELINE_BYTES; >> + scratch_addr |= PIPE_CONTROL_GLOBAL_GTT; >> + >> + intel_logical_ring_emit(ringbuf, GFX_OP_PIPE_CONTROL(6)); >> + intel_logical_ring_emit(ringbuf, flags); >> + intel_logical_ring_emit(ringbuf, scratch_addr); >> + intel_logical_ring_emit(ringbuf, 0); >> + intel_logical_ring_emit(ringbuf, 0); >> + intel_logical_ring_emit(ringbuf, 0); >> + >> + /* Padding to align with cache line */ >> + intel_logical_ring_emit(ringbuf, 0); >> + intel_logical_ring_emit(ringbuf, 0); >> + intel_logical_ring_emit(ringbuf, 0); >> + >> + /* >> + * No MI_BATCH_BUFFER_END is required in Indirect ctx BB because >> + * execution depends on the size defined in CTX_RCS_INDIRECT_CTX >> + */ >> + >> + return 0; >> +} >> + >> +static int gen8_init_perctx_bb(struct intel_engine_cs *ring, >> + struct intel_context *ctx) >> +{ >> + unsigned long flags = 0; >> + u32 scratch_addr; >> + struct intel_ringbuffer *ringbuf = NULL; >> + >> + if (ring->scratch.obj == NULL) { >> + DRM_ERROR("scratch page not allocated for %s\n", ring->name); >> + return -EINVAL; >> + } >> + >> + ringbuf = create_wa_bb(ring, PAGE_SIZE); >> + if (!ringbuf) >> + return -ENOMEM; >> + >> + ctx->per_ctx_wa_bb = ringbuf; >> + >> + /* Actual scratch location is at 128 bytes offset */ >> + scratch_addr = ring->scratch.gtt_offset + 2*CACHELINE_BYTES; >> + scratch_addr |= PIPE_CONTROL_GLOBAL_GTT; >> + >> + /* WaDisableCtxRestoreArbitration:bdw,chv */ >> + intel_logical_ring_emit(ringbuf, MI_ARB_ON_OFF | MI_ARB_ENABLE); >> + >> + /* >> + * As per Bspec, to workaround a known HW issue, SW must perform the >> + * below programming sequence prior to programming MI_BATCH_BUFFER_END. >> + * >> + * This is only applicable for Gen8. >> + */ >> + >> + /* WaRsRestoreWithPerCtxtBb:bdw,chv */ >> + intel_logical_ring_emit(ringbuf, MI_LOAD_REGISTER_IMM(1)); >> + intel_logical_ring_emit(ringbuf, INSTPM); >> + intel_logical_ring_emit(ringbuf, >> + _MASKED_BIT_DISABLE(INSTPM_FORCE_ORDERING)); >> + >> + flags = MI_ATOMIC_MEMORY_TYPE_GGTT | >> + MI_ATOMIC_INLINE_DATA | >> + MI_ATOMIC_CS_STALL | >> + MI_ATOMIC_RETURN_DATA_CTL | >> + MI_ATOMIC_MOVE; >> + >> + intel_logical_ring_emit(ringbuf, MI_ATOMIC(5) | flags); >> + intel_logical_ring_emit(ringbuf, scratch_addr); >> + intel_logical_ring_emit(ringbuf, 0); >> + intel_logical_ring_emit(ringbuf, >> + _MASKED_BIT_ENABLE(INSTPM_FORCE_ORDERING)); >> + intel_logical_ring_emit(ringbuf, >> + _MASKED_BIT_ENABLE(INSTPM_FORCE_ORDERING)); >> + >> + /* >> + * Bspec says MI_LOAD_REGISTER_MEM, MI_LOAD_REGISTER_REG and >> + * MI_BATCH_BUFFER_END need to be in the same cacheline. >> + */ >> + while (((unsigned long) ringbuf->tail % CACHELINE_BYTES) != 0) >> + intel_logical_ring_emit(ringbuf, MI_NOOP); >> + >> + intel_logical_ring_emit(ringbuf, >> + MI_LOAD_REGISTER_MEM | >> + MI_LRM_USE_GLOBAL_GTT | >> + MI_LRM_ASYNC_MODE_ENABLE); >> + intel_logical_ring_emit(ringbuf, INSTPM); >> + intel_logical_ring_emit(ringbuf, 0); >> + intel_logical_ring_emit(ringbuf, scratch_addr); >> + >> + /* >> + * Bspec says there should not be any commands programmed >> + * between MI_LOAD_REGISTER_REG and MI_BATCH_BUFFER_END so >> + * do not add any new commands >> + */ >> + intel_logical_ring_emit(ringbuf, MI_LOAD_REGISTER_REG); >> + intel_logical_ring_emit(ringbuf, GEN8_RS_PREEMPT_STATUS); >> + intel_logical_ring_emit(ringbuf, GEN8_RS_PREEMPT_STATUS); >> + >> + intel_logical_ring_emit(ringbuf, MI_BATCH_BUFFER_END); >> + >> + return 0; >> +} >> + >> +static int intel_init_workaround_bb(struct intel_engine_cs *ring, >> + struct intel_context *ctx) >> +{ >> + int ret; >> + struct drm_device *dev = ring->dev; >> + >> + WARN_ON(ring->id != RCS); >> + >> + if (IS_GEN8(dev)) { >> + ret = gen8_init_indirectctx_bb(ring, ctx); >> + if (ret) >> + return ret; >> + >> + ret = gen8_init_perctx_bb(ring, ctx); >> + if (ret) >> + return ret; >> + } >> + >> + return 0; >> + >> +} >> + >> static int gen8_init_common_ring(struct intel_engine_cs *ring) >> { >> struct drm_device *dev = ring->dev; >> @@ -1337,38 +1540,6 @@ static int gen8_emit_request(struct intel_ringbuffer *ringbuf, >> return 0; >> } >> >> -static struct intel_ringbuffer * >> -create_wa_bb(struct intel_engine_cs *ring, uint32_t bb_size) >> -{ >> - struct drm_device *dev = ring->dev; >> - struct intel_ringbuffer *ringbuf; >> - int ret; >> - >> - ringbuf = kzalloc(sizeof(*ringbuf), GFP_KERNEL); >> - if (!ringbuf) >> - return NULL; >> - >> - ringbuf->ring = ring; >> - >> - ringbuf->size = roundup(bb_size, PAGE_SIZE); >> - ringbuf->effective_size = ringbuf->size; >> - ringbuf->head = 0; >> - ringbuf->tail = 0; >> - ringbuf->space = ringbuf->size; >> - ringbuf->last_retired_head = -1; >> - >> - ret = intel_alloc_ringbuffer_obj(dev, ringbuf); >> - if (ret) { >> - DRM_DEBUG_DRIVER( >> - "Failed to allocate ringbuf obj for wa_bb%s: %d\n", >> - ring->name, ret); >> - kfree(ringbuf); >> - return NULL; >> - } >> - >> - return ringbuf; >> -} >> - >> static int intel_lr_context_render_state_init(struct intel_engine_cs *ring, >> struct intel_context *ctx) >> { >> @@ -1490,6 +1661,7 @@ static int logical_render_ring_init(struct drm_device *dev) >> else >> ring->init_hw = gen8_init_render_ring; >> ring->init_context = gen8_init_rcs_context; >> + ring->init_context_bb = intel_init_workaround_bb; >> ring->cleanup = intel_fini_pipe_control; >> ring->get_seqno = gen8_get_seqno; >> ring->set_seqno = gen8_set_seqno; >> @@ -1500,11 +1672,16 @@ static int logical_render_ring_init(struct drm_device *dev) >> ring->emit_bb_start = gen8_emit_bb_start; >> >> ring->dev = dev; >> + >> + ret = intel_init_pipe_control(ring); >> + if (ret) >> + return ret; >> + >> ret = logical_ring_init(dev, ring); >> if (ret) >> return ret; >> >> - return intel_init_pipe_control(ring); >> + return 0; >> } >> >> static int logical_bsd_ring_init(struct drm_device *dev) >> @@ -1784,15 +1961,29 @@ populate_lr_context(struct intel_context *ctx, struct drm_i915_gem_object *ctx_o >> reg_state[CTX_SECOND_BB_STATE] = ring->mmio_base + 0x118; >> reg_state[CTX_SECOND_BB_STATE+1] = 0; >> if (ring->id == RCS) { >> - /* TODO: according to BSpec, the register state context >> - * for CHV does not have these. OTOH, these registers do >> - * exist in CHV. I'm waiting for a clarification */ >> reg_state[CTX_BB_PER_CTX_PTR] = ring->mmio_base + 0x1c0; >> - reg_state[CTX_BB_PER_CTX_PTR+1] = 0; >> + >> + if (ctx->per_ctx_wa_bb) >> + reg_state[CTX_BB_PER_CTX_PTR + 1] = >> + i915_gem_obj_ggtt_offset( >> + ctx->per_ctx_wa_bb->obj) | 0x01; >> + else >> + reg_state[CTX_BB_PER_CTX_PTR+1] = 0; >> + >> reg_state[CTX_RCS_INDIRECT_CTX] = ring->mmio_base + 0x1c4; >> - reg_state[CTX_RCS_INDIRECT_CTX+1] = 0; >> reg_state[CTX_RCS_INDIRECT_CTX_OFFSET] = ring->mmio_base + 0x1c8; >> - reg_state[CTX_RCS_INDIRECT_CTX_OFFSET+1] = 0; >> + >> + if (ctx->indirect_ctx_wa_bb) { >> + reg_state[CTX_RCS_INDIRECT_CTX + 1] = >> + i915_gem_obj_ggtt_offset( >> + ctx->indirect_ctx_wa_bb->obj) | 0x01; >> + >> + reg_state[CTX_RCS_INDIRECT_CTX_OFFSET + 1] = >> + CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT << 6; >> + } else { >> + reg_state[CTX_RCS_INDIRECT_CTX+1] = 0; >> + reg_state[CTX_RCS_INDIRECT_CTX_OFFSET+1] = 0; >> + } >> } >> reg_state[CTX_LRI_HEADER_1] = MI_LOAD_REGISTER_IMM(9); >> reg_state[CTX_LRI_HEADER_1] |= MI_LRI_FORCE_POSTED; >> @@ -1859,6 +2050,18 @@ void intel_lr_context_free(struct intel_context *ctx) >> drm_gem_object_unreference(&ctx_obj->base); >> } >> } >> + >> + if (ctx->indirect_ctx_wa_bb) { >> + intel_unpin_ringbuffer_obj(ctx->indirect_ctx_wa_bb); >> + intel_destroy_ringbuffer_obj(ctx->indirect_ctx_wa_bb); >> + kfree(ctx->indirect_ctx_wa_bb); >> + } >> + >> + if (ctx->per_ctx_wa_bb) { >> + intel_unpin_ringbuffer_obj(ctx->per_ctx_wa_bb); >> + intel_destroy_ringbuffer_obj(ctx->per_ctx_wa_bb); >> + kfree(ctx->per_ctx_wa_bb); >> + } >> } >> >> static uint32_t get_lr_context_size(struct intel_engine_cs *ring) >> @@ -1985,6 +2188,16 @@ int intel_lr_context_deferred_create(struct intel_context *ctx, >> >> } >> >> + if (ring->id == RCS && !ctx->rcs_initialized) { >> + if (ring->init_context_bb) { >> + ret = ring->init_context_bb(ring, ctx); >> + if (ret) { >> + DRM_ERROR("ring init context bb: %d\n", ret); >> + goto error; >> + } >> + } >> + } >> + >> ret = populate_lr_context(ctx, ctx_obj, ring, ringbuf); >> if (ret) { >> DRM_DEBUG_DRIVER("Failed to populate LRC: %d\n", ret); >> @@ -2013,6 +2226,17 @@ int intel_lr_context_deferred_create(struct intel_context *ctx, >> return 0; >> >> error: >> + if (ctx->indirect_ctx_wa_bb) { >> + intel_unpin_ringbuffer_obj(ctx->indirect_ctx_wa_bb); >> + intel_destroy_ringbuffer_obj(ctx->indirect_ctx_wa_bb); >> + kfree(ctx->indirect_ctx_wa_bb); >> + } >> + if (ctx->per_ctx_wa_bb) { >> + intel_unpin_ringbuffer_obj(ctx->per_ctx_wa_bb); >> + intel_destroy_ringbuffer_obj(ctx->per_ctx_wa_bb); >> + kfree(ctx->per_ctx_wa_bb); >> + } >> + >> if (is_global_default_ctx) >> intel_unpin_ringbuffer_obj(ringbuf); >> error_destroy_rbuf: >> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h >> index 39183fc..839d698 100644 >> --- a/drivers/gpu/drm/i915/intel_ringbuffer.h >> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h >> @@ -146,6 +146,9 @@ struct intel_engine_cs { >> int (*init_context)(struct intel_engine_cs *ring, >> struct intel_context *ctx); >> >> + int (*init_context_bb)(struct intel_engine_cs *ring, >> + struct intel_context *ctx); >> + >> void (*write_tail)(struct intel_engine_cs *ring, >> u32 value); >> int __must_check (*flush)(struct intel_engine_cs *ring, > > >
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index d42040f..86cdb52 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -774,6 +774,9 @@ struct intel_context { /* Execlists */ bool rcs_initialized; + struct intel_ringbuffer *indirect_ctx_wa_bb; + struct intel_ringbuffer *per_ctx_wa_bb; + struct { struct drm_i915_gem_object *state; struct intel_ringbuffer *ringbuf; diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h index 55143cb..eb41d7f 100644 --- a/drivers/gpu/drm/i915/i915_reg.h +++ b/drivers/gpu/drm/i915/i915_reg.h @@ -347,6 +347,26 @@ #define MI_INVALIDATE_BSD (1<<7) #define MI_FLUSH_DW_USE_GTT (1<<2) #define MI_FLUSH_DW_USE_PPGTT (0<<2) +#define MI_ATOMIC(len) MI_INSTR(0x2F, (len-2)) +#define MI_ATOMIC_MEMORY_TYPE_GGTT (1<<22) +#define MI_ATOMIC_INLINE_DATA (1<<18) +#define MI_ATOMIC_CS_STALL (1<<17) +#define MI_ATOMIC_RETURN_DATA_CTL (1<<16) +#define MI_ATOMIC_OP_MASK(op) ((op) << 8) +#define MI_ATOMIC_AND MI_ATOMIC_OP_MASK(0x01) +#define MI_ATOMIC_OR MI_ATOMIC_OP_MASK(0x02) +#define MI_ATOMIC_XOR MI_ATOMIC_OP_MASK(0x03) +#define MI_ATOMIC_MOVE MI_ATOMIC_OP_MASK(0x04) +#define MI_ATOMIC_INC MI_ATOMIC_OP_MASK(0x05) +#define MI_ATOMIC_DEC MI_ATOMIC_OP_MASK(0x06) +#define MI_ATOMIC_ADD MI_ATOMIC_OP_MASK(0x07) +#define MI_ATOMIC_SUB MI_ATOMIC_OP_MASK(0x08) +#define MI_ATOMIC_RSUB MI_ATOMIC_OP_MASK(0x09) +#define MI_ATOMIC_IMAX MI_ATOMIC_OP_MASK(0x0A) +#define MI_ATOMIC_IMIN MI_ATOMIC_OP_MASK(0x0B) +#define MI_ATOMIC_UMAX MI_ATOMIC_OP_MASK(0x0C) +#define MI_ATOMIC_UMIN MI_ATOMIC_OP_MASK(0x0D) + #define MI_BATCH_BUFFER MI_INSTR(0x30, 1) #define MI_BATCH_NON_SECURE (1) /* for snb/ivb/vlv this also means "batch in ppgtt" when ppgtt is enabled. */ @@ -410,6 +430,7 @@ #define DISPLAY_PLANE_A (0<<20) #define DISPLAY_PLANE_B (1<<20) #define GFX_OP_PIPE_CONTROL(len) ((0x3<<29)|(0x3<<27)|(0x2<<24)|(len-2)) +#define PIPE_CONTROL_FLUSH_RO_CACHES (1<<27) #define PIPE_CONTROL_GLOBAL_GTT_IVB (1<<24) /* gen7+ */ #define PIPE_CONTROL_MMIO_WRITE (1<<23) #define PIPE_CONTROL_STORE_DATA_INDEX (1<<21) @@ -426,6 +447,7 @@ #define PIPE_CONTROL_INDIRECT_STATE_DISABLE (1<<9) #define PIPE_CONTROL_NOTIFY (1<<8) #define PIPE_CONTROL_FLUSH_ENABLE (1<<7) /* gen7+ */ +#define PIPE_CONTROL_DC_FLUSH_ENABLE (1<<5) #define PIPE_CONTROL_VF_CACHE_INVALIDATE (1<<4) #define PIPE_CONTROL_CONST_CACHE_INVALIDATE (1<<3) #define PIPE_CONTROL_STATE_CACHE_INVALIDATE (1<<2) @@ -449,8 +471,10 @@ #define MI_CLFLUSH MI_INSTR(0x27, 0) #define MI_REPORT_PERF_COUNT MI_INSTR(0x28, 0) #define MI_REPORT_PERF_COUNT_GGTT (1<<0) -#define MI_LOAD_REGISTER_MEM MI_INSTR(0x29, 0) -#define MI_LOAD_REGISTER_REG MI_INSTR(0x2A, 0) +#define MI_LOAD_REGISTER_MEM MI_INSTR(0x29, 2) +#define MI_LRM_USE_GLOBAL_GTT (1<<22) +#define MI_LRM_ASYNC_MODE_ENABLE (1<<21) +#define MI_LOAD_REGISTER_REG MI_INSTR(0x2A, 1) #define MI_RS_STORE_DATA_IMM MI_INSTR(0x2B, 0) #define MI_LOAD_URB_MEM MI_INSTR(0x2C, 0) #define MI_STORE_URB_MEM MI_INSTR(0x2D, 0) @@ -1520,6 +1544,8 @@ enum skl_disp_power_wells { #define GEN8_RC_SEMA_IDLE_MSG_DISABLE (1 << 12) #define GEN8_FF_DOP_CLOCK_GATE_DISABLE (1<<10) +#define GEN8_RS_PREEMPT_STATUS 0x215C + /* Fuse readout registers for GT */ #define CHV_FUSE_GT (VLV_DISPLAY_BASE + 0x2168) #define CHV_FGT_EU_DIS_SS0_R0_SHIFT 16 diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c index 47473e5..b34ef48 100644 --- a/drivers/gpu/drm/i915/intel_lrc.c +++ b/drivers/gpu/drm/i915/intel_lrc.c @@ -202,6 +202,7 @@ enum { FAULT_AND_CONTINUE /* Unsupported */ }; #define GEN8_CTX_ID_SHIFT 32 +#define CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT 0x17 static int intel_lr_context_pin(struct intel_engine_cs *ring, struct intel_context *ctx); @@ -1107,6 +1108,208 @@ static int intel_logical_ring_workarounds_emit(struct intel_engine_cs *ring, return 0; } +static struct intel_ringbuffer * +create_wa_bb(struct intel_engine_cs *ring, uint32_t bb_size) +{ + struct drm_device *dev = ring->dev; + struct intel_ringbuffer *ringbuf; + int ret; + + ringbuf = kzalloc(sizeof(*ringbuf), GFP_KERNEL); + if (!ringbuf) + return NULL; + + ringbuf->ring = ring; + + ringbuf->size = roundup(bb_size, PAGE_SIZE); + ringbuf->effective_size = ringbuf->size; + ringbuf->head = 0; + ringbuf->tail = 0; + ringbuf->space = ringbuf->size; + ringbuf->last_retired_head = -1; + + ret = intel_alloc_ringbuffer_obj(dev, ringbuf); + if (ret) { + DRM_DEBUG_DRIVER("Failed to allocate ringbuffer obj %s: %d\n", + ring->name, ret); + kfree(ringbuf); + return NULL; + } + + ret = intel_pin_and_map_ringbuffer_obj(dev, ringbuf); + if (ret) { + DRM_ERROR("Failed to pin and map %s w/a batch: %d\n", + ring->name, ret); + intel_destroy_ringbuffer_obj(ringbuf); + kfree(ringbuf); + return NULL; + } + + return ringbuf; +} + +static int gen8_init_indirectctx_bb(struct intel_engine_cs *ring, + struct intel_context *ctx) +{ + unsigned long flags = 0; + u32 scratch_addr; + struct intel_ringbuffer *ringbuf = NULL; + + if (ring->scratch.obj == NULL) { + DRM_ERROR("scratch page not allocated for %s\n", ring->name); + return -EINVAL; + } + + ringbuf = create_wa_bb(ring, PAGE_SIZE); + if (!ringbuf) + return -ENOMEM; + + ctx->indirect_ctx_wa_bb = ringbuf; + + /* WaDisableCtxRestoreArbitration:bdw,chv */ + intel_logical_ring_emit(ringbuf, MI_ARB_ON_OFF | MI_ARB_DISABLE); + + /* WaFlushCoherentL3CacheLinesAtContextSwitch:bdw,chv */ + intel_logical_ring_emit(ringbuf, GFX_OP_PIPE_CONTROL(6)); + intel_logical_ring_emit(ringbuf, PIPE_CONTROL_DC_FLUSH_ENABLE); + intel_logical_ring_emit(ringbuf, 0); + intel_logical_ring_emit(ringbuf, 0); + intel_logical_ring_emit(ringbuf, 0); + intel_logical_ring_emit(ringbuf, 0); + + /* WaClearSlmSpaceAtContextSwitch:bdw,chv */ + flags = PIPE_CONTROL_FLUSH_RO_CACHES | + PIPE_CONTROL_GLOBAL_GTT_IVB | + PIPE_CONTROL_CS_STALL | + PIPE_CONTROL_QW_WRITE; + + /* Actual scratch location is at 128 bytes offset */ + scratch_addr = ring->scratch.gtt_offset + 2*CACHELINE_BYTES; + scratch_addr |= PIPE_CONTROL_GLOBAL_GTT; + + intel_logical_ring_emit(ringbuf, GFX_OP_PIPE_CONTROL(6)); + intel_logical_ring_emit(ringbuf, flags); + intel_logical_ring_emit(ringbuf, scratch_addr); + intel_logical_ring_emit(ringbuf, 0); + intel_logical_ring_emit(ringbuf, 0); + intel_logical_ring_emit(ringbuf, 0); + + /* Padding to align with cache line */ + intel_logical_ring_emit(ringbuf, 0); + intel_logical_ring_emit(ringbuf, 0); + intel_logical_ring_emit(ringbuf, 0); + + /* + * No MI_BATCH_BUFFER_END is required in Indirect ctx BB because + * execution depends on the size defined in CTX_RCS_INDIRECT_CTX + */ + + return 0; +} + +static int gen8_init_perctx_bb(struct intel_engine_cs *ring, + struct intel_context *ctx) +{ + unsigned long flags = 0; + u32 scratch_addr; + struct intel_ringbuffer *ringbuf = NULL; + + if (ring->scratch.obj == NULL) { + DRM_ERROR("scratch page not allocated for %s\n", ring->name); + return -EINVAL; + } + + ringbuf = create_wa_bb(ring, PAGE_SIZE); + if (!ringbuf) + return -ENOMEM; + + ctx->per_ctx_wa_bb = ringbuf; + + /* Actual scratch location is at 128 bytes offset */ + scratch_addr = ring->scratch.gtt_offset + 2*CACHELINE_BYTES; + scratch_addr |= PIPE_CONTROL_GLOBAL_GTT; + + /* WaDisableCtxRestoreArbitration:bdw,chv */ + intel_logical_ring_emit(ringbuf, MI_ARB_ON_OFF | MI_ARB_ENABLE); + + /* + * As per Bspec, to workaround a known HW issue, SW must perform the + * below programming sequence prior to programming MI_BATCH_BUFFER_END. + * + * This is only applicable for Gen8. + */ + + /* WaRsRestoreWithPerCtxtBb:bdw,chv */ + intel_logical_ring_emit(ringbuf, MI_LOAD_REGISTER_IMM(1)); + intel_logical_ring_emit(ringbuf, INSTPM); + intel_logical_ring_emit(ringbuf, + _MASKED_BIT_DISABLE(INSTPM_FORCE_ORDERING)); + + flags = MI_ATOMIC_MEMORY_TYPE_GGTT | + MI_ATOMIC_INLINE_DATA | + MI_ATOMIC_CS_STALL | + MI_ATOMIC_RETURN_DATA_CTL | + MI_ATOMIC_MOVE; + + intel_logical_ring_emit(ringbuf, MI_ATOMIC(5) | flags); + intel_logical_ring_emit(ringbuf, scratch_addr); + intel_logical_ring_emit(ringbuf, 0); + intel_logical_ring_emit(ringbuf, + _MASKED_BIT_ENABLE(INSTPM_FORCE_ORDERING)); + intel_logical_ring_emit(ringbuf, + _MASKED_BIT_ENABLE(INSTPM_FORCE_ORDERING)); + + /* + * Bspec says MI_LOAD_REGISTER_MEM, MI_LOAD_REGISTER_REG and + * MI_BATCH_BUFFER_END need to be in the same cacheline. + */ + while (((unsigned long) ringbuf->tail % CACHELINE_BYTES) != 0) + intel_logical_ring_emit(ringbuf, MI_NOOP); + + intel_logical_ring_emit(ringbuf, + MI_LOAD_REGISTER_MEM | + MI_LRM_USE_GLOBAL_GTT | + MI_LRM_ASYNC_MODE_ENABLE); + intel_logical_ring_emit(ringbuf, INSTPM); + intel_logical_ring_emit(ringbuf, 0); + intel_logical_ring_emit(ringbuf, scratch_addr); + + /* + * Bspec says there should not be any commands programmed + * between MI_LOAD_REGISTER_REG and MI_BATCH_BUFFER_END so + * do not add any new commands + */ + intel_logical_ring_emit(ringbuf, MI_LOAD_REGISTER_REG); + intel_logical_ring_emit(ringbuf, GEN8_RS_PREEMPT_STATUS); + intel_logical_ring_emit(ringbuf, GEN8_RS_PREEMPT_STATUS); + + intel_logical_ring_emit(ringbuf, MI_BATCH_BUFFER_END); + + return 0; +} + +static int intel_init_workaround_bb(struct intel_engine_cs *ring, + struct intel_context *ctx) +{ + int ret; + struct drm_device *dev = ring->dev; + + WARN_ON(ring->id != RCS); + + if (IS_GEN8(dev)) { + ret = gen8_init_indirectctx_bb(ring, ctx); + if (ret) + return ret; + + ret = gen8_init_perctx_bb(ring, ctx); + if (ret) + return ret; + } + + return 0; + +} + static int gen8_init_common_ring(struct intel_engine_cs *ring) { struct drm_device *dev = ring->dev; @@ -1337,38 +1540,6 @@ static int gen8_emit_request(struct intel_ringbuffer *ringbuf, return 0; } -static struct intel_ringbuffer * -create_wa_bb(struct intel_engine_cs *ring, uint32_t bb_size) -{ - struct drm_device *dev = ring->dev; - struct intel_ringbuffer *ringbuf; - int ret; - - ringbuf = kzalloc(sizeof(*ringbuf), GFP_KERNEL); - if (!ringbuf) - return NULL; - - ringbuf->ring = ring; - - ringbuf->size = roundup(bb_size, PAGE_SIZE); - ringbuf->effective_size = ringbuf->size; - ringbuf->head = 0; - ringbuf->tail = 0; - ringbuf->space = ringbuf->size; - ringbuf->last_retired_head = -1; - - ret = intel_alloc_ringbuffer_obj(dev, ringbuf); - if (ret) { - DRM_DEBUG_DRIVER( - "Failed to allocate ringbuf obj for wa_bb%s: %d\n", - ring->name, ret); - kfree(ringbuf); - return NULL; - } - - return ringbuf; -} - static int intel_lr_context_render_state_init(struct intel_engine_cs *ring, struct intel_context *ctx) { @@ -1490,6 +1661,7 @@ static int logical_render_ring_init(struct drm_device *dev) else ring->init_hw = gen8_init_render_ring; ring->init_context = gen8_init_rcs_context; + ring->init_context_bb = intel_init_workaround_bb; ring->cleanup = intel_fini_pipe_control; ring->get_seqno = gen8_get_seqno; ring->set_seqno = gen8_set_seqno; @@ -1500,11 +1672,16 @@ static int logical_render_ring_init(struct drm_device *dev) ring->emit_bb_start = gen8_emit_bb_start; ring->dev = dev; + + ret = intel_init_pipe_control(ring); + if (ret) + return ret; + ret = logical_ring_init(dev, ring); if (ret) return ret; - return intel_init_pipe_control(ring); + return 0; } static int logical_bsd_ring_init(struct drm_device *dev) @@ -1784,15 +1961,29 @@ populate_lr_context(struct intel_context *ctx, struct drm_i915_gem_object *ctx_o reg_state[CTX_SECOND_BB_STATE] = ring->mmio_base + 0x118; reg_state[CTX_SECOND_BB_STATE+1] = 0; if (ring->id == RCS) { - /* TODO: according to BSpec, the register state context - * for CHV does not have these. OTOH, these registers do - * exist in CHV. I'm waiting for a clarification */ reg_state[CTX_BB_PER_CTX_PTR] = ring->mmio_base + 0x1c0; - reg_state[CTX_BB_PER_CTX_PTR+1] = 0; + + if (ctx->per_ctx_wa_bb) + reg_state[CTX_BB_PER_CTX_PTR + 1] = + i915_gem_obj_ggtt_offset( + ctx->per_ctx_wa_bb->obj) | 0x01; + else + reg_state[CTX_BB_PER_CTX_PTR+1] = 0; + reg_state[CTX_RCS_INDIRECT_CTX] = ring->mmio_base + 0x1c4; - reg_state[CTX_RCS_INDIRECT_CTX+1] = 0; reg_state[CTX_RCS_INDIRECT_CTX_OFFSET] = ring->mmio_base + 0x1c8; - reg_state[CTX_RCS_INDIRECT_CTX_OFFSET+1] = 0; + + if (ctx->indirect_ctx_wa_bb) { + reg_state[CTX_RCS_INDIRECT_CTX + 1] = + i915_gem_obj_ggtt_offset( + ctx->indirect_ctx_wa_bb->obj) | 0x01; + + reg_state[CTX_RCS_INDIRECT_CTX_OFFSET + 1] = + CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT << 6; + } else { + reg_state[CTX_RCS_INDIRECT_CTX+1] = 0; + reg_state[CTX_RCS_INDIRECT_CTX_OFFSET+1] = 0; + } } reg_state[CTX_LRI_HEADER_1] = MI_LOAD_REGISTER_IMM(9); reg_state[CTX_LRI_HEADER_1] |= MI_LRI_FORCE_POSTED; @@ -1859,6 +2050,18 @@ void intel_lr_context_free(struct intel_context *ctx) drm_gem_object_unreference(&ctx_obj->base); } } + + if (ctx->indirect_ctx_wa_bb) { + intel_unpin_ringbuffer_obj(ctx->indirect_ctx_wa_bb); + intel_destroy_ringbuffer_obj(ctx->indirect_ctx_wa_bb); + kfree(ctx->indirect_ctx_wa_bb); + } + + if (ctx->per_ctx_wa_bb) { + intel_unpin_ringbuffer_obj(ctx->per_ctx_wa_bb); + intel_destroy_ringbuffer_obj(ctx->per_ctx_wa_bb); + kfree(ctx->per_ctx_wa_bb); + } } static uint32_t get_lr_context_size(struct intel_engine_cs *ring) @@ -1985,6 +2188,16 @@ int intel_lr_context_deferred_create(struct intel_context *ctx, } + if (ring->id == RCS && !ctx->rcs_initialized) { + if (ring->init_context_bb) { + ret = ring->init_context_bb(ring, ctx); + if (ret) { + DRM_ERROR("ring init context bb: %d\n", ret); + goto error; + } + } + } + ret = populate_lr_context(ctx, ctx_obj, ring, ringbuf); if (ret) { DRM_DEBUG_DRIVER("Failed to populate LRC: %d\n", ret); @@ -2013,6 +2226,17 @@ int intel_lr_context_deferred_create(struct intel_context *ctx, return 0; error: + if (ctx->indirect_ctx_wa_bb) { + intel_unpin_ringbuffer_obj(ctx->indirect_ctx_wa_bb); + intel_destroy_ringbuffer_obj(ctx->indirect_ctx_wa_bb); + kfree(ctx->indirect_ctx_wa_bb); + } + if (ctx->per_ctx_wa_bb) { + intel_unpin_ringbuffer_obj(ctx->per_ctx_wa_bb); + intel_destroy_ringbuffer_obj(ctx->per_ctx_wa_bb); + kfree(ctx->per_ctx_wa_bb); + } + if (is_global_default_ctx) intel_unpin_ringbuffer_obj(ringbuf); error_destroy_rbuf: diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h index 39183fc..839d698 100644 --- a/drivers/gpu/drm/i915/intel_ringbuffer.h +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h @@ -146,6 +146,9 @@ struct intel_engine_cs { int (*init_context)(struct intel_engine_cs *ring, struct intel_context *ctx); + int (*init_context_bb)(struct intel_engine_cs *ring, + struct intel_context *ctx); + void (*write_tail)(struct intel_engine_cs *ring, u32 value); int __must_check (*flush)(struct intel_engine_cs *ring,