@@ -211,6 +211,7 @@ enum {
FAULT_AND_CONTINUE /* Unsupported */
};
#define GEN8_CTX_ID_SHIFT 32
+#define CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT 0x17
static int intel_lr_context_pin(struct intel_engine_cs *ring,
struct intel_context *ctx);
@@ -1077,6 +1078,173 @@ static int intel_logical_ring_workarounds_emit(struct intel_engine_cs *ring,
return 0;
}
+#define wa_ctx_emit(batch, cmd) { \
+ if (WARN_ON(index >= (PAGE_SIZE / sizeof(uint32_t)))) { \
+ return -ENOSPC; \
+ } \
+ batch[index++] = (cmd); \
+ }
+
+/**
+ * gen8_init_indirectctx_bb() - initialize indirect ctx batch with WA
+ *
+ * @ring: only applicable for RCS
+ * @wa_ctx_batch: page in which WA are loaded
+ * @offset: This is for future use in case if we would like to have multiple
+ * batches at different offsets and select them based on a criteria.
+ * @num_dwords: The number of WA applied are known at the beginning, it returns
+ * the no of DWORDS written. This batch does not contain MI_BATCH_BUFFER_END
+ * so it adds padding to make it cacheline aligned. MI_BATCH_BUFFER_END will be
+ * added to perctx batch and both of them together makes a complete batch buffer.
+ *
+ * Return: non-zero if we exceed the PAGE_SIZE limit.
+ */
+
+static int gen8_init_indirectctx_bb(struct intel_engine_cs *ring,
+ uint32_t **wa_ctx_batch,
+ uint32_t offset,
+ uint32_t *num_dwords)
+{
+ uint32_t index;
+ uint32_t *batch = *wa_ctx_batch;
+
+ index = offset;
+
+ /* FIXME: fill one cacheline with NOOPs.
+ * Replace these instructions with WA
+ */
+ while (index < (offset + 16))
+ wa_ctx_emit(batch, MI_NOOP);
+
+ /*
+ * MI_BATCH_BUFFER_END is not required in Indirect ctx BB because
+ * execution depends on the length specified in terms of cache lines
+ * in the register CTX_RCS_INDIRECT_CTX
+ */
+
+ *num_dwords = index - offset;
+
+ return 0;
+}
+
+/*
+ * gen8_init_perctx_bb() - initialize per ctx batch with WA
+ *
+ * This function doesn't add any padding at the end as it contains
+ * MI_BATCH_BUFFER_END and padding after it is redundant.
+ */
+static int gen8_init_perctx_bb(struct intel_engine_cs *ring,
+ uint32_t **wa_ctx_batch,
+ uint32_t offset,
+ uint32_t *num_dwords)
+{
+ uint32_t index;
+ uint32_t *batch = *wa_ctx_batch;
+
+ index = offset;
+
+ /* FIXME: fill one cacheline with NOOPs.
+ * Replace these instructions with WA
+ */
+ while (index < (offset + 16))
+ wa_ctx_emit(batch, MI_NOOP);
+
+ batch[index - 1] = MI_BATCH_BUFFER_END;
+
+ *num_dwords = index - offset;
+
+ return 0;
+}
+
+static int lrc_setup_wa_ctx_obj(struct intel_engine_cs *ring, u32 size)
+{
+ int ret;
+
+ WARN_ON(ring->id != RCS);
+
+ ring->wa_ctx.obj = i915_gem_alloc_object(ring->dev, PAGE_ALIGN(size));
+ if (!ring->wa_ctx.obj) {
+ DRM_DEBUG_DRIVER("alloc LRC WA ctx backing obj failed.\n");
+ return -ENOMEM;
+ }
+
+ ret = i915_gem_obj_ggtt_pin(ring->wa_ctx.obj, PAGE_SIZE, 0);
+ if (ret) {
+ DRM_DEBUG_DRIVER("pin LRC WA ctx backing obj failed: %d\n",
+ ret);
+ drm_gem_object_unreference(&ring->wa_ctx.obj->base);
+ return ret;
+ }
+
+ return 0;
+}
+
+static void lrc_destroy_wa_ctx_obj(struct intel_engine_cs *ring)
+{
+ WARN_ON(ring->id != RCS);
+
+ i915_gem_object_ggtt_unpin(ring->wa_ctx.obj);
+ drm_gem_object_unreference(&ring->wa_ctx.obj->base);
+ ring->wa_ctx.obj = NULL;
+}
+
+static int intel_init_workaround_bb(struct intel_engine_cs *ring)
+{
+ int ret = 0;
+ uint32_t *batch;
+ uint32_t num_dwords;
+ struct page *page;
+ struct i915_ctx_workarounds *wa_ctx = &ring->wa_ctx;
+
+ WARN_ON(ring->id != RCS);
+
+ if (ring->scratch.obj == NULL) {
+ DRM_ERROR("scratch page not allocated for %s\n", ring->name);
+ return -EINVAL;
+ }
+
+ ret = lrc_setup_wa_ctx_obj(ring, PAGE_SIZE);
+ if (ret) {
+ DRM_DEBUG_DRIVER("Failed to setup context WA page: %d\n", ret);
+ return ret;
+ }
+
+ page = i915_gem_object_get_page(wa_ctx->obj, 0);
+ batch = kmap_atomic(page);
+
+ if (INTEL_INFO(ring->dev)->gen == 8) {
+ wa_ctx->indctx_batch_offset = 0;
+
+ ret = gen8_init_indirectctx_bb(ring,
+ &batch,
+ wa_ctx->indctx_batch_offset,
+ &num_dwords);
+ if (ret)
+ goto out;
+
+ wa_ctx->indctx_batch_size = round_up(num_dwords, CACHELINE_DWORDS);
+ wa_ctx->perctx_batch_offset = wa_ctx->indctx_batch_size;
+
+ ret = gen8_init_perctx_bb(ring,
+ &batch,
+ wa_ctx->perctx_batch_offset,
+ &num_dwords);
+ if (ret)
+ goto out;
+ } else {
+ WARN(INTEL_INFO(ring->dev)->gen >= 8,
+ "WA batch buffer is not initialized for Gen%d\n",
+ INTEL_INFO(ring->dev)->gen);
+ }
+
+out:
+ kunmap_atomic(batch);
+ if (ret)
+ lrc_destroy_wa_ctx_obj(ring);
+
+ return ret;
+}
+
static int gen8_init_common_ring(struct intel_engine_cs *ring)
{
struct drm_device *dev = ring->dev;
@@ -1411,6 +1579,9 @@ void intel_logical_ring_cleanup(struct intel_engine_cs *ring)
kunmap(sg_page(ring->status_page.obj->pages->sgl));
ring->status_page.obj = NULL;
}
+
+ if (ring->wa_ctx.obj)
+ lrc_destroy_wa_ctx_obj(ring);
}
static int logical_ring_init(struct drm_device *dev, struct intel_engine_cs *ring)
@@ -1474,7 +1645,21 @@ static int logical_render_ring_init(struct drm_device *dev)
if (ret)
return ret;
- return intel_init_pipe_control(ring);
+ if (INTEL_INFO(ring->dev)->gen >= 8) {
+ ret = intel_init_workaround_bb(ring);
+ if (ret) {
+ DRM_ERROR("WA batch buffers are not initialized: %d\n",
+ ret);
+ }
+ }
+
+ ret = intel_init_pipe_control(ring);
+ if (ret) {
+ if (ring->wa_ctx.obj)
+ lrc_destroy_wa_ctx_obj(ring);
+ }
+
+ return ret;
}
static int logical_bsd_ring_init(struct drm_device *dev)
@@ -1754,15 +1939,26 @@ populate_lr_context(struct intel_context *ctx, struct drm_i915_gem_object *ctx_o
reg_state[CTX_SECOND_BB_STATE] = ring->mmio_base + 0x118;
reg_state[CTX_SECOND_BB_STATE+1] = 0;
if (ring->id == RCS) {
- /* TODO: according to BSpec, the register state context
- * for CHV does not have these. OTOH, these registers do
- * exist in CHV. I'm waiting for a clarification */
reg_state[CTX_BB_PER_CTX_PTR] = ring->mmio_base + 0x1c0;
reg_state[CTX_BB_PER_CTX_PTR+1] = 0;
reg_state[CTX_RCS_INDIRECT_CTX] = ring->mmio_base + 0x1c4;
reg_state[CTX_RCS_INDIRECT_CTX+1] = 0;
reg_state[CTX_RCS_INDIRECT_CTX_OFFSET] = ring->mmio_base + 0x1c8;
reg_state[CTX_RCS_INDIRECT_CTX_OFFSET+1] = 0;
+ if (ring->wa_ctx.obj) {
+ reg_state[CTX_RCS_INDIRECT_CTX+1] =
+ (i915_gem_obj_ggtt_offset(ring->wa_ctx.obj) +
+ ring->wa_ctx.indctx_batch_offset * sizeof(uint32_t)) |
+ (ring->wa_ctx.indctx_batch_size / CACHELINE_DWORDS);
+
+ reg_state[CTX_RCS_INDIRECT_CTX_OFFSET+1] =
+ CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT << 6;
+
+ reg_state[CTX_BB_PER_CTX_PTR+1] =
+ (i915_gem_obj_ggtt_offset(ring->wa_ctx.obj) +
+ ring->wa_ctx.perctx_batch_offset * sizeof(uint32_t)) |
+ 0x01;
+ }
}
reg_state[CTX_LRI_HEADER_1] = MI_LOAD_REGISTER_IMM(9);
reg_state[CTX_LRI_HEADER_1] |= MI_LRI_FORCE_POSTED;
@@ -12,6 +12,7 @@
* workarounds!
*/
#define CACHELINE_BYTES 64
+#define CACHELINE_DWORDS (CACHELINE_BYTES / sizeof(uint32_t))
/*
* Gen2 BSpec "1. Programming Environment" / 1.4.4.6 "Ring Buffer Use"
@@ -119,6 +120,22 @@ struct intel_ringbuffer {
struct intel_context;
+/*
+ * we use a single page to load ctx workarounds so all of these
+ * values are referred in terms of dwords
+ *
+ * offset field - helpful in case if we want to have multiple batches
+ * at different offsets based on some conditions. It is not a requirement
+ * at the moment but provides an option for future use.
+ * indctx_batch_size - HW expects this value in terms of cachelines
+ */
+struct i915_ctx_workarounds {
+ u32 indctx_batch_offset;
+ u32 indctx_batch_size;
+ u32 perctx_batch_offset;
+ struct drm_i915_gem_object *obj;
+};
+
struct intel_engine_cs {
const char *name;
enum intel_ring_id {
@@ -142,6 +159,7 @@ struct intel_engine_cs {
struct i915_gem_batch_pool batch_pool;
struct intel_hw_status_page status_page;
+ struct i915_ctx_workarounds wa_ctx;
unsigned irq_refcount; /* protected by dev_priv->irq_lock */
u32 irq_enable_mask; /* bitmask to enable ring interrupt */