@@ -3606,4 +3606,26 @@ typedef enum {
#define GEN7_ROW_CHICKEN2_GT2 0xf4f4
#define DOP_CLOCK_GATING_DISABLE (1<<0)
+#define HALF_SLICE_CHICKEN3 0xe184
+#define GEN8_CENTROID_PIXEL_OPT_DIS (1<<8)
+#define GEN8_SAMPLER_POWER_BYPASS_DIS (1<<1)
+
+#define GEN7_HALF_SLICE_CHICKEN1 0xe100
+#define GEN7_SINGLE_SUBSCAN_DISPATCH_ENABLE (1<<10)
+
+#define COMMON_SLICE_CHICKEN2 0x7014
+# define GEN8_CSC2_SBE_VUE_CACHE_CONSERVATIVE (1<<0)
+
+/* GEN8 chicken */
+#define HDC_CHICKEN0 0x7300
+#define HDC_FORCE_NON_COHERENT (1<<4)
+
+#define GEN7_CACHE_MODE_1 0x7004 /* IVB+ */
+#define GEN8_4x4_STC_OPTIMIZATION_DISABLE (1<<6)
+
+#define GEN7_GT_MODE 0x7008
+#define GEN6_WIZ_HASHING(hi, lo) (((hi) << 9) | ((lo) << 7))
+#define GEN6_WIZ_HASHING_16x4 GEN6_WIZ_HASHING(1, 0)
+#define GEN6_WIZ_HASHING_MASK (GEN6_WIZ_HASHING(1, 1) << 16)
+
#endif /* _I810_REG_H */
@@ -703,6 +703,43 @@ static void gen8_emit_workarounds(struct intel_batchbuffer *batch)
/* WaDisableDopClockGating:bdw May not be needed for production */
gen8_emit_write(batch, GEN7_ROW_CHICKEN2, _MASKED_BIT_ENABLE(DOP_CLOCK_GATING_DISABLE));
+
+ /*
+ * This GEN8_CENTROID_PIXEL_OPT_DIS W/A is only needed for
+ * pre-production hardware
+ */
+ gen8_emit_write(batch, HALF_SLICE_CHICKEN3,
+ _MASKED_BIT_ENABLE(GEN8_CENTROID_PIXEL_OPT_DIS
+ | GEN8_SAMPLER_POWER_BYPASS_DIS));
+
+ gen8_emit_write(batch, GEN7_HALF_SLICE_CHICKEN1,
+ _MASKED_BIT_ENABLE(GEN7_SINGLE_SUBSCAN_DISPATCH_ENABLE));
+
+ gen8_emit_write(batch, COMMON_SLICE_CHICKEN2,
+ _MASKED_BIT_ENABLE(GEN8_CSC2_SBE_VUE_CACHE_CONSERVATIVE));
+
+ /* Use Force Non-Coherent whenever executing a 3D context. This is a
+ * workaround for for a possible hang in the unlikely event a TLB
+ * invalidation occurs during a PSD flush.
+ */
+ gen8_emit_write(batch, HDC_CHICKEN0,
+ _MASKED_BIT_ENABLE(HDC_FORCE_NON_COHERENT));
+
+ /* Wa4x4STCOptimizationDisable:bdw */
+ gen8_emit_write(batch, GEN7_CACHE_MODE_1,
+ _MASKED_BIT_ENABLE(GEN8_4x4_STC_OPTIMIZATION_DISABLE));
+
+ /*
+ * BSpec recommends 8x4 when MSAA is used,
+ * however in practice 16x4 seems fastest.
+ *
+ * Note that PS/WM thread counts depend on the WIZ hashing
+ * disable bit, which we don't touch here, but it's good
+ * to keep in mind (see 3DSTATE_PS and 3DSTATE_WM).
+ */
+ gen8_emit_write(batch, GEN7_GT_MODE,
+ GEN6_WIZ_HASHING_MASK | GEN6_WIZ_HASHING_16x4);
+
}
int gen8_setup_null_render_state(struct intel_batchbuffer *batch)