diff mbox

[1/2] rendercopy/bdw: Enable hw-generated binding tables

Message ID 1399405683-27409-2-git-send-email-abdiel.janulgue@linux.intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

Abdiel Janulgue May 6, 2014, 7:48 p.m. UTC
Use on-chip hw-binding table generator to generate binding
tables when the test emits SURFACE_STATES packets. The hw generates
these binding table packets internally so we don't have to
allocate space on the batchbuffer.

Signed-off-by: Abdiel Janulgue <abdiel.janulgue@linux.intel.com>
---
 lib/gen8_render.h       |   13 +++++++
 lib/intel_batchbuffer.c |   12 ++++++
 lib/intel_batchbuffer.h |    3 ++
 lib/intel_reg.h         |    3 ++
 lib/rendercopy_gen8.c   |   97 ++++++++++++++++++++++++++++++++++++++++++++---
 5 files changed, 123 insertions(+), 5 deletions(-)

Comments

Ville Syrjala May 6, 2014, 9:38 p.m. UTC | #1
On Tue, May 06, 2014 at 10:48:02PM +0300, Abdiel Janulgue wrote:
> Use on-chip hw-binding table generator to generate binding
> tables when the test emits SURFACE_STATES packets. The hw generates
> these binding table packets internally so we don't have to
> allocate space on the batchbuffer.
> 
> Signed-off-by: Abdiel Janulgue <abdiel.janulgue@linux.intel.com>
> ---
>  lib/gen8_render.h       |   13 +++++++
>  lib/intel_batchbuffer.c |   12 ++++++
>  lib/intel_batchbuffer.h |    3 ++
>  lib/intel_reg.h         |    3 ++
>  lib/rendercopy_gen8.c   |   97 ++++++++++++++++++++++++++++++++++++++++++++---
>  5 files changed, 123 insertions(+), 5 deletions(-)
> 
> diff --git a/lib/gen8_render.h b/lib/gen8_render.h
> index fffc100..4d5f7ba 100644
> --- a/lib/gen8_render.h
> +++ b/lib/gen8_render.h
> @@ -83,6 +83,19 @@
>  /* Random shifts */
>  #define GEN8_3DSTATE_PS_MAX_THREADS_SHIFT 23
>  
> +/* GEN8+ resource streamer */
> +#define GEN8_3DSTATE_BINDING_TABLE_POOL_ALLOC       (0x7919 << 16) /* GEN8 */
> +# define BINDING_TABLE_POOL_ENABLE              0x0800
> +#define GEN8_3DSTATE_BINDING_TABLE_EDIT_VS          (0x7843 << 16) /* GEN8 */
> +#define GEN8_3DSTATE_BINDING_TABLE_EDIT_GS          (0x7844 << 16) /* GEN8 */
> +#define GEN8_3DSTATE_BINDING_TABLE_EDIT_HS          (0x7845 << 16) /* GEN8 */
> +#define GEN8_3DSTATE_BINDING_TABLE_EDIT_DS          (0x7846 << 16) /* GEN8 */
> +#define GEN8_3DSTATE_BINDING_TABLE_EDIT_PS          (0x7847 << 16) /* GEN8 */
> +
> +/* Toggling RS needs PIPE_CONTROL SC invalidate */
> +#define _3DSTATE_PIPE_CONTROL		((0x3 << 29) | (3 << 27) | (2 << 24))
> +#define PIPE_CONTROL_STATE_CACHE_INVALIDATE	(1 << 2)
> +
>  /* Shamelessly ripped from mesa */
>  struct gen8_surface_state
>  {
> diff --git a/lib/intel_batchbuffer.c b/lib/intel_batchbuffer.c
> index e868922..26c9b89 100644
> --- a/lib/intel_batchbuffer.c
> +++ b/lib/intel_batchbuffer.c
> @@ -81,6 +81,14 @@ intel_batchbuffer_reset(struct intel_batchbuffer *batch)
>  	memset(batch->buffer, 0, sizeof(batch->buffer));
>  
>  	batch->ptr = batch->buffer;
> +
> +	if (batch->hw_bt_pool_bo != NULL) {
> +		drm_intel_bo_unreference(batch->hw_bt_pool_bo);
> +		batch->hw_bt_pool_bo = NULL;
> +	}
> +
> +	batch->hw_bt_pool_bo = drm_intel_bo_alloc(batch->bufmgr, "hw_bt",
> +						  131072, 4096);
>  }
>  
>  /**
> @@ -116,6 +124,10 @@ intel_batchbuffer_free(struct intel_batchbuffer *batch)
>  {
>  	drm_intel_bo_unreference(batch->bo);
>  	batch->bo = NULL;
> +
> +	drm_intel_bo_unreference(batch->hw_bt_pool_bo);
> +	batch->hw_bt_pool_bo = NULL;
> +
>  	free(batch);
>  }
>  
> diff --git a/lib/intel_batchbuffer.h b/lib/intel_batchbuffer.h
> index 49dbcf0..2a516bc 100644
> --- a/lib/intel_batchbuffer.h
> +++ b/lib/intel_batchbuffer.h
> @@ -18,6 +18,9 @@ struct intel_batchbuffer {
>  	uint8_t buffer[BATCH_SZ];
>  	uint8_t *ptr;
>  	uint8_t *state;
> +
> +	drm_intel_bo *hw_bt_pool_bo;
> +	int use_resource_streamer;
>  };
>  
>  struct intel_batchbuffer *intel_batchbuffer_alloc(drm_intel_bufmgr *bufmgr,
> diff --git a/lib/intel_reg.h b/lib/intel_reg.h
> index 4b3a102..1cb9a29 100644
> --- a/lib/intel_reg.h
> +++ b/lib/intel_reg.h
> @@ -2560,6 +2560,9 @@ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
>  #define MI_BATCH_NON_SECURE		(1)
>  #define MI_BATCH_NON_SECURE_I965	(1 << 8)
>  
> +/* Resource Streamer */
> +#define MI_RS_CONTROL           (0x6 << 23)
> +
>  #define MAX_DISPLAY_PIPES	2
>  
>  typedef enum {
> diff --git a/lib/rendercopy_gen8.c b/lib/rendercopy_gen8.c
> index 6f5a698..b0dd0f3 100644
> --- a/lib/rendercopy_gen8.c
> +++ b/lib/rendercopy_gen8.c
> @@ -169,12 +169,14 @@ static void
>  gen6_render_flush(struct intel_batchbuffer *batch,
>  		  drm_intel_context *context, uint32_t batch_end)
>  {
> -	int ret;
> +	int ret = 0;
> +	uint32_t flags = 0;
>  
>  	ret = drm_intel_bo_subdata(batch->bo, 0, 4096, batch->buffer);
> +	flags = batch->use_resource_streamer ? I915_EXEC_RESOURCE_STREAMER : 0;
>  	if (ret == 0)
>  		ret = drm_intel_gem_bo_context_exec(batch->bo, context,
> -						    batch_end, 0);
> +						    batch_end, flags | I915_EXEC_RENDER);
>  	assert(ret == 0);
>  }
>  
> @@ -228,18 +230,83 @@ gen8_bind_buf(struct intel_batchbuffer *batch, struct igt_buf *buf,
>  	return offset;
>  }
>  
> +static void
> +gen8_hw_binding_table(struct intel_batchbuffer *batch, bool enable)
> +{
> +	if (!enable) {
> +		OUT_BATCH(MI_RS_CONTROL | 0x0);
> +
> +		OUT_BATCH(GEN8_3DSTATE_BINDING_TABLE_POOL_ALLOC | (4 - 2));
> +		/* binding table pool base address */
> +		OUT_BATCH(0);
> +		OUT_BATCH(0);
> +		/* Upper bound */
> +		OUT_BATCH(0);
> +

WaCsStallBeforeStateCacheInvalidate says you need to emit another
PIPE_CONTROL here with CS stall bit set. And thanks to the other
restrictions that PIPE_CONTORL needs one of the other magic bits also
set. Stall at scoreboard perhaps? I think that's what we set in the
kernel usually.

> +		OUT_BATCH(_3DSTATE_PIPE_CONTROL | (6 - 2));
> +		OUT_BATCH(PIPE_CONTROL_STATE_CACHE_INVALIDATE);
> +		OUT_BATCH(0);
> +		OUT_BATCH(0);
> +		OUT_BATCH(0);
> +		OUT_BATCH(0);
> +
> +		return;
> +        }
> +	OUT_BATCH(GEN8_3DSTATE_BINDING_TABLE_POOL_ALLOC | (4 - 2));
> +
> +	/* binding table pool base address */
> +	OUT_RELOC(batch->hw_bt_pool_bo, I915_GEM_DOMAIN_SAMPLER, 0,
> +                  BINDING_TABLE_POOL_ENABLE);
> +	OUT_BATCH(0);

I suppose it doesn't matter much which domain we use here. But the
hardware writes to the BT pool so maybe you should set write domain
as well?

> +
> +	/* Upper bound */
> +	OUT_RELOC(batch->hw_bt_pool_bo, I915_GEM_DOMAIN_SAMPLER, 0,
> +                  batch->hw_bt_pool_bo->size);

AFAICS on bdw this should be just the bt pool size and not a reloc.

> +

Again need another CS stall PIPE_CONTROL.

> +	OUT_BATCH(_3DSTATE_PIPE_CONTROL | (6 - 2));
> +	OUT_BATCH(PIPE_CONTROL_STATE_CACHE_INVALIDATE);
> +	OUT_BATCH(0);
> +	OUT_BATCH(0);
> +	OUT_BATCH(0);
> +	OUT_BATCH(0);
> +}
> +
> +static uint32_t
> +gen8_rs_bind_surfaces(struct intel_batchbuffer *batch,
> +		      struct igt_buf *src,
> +		      struct igt_buf *dst,
> +		      uint32_t *surf0, uint32_t *surf1)
> +{
> +	*surf0 = gen8_bind_buf(batch, dst, GEN6_SURFACEFORMAT_B8G8R8A8_UNORM, 1);
> +	*surf1 = gen8_bind_buf(batch, src, GEN6_SURFACEFORMAT_B8G8R8A8_UNORM, 0);
> +
> +	return 0;
> +}
> +
> +static void
> +gen8_rs_edit_surfaces(struct intel_batchbuffer *batch,
> +		      uint32_t surf0, uint32_t surf1)
> +{
> +	OUT_BATCH(GEN8_3DSTATE_BINDING_TABLE_EDIT_PS | (4 - 2));
> +	OUT_BATCH(0x3);

So I take it there's no need to clear the entries first?

> +	{
> +		OUT_BATCH(0 << 16 | surf0 >> 6);
> +		OUT_BATCH(1 << 16 | surf1 >> 6);
> +	}
> +}
> +
>  static uint32_t
>  gen8_bind_surfaces(struct intel_batchbuffer *batch,
>  		   struct igt_buf *src,
>  		   struct igt_buf *dst)
>  {
> -	uint32_t *binding_table, offset;
> +	uint32_t offset = 0;
> +	uint32_t *binding_table;
>  
>  	binding_table = batch_alloc(batch, 8, 32);
>  	offset = batch_offset(batch, binding_table);
>  	annotation_add_state(&aub_annotations, AUB_TRACE_BINDING_TABLE,
>  			     offset, 8);
> -
>  	binding_table[0] =
>  		gen8_bind_buf(batch, dst, GEN6_SURFACEFORMAT_B8G8R8A8_UNORM, 1);
>  	binding_table[1] =
> @@ -512,6 +579,9 @@ gen7_emit_push_constants(struct intel_batchbuffer *batch) {
>  
>  static void
>  gen8_emit_state_base_address(struct intel_batchbuffer *batch) {
> +	if (batch->use_resource_streamer)
> +		OUT_BATCH(MI_RS_CONTROL | 0x0);
> +
>  	OUT_BATCH(GEN6_STATE_BASE_ADDRESS | (16 - 2));
>  
>  	/* general */
> @@ -546,6 +616,9 @@ gen8_emit_state_base_address(struct intel_batchbuffer *batch) {
>  	OUT_BATCH(0xfffff000 | 1);
>  	/* intruction buffer size */
>  	OUT_BATCH(1 << 12 | 1);
> +
> +	if (batch->use_resource_streamer)
> +		OUT_BATCH(MI_RS_CONTROL | 0x1);
>  }
>  
>  static void
> @@ -918,6 +991,7 @@ void gen8_render_copyfunc(struct intel_batchbuffer *batch,
>  	uint32_t scissor_state;
>  	uint32_t vertex_buffer;
>  	uint32_t batch_end;
> +	uint32_t surf0 = 0, surf1 = 1;
>  
>  	intel_batchbuffer_flush_with_context(batch, context);
>  
> @@ -927,7 +1001,11 @@ void gen8_render_copyfunc(struct intel_batchbuffer *batch,
>  
>  	annotation_init(&aub_annotations);
>  
> -	ps_binding_table  = gen8_bind_surfaces(batch, src, dst);
> +	if (batch->use_resource_streamer)
> +		ps_binding_table = gen8_rs_bind_surfaces(batch, src, dst,
> +							 &surf0, &surf1);
> +	else
> +		ps_binding_table  = gen8_bind_surfaces(batch, src, dst);
>  	ps_sampler_state  = gen8_create_sampler(batch);
>  	ps_kernel_off = gen8_fill_ps(batch, ps_kernel, sizeof(ps_kernel));
>  	vertex_buffer = gen7_fill_vertex_buffer_data(batch, src,
> @@ -955,6 +1033,9 @@ void gen8_render_copyfunc(struct intel_batchbuffer *batch,
>  
>  	gen8_emit_state_base_address(batch);
>  
> +	if (batch->use_resource_streamer)
> +               gen8_hw_binding_table(batch, true);
> +
>  	OUT_BATCH(GEN7_3DSTATE_VIEWPORT_STATE_POINTERS_CC);
>  	OUT_BATCH(viewport.cc_state);
>  	OUT_BATCH(GEN7_3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP);
> @@ -978,6 +1059,9 @@ void gen8_render_copyfunc(struct intel_batchbuffer *batch,
>  
>  	gen8_emit_sf(batch);
>  
> +	if (batch->use_resource_streamer)
> +		gen8_rs_edit_surfaces(batch, surf0, surf1);
> +
>  	OUT_BATCH(GEN7_3DSTATE_BINDING_TABLE_POINTERS_PS);
>  	OUT_BATCH(ps_binding_table);
>  
> @@ -1001,6 +1085,9 @@ void gen8_render_copyfunc(struct intel_batchbuffer *batch,
>  	gen8_emit_vf_topology(batch);
>  	gen8_emit_primitive(batch, vertex_buffer);
>  
> +	if (batch->use_resource_streamer)
> +		gen8_hw_binding_table(batch, false);
> +
>  	OUT_BATCH(MI_BATCH_BUFFER_END);
>  
>  	batch_end = batch_align(batch, 8);
> -- 
> 1.7.9.5
> 
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/intel-gfx
Abdiel Janulgue May 6, 2014, 10:09 p.m. UTC | #2
On Wednesday, May 07, 2014 12:38:04 AM Ville Syrjälä wrote:
> On Tue, May 06, 2014 at 10:48:02PM +0300, Abdiel Janulgue wrote:
> > Use on-chip hw-binding table generator to generate binding
> > tables when the test emits SURFACE_STATES packets. The hw generates
> > these binding table packets internally so we don't have to
> > allocate space on the batchbuffer.
> > 
> > Signed-off-by: Abdiel Janulgue <abdiel.janulgue@linux.intel.com>
> > ---
> > 
> >  lib/gen8_render.h       |   13 +++++++
> >  lib/intel_batchbuffer.c |   12 ++++++
> >  lib/intel_batchbuffer.h |    3 ++
> >  lib/intel_reg.h         |    3 ++
> >  lib/rendercopy_gen8.c   |   97
> >  ++++++++++++++++++++++++++++++++++++++++++++--- 5 files changed, 123
> >  insertions(+), 5 deletions(-)
> > 
> > diff --git a/lib/gen8_render.h b/lib/gen8_render.h
> > index fffc100..4d5f7ba 100644
> > --- a/lib/gen8_render.h
> > +++ b/lib/gen8_render.h
> > @@ -83,6 +83,19 @@
> > 
> >  /* Random shifts */
> >  #define GEN8_3DSTATE_PS_MAX_THREADS_SHIFT 23
> > 
> > +/* GEN8+ resource streamer */
> > +#define GEN8_3DSTATE_BINDING_TABLE_POOL_ALLOC       (0x7919 << 16) /*
> > GEN8 */ +# define BINDING_TABLE_POOL_ENABLE              0x0800
> > +#define GEN8_3DSTATE_BINDING_TABLE_EDIT_VS          (0x7843 << 16) /*
> > GEN8 */ +#define GEN8_3DSTATE_BINDING_TABLE_EDIT_GS          (0x7844 <<
> > 16) /* GEN8 */ +#define GEN8_3DSTATE_BINDING_TABLE_EDIT_HS         
> > (0x7845 << 16) /* GEN8 */ +#define GEN8_3DSTATE_BINDING_TABLE_EDIT_DS    
> >      (0x7846 << 16) /* GEN8 */ +#define
> > GEN8_3DSTATE_BINDING_TABLE_EDIT_PS          (0x7847 << 16) /* GEN8 */ +
> > +/* Toggling RS needs PIPE_CONTROL SC invalidate */
> > +#define _3DSTATE_PIPE_CONTROL		((0x3 << 29) | (3 << 27) | (2 << 24))
> > +#define PIPE_CONTROL_STATE_CACHE_INVALIDATE	(1 << 2)
> > +
> > 
> >  /* Shamelessly ripped from mesa */
> >  struct gen8_surface_state
> >  {
> > 
> > diff --git a/lib/intel_batchbuffer.c b/lib/intel_batchbuffer.c
> > index e868922..26c9b89 100644
> > --- a/lib/intel_batchbuffer.c
> > +++ b/lib/intel_batchbuffer.c
> > @@ -81,6 +81,14 @@ intel_batchbuffer_reset(struct intel_batchbuffer
> > *batch)
> > 
> >  	memset(batch->buffer, 0, sizeof(batch->buffer));
> >  	
> >  	batch->ptr = batch->buffer;
> > 
> > +
> > +	if (batch->hw_bt_pool_bo != NULL) {
> > +		drm_intel_bo_unreference(batch->hw_bt_pool_bo);
> > +		batch->hw_bt_pool_bo = NULL;
> > +	}
> > +
> > +	batch->hw_bt_pool_bo = drm_intel_bo_alloc(batch->bufmgr, "hw_bt",
> > +						  131072, 4096);
> > 
> >  }
> >  
> >  /**
> > 
> > @@ -116,6 +124,10 @@ intel_batchbuffer_free(struct intel_batchbuffer
> > *batch)> 
> >  {
> >  
> >  	drm_intel_bo_unreference(batch->bo);
> >  	batch->bo = NULL;
> > 
> > +
> > +	drm_intel_bo_unreference(batch->hw_bt_pool_bo);
> > +	batch->hw_bt_pool_bo = NULL;
> > +
> > 
> >  	free(batch);
> >  
> >  }
> > 
> > diff --git a/lib/intel_batchbuffer.h b/lib/intel_batchbuffer.h
> > index 49dbcf0..2a516bc 100644
> > --- a/lib/intel_batchbuffer.h
> > +++ b/lib/intel_batchbuffer.h
> > @@ -18,6 +18,9 @@ struct intel_batchbuffer {
> > 
> >  	uint8_t buffer[BATCH_SZ];
> >  	uint8_t *ptr;
> >  	uint8_t *state;
> > 
> > +
> > +	drm_intel_bo *hw_bt_pool_bo;
> > +	int use_resource_streamer;
> > 
> >  };
> >  
> >  struct intel_batchbuffer *intel_batchbuffer_alloc(drm_intel_bufmgr
> >  *bufmgr,
> > 
> > diff --git a/lib/intel_reg.h b/lib/intel_reg.h
> > index 4b3a102..1cb9a29 100644
> > --- a/lib/intel_reg.h
> > +++ b/lib/intel_reg.h
> > @@ -2560,6 +2560,9 @@ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
> > SOFTWARE.> 
> >  #define MI_BATCH_NON_SECURE		(1)
> >  #define MI_BATCH_NON_SECURE_I965	(1 << 8)
> > 
> > +/* Resource Streamer */
> > +#define MI_RS_CONTROL           (0x6 << 23)
> > +
> > 
> >  #define MAX_DISPLAY_PIPES	2
> >  
> >  typedef enum {
> > 
> > diff --git a/lib/rendercopy_gen8.c b/lib/rendercopy_gen8.c
> > index 6f5a698..b0dd0f3 100644
> > --- a/lib/rendercopy_gen8.c
> > +++ b/lib/rendercopy_gen8.c
> > @@ -169,12 +169,14 @@ static void
> > 
> >  gen6_render_flush(struct intel_batchbuffer *batch,
> >  
> >  		  drm_intel_context *context, uint32_t batch_end)
> >  
> >  {
> > 
> > -	int ret;
> > +	int ret = 0;
> > +	uint32_t flags = 0;
> > 
> >  	ret = drm_intel_bo_subdata(batch->bo, 0, 4096, batch->buffer);
> > 
> > +	flags = batch->use_resource_streamer ? I915_EXEC_RESOURCE_STREAMER : 0;
> > 
> >  	if (ret == 0)
> >  	
> >  		ret = drm_intel_gem_bo_context_exec(batch->bo, context,
> > 
> > -						    batch_end, 0);
> > +						    batch_end, flags | I915_EXEC_RENDER);
> > 
> >  	assert(ret == 0);
> >  
> >  }
> > 
> > @@ -228,18 +230,83 @@ gen8_bind_buf(struct intel_batchbuffer *batch,
> > struct igt_buf *buf,> 
> >  	return offset;
> >  
> >  }
> > 
> > +static void
> > +gen8_hw_binding_table(struct intel_batchbuffer *batch, bool enable)
> > +{
> > +	if (!enable) {
> > +		OUT_BATCH(MI_RS_CONTROL | 0x0);
> > +
> > +		OUT_BATCH(GEN8_3DSTATE_BINDING_TABLE_POOL_ALLOC | (4 - 2));
> > +		/* binding table pool base address */
> > +		OUT_BATCH(0);
> > +		OUT_BATCH(0);
> > +		/* Upper bound */
> > +		OUT_BATCH(0);
> > +
> 
> WaCsStallBeforeStateCacheInvalidate says you need to emit another
> PIPE_CONTROL here with CS stall bit set. And thanks to the other
> restrictions that PIPE_CONTORL needs one of the other magic bits also
> set. Stall at scoreboard perhaps? I think that's what we set in the
> kernel usually.

It used to have the CS stall bit. But it seems to work without it. In any
case, you might be right. It's a good idea to put in back in.
> 
> > +		OUT_BATCH(_3DSTATE_PIPE_CONTROL | (6 - 2));
> > +		OUT_BATCH(PIPE_CONTROL_STATE_CACHE_INVALIDATE);
> > +		OUT_BATCH(0);
> > +		OUT_BATCH(0);
> > +		OUT_BATCH(0);
> > +		OUT_BATCH(0);
> > +
> > +		return;
> > +        }
> > +	OUT_BATCH(GEN8_3DSTATE_BINDING_TABLE_POOL_ALLOC | (4 - 2));
> > +
> > +	/* binding table pool base address */
> > +	OUT_RELOC(batch->hw_bt_pool_bo, I915_GEM_DOMAIN_SAMPLER, 0,
> > +                  BINDING_TABLE_POOL_ENABLE);
> > +	OUT_BATCH(0);
> 
> I suppose it doesn't matter much which domain we use here. But the
> hardware writes to the BT pool so maybe you should set write domain
> as well?
> 
> > +
> > +	/* Upper bound */
> > +	OUT_RELOC(batch->hw_bt_pool_bo, I915_GEM_DOMAIN_SAMPLER, 0,
> > +                  batch->hw_bt_pool_bo->size);
> 
> AFAICS on bdw this should be just the bt pool size and not a reloc.
> 
> > +
> 
> Again need another CS stall PIPE_CONTROL.
> 
> > +	OUT_BATCH(_3DSTATE_PIPE_CONTROL | (6 - 2));
> > +	OUT_BATCH(PIPE_CONTROL_STATE_CACHE_INVALIDATE);
> > +	OUT_BATCH(0);
> > +	OUT_BATCH(0);
> > +	OUT_BATCH(0);
> > +	OUT_BATCH(0);
> > +}
> > +
> > +static uint32_t
> > +gen8_rs_bind_surfaces(struct intel_batchbuffer *batch,
> > +		      struct igt_buf *src,
> > +		      struct igt_buf *dst,
> > +		      uint32_t *surf0, uint32_t *surf1)
> > +{
> > +	*surf0 = gen8_bind_buf(batch, dst, GEN6_SURFACEFORMAT_B8G8R8A8_UNORM,
> > 1);
> > +	*surf1 = gen8_bind_buf(batch, src, GEN6_SURFACEFORMAT_B8G8R8A8_UNORM,
> > 0);
> > +
> > +	return 0;
> > +}
> > +
> > +static void
> > +gen8_rs_edit_surfaces(struct intel_batchbuffer *batch,
> > +		      uint32_t surf0, uint32_t surf1)
> > +{
> > +	OUT_BATCH(GEN8_3DSTATE_BINDING_TABLE_EDIT_PS | (4 - 2));
> > +	OUT_BATCH(0x3);
> 
> So I take it there's no need to clear the entries first?

I don't think it's necessary on this case. The clear is only
neded if the binding table pool is re-used from the previous
batch - which might contain stale entries. At least it causes
hungs in Mesa. In this test, we are allocating the pool same
time as the batch is allocated.


> 
> > +	{
> > +		OUT_BATCH(0 << 16 | surf0 >> 6);
> > +		OUT_BATCH(1 << 16 | surf1 >> 6);
> > +	}
> > +}
> > +
> > 
> >  static uint32_t
> >  gen8_bind_surfaces(struct intel_batchbuffer *batch,
> >  
> >  		   struct igt_buf *src,
> >  		   struct igt_buf *dst)
> >  
> >  {
> > 
> > -	uint32_t *binding_table, offset;
> > +	uint32_t offset = 0;
> > +	uint32_t *binding_table;
> > 
> >  	binding_table = batch_alloc(batch, 8, 32);
> >  	offset = batch_offset(batch, binding_table);
> >  	annotation_add_state(&aub_annotations, AUB_TRACE_BINDING_TABLE,
> >  	
> >  			     offset, 8);
> > 
> > -
> > 
> >  	binding_table[0] =
> >  	
> >  		gen8_bind_buf(batch, dst, GEN6_SURFACEFORMAT_B8G8R8A8_UNORM, 1);
> >  	
> >  	binding_table[1] =
> > 
> > @@ -512,6 +579,9 @@ gen7_emit_push_constants(struct intel_batchbuffer
> > *batch) {> 
> >  static void
> >  gen8_emit_state_base_address(struct intel_batchbuffer *batch) {
> > 
> > +	if (batch->use_resource_streamer)
> > +		OUT_BATCH(MI_RS_CONTROL | 0x0);
> > +
> > 
> >  	OUT_BATCH(GEN6_STATE_BASE_ADDRESS | (16 - 2));
> >  	
> >  	/* general */
> > 
> > @@ -546,6 +616,9 @@ gen8_emit_state_base_address(struct intel_batchbuffer
> > *batch) {> 
> >  	OUT_BATCH(0xfffff000 | 1);
> >  	/* intruction buffer size */
> >  	OUT_BATCH(1 << 12 | 1);
> > 
> > +
> > +	if (batch->use_resource_streamer)
> > +		OUT_BATCH(MI_RS_CONTROL | 0x1);
> > 
> >  }
> >  
> >  static void
> > 
> > @@ -918,6 +991,7 @@ void gen8_render_copyfunc(struct intel_batchbuffer
> > *batch,> 
> >  	uint32_t scissor_state;
> >  	uint32_t vertex_buffer;
> >  	uint32_t batch_end;
> > 
> > +	uint32_t surf0 = 0, surf1 = 1;
> > 
> >  	intel_batchbuffer_flush_with_context(batch, context);
> > 
> > @@ -927,7 +1001,11 @@ void gen8_render_copyfunc(struct intel_batchbuffer
> > *batch,> 
> >  	annotation_init(&aub_annotations);
> > 
> > -	ps_binding_table  = gen8_bind_surfaces(batch, src, dst);
> > +	if (batch->use_resource_streamer)
> > +		ps_binding_table = gen8_rs_bind_surfaces(batch, src, dst,
> > +							 &surf0, &surf1);
> > +	else
> > +		ps_binding_table  = gen8_bind_surfaces(batch, src, dst);
> > 
> >  	ps_sampler_state  = gen8_create_sampler(batch);
> >  	ps_kernel_off = gen8_fill_ps(batch, ps_kernel, sizeof(ps_kernel));
> >  	vertex_buffer = gen7_fill_vertex_buffer_data(batch, src,
> > 
> > @@ -955,6 +1033,9 @@ void gen8_render_copyfunc(struct intel_batchbuffer
> > *batch,> 
> >  	gen8_emit_state_base_address(batch);
> > 
> > +	if (batch->use_resource_streamer)
> > +               gen8_hw_binding_table(batch, true);
> > +
> > 
> >  	OUT_BATCH(GEN7_3DSTATE_VIEWPORT_STATE_POINTERS_CC);
> >  	OUT_BATCH(viewport.cc_state);
> >  	OUT_BATCH(GEN7_3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP);
> > 
> > @@ -978,6 +1059,9 @@ void gen8_render_copyfunc(struct intel_batchbuffer
> > *batch,> 
> >  	gen8_emit_sf(batch);
> > 
> > +	if (batch->use_resource_streamer)
> > +		gen8_rs_edit_surfaces(batch, surf0, surf1);
> > +
> > 
> >  	OUT_BATCH(GEN7_3DSTATE_BINDING_TABLE_POINTERS_PS);
> >  	OUT_BATCH(ps_binding_table);
> > 
> > @@ -1001,6 +1085,9 @@ void gen8_render_copyfunc(struct intel_batchbuffer
> > *batch,> 
> >  	gen8_emit_vf_topology(batch);
> >  	gen8_emit_primitive(batch, vertex_buffer);
> > 
> > +	if (batch->use_resource_streamer)
> > +		gen8_hw_binding_table(batch, false);
> > +
> > 
> >  	OUT_BATCH(MI_BATCH_BUFFER_END);
> >  	
> >  	batch_end = batch_align(batch, 8);


Thanks for the review. I'll update v2 soon to address the comments

-abdiel
diff mbox

Patch

diff --git a/lib/gen8_render.h b/lib/gen8_render.h
index fffc100..4d5f7ba 100644
--- a/lib/gen8_render.h
+++ b/lib/gen8_render.h
@@ -83,6 +83,19 @@ 
 /* Random shifts */
 #define GEN8_3DSTATE_PS_MAX_THREADS_SHIFT 23
 
+/* GEN8+ resource streamer */
+#define GEN8_3DSTATE_BINDING_TABLE_POOL_ALLOC       (0x7919 << 16) /* GEN8 */
+# define BINDING_TABLE_POOL_ENABLE              0x0800
+#define GEN8_3DSTATE_BINDING_TABLE_EDIT_VS          (0x7843 << 16) /* GEN8 */
+#define GEN8_3DSTATE_BINDING_TABLE_EDIT_GS          (0x7844 << 16) /* GEN8 */
+#define GEN8_3DSTATE_BINDING_TABLE_EDIT_HS          (0x7845 << 16) /* GEN8 */
+#define GEN8_3DSTATE_BINDING_TABLE_EDIT_DS          (0x7846 << 16) /* GEN8 */
+#define GEN8_3DSTATE_BINDING_TABLE_EDIT_PS          (0x7847 << 16) /* GEN8 */
+
+/* Toggling RS needs PIPE_CONTROL SC invalidate */
+#define _3DSTATE_PIPE_CONTROL		((0x3 << 29) | (3 << 27) | (2 << 24))
+#define PIPE_CONTROL_STATE_CACHE_INVALIDATE	(1 << 2)
+
 /* Shamelessly ripped from mesa */
 struct gen8_surface_state
 {
diff --git a/lib/intel_batchbuffer.c b/lib/intel_batchbuffer.c
index e868922..26c9b89 100644
--- a/lib/intel_batchbuffer.c
+++ b/lib/intel_batchbuffer.c
@@ -81,6 +81,14 @@  intel_batchbuffer_reset(struct intel_batchbuffer *batch)
 	memset(batch->buffer, 0, sizeof(batch->buffer));
 
 	batch->ptr = batch->buffer;
+
+	if (batch->hw_bt_pool_bo != NULL) {
+		drm_intel_bo_unreference(batch->hw_bt_pool_bo);
+		batch->hw_bt_pool_bo = NULL;
+	}
+
+	batch->hw_bt_pool_bo = drm_intel_bo_alloc(batch->bufmgr, "hw_bt",
+						  131072, 4096);
 }
 
 /**
@@ -116,6 +124,10 @@  intel_batchbuffer_free(struct intel_batchbuffer *batch)
 {
 	drm_intel_bo_unreference(batch->bo);
 	batch->bo = NULL;
+
+	drm_intel_bo_unreference(batch->hw_bt_pool_bo);
+	batch->hw_bt_pool_bo = NULL;
+
 	free(batch);
 }
 
diff --git a/lib/intel_batchbuffer.h b/lib/intel_batchbuffer.h
index 49dbcf0..2a516bc 100644
--- a/lib/intel_batchbuffer.h
+++ b/lib/intel_batchbuffer.h
@@ -18,6 +18,9 @@  struct intel_batchbuffer {
 	uint8_t buffer[BATCH_SZ];
 	uint8_t *ptr;
 	uint8_t *state;
+
+	drm_intel_bo *hw_bt_pool_bo;
+	int use_resource_streamer;
 };
 
 struct intel_batchbuffer *intel_batchbuffer_alloc(drm_intel_bufmgr *bufmgr,
diff --git a/lib/intel_reg.h b/lib/intel_reg.h
index 4b3a102..1cb9a29 100644
--- a/lib/intel_reg.h
+++ b/lib/intel_reg.h
@@ -2560,6 +2560,9 @@  SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #define MI_BATCH_NON_SECURE		(1)
 #define MI_BATCH_NON_SECURE_I965	(1 << 8)
 
+/* Resource Streamer */
+#define MI_RS_CONTROL           (0x6 << 23)
+
 #define MAX_DISPLAY_PIPES	2
 
 typedef enum {
diff --git a/lib/rendercopy_gen8.c b/lib/rendercopy_gen8.c
index 6f5a698..b0dd0f3 100644
--- a/lib/rendercopy_gen8.c
+++ b/lib/rendercopy_gen8.c
@@ -169,12 +169,14 @@  static void
 gen6_render_flush(struct intel_batchbuffer *batch,
 		  drm_intel_context *context, uint32_t batch_end)
 {
-	int ret;
+	int ret = 0;
+	uint32_t flags = 0;
 
 	ret = drm_intel_bo_subdata(batch->bo, 0, 4096, batch->buffer);
+	flags = batch->use_resource_streamer ? I915_EXEC_RESOURCE_STREAMER : 0;
 	if (ret == 0)
 		ret = drm_intel_gem_bo_context_exec(batch->bo, context,
-						    batch_end, 0);
+						    batch_end, flags | I915_EXEC_RENDER);
 	assert(ret == 0);
 }
 
@@ -228,18 +230,83 @@  gen8_bind_buf(struct intel_batchbuffer *batch, struct igt_buf *buf,
 	return offset;
 }
 
+static void
+gen8_hw_binding_table(struct intel_batchbuffer *batch, bool enable)
+{
+	if (!enable) {
+		OUT_BATCH(MI_RS_CONTROL | 0x0);
+
+		OUT_BATCH(GEN8_3DSTATE_BINDING_TABLE_POOL_ALLOC | (4 - 2));
+		/* binding table pool base address */
+		OUT_BATCH(0);
+		OUT_BATCH(0);
+		/* Upper bound */
+		OUT_BATCH(0);
+
+		OUT_BATCH(_3DSTATE_PIPE_CONTROL | (6 - 2));
+		OUT_BATCH(PIPE_CONTROL_STATE_CACHE_INVALIDATE);
+		OUT_BATCH(0);
+		OUT_BATCH(0);
+		OUT_BATCH(0);
+		OUT_BATCH(0);
+
+		return;
+        }
+	OUT_BATCH(GEN8_3DSTATE_BINDING_TABLE_POOL_ALLOC | (4 - 2));
+
+	/* binding table pool base address */
+	OUT_RELOC(batch->hw_bt_pool_bo, I915_GEM_DOMAIN_SAMPLER, 0,
+                  BINDING_TABLE_POOL_ENABLE);
+	OUT_BATCH(0);
+
+	/* Upper bound */
+	OUT_RELOC(batch->hw_bt_pool_bo, I915_GEM_DOMAIN_SAMPLER, 0,
+                  batch->hw_bt_pool_bo->size);
+
+	OUT_BATCH(_3DSTATE_PIPE_CONTROL | (6 - 2));
+	OUT_BATCH(PIPE_CONTROL_STATE_CACHE_INVALIDATE);
+	OUT_BATCH(0);
+	OUT_BATCH(0);
+	OUT_BATCH(0);
+	OUT_BATCH(0);
+}
+
+static uint32_t
+gen8_rs_bind_surfaces(struct intel_batchbuffer *batch,
+		      struct igt_buf *src,
+		      struct igt_buf *dst,
+		      uint32_t *surf0, uint32_t *surf1)
+{
+	*surf0 = gen8_bind_buf(batch, dst, GEN6_SURFACEFORMAT_B8G8R8A8_UNORM, 1);
+	*surf1 = gen8_bind_buf(batch, src, GEN6_SURFACEFORMAT_B8G8R8A8_UNORM, 0);
+
+	return 0;
+}
+
+static void
+gen8_rs_edit_surfaces(struct intel_batchbuffer *batch,
+		      uint32_t surf0, uint32_t surf1)
+{
+	OUT_BATCH(GEN8_3DSTATE_BINDING_TABLE_EDIT_PS | (4 - 2));
+	OUT_BATCH(0x3);
+	{
+		OUT_BATCH(0 << 16 | surf0 >> 6);
+		OUT_BATCH(1 << 16 | surf1 >> 6);
+	}
+}
+
 static uint32_t
 gen8_bind_surfaces(struct intel_batchbuffer *batch,
 		   struct igt_buf *src,
 		   struct igt_buf *dst)
 {
-	uint32_t *binding_table, offset;
+	uint32_t offset = 0;
+	uint32_t *binding_table;
 
 	binding_table = batch_alloc(batch, 8, 32);
 	offset = batch_offset(batch, binding_table);
 	annotation_add_state(&aub_annotations, AUB_TRACE_BINDING_TABLE,
 			     offset, 8);
-
 	binding_table[0] =
 		gen8_bind_buf(batch, dst, GEN6_SURFACEFORMAT_B8G8R8A8_UNORM, 1);
 	binding_table[1] =
@@ -512,6 +579,9 @@  gen7_emit_push_constants(struct intel_batchbuffer *batch) {
 
 static void
 gen8_emit_state_base_address(struct intel_batchbuffer *batch) {
+	if (batch->use_resource_streamer)
+		OUT_BATCH(MI_RS_CONTROL | 0x0);
+
 	OUT_BATCH(GEN6_STATE_BASE_ADDRESS | (16 - 2));
 
 	/* general */
@@ -546,6 +616,9 @@  gen8_emit_state_base_address(struct intel_batchbuffer *batch) {
 	OUT_BATCH(0xfffff000 | 1);
 	/* intruction buffer size */
 	OUT_BATCH(1 << 12 | 1);
+
+	if (batch->use_resource_streamer)
+		OUT_BATCH(MI_RS_CONTROL | 0x1);
 }
 
 static void
@@ -918,6 +991,7 @@  void gen8_render_copyfunc(struct intel_batchbuffer *batch,
 	uint32_t scissor_state;
 	uint32_t vertex_buffer;
 	uint32_t batch_end;
+	uint32_t surf0 = 0, surf1 = 1;
 
 	intel_batchbuffer_flush_with_context(batch, context);
 
@@ -927,7 +1001,11 @@  void gen8_render_copyfunc(struct intel_batchbuffer *batch,
 
 	annotation_init(&aub_annotations);
 
-	ps_binding_table  = gen8_bind_surfaces(batch, src, dst);
+	if (batch->use_resource_streamer)
+		ps_binding_table = gen8_rs_bind_surfaces(batch, src, dst,
+							 &surf0, &surf1);
+	else
+		ps_binding_table  = gen8_bind_surfaces(batch, src, dst);
 	ps_sampler_state  = gen8_create_sampler(batch);
 	ps_kernel_off = gen8_fill_ps(batch, ps_kernel, sizeof(ps_kernel));
 	vertex_buffer = gen7_fill_vertex_buffer_data(batch, src,
@@ -955,6 +1033,9 @@  void gen8_render_copyfunc(struct intel_batchbuffer *batch,
 
 	gen8_emit_state_base_address(batch);
 
+	if (batch->use_resource_streamer)
+               gen8_hw_binding_table(batch, true);
+
 	OUT_BATCH(GEN7_3DSTATE_VIEWPORT_STATE_POINTERS_CC);
 	OUT_BATCH(viewport.cc_state);
 	OUT_BATCH(GEN7_3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP);
@@ -978,6 +1059,9 @@  void gen8_render_copyfunc(struct intel_batchbuffer *batch,
 
 	gen8_emit_sf(batch);
 
+	if (batch->use_resource_streamer)
+		gen8_rs_edit_surfaces(batch, surf0, surf1);
+
 	OUT_BATCH(GEN7_3DSTATE_BINDING_TABLE_POINTERS_PS);
 	OUT_BATCH(ps_binding_table);
 
@@ -1001,6 +1085,9 @@  void gen8_render_copyfunc(struct intel_batchbuffer *batch,
 	gen8_emit_vf_topology(batch);
 	gen8_emit_primitive(batch, vertex_buffer);
 
+	if (batch->use_resource_streamer)
+		gen8_hw_binding_table(batch, false);
+
 	OUT_BATCH(MI_BATCH_BUFFER_END);
 
 	batch_end = batch_align(batch, 8);