[2/3] lib: Add GPGPU fill

Message ID	1417605079-10913-2-git-send-email-zhenyuw@linux.intel.com (mailing list archive)
State	New, archived
Headers	show Return-Path: <intel-gfx-bounces@lists.freedesktop.org> From: Zhenyu Wang <zhenyuw@linux.intel.com> To: intel-gfx@lists.freedesktop.org Date: Wed, 3 Dec 2014 19:11:18 +0800 Message-Id: <1417605079-10913-2-git-send-email-zhenyuw@linux.intel.com> In-Reply-To: <1417605079-10913-1-git-send-email-zhenyuw@linux.intel.com> References: <1417605079-10913-1-git-send-email-zhenyuw@linux.intel.com> Subject: [Intel-gfx] [PATCH 2/3] lib: Add GPGPU fill Precedence: list MIME-Version: 1.0 Content-Type: text/plain; charset="utf-8" Content-Transfer-Encoding: base64 Errors-To: intel-gfx-bounces@lists.freedesktop.org Sender: "Intel-gfx" <intel-gfx-bounces@lists.freedesktop.org>

diff --git a/lib/gen7_media.h b/lib/gen7_media.h index d5f9921..91294d2 100644 --- a/lib/gen7_media.h +++ b/lib/gen7_media.h @@ -179,6 +179,7 @@ #define GEN7_PIPELINE_SELECT GFXPIPE(1, 1, 4) # define PIPELINE_SELECT_3D (0 << 0) # define PIPELINE_SELECT_MEDIA (1 << 0) +# define PIPELINE_SELECT_GPGPU (2 << 0) #define GEN7_STATE_BASE_ADDRESS GFXPIPE(0, 1, 1) # define BASE_ADDRESS_MODIFY (1 << 0) @@ -187,6 +188,7 @@ #define GEN7_MEDIA_CURBE_LOAD GFXPIPE(2, 0, 1) #define GEN7_MEDIA_INTERFACE_DESCRIPTOR_LOAD GFXPIPE(2, 0, 2) #define GEN7_MEDIA_OBJECT GFXPIPE(2, 1, 0) +#define GEN7_GPGPU_WALKER GFXPIPE(2, 1, 5) struct gen7_interface_descriptor_data { diff --git a/lib/intel_batchbuffer.c b/lib/intel_batchbuffer.c index 4b3a5b8..c70f6d8 100644 --- a/lib/intel_batchbuffer.c +++ b/lib/intel_batchbuffer.c @@ -511,3 +511,22 @@ igt_fillfunc_t igt_get_media_fillfunc(int devid) return fill; } + +/** + * igt_get_gpgpu_fillfunc: + * @devid: pci device id + * + * Returns: + * + * The platform-specific gpgpu fill function pointer for the device specified + * with @devid. Will return NULL when no gpgpu fill function is implemented. + */ +igt_fillfunc_t igt_get_gpgpu_fillfunc(int devid) +{ + igt_fillfunc_t fill = NULL; + + if (IS_GEN7(devid)) + fill = gen7_gpgpu_fillfunc; + + return fill; +} diff --git a/lib/intel_batchbuffer.h b/lib/intel_batchbuffer.h index f0e21ea..12f7be1 100644 --- a/lib/intel_batchbuffer.h +++ b/lib/intel_batchbuffer.h @@ -250,11 +250,11 @@ igt_render_copyfunc_t igt_get_render_copyfunc(int devid); * @color: fill color to use * * This is the type of the per-platform fill functions using media - * pipeline. The platform-specific implementation can be obtained - * by calling igt_get_media_fillfunc(). + * or gpgpu pipeline. The platform-specific implementation can be obtained + * by calling igt_get_media_fillfunc() or igt_get_gpgpu_fillfunc(). * * A fill function will emit a batchbuffer to the kernel which executes - * the specified blit fill operation using the media engine. + * the specified blit fill operation using the media/gpgpu engine. */ typedef void (*igt_fillfunc_t)(struct intel_batchbuffer *batch, struct igt_buf *dst, @@ -263,5 +263,6 @@ typedef void (*igt_fillfunc_t)(struct intel_batchbuffer *batch, uint8_t color); igt_fillfunc_t igt_get_media_fillfunc(int devid); +igt_fillfunc_t igt_get_gpgpu_fillfunc(int devid); #endif diff --git a/lib/media_fill.h b/lib/media_fill.h index 226489c..2a30055 100644 --- a/lib/media_fill.h +++ b/lib/media_fill.h @@ -32,4 +32,11 @@ gen9_media_fillfunc(struct intel_batchbuffer *batch, unsigned width, unsigned height, uint8_t color); +void +gen7_gpgpu_fillfunc(struct intel_batchbuffer *batch, + struct igt_buf *dst, + unsigned x, unsigned y, + unsigned width, unsigned height, + uint8_t color); + #endif /* RENDE_MEDIA_FILL_H */ diff --git a/lib/media_fill_gen7.c b/lib/media_fill_gen7.c index 5a23b7d..7113fda 100644 --- a/lib/media_fill_gen7.c +++ b/lib/media_fill_gen7.c @@ -8,7 +8,6 @@ #include <assert.h> - static const uint32_t media_kernel[][4] = { { 0x00400001, 0x20200231, 0x00000020, 0x00000000 }, { 0x00600001, 0x20800021, 0x008d0000, 0x00000000 }, @@ -23,6 +22,23 @@ static const uint32_t media_kernel[][4] = { { 0x07800031, 0x20001ca8, 0x00000e00, 0x82000010 }, }; +/* shaders/gpgpu/gpgpu_fill.gxa */ +static const uint32_t gpgpu_kernel[][4] = { + { 0x00400001, 0x20200231, 0x00000020, 0x00000000 }, + { 0x00000041, 0x20400c21, 0x00000004, 0x00000010 }, + { 0x00000001, 0x20440021, 0x00000018, 0x00000000 }, + { 0x00600001, 0x20800021, 0x008d0000, 0x00000000 }, + { 0x00200001, 0x20800021, 0x00450040, 0x00000000 }, + { 0x00000001, 0x20880061, 0x00000000, 0x0000000f }, + { 0x00800001, 0x20a00021, 0x00000020, 0x00000000 }, + { 0x00800001, 0x20e00021, 0x00000020, 0x00000000 }, + { 0x00800001, 0x21200021, 0x00000020, 0x00000000 }, + { 0x00800001, 0x21600021, 0x00000020, 0x00000000 }, + { 0x05800031, 0x24001ca8, 0x00000080, 0x120a8000 }, + { 0x00600001, 0x2e000021, 0x008d0000, 0x00000000 }, + { 0x07800031, 0x20001ca8, 0x00000e00, 0x82000010 }, +}; + static uint32_t batch_used(struct intel_batchbuffer *batch) { @@ -160,14 +176,15 @@ gen7_fill_media_kernel(struct intel_batchbuffer *batch, } static uint32_t -gen7_fill_interface_descriptor(struct intel_batchbuffer *batch, struct igt_buf *dst) +gen7_fill_interface_descriptor(struct intel_batchbuffer *batch, struct igt_buf *dst, + const uint32_t kernel[][4], size_t size) { struct gen7_interface_descriptor_data *idd; uint32_t offset; uint32_t binding_table_offset, kernel_offset; binding_table_offset = gen7_fill_binding_table(batch, dst); - kernel_offset = gen7_fill_media_kernel(batch, media_kernel, sizeof(media_kernel)); + kernel_offset = gen7_fill_media_kernel(batch, kernel, size); idd = batch_alloc(batch, sizeof(*idd), 64); offset = batch_offset(batch, idd); @@ -329,7 +346,9 @@ gen7_media_fillfunc(struct intel_batchbuffer *batch, batch->ptr = &batch->buffer[BATCH_STATE_SPLIT]; curbe_buffer = gen7_fill_curbe_buffer_data(batch, color); - interface_descriptor = gen7_fill_interface_descriptor(batch, dst); + interface_descriptor = gen7_fill_interface_descriptor(batch, dst, + media_kernel, + sizeof(media_kernel)); igt_assert(batch->ptr < &batch->buffer[4095]); /* media pipeline */ @@ -353,3 +372,137 @@ gen7_media_fillfunc(struct intel_batchbuffer *batch, gen7_render_flush(batch, batch_end); intel_batchbuffer_reset(batch); } + +static void +gen7_emit_vfe_state_gpgpu(struct intel_batchbuffer *batch) +{ + OUT_BATCH(GEN7_MEDIA_VFE_STATE | (8 - 2)); + + /* scratch buffer */ + OUT_BATCH(0); + + /* number of threads & urb entries */ + OUT_BATCH(1 << 16 | /* max num of threads */ + 0 << 8 | /* num of URB entry */ + 1 << 2); /* GPGPU mode */ + + OUT_BATCH(0); + + /* urb entry size & curbe size */ + OUT_BATCH(0 << 16 | /* URB entry size in 256 bits unit */ + 1); /* CURBE entry size in 256 bits unit */ + + /* scoreboard */ + OUT_BATCH(0); + OUT_BATCH(0); + OUT_BATCH(0); +} + +static void +gen7_emit_gpgpu_walk(struct intel_batchbuffer *batch, + unsigned x, unsigned y, + unsigned width, unsigned height) +{ + uint32_t x_dim, y_dim, tmp, right_mask; + + /* + * Simply do SIMD16 based dispatch, so every thread uses + * SIMD16 channels. + * + * Define our own thread group size, e.g 16x1 for every group, then + * will have 1 thread each group in SIMD16 dispatch. So thread + * width/height/depth are all 1. + * + * Then thread group X = width / 16 (aligned to 16) + * thread group Y = height; + */ + x_dim = (width + 15) / 16; + y_dim = height; + + tmp = width & 15; + if (tmp == 0) + right_mask = (1 << 16) - 1; + else + right_mask = (1 << tmp) - 1; + + OUT_BATCH(GEN7_GPGPU_WALKER | 9); + + /* interface descriptor offset */ + OUT_BATCH(0); + + /* SIMD size, thread w/h/d */ + OUT_BATCH(1 << 30 | /* SIMD16 */ + 0 << 16 | /* depth:1 */ + 0 << 8 | /* height:1 */ + 0); /* width:1 */ + + /* thread group X */ + OUT_BATCH(0); + OUT_BATCH(x_dim); + + /* thread group Y */ + OUT_BATCH(0); + OUT_BATCH(y_dim); + + /* thread group Z */ + OUT_BATCH(0); + OUT_BATCH(1); + + /* right mask */ + OUT_BATCH(right_mask); + + /* bottom mask, height 1, always 0xffffffff */ + OUT_BATCH(0xffffffff); +} + +void +gen7_gpgpu_fillfunc(struct intel_batchbuffer *batch, + struct igt_buf *dst, + unsigned x, unsigned y, + unsigned width, unsigned height, + uint8_t color) +{ + uint32_t curbe_buffer, interface_descriptor; + uint32_t batch_end; + + intel_batchbuffer_flush(batch); + + /* setup states */ + batch->ptr = &batch->buffer[BATCH_STATE_SPLIT]; + + /* + * const buffer needs to fill for every thread, but as we have just 1 thread + * per every group, so need only one curbe data. + * + * For each thread, just use thread group ID for buffer offset. + */ + curbe_buffer = gen7_fill_curbe_buffer_data(batch, color); + + interface_descriptor = gen7_fill_interface_descriptor(batch, dst, + gpgpu_kernel, + sizeof(gpgpu_kernel)); + igt_assert(batch->ptr < &batch->buffer[4095]); + + batch->ptr = batch->buffer; + + /* GPGPU pipeline */ + OUT_BATCH(GEN7_PIPELINE_SELECT | PIPELINE_SELECT_GPGPU); + + gen7_emit_state_base_address(batch); + + gen7_emit_vfe_state_gpgpu(batch); + + gen7_emit_curbe_load(batch, curbe_buffer); + + gen7_emit_interface_descriptor_load(batch, interface_descriptor); + + gen7_emit_gpgpu_walk(batch, x, y, width, height); + + OUT_BATCH(MI_BATCH_BUFFER_END); + + batch_end = batch_align(batch, 8); + igt_assert(batch_end < BATCH_STATE_SPLIT); + + gen7_render_flush(batch, batch_end); + intel_batchbuffer_reset(batch); +} diff --git a/shaders/gpgpu/README b/shaders/gpgpu/README new file mode 100644 index 0000000..3bf328a --- /dev/null +++ b/shaders/gpgpu/README @@ -0,0 +1,4 @@ + +Commands used to generate the shader on gen7 +$> m4 gpgpu_fill.gxa > gpgpu_fill.gxm +$> intel-gen4asm -g 7 -o <output> gpgpu_fill.gxm diff --git a/shaders/gpgpu/gpgpu_fill.gxa b/shaders/gpgpu/gpgpu_fill.gxa new file mode 100644 index 0000000..fc309f3 --- /dev/null +++ b/shaders/gpgpu/gpgpu_fill.gxa @@ -0,0 +1,51 @@ +/* + * Registers + * g0 -- header + * g1 -- constant + * g2 -- calculate X/Y offset + * g4-g12 payload for write message + */ +define(`ORIG', `g2.0<2,2,1>UD') +define(`ORIG_X', `g2.0<1>UD') +define(`ORIG_Y', `g2.4<1>UD') +define(`COLOR', `g1.0') +define(`COLORUB', `COLOR<0,1,0>UB') +define(`COLORUD', `COLOR<0,1,0>UD') +define(`X', `g0.4<0,1,0>UD') +define(`Y', `g0.24<0,1,0>UD') + +mov(4) COLOR<1>UB COLORUB {align1}; + +/* WRITE */ +/* count thread group ID for X/Y offset */ +mul(1) ORIG_X X 0x10UD {align1}; +mov(1) ORIG_Y Y {align1}; +mov(8) g4.0<1>UD g0.0<8,8,1>UD {align1}; +mov(2) g4.0<1>UD ORIG {align1}; +/* Normal mode: for block height 1 row and block width 16 bytes */ +mov(1) g4.8<1>UD 0x0000000fUD {align1}; + +mov(16) g5.0<1>UD COLORUD {align1 compr}; +mov(16) g7.0<1>UD COLORUD {align1 compr}; +mov(16) g9.0<1>UD COLORUD {align1 compr}; +mov(16) g11.0<1>UD COLORUD {align1 compr}; + +/* + * comment out the following instruction on Gen7 + * write(0, 0, 10, 12) + * 10: media_block_write + * 12: data cache data port 1 + */ +send(16) 4 acc0<1>UW null write(0, 0, 10, 12) mlen 9 rlen 0 {align1}; + +/* + * uncomment the following instruction on Gen7 + * write(0, 0, 10, 0) + * 10: media_block_write + * 0: reander cache data port + */ +/* send(16) 4 acc0<1>UW null write(0, 0, 10, 0) mlen 9 rlen 0 {align1}; */ + +/* EOT */ +mov(8) g112.0<1>UD g0.0<8,8,1>UD {align1}; +send(16) 112 null<1>UW null thread_spawner(0, 0, 1) mlen 1 rlen 0 {align1 EOT};

[2/3] lib: Add GPGPU fill

Commit Message

Patch