diff mbox

[i-g-t] intel_aubdump: Add ability to simulate execlist submission

Message ID 20171128235119.17774-1-scott.d.phillips@intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

Scott D Phillips Nov. 28, 2017, 11:51 p.m. UTC
Newer devices do not have the legacy ring buffer submission model,
so aub files generated using that model cannot be handled by some
internal tools. The execlist submission modeled by this change is
pretty simplistic, using GGTT only and synchronizing after every
batch.
---
 tools/aubdump.c | 422 ++++++++++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 394 insertions(+), 28 deletions(-)

Comments

Jordan Justen Dec. 5, 2017, 8:47 a.m. UTC | #1
On 2017-11-28 15:51:19, Scott D Phillips wrote:
> Newer devices do not have the legacy ring buffer submission model,
> so aub files generated using that model cannot be handled by some
> internal tools. The execlist submission modeled by this change is
> pretty simplistic, using GGTT only and synchronizing after every
> batch.
> ---
>  tools/aubdump.c | 422 ++++++++++++++++++++++++++++++++++++++++++++++++++++----
>  1 file changed, 394 insertions(+), 28 deletions(-)
> 
> diff --git a/tools/aubdump.c b/tools/aubdump.c
> index 6ba3cb66..232371d5 100644
> --- a/tools/aubdump.c
> +++ b/tools/aubdump.c
> @@ -46,6 +46,175 @@
>  #define ARRAY_SIZE(x) (sizeof(x)/sizeof((x)[0]))
>  #endif
>  
> +#ifndef ALIGN
> +#define ALIGN(x, y) (((x) + (y)-1) & ~((y)-1))
> +#endif
> +
> +#ifndef MIN
> +#define MIN(x, y) ((x) < (y) ? (x) : (y))

I notice 'min' is used in a few other places in i-g-t. Mesa uses MIN2.

> +#endif
> +
> +#define HWS_PGA_RCSUNIT                0x02080
> +#define HWS_PGA_VCSUNIT0       0x12080
> +#define HWS_PGA_BCSUNIT                0x22080
> +
> +#define GFX_MODE_RCSUNIT       0x0229c
> +#define GFX_MODE_VCSUNIT0      0x1229c
> +#define GFX_MODE_BCSUNIT       0x2229c
> +
> +#define EXECLIST_SUBMITPORT_RCSUNIT    0x02230
> +#define EXECLIST_SUBMITPORT_VCSUNIT0   0x12230
> +#define EXECLIST_SUBMITPORT_BCSUNIT    0x22230
> +
> +#define EXECLIST_STATUS_RCSUNIT                0x02234
> +#define EXECLIST_STATUS_VCSUNIT0       0x12234
> +#define EXECLIST_STATUS_BCSUNIT                0x22234
> +
> +#define MEMORY_MAP_SIZE (64 /* MiB */ * 1024 * 1024)
> +
> +#define PTE_SIZE 4
> +#define GEN8_PTE_SIZE 8
> +
> +#define NUM_PT_ENTRIES (ALIGN(MEMORY_MAP_SIZE, 4096) / 4096)
> +#define PT_SIZE ALIGN(NUM_PT_ENTRIES * GEN8_PTE_SIZE, 4096)
> +
> +#define RING_SIZE                      ( 1 * 4096)
> +#define PPHWSP_SIZE                    ( 1 * 4096)
> +#define GEN10_LR_CONTEXT_RENDER_SIZE   (19 * 4096)
> +#define GEN8_LR_CONTEXT_OTHER_SIZE     ( 2 * 4096)
> +
> +#define STATIC_GGTT_MAP_START 0
> +
> +#define RENDER_RING_ADDR STATIC_GGTT_MAP_START
> +#define RENDER_CONTEXT_ADDR (RENDER_RING_ADDR + RING_SIZE)
> +
> +#define BLITTER_RING_ADDR (RENDER_CONTEXT_ADDR + PPHWSP_SIZE + GEN10_LR_CONTEXT_RENDER_SIZE)
> +#define BLITTER_CONTEXT_ADDR (BLITTER_RING_ADDR + RING_SIZE)
> +
> +#define VIDEO_RING_ADDR (BLITTER_CONTEXT_ADDR + PPHWSP_SIZE + GEN8_LR_CONTEXT_OTHER_SIZE)
> +#define VIDEO_CONTEXT_ADDR (VIDEO_RING_ADDR + RING_SIZE)
> +
> +#define STATIC_GGTT_MAP_END (VIDEO_CONTEXT_ADDR + PPHWSP_SIZE + GEN8_LR_CONTEXT_OTHER_SIZE)
> +#define STATIC_GGTT_MAP_SIZE (STATIC_GGTT_MAP_END - STATIC_GGTT_MAP_START)
> +
> +#define CONTEXT_FLAGS (0x229)  /* Normal Priority | L3-LLC Coherency |
> +       Legacy Context with no 64 bit VA support | Valid */
> +
> +#define RENDER_CONTEXT_DESCRIPTOR  ((uint64_t)1 << 32 | RENDER_CONTEXT_ADDR  | CONTEXT_FLAGS)
> +#define BLITTER_CONTEXT_DESCRIPTOR ((uint64_t)2 << 32 | BLITTER_CONTEXT_ADDR | CONTEXT_FLAGS)
> +#define VIDEO_CONTEXT_DESCRIPTOR   ((uint64_t)3 << 32 | VIDEO_CONTEXT_ADDR   | CONTEXT_FLAGS)
> +
> +static const uint32_t render_context_init[GEN10_LR_CONTEXT_RENDER_SIZE /
> +                                         sizeof(uint32_t)] = {
> +       0 /* MI_NOOP */,
> +       0x1100101B /* MI_LOAD_REGISTER_IMM */,
> +       0x2244 /* CONTEXT_CONTROL */,           0x90009 /* Inhibit Synchronous Context Switch | Engine Context Restore Inhibit */,
> +       0x2034 /* RING_HEAD */,                 0,
> +       0x2030 /* RING_TAIL */,                 0,
> +       0x2038 /* RING_BUFFER_START */,         RENDER_RING_ADDR,
> +       0x203C /* RING_BUFFER_CONTROL */,       (RING_SIZE - 4096) | 1 /* Buffer Length | Ring Buffer Enable */,
> +       0x2168 /* BB_HEAD_U */,                 0,
> +       0x2140 /* BB_HEAD_L */,                 0,
> +       0x2110 /* BB_STATE */,                  0,
> +       0x211C /* SECOND_BB_HEAD_U */,          0,
> +       0x2114 /* SECOND_BB_HEAD_L */,          0,
> +       0x2118 /* SECOND_BB_STATE */,           0,
> +       0x21C0 /* BB_PER_CTX_PTR */,            0,
> +       0x21C4 /* RCS_INDIRECT_CTX */,          0,
> +       0x21C8 /* RCS_INDIRECT_CTX_OFFSET */,   0,
> +       /* MI_NOOP */
> +       0, 0,
> +
> +       0 /* MI_NOOP */,
> +       0x11001011 /* MI_LOAD_REGISTER_IMM */,
> +       0x23A8 /* CTX_TIMESTAMP */,     0,
> +       0x228C /* PDP3_UDW */,          0,
> +       0x2288 /* PDP3_LDW */,          0,
> +       0x2284 /* PDP2_UDW */,          0,
> +       0x2280 /* PDP2_LDW */,          0,
> +       0x227C /* PDP1_UDW */,          0,
> +       0x2278 /* PDP1_LDW */,          0,
> +       0x2274 /* PDP0_UDW */,          0,
> +       0x2270 /* PDP0_LDW */,          0,
> +       /* MI_NOOP */
> +       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
> +
> +       0 /* MI_NOOP */,
> +       0x11000001 /* MI_LOAD_REGISTER_IMM */,
> +       0x20C8 /* R_PWR_CLK_STATE */, 0x7FFFFFFF,
> +       0x05000001 /* MI_BATCH_BUFFER_END */
> +};
> +
> +static const uint32_t blitter_context_init[GEN8_LR_CONTEXT_OTHER_SIZE /
> +                                          sizeof(uint32_t)] = {
> +       0 /* MI_NOOP */,
> +       0x11001015 /* MI_LOAD_REGISTER_IMM */,
> +       0x22244 /* CONTEXT_CONTROL */,          0x90009 /* Inhibit Synchronous Context Switch | Engine Context Restore Inhibit */,
> +       0x22034 /* RING_HEAD */,                0,
> +       0x22030 /* RING_TAIL */,                0,
> +       0x22038 /* RING_BUFFER_START */,        BLITTER_RING_ADDR,
> +       0x2203C /* RING_BUFFER_CONTROL */,      (RING_SIZE - 4096) | 1 /* Buffer Length | Ring Buffer Enable */,
> +       0x22168 /* BB_HEAD_U */,                0,
> +       0x22140 /* BB_HEAD_L */,                0,
> +       0x22110 /* BB_STATE */,                 0,
> +       0x2211C /* SECOND_BB_HEAD_U */,         0,
> +       0x22114 /* SECOND_BB_HEAD_L */,         0,
> +       0x22118 /* SECOND_BB_STATE */,          0,
> +       /* MI_NOOP */
> +       0, 0, 0, 0, 0, 0, 0, 0,
> +
> +       0 /* MI_NOOP */,
> +       0x11001011,
> +       0x223A8 /* CTX_TIMESTAMP */,    0,
> +       0x2228C /* PDP3_UDW */,         0,
> +       0x22288 /* PDP3_LDW */,         0,
> +       0x22284 /* PDP2_UDW */,         0,
> +       0x22280 /* PDP2_LDW */,         0,
> +       0x2227C /* PDP1_UDW */,         0,
> +       0x22278 /* PDP1_LDW */,         0,
> +       0x22274 /* PDP0_UDW */,         0,
> +       0x22270 /* PDP0_LDW */,         0,
> +       /* MI_NOOP */
> +       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
> +
> +       0x05000001 /* MI_BATCH_BUFFER_END */
> +};
> +
> +static const uint32_t video_context_init[GEN8_LR_CONTEXT_OTHER_SIZE /
> +                                        sizeof(uint32_t)] = {
> +       0 /* MI_NOOP */,
> +       0x11001015 /* MI_LOAD_REGISTER_IMM */,
> +       0x1C244 /* CONTEXT_CONTROL */,          0x90009 /* Inhibit Synchronous Context Switch | Engine Context Restore Inhibit */,
> +       0x1C034 /* RING_HEAD */,                0,
> +       0x1C030 /* RING_TAIL */,                0,
> +       0x1C038 /* RING_BUFFER_START */,        VIDEO_RING_ADDR,
> +       0x1C03C /* RING_BUFFER_CONTROL */,      (RING_SIZE - 4096) | 1 /* Buffer Length | Ring Buffer Enable */,
> +       0x1C168 /* BB_HEAD_U */,                0,
> +       0x1C140 /* BB_HEAD_L */,                0,
> +       0x1C110 /* BB_STATE */,                 0,
> +       0x1C11C /* SECOND_BB_HEAD_U */,         0,
> +       0x1C114 /* SECOND_BB_HEAD_L */,         0,
> +       0x1C118 /* SECOND_BB_STATE */,          0,
> +       /* MI_NOOP */
> +       0, 0, 0, 0, 0, 0, 0, 0,
> +
> +       0 /* MI_NOOP */,
> +       0x11001011,
> +       0x1C3A8 /* CTX_TIMESTAMP */,    0,
> +       0x1C28C /* PDP3_UDW */,         0,
> +       0x1C288 /* PDP3_LDW */,         0,
> +       0x1C284 /* PDP2_UDW */,         0,
> +       0x1C280 /* PDP2_LDW */,         0,
> +       0x1C27C /* PDP1_UDW */,         0,
> +       0x1C278 /* PDP1_LDW */,         0,
> +       0x1C274 /* PDP0_UDW */,         0,
> +       0x1C270 /* PDP0_LDW */,         0,
> +       /* MI_NOOP */
> +       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
> +
> +       0x05000001 /* MI_BATCH_BUFFER_END */
> +};
> +
>  static int close_init_helper(int fd);
>  static int ioctl_init_helper(int fd, unsigned long request, ...);
>  
> @@ -171,26 +340,142 @@ data_out(const void *data, size_t size)
>  }
>  
>  static uint32_t
> -gtt_entry_size(void)
> +gtt_size(void)
>  {
> -       return addr_bits > 32 ? 8 : 4;
> +       return NUM_PT_ENTRIES * (addr_bits > 32 ? GEN8_PTE_SIZE : PTE_SIZE);
>  }
>  
> -static uint32_t
> -gtt_size(void)
> +static void
> +mem_trace_memory_write_header_out(uint64_t addr, uint32_t len,
> +                                 uint32_t addr_space)
>  {
> -       /* Enough for 64MB assuming 4kB pages. */
> -       const unsigned entries = 0x4000;
> -       return entries * gtt_entry_size();
> +       uint32_t dwords = ALIGN(len, sizeof(uint32_t)) / sizeof(uint32_t);
> +
> +       dword_out(CMD_MEM_TRACE_MEMORY_WRITE | (5 + dwords - 1));
> +       dword_out(addr & 0xFFFFFFFF);   /* addr lo */
> +       dword_out(addr >> 32);  /* addr hi */
> +       dword_out(addr_space);  /* gtt */
> +       dword_out(len);
>  }
>  
>  static void
> -write_header(void)
> +register_write_out(uint32_t addr, uint32_t value)
> +{
> +       uint32_t dwords = 1;
> +
> +       dword_out(CMD_MEM_TRACE_REGISTER_WRITE | (5 + dwords - 1));
> +       dword_out(addr);
> +       dword_out(AUB_MEM_TRACE_REGISTER_SIZE_DWORD |
> +                 AUB_MEM_TRACE_REGISTER_SPACE_MMIO);
> +       dword_out(0xFFFFFFFF);  /* mask lo */
> +       dword_out(0x00000000);  /* mask hi */
> +       dword_out(value);
> +}
> +
> +static void
> +gen10_write_header(void)
> +{
> +       char app_name[8 * 4];
> +       int app_name_len, dwords;
> +       uint32_t entry = 0x3;   /* read/write | present */
> +
> +       app_name_len =
> +           snprintf(app_name, sizeof(app_name), "PCI-ID=0x%X %s", device,
> +                    program_invocation_short_name);
> +       app_name_len = ALIGN(app_name_len, sizeof(uint32_t));
> +
> +       dwords = 5 + app_name_len / sizeof(uint32_t);
> +       dword_out(CMD_MEM_TRACE_VERSION | (dwords - 1));
> +       dword_out(AUB_MEM_TRACE_VERSION_FILE_VERSION);
> +       dword_out(AUB_MEM_TRACE_VERSION_DEVICE_CNL |
> +                 AUB_MEM_TRACE_VERSION_METHOD_PHY);
> +       dword_out(0);           /* version */
> +       dword_out(0);           /* version */
> +       data_out(app_name, app_name_len);
> +
> +       /* GGTT PT */
> +       for (uint32_t page = 0; page < ALIGN(PT_SIZE, 4096) / 4096; page++) {
> +               uint32_t to_write = MIN(PT_SIZE - page * 4096, 4096);
> +               mem_trace_memory_write_header_out(page << 12, to_write,
> +                                                 AUB_MEM_TRACE_MEMORY_ADDRESS_SPACE_GGTT_ENTRY);
> +               for (uint32_t i = 0; i < to_write / GEN8_PTE_SIZE; i++) {
> +                       dword_out(entry + 0x1000 * i + 0x200000 * page);
> +                       dword_out(0);
> +               }
> +       }
> +
> +       /* RENDER_RING */
> +       mem_trace_memory_write_header_out(RENDER_RING_ADDR, RING_SIZE,
> +                                         AUB_MEM_TRACE_MEMORY_ADDRESS_SPACE_LOCAL);
> +       for (uint32_t i = 0; i < RING_SIZE; i += sizeof(uint32_t)) {
> +               dword_out(0);
> +       }
> +
> +       /* RENDER_PPHWSP */
> +       mem_trace_memory_write_header_out(RENDER_CONTEXT_ADDR,
> +                                         PPHWSP_SIZE +
> +                                         sizeof(render_context_init),
> +                                         AUB_MEM_TRACE_MEMORY_ADDRESS_SPACE_LOCAL);
> +       for (uint32_t i = 0; i < PPHWSP_SIZE; i += sizeof(uint32_t)) {
> +               dword_out(0);
> +       }
> +
> +       /* RENDER_CONTEXT */
> +       data_out(render_context_init, sizeof(render_context_init));
> +
> +       /* BLITTER_RING */
> +       mem_trace_memory_write_header_out(BLITTER_RING_ADDR, RING_SIZE,
> +                                         AUB_MEM_TRACE_MEMORY_ADDRESS_SPACE_LOCAL);
> +       for (uint32_t i = 0; i < RING_SIZE; i += sizeof(uint32_t)) {
> +               dword_out(0);
> +       }
> +
> +       /* BLITTER_PPHWSP */
> +       mem_trace_memory_write_header_out(BLITTER_CONTEXT_ADDR,
> +                                         PPHWSP_SIZE +
> +                                         sizeof(blitter_context_init),
> +                                         AUB_MEM_TRACE_MEMORY_ADDRESS_SPACE_LOCAL);
> +       for (uint32_t i = 0; i < PPHWSP_SIZE; i += sizeof(uint32_t)) {
> +               dword_out(0);
> +       }
> +
> +       /* BLITTER_CONTEXT */
> +       data_out(blitter_context_init, sizeof(blitter_context_init));
> +
> +       /* VIDEO_RING */
> +       mem_trace_memory_write_header_out(VIDEO_RING_ADDR, RING_SIZE,
> +                                         AUB_MEM_TRACE_MEMORY_ADDRESS_SPACE_LOCAL);
> +       for (uint32_t i = 0; i < RING_SIZE; i += sizeof(uint32_t)) {
> +               dword_out(0);
> +       }
> +
> +       /* VIDEO_PPHWSP */
> +       mem_trace_memory_write_header_out(VIDEO_CONTEXT_ADDR,
> +                                         PPHWSP_SIZE +
> +                                         sizeof(video_context_init),
> +                                         AUB_MEM_TRACE_MEMORY_ADDRESS_SPACE_LOCAL);
> +       for (uint32_t i = 0; i < PPHWSP_SIZE; i += sizeof(uint32_t)) {
> +               dword_out(0);
> +       }
> +
> +       /* VIDEO_CONTEXT */
> +       data_out(video_context_init, sizeof(video_context_init));
> +
> +       register_write_out(HWS_PGA_RCSUNIT, RENDER_CONTEXT_ADDR);
> +       register_write_out(HWS_PGA_VCSUNIT0, VIDEO_CONTEXT_ADDR);
> +       register_write_out(HWS_PGA_BCSUNIT, BLITTER_CONTEXT_ADDR);
> +
> +       register_write_out(GFX_MODE_RCSUNIT, 0x80008000 /* execlist enable */);
> +       register_write_out(GFX_MODE_VCSUNIT0, 0x80008000 /* execlist enable */);
> +       register_write_out(GFX_MODE_BCSUNIT, 0x80008000 /* execlist enable */);
> +}
> +
> +static void write_header(void)
>  {
>         char app_name[8 * 4];
>         char comment[16];
>         int comment_len, comment_dwords, dwords;
> -       uint32_t entry = 0x200003;
> +       uint32_t entry = 0x3;   /* read/write | present */

Why this change for < 10? Should it be a separate patch?

>  
>         comment_len = snprintf(comment, sizeof(comment), "PCI-ID=0x%x", device);
>         comment_dwords = ((comment_len + 3) / 4);
> @@ -220,7 +505,7 @@ write_header(void)
>         dword_out(gtt_size()); /* size */
>         if (addr_bits > 32)
>                 dword_out(0);
> -       for (uint32_t i = 0; i * gtt_entry_size() < gtt_size(); i++) {
> +       for (uint32_t i = 0; i < NUM_PT_ENTRIES; i++) {
>                 dword_out(entry + 0x1000 * i);
>                 if (addr_bits > 32)
>                         dword_out(0);
> @@ -245,15 +530,21 @@ aub_write_trace_block(uint32_t type, void *virtual, uint32_t size, uint64_t gtt_
>                 if (block_size > 8 * 4096)
>                         block_size = 8 * 4096;
>  
> -               dword_out(CMD_AUB_TRACE_HEADER_BLOCK |
> -                         ((addr_bits > 32 ? 6 : 5) - 2));
> -               dword_out(AUB_TRACE_MEMTYPE_GTT |
> -                         type | AUB_TRACE_OP_DATA_WRITE);
> -               dword_out(subtype);
> -               dword_out(gtt_offset + offset);
> -               dword_out(align_u32(block_size, 4));
> -               if (addr_bits > 32)
> -                       dword_out((gtt_offset + offset) >> 32);
> +               if (gen >= 10) {
> +                       mem_trace_memory_write_header_out(gtt_offset + offset,
> +                                                         block_size,
> +                                                         AUB_MEM_TRACE_MEMORY_ADDRESS_SPACE_LOCAL);
> +               } else {
> +                       dword_out(CMD_AUB_TRACE_HEADER_BLOCK |
> +                                 ((addr_bits > 32 ? 6 : 5) - 2));
> +                       dword_out(AUB_TRACE_MEMTYPE_GTT |
> +                                 type | AUB_TRACE_OP_DATA_WRITE);
> +                       dword_out(subtype);
> +                       dword_out(gtt_offset + offset);
> +                       dword_out(align_u32(block_size, 4));
> +                       if (addr_bits > 32)
> +                               dword_out((gtt_offset + offset) >> 32);
> +               }
>  
>                 if (virtual)
>                         data_out(((char *) GET_PTR(virtual)) + offset, block_size);
> @@ -291,6 +582,64 @@ write_reloc(void *p, uint64_t v)
>         }
>  }
>  
> +static void
> +aub_dump_execlist(uint64_t batch_offset, int ring_flag)
> +{
> +       uint32_t ring_addr;
> +       uint64_t descriptor;
> +       uint32_t elsp_reg;
> +       uint32_t status_reg;
> +
> +       switch (ring_flag) {
> +       case I915_EXEC_DEFAULT:
> +       case I915_EXEC_RENDER:
> +               ring_addr = RENDER_RING_ADDR;
> +               descriptor = RENDER_CONTEXT_DESCRIPTOR;
> +               elsp_reg = EXECLIST_SUBMITPORT_RCSUNIT;
> +               status_reg = EXECLIST_STATUS_RCSUNIT;
> +               break;
> +       case I915_EXEC_BSD:
> +               ring_addr = VIDEO_RING_ADDR;
> +               descriptor = VIDEO_CONTEXT_DESCRIPTOR;
> +               elsp_reg = EXECLIST_SUBMITPORT_VCSUNIT0;
> +               status_reg = EXECLIST_STATUS_VCSUNIT0;
> +               break;
> +       case I915_EXEC_BLT:
> +               ring_addr = BLITTER_RING_ADDR;
> +               descriptor = BLITTER_CONTEXT_DESCRIPTOR;
> +               elsp_reg = EXECLIST_SUBMITPORT_BCSUNIT;
> +               status_reg = EXECLIST_STATUS_BCSUNIT;
> +               break;
> +       }
> +
> +       mem_trace_memory_write_header_out(ring_addr, 16,
> +                                         AUB_MEM_TRACE_MEMORY_ADDRESS_SPACE_LOCAL);
> +       dword_out(AUB_MI_BATCH_BUFFER_START | (3 - 2));
> +       dword_out(batch_offset & 0xFFFFFFFF);
> +       dword_out(batch_offset >> 32);
> +       dword_out(0 /* MI_NOOP */);
> +
> +       mem_trace_memory_write_header_out(ring_addr + 8192 + 20, 4,
> +                                         AUB_MEM_TRACE_MEMORY_ADDRESS_SPACE_LOCAL);
> +       dword_out(0); /* RING_BUFFER_HEAD */
> +       mem_trace_memory_write_header_out(ring_addr + 8192 + 28, 4,
> +                                         AUB_MEM_TRACE_MEMORY_ADDRESS_SPACE_LOCAL);
> +       dword_out(16); /* RING_BUFFER_TAIL */
> +
> +       register_write_out(elsp_reg, 0);
> +       register_write_out(elsp_reg, 8);
> +       register_write_out(elsp_reg, descriptor >> 32);
> +       register_write_out(elsp_reg, descriptor & 0xFFFFFFFF);
> +
> +       dword_out(CMD_MEM_TRACE_REGISTER_POLL | (5 + 1 - 1));
> +       dword_out(status_reg);
> +       dword_out(AUB_MEM_TRACE_REGISTER_SIZE_DWORD |
> +                 AUB_MEM_TRACE_REGISTER_SPACE_MMIO);
> +       dword_out(0x00000010);  /* mask lo */
> +       dword_out(0x00000000);  /* mask hi */
> +       dword_out(0x00000000);
> +}
> +
>  static void
>  aub_dump_ringbuffer(uint64_t batch_offset, uint64_t offset, int ring_flag)
>  {
> @@ -404,7 +753,7 @@ dump_execbuffer2(int fd, struct drm_i915_gem_execbuffer2 *execbuffer2)
>         struct drm_i915_gem_exec_object2 *exec_objects =
>                 (struct drm_i915_gem_exec_object2 *) (uintptr_t) execbuffer2->buffers_ptr;
>         uint32_t ring_flag = execbuffer2->flags & I915_EXEC_RING_MASK;
> -       uint32_t offset = gtt_size();
> +       uint32_t offset;
>         struct drm_i915_gem_exec_object2 *obj;
>         struct bo *bo, *batch_bo;
>         int batch_index;
> @@ -417,7 +766,18 @@ dump_execbuffer2(int fd, struct drm_i915_gem_execbuffer2 *execbuffer2)
>         }
>         if (gen == 0) {
>                 gen = intel_gen(device);
> -               write_header();
> +               if (gen == 0)
> +                       gen = 99;
> +
> +               /* If we don't know the device gen, then it probably is a
> +                * newer device which uses 48-bit addresses.
> +                */
> +               addr_bits = (gen >= 8) ? 48 : 32;

The comment here is now sort of out of date. Maybe it now belongs with
the gen = 99 above in a slightly reworded format?

Also, maybe the gen = 99 change and addr_bits move should be a
separate patch? I think it's okay, but it seems best to not let it get
lost in this patch.

I'll say these changes are

Reviewed-by: Jordan Justen <jordan.l.justen@intel.com>

but, really it's about 50% r-b, and 50% acked-by.

You can keep my r-b for patches if you separate them out as I
mentioned.

-Jordan

> +
> +               if (gen >= 10)
> +                       gen10_write_header();
> +               else
> +                       write_header();
>  
>                 if (verbose)
>                         printf("[intel_aubdump: running, "
> @@ -425,10 +785,10 @@ dump_execbuffer2(int fd, struct drm_i915_gem_execbuffer2 *execbuffer2)
>                                filename, device, gen);
>         }
>  
> -       /* If we don't know the device gen, then it probably is a
> -        * newer device which uses 48-bit addresses.
> -        */
> -       addr_bits = (gen >= 8 || gen == 0) ? 48 : 32;
> +       if (gen >= 10)
> +               offset = STATIC_GGTT_MAP_END;
> +       else
> +               offset = gtt_size();
>  
>         if (verbose)
>                 printf("Dumping execbuffer2:\n");
> @@ -484,9 +844,15 @@ dump_execbuffer2(int fd, struct drm_i915_gem_execbuffer2 *execbuffer2)
>                         free(data);
>         }
>  
> -       /* Dump ring buffer */
> -       aub_dump_ringbuffer(batch_bo->offset + execbuffer2->batch_start_offset,
> -                           offset, ring_flag);
> +       if (gen >= 10) {
> +               aub_dump_execlist(batch_bo->offset +
> +                                 execbuffer2->batch_start_offset, ring_flag);
> +       } else {
> +               /* Dump ring buffer */
> +               aub_dump_ringbuffer(batch_bo->offset +
> +                                   execbuffer2->batch_start_offset, offset,
> +                                   ring_flag);
> +       }
>  
>         for (int i = 0; i < ARRAY_SIZE(files); i++) {
>                 if (files[i] != NULL)
> -- 
> 2.14.3
> 
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/intel-gfx
diff mbox

Patch

diff --git a/tools/aubdump.c b/tools/aubdump.c
index 6ba3cb66..232371d5 100644
--- a/tools/aubdump.c
+++ b/tools/aubdump.c
@@ -46,6 +46,175 @@ 
 #define ARRAY_SIZE(x) (sizeof(x)/sizeof((x)[0]))
 #endif
 
+#ifndef ALIGN
+#define ALIGN(x, y) (((x) + (y)-1) & ~((y)-1))
+#endif
+
+#ifndef MIN
+#define MIN(x, y) ((x) < (y) ? (x) : (y))
+#endif
+
+#define HWS_PGA_RCSUNIT		0x02080
+#define HWS_PGA_VCSUNIT0	0x12080
+#define HWS_PGA_BCSUNIT		0x22080
+
+#define GFX_MODE_RCSUNIT	0x0229c
+#define GFX_MODE_VCSUNIT0	0x1229c
+#define GFX_MODE_BCSUNIT	0x2229c
+
+#define EXECLIST_SUBMITPORT_RCSUNIT	0x02230
+#define EXECLIST_SUBMITPORT_VCSUNIT0	0x12230
+#define EXECLIST_SUBMITPORT_BCSUNIT	0x22230
+
+#define EXECLIST_STATUS_RCSUNIT		0x02234
+#define EXECLIST_STATUS_VCSUNIT0	0x12234
+#define EXECLIST_STATUS_BCSUNIT		0x22234
+
+#define MEMORY_MAP_SIZE (64 /* MiB */ * 1024 * 1024)
+
+#define PTE_SIZE 4
+#define GEN8_PTE_SIZE 8
+
+#define NUM_PT_ENTRIES (ALIGN(MEMORY_MAP_SIZE, 4096) / 4096)
+#define PT_SIZE ALIGN(NUM_PT_ENTRIES * GEN8_PTE_SIZE, 4096)
+
+#define RING_SIZE			( 1 * 4096)
+#define PPHWSP_SIZE			( 1 * 4096)
+#define GEN10_LR_CONTEXT_RENDER_SIZE	(19 * 4096)
+#define GEN8_LR_CONTEXT_OTHER_SIZE	( 2 * 4096)
+
+#define STATIC_GGTT_MAP_START 0
+
+#define RENDER_RING_ADDR STATIC_GGTT_MAP_START
+#define RENDER_CONTEXT_ADDR (RENDER_RING_ADDR + RING_SIZE)
+
+#define BLITTER_RING_ADDR (RENDER_CONTEXT_ADDR + PPHWSP_SIZE + GEN10_LR_CONTEXT_RENDER_SIZE)
+#define BLITTER_CONTEXT_ADDR (BLITTER_RING_ADDR + RING_SIZE)
+
+#define VIDEO_RING_ADDR (BLITTER_CONTEXT_ADDR + PPHWSP_SIZE + GEN8_LR_CONTEXT_OTHER_SIZE)
+#define VIDEO_CONTEXT_ADDR (VIDEO_RING_ADDR + RING_SIZE)
+
+#define STATIC_GGTT_MAP_END (VIDEO_CONTEXT_ADDR + PPHWSP_SIZE + GEN8_LR_CONTEXT_OTHER_SIZE)
+#define STATIC_GGTT_MAP_SIZE (STATIC_GGTT_MAP_END - STATIC_GGTT_MAP_START)
+
+#define CONTEXT_FLAGS (0x229)	/* Normal Priority | L3-LLC Coherency |
+	Legacy Context with no 64 bit VA support | Valid */
+
+#define RENDER_CONTEXT_DESCRIPTOR  ((uint64_t)1 << 32 | RENDER_CONTEXT_ADDR  | CONTEXT_FLAGS)
+#define BLITTER_CONTEXT_DESCRIPTOR ((uint64_t)2 << 32 | BLITTER_CONTEXT_ADDR | CONTEXT_FLAGS)
+#define VIDEO_CONTEXT_DESCRIPTOR   ((uint64_t)3 << 32 | VIDEO_CONTEXT_ADDR   | CONTEXT_FLAGS)
+
+static const uint32_t render_context_init[GEN10_LR_CONTEXT_RENDER_SIZE /
+					  sizeof(uint32_t)] = {
+	0 /* MI_NOOP */,
+	0x1100101B /* MI_LOAD_REGISTER_IMM */,
+	0x2244 /* CONTEXT_CONTROL */,		0x90009 /* Inhibit Synchronous Context Switch | Engine Context Restore Inhibit */,
+	0x2034 /* RING_HEAD */,			0,
+	0x2030 /* RING_TAIL */,			0,
+	0x2038 /* RING_BUFFER_START */,		RENDER_RING_ADDR,
+	0x203C /* RING_BUFFER_CONTROL */,	(RING_SIZE - 4096) | 1 /* Buffer Length | Ring Buffer Enable */,
+	0x2168 /* BB_HEAD_U */,			0,
+	0x2140 /* BB_HEAD_L */,			0,
+	0x2110 /* BB_STATE */,			0,
+	0x211C /* SECOND_BB_HEAD_U */,		0,
+	0x2114 /* SECOND_BB_HEAD_L */,		0,
+	0x2118 /* SECOND_BB_STATE */,		0,
+	0x21C0 /* BB_PER_CTX_PTR */,		0,
+	0x21C4 /* RCS_INDIRECT_CTX */,		0,
+	0x21C8 /* RCS_INDIRECT_CTX_OFFSET */,	0,
+	/* MI_NOOP */
+	0, 0,
+
+	0 /* MI_NOOP */,
+	0x11001011 /* MI_LOAD_REGISTER_IMM */,
+	0x23A8 /* CTX_TIMESTAMP */,	0,
+	0x228C /* PDP3_UDW */,		0,
+	0x2288 /* PDP3_LDW */,		0,
+	0x2284 /* PDP2_UDW */,		0,
+	0x2280 /* PDP2_LDW */,		0,
+	0x227C /* PDP1_UDW */,		0,
+	0x2278 /* PDP1_LDW */,		0,
+	0x2274 /* PDP0_UDW */,		0,
+	0x2270 /* PDP0_LDW */,		0,
+	/* MI_NOOP */
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+	0 /* MI_NOOP */,
+	0x11000001 /* MI_LOAD_REGISTER_IMM */,
+	0x20C8 /* R_PWR_CLK_STATE */, 0x7FFFFFFF,
+	0x05000001 /* MI_BATCH_BUFFER_END */
+};
+
+static const uint32_t blitter_context_init[GEN8_LR_CONTEXT_OTHER_SIZE /
+					   sizeof(uint32_t)] = {
+	0 /* MI_NOOP */,
+	0x11001015 /* MI_LOAD_REGISTER_IMM */,
+	0x22244 /* CONTEXT_CONTROL */,		0x90009 /* Inhibit Synchronous Context Switch | Engine Context Restore Inhibit */,
+	0x22034 /* RING_HEAD */,		0,
+	0x22030 /* RING_TAIL */,		0,
+	0x22038 /* RING_BUFFER_START */,	BLITTER_RING_ADDR,
+	0x2203C /* RING_BUFFER_CONTROL */,	(RING_SIZE - 4096) | 1 /* Buffer Length | Ring Buffer Enable */,
+	0x22168 /* BB_HEAD_U */,		0,
+	0x22140 /* BB_HEAD_L */,		0,
+	0x22110 /* BB_STATE */,			0,
+	0x2211C /* SECOND_BB_HEAD_U */,		0,
+	0x22114 /* SECOND_BB_HEAD_L */,		0,
+	0x22118 /* SECOND_BB_STATE */,		0,
+	/* MI_NOOP */
+	0, 0, 0, 0, 0, 0, 0, 0,
+
+	0 /* MI_NOOP */,
+	0x11001011,
+	0x223A8 /* CTX_TIMESTAMP */,	0,
+	0x2228C /* PDP3_UDW */,		0,
+	0x22288 /* PDP3_LDW */,		0,
+	0x22284 /* PDP2_UDW */,		0,
+	0x22280 /* PDP2_LDW */,		0,
+	0x2227C /* PDP1_UDW */,		0,
+	0x22278 /* PDP1_LDW */,		0,
+	0x22274 /* PDP0_UDW */,		0,
+	0x22270 /* PDP0_LDW */,		0,
+	/* MI_NOOP */
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+	0x05000001 /* MI_BATCH_BUFFER_END */
+};
+
+static const uint32_t video_context_init[GEN8_LR_CONTEXT_OTHER_SIZE /
+					 sizeof(uint32_t)] = {
+	0 /* MI_NOOP */,
+	0x11001015 /* MI_LOAD_REGISTER_IMM */,
+	0x1C244 /* CONTEXT_CONTROL */,		0x90009 /* Inhibit Synchronous Context Switch | Engine Context Restore Inhibit */,
+	0x1C034 /* RING_HEAD */,		0,
+	0x1C030 /* RING_TAIL */,		0,
+	0x1C038 /* RING_BUFFER_START */,	VIDEO_RING_ADDR,
+	0x1C03C /* RING_BUFFER_CONTROL */,	(RING_SIZE - 4096) | 1 /* Buffer Length | Ring Buffer Enable */,
+	0x1C168 /* BB_HEAD_U */,		0,
+	0x1C140 /* BB_HEAD_L */,		0,
+	0x1C110 /* BB_STATE */,			0,
+	0x1C11C /* SECOND_BB_HEAD_U */,		0,
+	0x1C114 /* SECOND_BB_HEAD_L */,		0,
+	0x1C118 /* SECOND_BB_STATE */,		0,
+	/* MI_NOOP */
+	0, 0, 0, 0, 0, 0, 0, 0,
+
+	0 /* MI_NOOP */,
+	0x11001011,
+	0x1C3A8 /* CTX_TIMESTAMP */,	0,
+	0x1C28C /* PDP3_UDW */,		0,
+	0x1C288 /* PDP3_LDW */,		0,
+	0x1C284 /* PDP2_UDW */,		0,
+	0x1C280 /* PDP2_LDW */,		0,
+	0x1C27C /* PDP1_UDW */,		0,
+	0x1C278 /* PDP1_LDW */,		0,
+	0x1C274 /* PDP0_UDW */,		0,
+	0x1C270 /* PDP0_LDW */,		0,
+	/* MI_NOOP */
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+	0x05000001 /* MI_BATCH_BUFFER_END */
+};
+
 static int close_init_helper(int fd);
 static int ioctl_init_helper(int fd, unsigned long request, ...);
 
@@ -171,26 +340,142 @@  data_out(const void *data, size_t size)
 }
 
 static uint32_t
-gtt_entry_size(void)
+gtt_size(void)
 {
-	return addr_bits > 32 ? 8 : 4;
+	return NUM_PT_ENTRIES * (addr_bits > 32 ? GEN8_PTE_SIZE : PTE_SIZE);
 }
 
-static uint32_t
-gtt_size(void)
+static void
+mem_trace_memory_write_header_out(uint64_t addr, uint32_t len,
+				  uint32_t addr_space)
 {
-	/* Enough for 64MB assuming 4kB pages. */
-	const unsigned entries = 0x4000;
-	return entries * gtt_entry_size();
+	uint32_t dwords = ALIGN(len, sizeof(uint32_t)) / sizeof(uint32_t);
+
+	dword_out(CMD_MEM_TRACE_MEMORY_WRITE | (5 + dwords - 1));
+	dword_out(addr & 0xFFFFFFFF);	/* addr lo */
+	dword_out(addr >> 32);	/* addr hi */
+	dword_out(addr_space);	/* gtt */
+	dword_out(len);
 }
 
 static void
-write_header(void)
+register_write_out(uint32_t addr, uint32_t value)
+{
+	uint32_t dwords = 1;
+
+	dword_out(CMD_MEM_TRACE_REGISTER_WRITE | (5 + dwords - 1));
+	dword_out(addr);
+	dword_out(AUB_MEM_TRACE_REGISTER_SIZE_DWORD |
+		  AUB_MEM_TRACE_REGISTER_SPACE_MMIO);
+	dword_out(0xFFFFFFFF);	/* mask lo */
+	dword_out(0x00000000);	/* mask hi */
+	dword_out(value);
+}
+
+static void
+gen10_write_header(void)
+{
+	char app_name[8 * 4];
+	int app_name_len, dwords;
+	uint32_t entry = 0x3;	/* read/write | present */
+
+	app_name_len =
+	    snprintf(app_name, sizeof(app_name), "PCI-ID=0x%X %s", device,
+		     program_invocation_short_name);
+	app_name_len = ALIGN(app_name_len, sizeof(uint32_t));
+
+	dwords = 5 + app_name_len / sizeof(uint32_t);
+	dword_out(CMD_MEM_TRACE_VERSION | (dwords - 1));
+	dword_out(AUB_MEM_TRACE_VERSION_FILE_VERSION);
+	dword_out(AUB_MEM_TRACE_VERSION_DEVICE_CNL |
+		  AUB_MEM_TRACE_VERSION_METHOD_PHY);
+	dword_out(0);		/* version */
+	dword_out(0);		/* version */
+	data_out(app_name, app_name_len);
+
+	/* GGTT PT */
+	for (uint32_t page = 0; page < ALIGN(PT_SIZE, 4096) / 4096; page++) {
+		uint32_t to_write = MIN(PT_SIZE - page * 4096, 4096);
+		mem_trace_memory_write_header_out(page << 12, to_write,
+						  AUB_MEM_TRACE_MEMORY_ADDRESS_SPACE_GGTT_ENTRY);
+		for (uint32_t i = 0; i < to_write / GEN8_PTE_SIZE; i++) {
+			dword_out(entry + 0x1000 * i + 0x200000 * page);
+			dword_out(0);
+		}
+	}
+
+	/* RENDER_RING */
+	mem_trace_memory_write_header_out(RENDER_RING_ADDR, RING_SIZE,
+					  AUB_MEM_TRACE_MEMORY_ADDRESS_SPACE_LOCAL);
+	for (uint32_t i = 0; i < RING_SIZE; i += sizeof(uint32_t)) {
+		dword_out(0);
+	}
+
+	/* RENDER_PPHWSP */
+	mem_trace_memory_write_header_out(RENDER_CONTEXT_ADDR,
+					  PPHWSP_SIZE +
+					  sizeof(render_context_init),
+					  AUB_MEM_TRACE_MEMORY_ADDRESS_SPACE_LOCAL);
+	for (uint32_t i = 0; i < PPHWSP_SIZE; i += sizeof(uint32_t)) {
+		dword_out(0);
+	}
+
+	/* RENDER_CONTEXT */
+	data_out(render_context_init, sizeof(render_context_init));
+
+	/* BLITTER_RING */
+	mem_trace_memory_write_header_out(BLITTER_RING_ADDR, RING_SIZE,
+					  AUB_MEM_TRACE_MEMORY_ADDRESS_SPACE_LOCAL);
+	for (uint32_t i = 0; i < RING_SIZE; i += sizeof(uint32_t)) {
+		dword_out(0);
+	}
+
+	/* BLITTER_PPHWSP */
+	mem_trace_memory_write_header_out(BLITTER_CONTEXT_ADDR,
+					  PPHWSP_SIZE +
+					  sizeof(blitter_context_init),
+					  AUB_MEM_TRACE_MEMORY_ADDRESS_SPACE_LOCAL);
+	for (uint32_t i = 0; i < PPHWSP_SIZE; i += sizeof(uint32_t)) {
+		dword_out(0);
+	}
+
+	/* BLITTER_CONTEXT */
+	data_out(blitter_context_init, sizeof(blitter_context_init));
+
+	/* VIDEO_RING */
+	mem_trace_memory_write_header_out(VIDEO_RING_ADDR, RING_SIZE,
+					  AUB_MEM_TRACE_MEMORY_ADDRESS_SPACE_LOCAL);
+	for (uint32_t i = 0; i < RING_SIZE; i += sizeof(uint32_t)) {
+		dword_out(0);
+	}
+
+	/* VIDEO_PPHWSP */
+	mem_trace_memory_write_header_out(VIDEO_CONTEXT_ADDR,
+					  PPHWSP_SIZE +
+					  sizeof(video_context_init),
+					  AUB_MEM_TRACE_MEMORY_ADDRESS_SPACE_LOCAL);
+	for (uint32_t i = 0; i < PPHWSP_SIZE; i += sizeof(uint32_t)) {
+		dword_out(0);
+	}
+
+	/* VIDEO_CONTEXT */
+	data_out(video_context_init, sizeof(video_context_init));
+
+	register_write_out(HWS_PGA_RCSUNIT, RENDER_CONTEXT_ADDR);
+	register_write_out(HWS_PGA_VCSUNIT0, VIDEO_CONTEXT_ADDR);
+	register_write_out(HWS_PGA_BCSUNIT, BLITTER_CONTEXT_ADDR);
+
+	register_write_out(GFX_MODE_RCSUNIT, 0x80008000 /* execlist enable */);
+	register_write_out(GFX_MODE_VCSUNIT0, 0x80008000 /* execlist enable */);
+	register_write_out(GFX_MODE_BCSUNIT, 0x80008000 /* execlist enable */);
+}
+
+static void write_header(void)
 {
 	char app_name[8 * 4];
 	char comment[16];
 	int comment_len, comment_dwords, dwords;
-	uint32_t entry = 0x200003;
+	uint32_t entry = 0x3;	/* read/write | present */
 
 	comment_len = snprintf(comment, sizeof(comment), "PCI-ID=0x%x", device);
 	comment_dwords = ((comment_len + 3) / 4);
@@ -220,7 +505,7 @@  write_header(void)
 	dword_out(gtt_size()); /* size */
 	if (addr_bits > 32)
 		dword_out(0);
-	for (uint32_t i = 0; i * gtt_entry_size() < gtt_size(); i++) {
+	for (uint32_t i = 0; i < NUM_PT_ENTRIES; i++) {
 		dword_out(entry + 0x1000 * i);
 		if (addr_bits > 32)
 			dword_out(0);
@@ -245,15 +530,21 @@  aub_write_trace_block(uint32_t type, void *virtual, uint32_t size, uint64_t gtt_
 		if (block_size > 8 * 4096)
 			block_size = 8 * 4096;
 
-		dword_out(CMD_AUB_TRACE_HEADER_BLOCK |
-			  ((addr_bits > 32 ? 6 : 5) - 2));
-		dword_out(AUB_TRACE_MEMTYPE_GTT |
-			  type | AUB_TRACE_OP_DATA_WRITE);
-		dword_out(subtype);
-		dword_out(gtt_offset + offset);
-		dword_out(align_u32(block_size, 4));
-		if (addr_bits > 32)
-			dword_out((gtt_offset + offset) >> 32);
+		if (gen >= 10) {
+			mem_trace_memory_write_header_out(gtt_offset + offset,
+							  block_size,
+							  AUB_MEM_TRACE_MEMORY_ADDRESS_SPACE_LOCAL);
+		} else {
+			dword_out(CMD_AUB_TRACE_HEADER_BLOCK |
+				  ((addr_bits > 32 ? 6 : 5) - 2));
+			dword_out(AUB_TRACE_MEMTYPE_GTT |
+				  type | AUB_TRACE_OP_DATA_WRITE);
+			dword_out(subtype);
+			dword_out(gtt_offset + offset);
+			dword_out(align_u32(block_size, 4));
+			if (addr_bits > 32)
+				dword_out((gtt_offset + offset) >> 32);
+		}
 
 		if (virtual)
 			data_out(((char *) GET_PTR(virtual)) + offset, block_size);
@@ -291,6 +582,64 @@  write_reloc(void *p, uint64_t v)
 	}
 }
 
+static void
+aub_dump_execlist(uint64_t batch_offset, int ring_flag)
+{
+	uint32_t ring_addr;
+	uint64_t descriptor;
+	uint32_t elsp_reg;
+	uint32_t status_reg;
+
+	switch (ring_flag) {
+	case I915_EXEC_DEFAULT:
+	case I915_EXEC_RENDER:
+		ring_addr = RENDER_RING_ADDR;
+		descriptor = RENDER_CONTEXT_DESCRIPTOR;
+		elsp_reg = EXECLIST_SUBMITPORT_RCSUNIT;
+		status_reg = EXECLIST_STATUS_RCSUNIT;
+		break;
+	case I915_EXEC_BSD:
+		ring_addr = VIDEO_RING_ADDR;
+		descriptor = VIDEO_CONTEXT_DESCRIPTOR;
+		elsp_reg = EXECLIST_SUBMITPORT_VCSUNIT0;
+		status_reg = EXECLIST_STATUS_VCSUNIT0;
+		break;
+	case I915_EXEC_BLT:
+		ring_addr = BLITTER_RING_ADDR;
+		descriptor = BLITTER_CONTEXT_DESCRIPTOR;
+		elsp_reg = EXECLIST_SUBMITPORT_BCSUNIT;
+		status_reg = EXECLIST_STATUS_BCSUNIT;
+		break;
+	}
+
+	mem_trace_memory_write_header_out(ring_addr, 16,
+					  AUB_MEM_TRACE_MEMORY_ADDRESS_SPACE_LOCAL);
+	dword_out(AUB_MI_BATCH_BUFFER_START | (3 - 2));
+	dword_out(batch_offset & 0xFFFFFFFF);
+	dword_out(batch_offset >> 32);
+	dword_out(0 /* MI_NOOP */);
+
+	mem_trace_memory_write_header_out(ring_addr + 8192 + 20, 4,
+					  AUB_MEM_TRACE_MEMORY_ADDRESS_SPACE_LOCAL);
+	dword_out(0); /* RING_BUFFER_HEAD */
+	mem_trace_memory_write_header_out(ring_addr + 8192 + 28, 4,
+					  AUB_MEM_TRACE_MEMORY_ADDRESS_SPACE_LOCAL);
+	dword_out(16); /* RING_BUFFER_TAIL */
+
+	register_write_out(elsp_reg, 0);
+	register_write_out(elsp_reg, 8);
+	register_write_out(elsp_reg, descriptor >> 32);
+	register_write_out(elsp_reg, descriptor & 0xFFFFFFFF);
+
+	dword_out(CMD_MEM_TRACE_REGISTER_POLL | (5 + 1 - 1));
+	dword_out(status_reg);
+	dword_out(AUB_MEM_TRACE_REGISTER_SIZE_DWORD |
+		  AUB_MEM_TRACE_REGISTER_SPACE_MMIO);
+	dword_out(0x00000010);	/* mask lo */
+	dword_out(0x00000000);	/* mask hi */
+	dword_out(0x00000000);
+}
+
 static void
 aub_dump_ringbuffer(uint64_t batch_offset, uint64_t offset, int ring_flag)
 {
@@ -404,7 +753,7 @@  dump_execbuffer2(int fd, struct drm_i915_gem_execbuffer2 *execbuffer2)
 	struct drm_i915_gem_exec_object2 *exec_objects =
 		(struct drm_i915_gem_exec_object2 *) (uintptr_t) execbuffer2->buffers_ptr;
 	uint32_t ring_flag = execbuffer2->flags & I915_EXEC_RING_MASK;
-	uint32_t offset = gtt_size();
+	uint32_t offset;
 	struct drm_i915_gem_exec_object2 *obj;
 	struct bo *bo, *batch_bo;
 	int batch_index;
@@ -417,7 +766,18 @@  dump_execbuffer2(int fd, struct drm_i915_gem_execbuffer2 *execbuffer2)
 	}
 	if (gen == 0) {
 		gen = intel_gen(device);
-		write_header();
+		if (gen == 0)
+			gen = 99;
+
+		/* If we don't know the device gen, then it probably is a
+		 * newer device which uses 48-bit addresses.
+		 */
+		addr_bits = (gen >= 8) ? 48 : 32;
+
+		if (gen >= 10)
+			gen10_write_header();
+		else
+			write_header();
 
 		if (verbose)
 			printf("[intel_aubdump: running, "
@@ -425,10 +785,10 @@  dump_execbuffer2(int fd, struct drm_i915_gem_execbuffer2 *execbuffer2)
 			       filename, device, gen);
 	}
 
-	/* If we don't know the device gen, then it probably is a
-	 * newer device which uses 48-bit addresses.
-	 */
-	addr_bits = (gen >= 8 || gen == 0) ? 48 : 32;
+	if (gen >= 10)
+		offset = STATIC_GGTT_MAP_END;
+	else
+		offset = gtt_size();
 
 	if (verbose)
 		printf("Dumping execbuffer2:\n");
@@ -484,9 +844,15 @@  dump_execbuffer2(int fd, struct drm_i915_gem_execbuffer2 *execbuffer2)
 			free(data);
 	}
 
-	/* Dump ring buffer */
-	aub_dump_ringbuffer(batch_bo->offset + execbuffer2->batch_start_offset,
-			    offset, ring_flag);
+	if (gen >= 10) {
+		aub_dump_execlist(batch_bo->offset +
+				  execbuffer2->batch_start_offset, ring_flag);
+	} else {
+		/* Dump ring buffer */
+		aub_dump_ringbuffer(batch_bo->offset +
+				    execbuffer2->batch_start_offset, offset,
+				    ring_flag);
+	}
 
 	for (int i = 0; i < ARRAY_SIZE(files); i++) {
 		if (files[i] != NULL)