From 804328390488f76c1bda02cdf0a4081e55657ff9 Mon Sep 17 00:00:00 2001
From: Zhenyu Wang <zhenyuw@linux.intel.com>
Date: Tue, 22 Feb 2011 14:53:42 +0800
Subject: [PATCH] intel: Add AUB file dump support
This adds AUB file dump support to generate execution
trace for internal GPU simulator.
Signed-off-by: Zhenyu Wang <zhenyuw@linux.intel.com>
---
intel/Makefile.am | 3 +-
intel/intel_bufmgr.h | 40 +++++
intel/intel_bufmgr_gem.c | 397 ++++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 439 insertions(+), 1 deletions(-)
@@ -41,7 +41,8 @@ libdrm_intel_la_SOURCES = \
intel_bufmgr_gem.c \
intel_chipset.h \
mm.c \
- mm.h
+ mm.h \
+ intel_aub.h
libdrm_intelincludedir = ${includedir}/libdrm
libdrm_intelinclude_HEADERS = intel_bufmgr.h
@@ -35,6 +35,7 @@
#define INTEL_BUFMGR_H
#include <stdint.h>
+#include <stdio.h>
struct drm_clip_rect;
@@ -83,6 +84,41 @@ struct _drm_intel_bo {
int handle;
};
+enum drm_intel_aub_bmp_format {
+ AUB_DUMP_BMP_LEGACY,
+ AUB_DUMP_BMP_8BIT,
+ AUB_DUMP_BMP_ARGB_0555,
+ AUB_DUMP_BMP_ARGB_0565,
+ AUB_DUMP_BMP_ARGB_4444,
+ AUB_DUMP_BMP_ARGB_1555,
+ AUB_DUMP_BMP_ARGB_0888,
+ AUB_DUMP_BMP_ARGB_8888,
+ AUB_DUMP_BMP_YCRCB_SWAPY,
+ AUB_DUMP_BMP_YCRCB_NORMAL,
+ AUB_DUMP_BMP_YCRCB_SWAPUV,
+ AUB_DUMP_BMP_YCRCB_SWAPUVY,
+ AUB_DUMP_BMP_ABGR_8888,
+};
+
+/*
+ * The information needed by aub DUMP_BMP command
+ *
+ * NOTE: this is for aub dump
+ */
+struct drm_intel_aub_surface_bmp {
+ uint16_t x_offset;
+ uint16_t y_offset;
+ uint16_t pitch;
+ uint8_t bits_per_pixel;
+ uint8_t format;
+ uint16_t width;
+ uint16_t height;
+ uint32_t tiling_walk_y:1;
+ uint32_t tiling:1;
+ uint32_t pad:30;
+};
+
+
#define BO_ALLOC_FOR_RENDER (1<<0)
drm_intel_bo *drm_intel_bo_alloc(drm_intel_bufmgr *bufmgr, const char *name,
@@ -150,6 +186,10 @@ int drm_intel_gem_bo_unmap_gtt(drm_intel_bo *bo);
void drm_intel_gem_bo_start_gtt_access(drm_intel_bo *bo, int write_enable);
int drm_intel_get_pipe_from_crtc_id(drm_intel_bufmgr *bufmgr, int crtc_id);
+void drm_intel_bufmgr_gem_set_aubfile(drm_intel_bufmgr *bufmgr, FILE *file);
+void drm_intel_bufmgr_gem_stop_aubfile(drm_intel_bufmgr *bufmgr);
+int drm_intel_gem_aub_dump_bmp(drm_intel_bufmgr *bufmgr, drm_intel_bo *bo,
+ struct drm_intel_aub_surface_bmp *bmp);
/* drm_intel_bufmgr_fake.c */
drm_intel_bufmgr *drm_intel_bufmgr_fake_init(int fd,
@@ -57,6 +57,7 @@
#include "intel_bufmgr.h"
#include "intel_bufmgr_priv.h"
#include "intel_chipset.h"
+#include "intel_aub.h"
#include "string.h"
#include "i915_drm.h"
@@ -75,6 +76,12 @@ struct drm_intel_gem_bo_bucket {
unsigned long size;
};
+struct drm_intel_aub_bmp {
+ drm_intel_bo *bo;
+ struct drm_intel_aub_surface_bmp bmp;
+ struct drm_intel_aub_bmp *next;
+};
+
typedef struct _drm_intel_bufmgr_gem {
drm_intel_bufmgr bufmgr;
@@ -106,6 +113,10 @@ typedef struct _drm_intel_bufmgr_gem {
unsigned int has_relaxed_fencing : 1;
unsigned int bo_reuse : 1;
char fenced_relocs;
+
+ FILE *aub_file;
+ uint32_t aub_offset;
+ struct drm_intel_aub_bmp *aub_bmp;
} drm_intel_bufmgr_gem;
#define DRM_INTEL_RELOC_FENCE (1<<0)
@@ -195,8 +206,392 @@ struct _drm_intel_bo_gem {
* relocations.
*/
int reloc_tree_fences;
+
+ uint32_t aub_offset;
};
+/* AUB trace dump support */
+
+static void
+aub_out(drm_intel_bufmgr_gem *bufmgr_gem, uint32_t data)
+{
+ fwrite(&data, 1, 4, bufmgr_gem->aub_file);
+}
+
+static void
+aub_out_data(drm_intel_bufmgr_gem *bufmgr_gem, void *data, size_t size)
+{
+ fwrite(data, 1, size, bufmgr_gem->aub_file);
+}
+
+static void
+aub_write_bo_data(drm_intel_bo *bo, uint32_t offset, uint32_t size)
+{
+ drm_intel_bufmgr_gem *bufmgr_gem = (drm_intel_bufmgr_gem *) bo->bufmgr;
+ drm_intel_bo_gem *bo_gem = (drm_intel_bo_gem *) bo;
+ uint32_t *data;
+ unsigned int i;
+
+ data = malloc(bo->size);
+ drm_intel_bo_get_subdata(bo, offset, size, data);
+
+ /* Easy mode: write out bo with no relocations */
+ if (!bo_gem->reloc_count) {
+ aub_out_data(bufmgr_gem, data, size);
+ free(data);
+ return;
+ }
+
+ /* Otherwise, handle the relocations while writing. */
+ for (i = 0; i < size / 4; i++) {
+ int r;
+ for (r = 0; r < bo_gem->reloc_count; r++) {
+ struct drm_i915_gem_relocation_entry *reloc;
+ drm_intel_reloc_target *info;
+
+ reloc = &bo_gem->relocs[r];
+ info = &bo_gem->reloc_target_info[r];
+
+ if (reloc->offset == offset + i * 4) {
+ drm_intel_bo_gem *target_gem;
+ uint32_t val;
+
+ target_gem = (drm_intel_bo_gem *)info->bo;
+
+ val = reloc->delta;
+ val += target_gem->aub_offset;
+
+ aub_out(bufmgr_gem, val);
+ data[i] = val;
+ break;
+ }
+ }
+ if (r == bo_gem->reloc_count) {
+ /* no relocation, just the data */
+ aub_out(bufmgr_gem, data[i]);
+ }
+ }
+}
+
+static void
+aub_bo_get_address(drm_intel_bo *bo)
+{
+ drm_intel_bufmgr_gem *bufmgr_gem = (drm_intel_bufmgr_gem *) bo->bufmgr;
+ drm_intel_bo_gem *bo_gem = (drm_intel_bo_gem *) bo;
+
+ /* Give the object a graphics address in the AUB file. We
+ * don't just use the GEM object address because we do AUB
+ * dumping before execution -- we want to successfully log
+ * when the hardware might hang, and we might even want to aub
+ * capture for a driver trying to execute on a different
+ * generation of hardware by disabling the actual kernel exec
+ * call.
+ */
+ bo_gem->aub_offset = bufmgr_gem->aub_offset;
+ bufmgr_gem->aub_offset += bo->size;
+ /* XXX: Handle aperture overflow. */
+ assert(bufmgr_gem->aub_offset < 256 * 1024 * 1024);
+}
+
+static const struct {
+ const char *name;
+ uint32_t type;
+ uint32_t subtype;
+} name_to_type_mapping[] = {
+ { "VS_UNIT", AUB_TRACE_TYPE_GENERAL, AUB_TRACE_VS_STATE},
+ { "GS_UNIT", AUB_TRACE_TYPE_GENERAL, AUB_TRACE_GS_STATE},
+ { "CLIP_UNIT", AUB_TRACE_TYPE_GENERAL, AUB_TRACE_CL_STATE},
+ { "SF_UNIT", AUB_TRACE_TYPE_GENERAL, AUB_TRACE_SF_STATE},
+ { "WM_UNIT", AUB_TRACE_TYPE_GENERAL, AUB_TRACE_WM_STATE},
+ { "CC_UNIT", AUB_TRACE_TYPE_GENERAL, AUB_TRACE_CC_STATE},
+ { "CLIP_VP", AUB_TRACE_TYPE_GENERAL, AUB_TRACE_CL_VP},
+ { "SF_VP", AUB_TRACE_TYPE_GENERAL, AUB_TRACE_SF_VP},
+ { "SF_SCISSOR_UNIT",
+ AUB_TRACE_TYPE_GENERAL, AUB_TRACE_SF_SCISSOR_RECT},
+ { "CC_VP", AUB_TRACE_TYPE_GENERAL, AUB_TRACE_CC_VP},
+ { "SAMPLER", AUB_TRACE_TYPE_GENERAL, AUB_TRACE_SAMPLER_STATE},
+ { "SAMPLER_DEFAULT_COLOR", AUB_TRACE_TYPE_GENERAL, AUB_TRACE_SDC},
+ { "VS_PROG", AUB_TRACE_TYPE_GENERAL, AUB_TRACE_KERNEL},
+ { "GS_PROG", AUB_TRACE_TYPE_GENERAL, AUB_TRACE_KERNEL},
+ { "CLIP_PROG", AUB_TRACE_TYPE_GENERAL, AUB_TRACE_KERNEL},
+ { "SF_PROG", AUB_TRACE_TYPE_GENERAL, AUB_TRACE_KERNEL},
+ { "WM_PROG", AUB_TRACE_TYPE_GENERAL, AUB_TRACE_KERNEL},
+ { "MEDIA_PROG", AUB_TRACE_TYPE_GENERAL, AUB_TRACE_KERNEL},
+ { "BLEND_STATE", AUB_TRACE_TYPE_GENERAL, AUB_TRACE_BLEND_STATE},
+ { "DEPTH_STENCIL_STATE",
+ AUB_TRACE_TYPE_GENERAL, AUB_TRACE_DEPTH_STENCIL_STATE},
+ { "COLOR_CALC_STATE", AUB_TRACE_TYPE_GENERAL, AUB_TRACE_CC_STATE},
+ { "SS_SURF_BIND", AUB_TRACE_TYPE_SURFACE, AUB_TRACE_BINDING_TABLE},
+ { "SS_SURFACE", AUB_TRACE_TYPE_SURFACE, AUB_TRACE_SURFACE_STATE},
+ { "temporary VBO", AUB_TRACE_TYPE_VERTEX_BUFFER, 0},
+ { "CURBE", AUB_TRACE_TYPE_CONSTANT_URB, 0},
+ { "INTERFACE_DESC", AUB_TRACE_TYPE_GENERAL, AUB_TRACE_INTERFACE_DESC},
+ { "VLD_STATE", AUB_TRACE_TYPE_GENERAL, AUB_TRACE_VLD_STATE},
+ { "VFE_STATE", AUB_TRACE_TYPE_GENERAL, AUB_TRACE_VFE_STATE},
+ { "IT_STATE", AUB_TRACE_TYPE_GENERAL, AUB_TRACE_IT_STATE},
+ { "DI_SAMPLE_STATE", AUB_TRACE_TYPE_GENERAL, AUB_TRACE_DI_SAMPLE_STATE},
+ { "IEF_STATE", AUB_TRACE_TYPE_GENERAL, AUB_TRACE_IEF_STATE},
+ { "AVS_STATE", AUB_TRACE_TYPE_GENERAL, AUB_TRACE_AVS_STATE},
+ { "VME_STATE", AUB_TRACE_TYPE_GENERAL, AUB_TRACE_VME_STATE},
+};
+
+static void
+aub_write_trace_block(drm_intel_bo *bo, uint32_t type, uint32_t subtype,
+ uint32_t offset, uint32_t size)
+{
+ drm_intel_bufmgr_gem *bufmgr_gem = (drm_intel_bufmgr_gem *) bo->bufmgr;
+ drm_intel_bo_gem *bo_gem = (drm_intel_bo_gem *) bo;
+
+ aub_out(bufmgr_gem,
+ CMD_AUB_TRACE_HEADER_BLOCK |
+ (5 - 2));
+ aub_out(bufmgr_gem, AUB_TRACE_MEMTYPE_GTT | type | AUB_TRACE_OP_DATA_WRITE);
+ aub_out(bufmgr_gem, subtype);
+ aub_out(bufmgr_gem, bo_gem->aub_offset + offset);
+ aub_out(bufmgr_gem, size);
+ aub_write_bo_data(bo, offset, size);
+}
+
+static void
+aub_write_bo(drm_intel_bo *bo)
+{
+ drm_intel_bo_gem *bo_gem = (drm_intel_bo_gem *) bo;
+ uint32_t type = AUB_TRACE_TYPE_NOTYPE;
+ uint32_t subtype = 0;
+ uint32_t block_size;
+ uint32_t offset;
+ unsigned int i;
+
+ aub_bo_get_address(bo);
+
+ for (i = 0; i < ARRAY_SIZE(name_to_type_mapping); i++) {
+ if (strcmp(bo_gem->name,
+ name_to_type_mapping[i].name) == 0) {
+ type = name_to_type_mapping[i].type;
+ subtype = name_to_type_mapping[i].subtype;
+ break;
+ }
+ }
+
+ if (type == 0) {
+ printf("Failed to find type for object %s(size: 0x%lx, aub_offset: 0x%08x)\n",
+ bo_gem->name, bo->size, bo_gem->aub_offset);
+ }
+
+
+ /* Break up large objects into multiple writes. Otherwise a
+ * 128kb VBO would overflow the 16 bits of size field in the
+ * packet header and everything goes badly after that.
+ */
+ for (offset = 0; offset < bo->size; offset += block_size) {
+ block_size = bo->size - offset;
+
+ if (block_size > 2 * 4096)
+ block_size = 2 * 4096;
+
+ aub_write_trace_block(bo, type, subtype,
+ offset, block_size);
+ }
+}
+
+/*
+ * Make a ringbuffer on fly and dump it
+ */
+static void
+aub_generate_ringbuffer(drm_intel_bufmgr_gem *bufmgr_gem,
+ uint32_t batch_buffer, unsigned int flags)
+{
+ uint32_t ringbuffer[4096];
+ int ring = 0;
+
+ switch (flags) {
+ case I915_EXEC_RENDER:
+ case I915_EXEC_DEFAULT:
+ ring = AUB_TRACE_TYPE_RING_PRB0;
+ break;
+ case I915_EXEC_BSD:
+ ring = AUB_TRACE_TYPE_RING_PRB1;
+ break;
+ case I915_EXEC_BLT:
+ ring = AUB_TRACE_TYPE_RING_PRB2;
+ break;
+ }
+
+ aub_out(bufmgr_gem,
+ CMD_AUB_TRACE_HEADER_BLOCK |
+ (5 - 2));
+ aub_out(bufmgr_gem,
+ AUB_TRACE_MEMTYPE_GTT | ring | AUB_TRACE_OP_COMMAND_WRITE);
+ aub_out(bufmgr_gem, 0); /* general/surface subtype */
+ aub_out(bufmgr_gem, bufmgr_gem->aub_offset);
+ aub_out(bufmgr_gem, 4096);
+
+ /* Do make a ring buffer here */
+ memset(ringbuffer, AUB_MI_NOOP, sizeof(ringbuffer));
+ ringbuffer[0] = AUB_MI_BATCH_BUFFER_START;
+ ringbuffer[1] = batch_buffer;
+
+ /* FIXME: Need some flush operations here? */
+
+ aub_out_data(bufmgr_gem, ringbuffer, 4096);
+
+ /* Update offset pointer */
+ bufmgr_gem->aub_offset += 4096;
+}
+
+static void
+aub_dump_bmp(drm_intel_bufmgr_gem *bufmgr_gem)
+{
+ struct drm_intel_aub_bmp *p = bufmgr_gem->aub_bmp;
+
+ if (!p)
+ return;
+
+ while(p) {
+ aub_out(bufmgr_gem, CMD_AUB_DUMP_BMP | 4);
+ aub_out(bufmgr_gem, (p->bmp.y_offset << 16) | p->bmp.x_offset);
+ aub_out(bufmgr_gem, (p->bmp.format << 24) |
+ (p->bmp.bits_per_pixel << 16) | p->bmp.pitch);
+ aub_out(bufmgr_gem, (p->bmp.height << 16) | p->bmp.width);
+ /* surface bo should already be written out */
+ assert(((drm_intel_bo_gem *)p->bo)->aub_offset != 0);
+ aub_out(bufmgr_gem, ((drm_intel_bo_gem *)p->bo)->aub_offset);
+ aub_out(bufmgr_gem, (p->bmp.tiling << 2) | (p->bmp.tiling_walk_y << 3));
+
+ bufmgr_gem->aub_bmp = p->next;
+ free(p);
+ p = bufmgr_gem->aub_bmp;
+ }
+}
+
+static void
+aub_exec(drm_intel_bo *bo, unsigned int flags)
+{
+ drm_intel_bufmgr_gem *bufmgr_gem = (drm_intel_bufmgr_gem *) bo->bufmgr;
+ drm_intel_bo_gem *bo_gem = (drm_intel_bo_gem *) bo;
+ int i;
+
+ if (!bufmgr_gem->aub_file)
+ return;
+
+ /* Write out all but the batchbuffer to AUB memory */
+ for (i = 0; i < bufmgr_gem->exec_count - 1; i++) {
+ if (bufmgr_gem->exec_bos[i] != bo)
+ aub_write_bo(bufmgr_gem->exec_bos[i]);
+ }
+
+ aub_bo_get_address(bo);
+
+ /* Dump the batchbuffer. */
+ aub_out(bufmgr_gem,
+ CMD_AUB_TRACE_HEADER_BLOCK |
+ (5 - 2));
+ aub_out(bufmgr_gem,
+ AUB_TRACE_MEMTYPE_GTT | AUB_TRACE_TYPE_BATCH | AUB_TRACE_OP_DATA_WRITE);
+ aub_out(bufmgr_gem, 0); /* general/surface subtype */
+ aub_out(bufmgr_gem, bo_gem->aub_offset);
+ aub_out(bufmgr_gem, bo_gem->bo.size);
+ aub_write_bo_data(bo, 0, bo_gem->bo.size);
+
+ /* Dump ring buffer */
+ aub_generate_ringbuffer(bufmgr_gem, bo_gem->aub_offset, flags);
+
+ /* Dump BMP file for any requested surface */
+ aub_dump_bmp(bufmgr_gem);
+
+ fflush(bufmgr_gem->aub_file);
+
+ /*
+ * One frame has been dumped. So reset the aub_offset for the next frame.
+ *
+ * FIXME: Can we do this?
+ */
+ bufmgr_gem->aub_offset = 0x10000;
+}
+
+/*
+ * Stop dumping data to aub file
+ */
+void drm_intel_bufmgr_gem_stop_aubfile(drm_intel_bufmgr *bufmgr)
+{
+ drm_intel_bufmgr_gem *bufmgr_gem = (drm_intel_bufmgr_gem *)bufmgr;
+
+ pthread_mutex_lock(&bufmgr_gem->lock);
+ bufmgr_gem->aub_file = NULL;
+ pthread_mutex_unlock(&bufmgr_gem->lock);
+}
+
+void drm_intel_bufmgr_gem_set_aubfile(drm_intel_bufmgr *bufmgr, FILE *file)
+{
+ drm_intel_bufmgr_gem *bufmgr_gem = (drm_intel_bufmgr_gem *)bufmgr;
+ int entry = 0x3; /* uc/valid GTT */
+ int i;
+
+ if (!file)
+ return;
+
+ pthread_mutex_lock(&bufmgr_gem->lock);
+
+ bufmgr_gem->aub_file = file;
+
+ /* Start from 0x10000, since the address below is used for GTT entry building */
+ bufmgr_gem->aub_offset = 0x10000;
+
+ /* Start with a (required) version packet. */
+ aub_out(bufmgr_gem, CMD_AUB_HEADER | (13 - 2));
+ aub_out(bufmgr_gem,
+ (4 << AUB_HEADER_MAJOR_SHIFT) |
+ (0 << AUB_HEADER_MINOR_SHIFT));
+ for (i = 0; i < 8; i++) {
+ aub_out(bufmgr_gem, 0); /* app name */
+ }
+ aub_out(bufmgr_gem, 0); /* timestamp */
+ aub_out(bufmgr_gem, 0); /* timestamp */
+ aub_out(bufmgr_gem, 0); /* comment len */
+
+ /* Set up the GTT. The max we can handle is 256M */
+ for (i = 0x000; i < 0x10000; i += 4, entry += 0x1000) {
+ aub_out(bufmgr_gem, CMD_AUB_TRACE_HEADER_BLOCK | (5 - 2));
+ aub_out(bufmgr_gem, AUB_TRACE_MEMTYPE_NONLOCAL | 0 | AUB_TRACE_OP_DATA_WRITE);
+ aub_out(bufmgr_gem, 0);
+ aub_out(bufmgr_gem, i);
+ aub_out(bufmgr_gem, 4);
+ aub_out(bufmgr_gem, entry);
+ }
+
+ pthread_mutex_unlock(&bufmgr_gem->lock);
+}
+
+int drm_intel_gem_aub_dump_bmp(drm_intel_bufmgr *bufmgr,
+ drm_intel_bo *bo,
+ struct drm_intel_aub_surface_bmp *bmp)
+{
+ drm_intel_bufmgr_gem *bufmgr_gem = (drm_intel_bufmgr_gem *)bufmgr;
+ struct drm_intel_aub_bmp *aub_bmp, *p, *last;
+
+ aub_bmp = malloc(sizeof(*aub_bmp));
+ if (!aub_bmp)
+ return -1;
+
+ aub_bmp->bo = bo;
+ memcpy(&aub_bmp->bmp, bmp, sizeof(*bmp));
+ aub_bmp->next = NULL;
+
+ pthread_mutex_lock(&bufmgr_gem->lock);
+
+ /* Insert last */
+ p = last = bufmgr_gem->aub_bmp;
+ while (p) {
+ last = p;
+ p = p->next;
+ }
+ last->next = aub_bmp;
+
+ pthread_mutex_unlock(&bufmgr_gem->lock);
+
+ return 0;
+}
+
static unsigned int
drm_intel_gem_estimate_batch_space(drm_intel_bo ** bo_array, int count);
@@ -1624,6 +2019,8 @@ drm_intel_gem_bo_mrb_exec2(drm_intel_bo *bo, int used,
execbuf.rsvd1 = 0;
execbuf.rsvd2 = 0;
+ aub_exec(bo, flags);
+
ret = drmIoctl(bufmgr_gem->fd,
DRM_IOCTL_I915_GEM_EXECBUFFER2,
&execbuf);
--
1.7.1