diff mbox

intel: Add AUB file dump support

Message ID 1297836461-31543-2-git-send-email-zhenyuw@linux.intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

Zhenyu Wang Feb. 16, 2011, 6:07 a.m. UTC
None
diff mbox

Patch

diff --git a/intel/Makefile.am b/intel/Makefile.am
index 1ae92f8..398cd2f 100644
--- a/intel/Makefile.am
+++ b/intel/Makefile.am
@@ -41,7 +41,8 @@  libdrm_intel_la_SOURCES = \
 	intel_bufmgr_gem.c \
 	intel_chipset.h \
 	mm.c \
-	mm.h
+	mm.h \
+	intel_aub.h
 
 libdrm_intelincludedir = ${includedir}/libdrm
 libdrm_intelinclude_HEADERS = intel_bufmgr.h
diff --git a/intel/intel_bufmgr.h b/intel/intel_bufmgr.h
index daa18b4..bb4158a 100644
--- a/intel/intel_bufmgr.h
+++ b/intel/intel_bufmgr.h
@@ -35,6 +35,7 @@ 
 #define INTEL_BUFMGR_H
 
 #include <stdint.h>
+#include <stdio.h>
 
 struct drm_clip_rect;
 
@@ -83,6 +84,39 @@  struct _drm_intel_bo {
 	int handle;
 };
 
+enum drm_intel_aub_bmp_format {
+	AUB_DUMP_BMP_LEGACY,
+	AUB_DUMP_BMP_8BIT,
+	AUB_DUMP_BMP_ARGB_0555,
+	AUB_DUMP_BMP_ARGB_0565,
+	AUB_DUMP_BMP_ARGB_4444,
+	AUB_DUMP_BMP_ARGB_1555,
+	AUB_DUMP_BMP_ARGB_0888,
+	AUB_DUMP_BMP_ARGB_8888,
+	AUB_DUMP_BMP_YCRCB_SWAPY,
+	AUB_DUMP_BMP_YCRCB_NORMAL,
+	AUB_DUMP_BMP_YCRCB_SWAPUV,
+	AUB_DUMP_BMP_YCRCB_SWAPUVY,
+	AUB_DUMP_BMP_ABGR_8888,
+};
+
+/*
+ * surface info needed by aub DUMP_BMP block
+ */
+struct drm_intel_aub_surface_bmp {
+	uint16_t x_offset;
+	uint16_t y_offset;
+	uint16_t pitch;
+	uint8_t bits_per_pixel;
+	uint8_t format;
+	uint16_t width;
+	uint16_t height;
+	uint32_t tiling_walk_y:1;
+	uint32_t tiling:1;
+	uint32_t pad:30;
+};
+
+
 #define BO_ALLOC_FOR_RENDER (1<<0)
 
 drm_intel_bo *drm_intel_bo_alloc(drm_intel_bufmgr *bufmgr, const char *name,
@@ -150,6 +184,10 @@  int drm_intel_gem_bo_unmap_gtt(drm_intel_bo *bo);
 void drm_intel_gem_bo_start_gtt_access(drm_intel_bo *bo, int write_enable);
 
 int drm_intel_get_pipe_from_crtc_id(drm_intel_bufmgr *bufmgr, int crtc_id);
+void drm_intel_bufmgr_gem_set_aubfile(drm_intel_bufmgr *bufmgr, FILE *file);
+void drm_intel_bufmgr_gem_stop_aubfile(drm_intel_bufmgr *bufmgr);
+int drm_intel_gem_aub_dump_bmp(drm_intel_bufmgr *bufmgr, drm_intel_bo *bo,
+			       unsigned int offset, struct drm_intel_aub_surface_bmp *bmp);
 
 /* drm_intel_bufmgr_fake.c */
 drm_intel_bufmgr *drm_intel_bufmgr_fake_init(int fd,
diff --git a/intel/intel_bufmgr_gem.c b/intel/intel_bufmgr_gem.c
index 3cdffce..654bc31 100644
--- a/intel/intel_bufmgr_gem.c
+++ b/intel/intel_bufmgr_gem.c
@@ -57,6 +57,7 @@ 
 #include "intel_bufmgr.h"
 #include "intel_bufmgr_priv.h"
 #include "intel_chipset.h"
+#include "intel_aub.h"
 #include "string.h"
 
 #include "i915_drm.h"
@@ -75,6 +76,13 @@  struct drm_intel_gem_bo_bucket {
 	unsigned long size;
 };
 
+struct drm_intel_aub_bmp {
+	drm_intel_bo *bo; /* surface bo */
+	unsigned int offset;
+	struct drm_intel_aub_surface_bmp bmp;
+	struct drm_intel_aub_bmp *next;
+};
+
 typedef struct _drm_intel_bufmgr_gem {
 	drm_intel_bufmgr bufmgr;
 
@@ -106,6 +114,10 @@  typedef struct _drm_intel_bufmgr_gem {
 	unsigned int has_relaxed_fencing : 1;
 	unsigned int bo_reuse : 1;
 	char fenced_relocs;
+
+	FILE *aub_file;
+	uint32_t aub_offset;
+	struct drm_intel_aub_bmp *aub_bmp;
 } drm_intel_bufmgr_gem;
 
 #define DRM_INTEL_RELOC_FENCE (1<<0)
@@ -195,8 +207,396 @@  struct _drm_intel_bo_gem {
 	 * relocations.
 	 */
 	int reloc_tree_fences;
+
+	uint32_t aub_offset;
 };
 
+/* AUB trace dump support */
+
+static void
+aub_out(drm_intel_bufmgr_gem *bufmgr_gem, uint32_t data)
+{
+	fwrite(&data, 1, 4, bufmgr_gem->aub_file);
+}
+
+static void
+aub_out_data(drm_intel_bufmgr_gem *bufmgr_gem, void *data, size_t size)
+{
+	fwrite(data, 1, size, bufmgr_gem->aub_file);
+}
+
+static void
+aub_write_bo_data(drm_intel_bo *bo, uint32_t offset, uint32_t size)
+{
+	drm_intel_bufmgr_gem *bufmgr_gem = (drm_intel_bufmgr_gem *) bo->bufmgr;
+	drm_intel_bo_gem *bo_gem = (drm_intel_bo_gem *) bo;
+	uint32_t *data;
+	unsigned int i;
+
+	data = malloc(bo->size);
+	drm_intel_bo_get_subdata(bo, offset, size, data);
+
+	/* Easy mode: write out bo with no relocations */
+	if (!bo_gem->reloc_count) {
+		aub_out_data(bufmgr_gem, data, size);
+		free(data);
+		return;
+	}
+
+	/* Otherwise, handle the relocations while writing. */
+	for (i = 0; i < size / 4; i++) {
+		int r;
+		for (r = 0; r < bo_gem->reloc_count; r++) {
+			struct drm_i915_gem_relocation_entry *reloc;
+			drm_intel_reloc_target *info;
+
+			reloc = &bo_gem->relocs[r];
+			info = &bo_gem->reloc_target_info[r];
+
+			if (reloc->offset == offset + i * 4) {
+				drm_intel_bo_gem *target_gem;
+				uint32_t val;
+
+				target_gem = (drm_intel_bo_gem *)info->bo;
+
+				val = reloc->delta;
+				val += target_gem->aub_offset;
+
+				aub_out(bufmgr_gem, val);
+				data[i] = val;
+				break;
+			}
+		}
+		if (r == bo_gem->reloc_count) {
+			/* no relocation, just the data */
+			aub_out(bufmgr_gem, data[i]);
+		}
+	}
+}
+
+static void
+aub_bo_get_address(drm_intel_bo *bo)
+{
+	drm_intel_bufmgr_gem *bufmgr_gem = (drm_intel_bufmgr_gem *) bo->bufmgr;
+	drm_intel_bo_gem *bo_gem = (drm_intel_bo_gem *) bo;
+
+	/* Give the object a graphics address in the AUB file.  We
+	 * don't just use the GEM object address because we do AUB
+	 * dumping before execution -- we want to successfully log
+	 * when the hardware might hang, and we might even want to aub
+	 * capture for a driver trying to execute on a different
+	 * generation of hardware by disabling the actual kernel exec
+	 * call.
+	 */
+	bo_gem->aub_offset = bufmgr_gem->aub_offset;
+	bufmgr_gem->aub_offset += bo->size;
+	/* XXX: Handle aperture overflow. */
+	assert(bufmgr_gem->aub_offset < 256 * 1024 * 1024);
+}
+
+static const struct {
+	const char *name;
+	uint32_t type;
+	uint32_t subtype;
+} name_to_type_mapping[] = {
+	{ "VS_UNIT",	AUB_TRACE_TYPE_GENERAL, AUB_TRACE_VS_STATE},
+	{ "GS_UNIT",	AUB_TRACE_TYPE_GENERAL, AUB_TRACE_GS_STATE},
+	{ "CLIP_UNIT",	AUB_TRACE_TYPE_GENERAL, AUB_TRACE_CL_STATE},
+	{ "SF_UNIT",	AUB_TRACE_TYPE_GENERAL, AUB_TRACE_SF_STATE},
+	{ "WM_UNIT",	AUB_TRACE_TYPE_GENERAL, AUB_TRACE_WM_STATE},
+	{ "CC_UNIT",	AUB_TRACE_TYPE_GENERAL, AUB_TRACE_CC_STATE},
+	{ "CLIP_VP",	AUB_TRACE_TYPE_GENERAL, AUB_TRACE_CL_VP},
+	{ "SF_VP",	AUB_TRACE_TYPE_GENERAL, AUB_TRACE_SF_VP},
+	{ "SF_SCISSOR_UNIT",
+	  AUB_TRACE_TYPE_GENERAL, AUB_TRACE_SF_SCISSOR_RECT},
+	{ "CC_VP",	AUB_TRACE_TYPE_GENERAL, AUB_TRACE_CC_VP},
+	{ "SAMPLER",	AUB_TRACE_TYPE_GENERAL, AUB_TRACE_SAMPLER_STATE},
+	{ "SAMPLER_DEFAULT_COLOR", AUB_TRACE_TYPE_GENERAL, AUB_TRACE_SDC},
+	{ "VS_PROG",	AUB_TRACE_TYPE_GENERAL, AUB_TRACE_KERNEL},
+	{ "GS_PROG",	AUB_TRACE_TYPE_GENERAL, AUB_TRACE_KERNEL},
+	{ "CLIP_PROG",	AUB_TRACE_TYPE_GENERAL, AUB_TRACE_KERNEL},
+	{ "SF_PROG",	AUB_TRACE_TYPE_GENERAL, AUB_TRACE_KERNEL},
+	{ "WM_PROG",	AUB_TRACE_TYPE_GENERAL, AUB_TRACE_KERNEL},
+	{ "BLEND_STATE", AUB_TRACE_TYPE_GENERAL, AUB_TRACE_BLEND_STATE},
+	{ "DEPTH_STENCIL_STATE",
+	  AUB_TRACE_TYPE_GENERAL, AUB_TRACE_DEPTH_STENCIL_STATE},
+	{ "COLOR_CALC_STATE", AUB_TRACE_TYPE_GENERAL, AUB_TRACE_CC_STATE},
+	{ "SS_SURF_BIND", AUB_TRACE_TYPE_SURFACE, AUB_TRACE_BINDING_TABLE},
+	{ "SS_SURFACE",	AUB_TRACE_TYPE_SURFACE, AUB_TRACE_SURFACE_STATE},
+	{ "temporary VBO", AUB_TRACE_TYPE_VERTEX_BUFFER, 0},
+	{ "CURBE",	AUB_TRACE_TYPE_CONSTANT_URB, 0},
+	{ "VS constant_bo", AUB_TRACE_TYPE_CONSTANT_BUFFER, 0},
+	{ "WM constant_bo", AUB_TRACE_TYPE_CONSTANT_BUFFER, 0},
+	{ "INTERFACE_DESC", AUB_TRACE_TYPE_GENERAL, AUB_TRACE_INTERFACE_DESC},
+	{ "VLD_STATE", AUB_TRACE_TYPE_GENERAL, AUB_TRACE_VLD_STATE},
+	{ "VFE_STATE", AUB_TRACE_TYPE_GENERAL, AUB_TRACE_VFE_STATE},
+	{ "IT_STATE", AUB_TRACE_TYPE_GENERAL, AUB_TRACE_IT_STATE},
+	{ "DI_SAMPLE_STATE", AUB_TRACE_TYPE_GENERAL, AUB_TRACE_DI_SAMPLE_STATE},
+	{ "IEF_STATE", AUB_TRACE_TYPE_GENERAL, AUB_TRACE_IEF_STATE},
+	{ "AVS_STATE", AUB_TRACE_TYPE_GENERAL, AUB_TRACE_AVS_STATE},
+};
+
+static void
+aub_write_trace_block(drm_intel_bo *bo, uint32_t type, uint32_t subtype,
+		      uint32_t offset, uint32_t size)
+{
+	drm_intel_bufmgr_gem *bufmgr_gem = (drm_intel_bufmgr_gem *) bo->bufmgr;
+	drm_intel_bo_gem *bo_gem = (drm_intel_bo_gem *) bo;
+
+	aub_out(bufmgr_gem,
+		CMD_AUB_TRACE_HEADER_BLOCK |
+		(5 - 2));
+	aub_out(bufmgr_gem, AUB_TRACE_MEMTYPE_GTT | type | AUB_TRACE_OP_DATA_WRITE);
+	aub_out(bufmgr_gem, subtype);
+	aub_out(bufmgr_gem, bo_gem->aub_offset + offset);
+	aub_out(bufmgr_gem, size);
+	aub_write_bo_data(bo, offset, size);
+}
+
+static void
+aub_write_bo(drm_intel_bo *bo)
+{
+	drm_intel_bufmgr_gem *bufmgr_gem = (drm_intel_bufmgr_gem *) bo->bufmgr;
+	drm_intel_bo_gem *bo_gem = (drm_intel_bo_gem *) bo;
+	uint32_t type = AUB_TRACE_TYPE_NOTYPE;
+	uint32_t subtype = 0;
+	uint32_t block_size;
+	uint32_t offset;
+	unsigned int i;
+
+	aub_bo_get_address(bo);
+
+	for (i = 0; i < ARRAY_SIZE(name_to_type_mapping); i++) {
+		if (strcmp(bo_gem->name,
+			   name_to_type_mapping[i].name) == 0) {
+			type = name_to_type_mapping[i].type;
+			subtype = name_to_type_mapping[i].subtype;
+			break;
+		}
+	}
+
+	if (type == 0) {
+		DBG("Failed to find type for object %s(size: 0x%lx, aub_offset: 0x%08x)\n",
+		       bo_gem->name, bo->size, bo_gem->aub_offset);
+	}
+
+
+	/* Break up large objects into multiple writes.  Otherwise a
+	 * 128kb VBO would overflow the 16 bits of size field in the
+	 * packet header and everything goes badly after that.
+	 */
+	for (offset = 0; offset < bo->size; offset += block_size) {
+		block_size = bo->size - offset;
+
+		if (block_size > 2 * 4096)
+			block_size = 2 * 4096;
+
+		aub_write_trace_block(bo, type, subtype,
+				      offset, block_size);
+	}
+}
+
+/*
+ * Make a ringbuffer on fly and dump it
+ */
+static void
+aub_generate_ringbuffer(drm_intel_bufmgr_gem *bufmgr_gem,
+			  uint32_t batch_buffer, unsigned int flags)
+{
+	uint32_t ringbuffer[1024];
+	int ring = 0;
+
+	switch (flags) {
+	case I915_EXEC_RENDER:
+	case I915_EXEC_DEFAULT:
+		ring = AUB_TRACE_TYPE_RING_PRB0;
+		break;
+	case I915_EXEC_BSD:
+		ring = AUB_TRACE_TYPE_RING_PRB1;
+		break;
+	case I915_EXEC_BLT:
+		ring = AUB_TRACE_TYPE_RING_PRB2;
+		break;
+	}
+
+	aub_out(bufmgr_gem,
+		CMD_AUB_TRACE_HEADER_BLOCK |
+		(5 - 2));
+	aub_out(bufmgr_gem,
+		AUB_TRACE_MEMTYPE_GTT | ring | AUB_TRACE_OP_COMMAND_WRITE);
+	aub_out(bufmgr_gem, 0); /* general/surface subtype */
+	aub_out(bufmgr_gem, bufmgr_gem->aub_offset);
+	aub_out(bufmgr_gem, 4096);
+
+	/* Do make a ring buffer here */
+	memset(ringbuffer, AUB_MI_NOOP, sizeof(ringbuffer));
+	ringbuffer[0] = AUB_MI_BATCH_BUFFER_START;
+	ringbuffer[1] = batch_buffer;
+
+	/* FIXME: Need some flush operations here? */
+
+	aub_out_data(bufmgr_gem, ringbuffer, 4096);
+
+	/* Update offset pointer */
+	bufmgr_gem->aub_offset += 4096;
+}
+
+static void
+aub_dump_bmp(drm_intel_bufmgr_gem *bufmgr_gem)
+{
+	struct drm_intel_aub_bmp *p = bufmgr_gem->aub_bmp;
+
+	while(p) {
+		aub_out(bufmgr_gem, CMD_AUB_DUMP_BMP | 4);
+		aub_out(bufmgr_gem, (p->bmp.y_offset << 16) | p->bmp.x_offset);
+		aub_out(bufmgr_gem, (p->bmp.format << 24) |
+				    (p->bmp.bits_per_pixel << 16) | p->bmp.pitch);
+		aub_out(bufmgr_gem, (p->bmp.height << 16) | p->bmp.width);
+		/* surface bo should already be written out */
+		assert(((drm_intel_bo_gem *)p->bo)->aub_offset != 0);
+		aub_out(bufmgr_gem, ((drm_intel_bo_gem *)p->bo)->aub_offset + p->offset);
+		aub_out(bufmgr_gem, (p->bmp.tiling << 2) | (p->bmp.tiling_walk_y << 3));
+
+		bufmgr_gem->aub_bmp = p->next;
+		free(p);
+		p = bufmgr_gem->aub_bmp;
+	}
+}
+
+static void
+aub_exec(drm_intel_bo *bo, unsigned int flags)
+{
+	drm_intel_bufmgr_gem *bufmgr_gem = (drm_intel_bufmgr_gem *) bo->bufmgr;
+	drm_intel_bo_gem *bo_gem = (drm_intel_bo_gem *) bo;
+	int i;
+
+	if (!bufmgr_gem->aub_file)
+		return;
+
+	/* Write out all but the batchbuffer to AUB memory */
+	for (i = 0; i < bufmgr_gem->exec_count - 1; i++) {
+		if (bufmgr_gem->exec_bos[i] != bo)
+			aub_write_bo(bufmgr_gem->exec_bos[i]);
+	}
+
+	aub_bo_get_address(bo);
+
+	/* Dump the batchbuffer. */
+	aub_out(bufmgr_gem,
+		CMD_AUB_TRACE_HEADER_BLOCK |
+		(5 - 2));
+	aub_out(bufmgr_gem,
+		AUB_TRACE_MEMTYPE_GTT | AUB_TRACE_TYPE_BATCH | AUB_TRACE_OP_DATA_WRITE);
+	aub_out(bufmgr_gem, 0); /* general/surface subtype */
+	aub_out(bufmgr_gem, bo_gem->aub_offset);
+	aub_out(bufmgr_gem, bo_gem->bo.size);
+	aub_write_bo_data(bo, 0, bo_gem->bo.size);
+
+	/* Dump ring buffer */
+	aub_generate_ringbuffer(bufmgr_gem, bo_gem->aub_offset, flags);
+
+	/* Dump BMP file for any requested surface */
+	aub_dump_bmp(bufmgr_gem);
+
+	fflush(bufmgr_gem->aub_file);
+
+	/*
+	 * One frame has been dumped. So reset the aub_offset for the next frame.
+	 *
+	 * FIXME: Can we do this?
+	 */
+	bufmgr_gem->aub_offset = 0x10000;
+}
+
+/*
+ * Stop dumping data to aub file
+ */
+void drm_intel_bufmgr_gem_stop_aubfile(drm_intel_bufmgr *bufmgr)
+{
+	drm_intel_bufmgr_gem *bufmgr_gem = (drm_intel_bufmgr_gem *)bufmgr;
+
+	pthread_mutex_lock(&bufmgr_gem->lock);
+	bufmgr_gem->aub_file = NULL;
+	pthread_mutex_unlock(&bufmgr_gem->lock);
+}
+
+void drm_intel_bufmgr_gem_set_aubfile(drm_intel_bufmgr *bufmgr, FILE *file)
+{
+	drm_intel_bufmgr_gem *bufmgr_gem = (drm_intel_bufmgr_gem *)bufmgr;
+	int entry = 0x3; /* uc/valid GTT */
+	int i;
+
+	if (!file)
+		return;
+
+	pthread_mutex_lock(&bufmgr_gem->lock);
+
+	bufmgr_gem->aub_file = file;
+
+	/* Start from 0x10000, since the address below is used for GTT entry building */
+	bufmgr_gem->aub_offset = 0x10000;
+
+	/* Start with a (required) version packet. */
+	aub_out(bufmgr_gem, CMD_AUB_HEADER | (13 - 2));
+	aub_out(bufmgr_gem,
+		(4 << AUB_HEADER_MAJOR_SHIFT) |
+		(0 << AUB_HEADER_MINOR_SHIFT));
+	for (i = 0; i < 8; i++) {
+		aub_out(bufmgr_gem, 0); /* app name */
+	}
+	aub_out(bufmgr_gem, 0); /* timestamp */
+	aub_out(bufmgr_gem, 0); /* timestamp */
+	aub_out(bufmgr_gem, 0); /* comment len */
+
+	/* Set up the GTT. The max we can handle is 256M.
+	 * Need improvement, dynamicly alloc/write GTT entry
+	 * block for each bo, so AubList output won't contain
+	 * whole GTT entry block in the first, easier for parse.
+	 */
+	for (i = 0x000; i < 0x10000; i += 4, entry += 0x1000) {
+		aub_out(bufmgr_gem, CMD_AUB_TRACE_HEADER_BLOCK | (5 - 2));
+		aub_out(bufmgr_gem, AUB_TRACE_MEMTYPE_NONLOCAL | 0 | AUB_TRACE_OP_DATA_WRITE);
+		aub_out(bufmgr_gem, 0);
+		aub_out(bufmgr_gem, i);
+		aub_out(bufmgr_gem, 4);
+		aub_out(bufmgr_gem, entry);
+	}
+
+	pthread_mutex_unlock(&bufmgr_gem->lock);
+}
+
+int drm_intel_gem_aub_dump_bmp(drm_intel_bufmgr *bufmgr,
+			       drm_intel_bo *bo, unsigned int offset,
+			       struct drm_intel_aub_surface_bmp *bmp)
+{
+	drm_intel_bufmgr_gem *bufmgr_gem = (drm_intel_bufmgr_gem *)bufmgr;
+	struct drm_intel_aub_bmp *aub_bmp, *p, *last;
+
+	aub_bmp = malloc(sizeof(*aub_bmp));
+
+	aub_bmp->bo = bo;
+	aub_bmp->offset = offset;
+	memcpy(&aub_bmp->bmp, bmp, sizeof(*bmp));
+	aub_bmp->next = NULL;
+
+	pthread_mutex_lock(&bufmgr_gem->lock);
+
+	/* Insert last */
+	p = last = bufmgr_gem->aub_bmp;
+	while (p) {
+		last = p;
+		p = p->next;
+	}
+	if (last == bufmgr_gem->aub_bmp)
+	    bufmgr_gem->aub_bmp = aub_bmp;
+	else
+	    last->next = aub_bmp;
+
+	pthread_mutex_unlock(&bufmgr_gem->lock);
+
+	return 0;
+}
+
 static unsigned int
 drm_intel_gem_estimate_batch_space(drm_intel_bo ** bo_array, int count);
 
@@ -1624,6 +2024,8 @@  drm_intel_gem_bo_mrb_exec2(drm_intel_bo *bo, int used,
 	execbuf.rsvd1 = 0;
 	execbuf.rsvd2 = 0;
 
+	aub_exec(bo, flags);
+
 	ret = drmIoctl(bufmgr_gem->fd,
 		       DRM_IOCTL_I915_GEM_EXECBUFFER2,
 		       &execbuf);