diff mbox

[v2] tests/gem_error_capture: Initial testcase for error state capture/dump

Message ID 1397234892-13820-1-git-send-email-oscar.mateo@intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

oscar.mateo@intel.com April 11, 2014, 4:48 p.m. UTC
From: Oscar Mateo <oscar.mateo@intel.com>

Guarantees that error capture works at a very basic level.

v2: Also check that the ring object contains a reloc with MI_BB_START
for the presumed batch object's address.

Signed-off-by: Oscar Mateo <oscar.mateo@intel.com>
---
 tests/.gitignore          |   1 +
 tests/Makefile.sources    |   1 +
 tests/gem_error_capture.c | 269 ++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 271 insertions(+)
 create mode 100644 tests/gem_error_capture.c

Comments

Chris Wilson April 11, 2014, 5:12 p.m. UTC | #1
On Fri, Apr 11, 2014 at 05:48:12PM +0100, oscar.mateo@intel.com wrote:
> +static void check_error_state(const char *expected_ring_name,
> +				uint64_t expected_offset)
> +{
> +	FILE *file;
> +	int debug_fd;
> +	char *line = NULL;
> +	size_t line_size = 0;
> +	char *dashes = NULL;
> +	char *ring_name = NULL;
> +	int bb_matched = 0;
> +	uint32_t gtt_offset;
> +	char expected_line[32];
> +	int req_matched = 0;
> +	int requests;
> +	uint32_t seqno, tail;
> +	long jiffies;
> +	int ringbuf_matched = 0;
> +	unsigned int offset, command, expected_addr = 0;
> +	int i, items;
> +	bool bb_ok = false, req_ok = false, ringbuf_ok = false;

Most of these are only of use in local scope.

> +	debug_fd = igt_debugfs_open("i915_error_state", O_RDONLY);
> +	igt_assert(debug_fd >= 0);
> +	file = fdopen(debug_fd, "r");
> +
> +	while (getline(&line, &line_size, file) > 0) {
> +		dashes = strstr(line, "---");
> +		if (!dashes)
> +			continue;
> +
> +		ring_name = realloc(ring_name, dashes - line);
> +		strncpy(ring_name, line, dashes - line);
> +		ring_name[dashes - line - 1] = '\0';
> +
> +		bb_matched = sscanf(dashes, "--- gtt_offset = 0x%08x\n",
> +				&gtt_offset);
> +		if (bb_matched == 1) {
> +			igt_assert(strstr(ring_name, expected_ring_name));
> +			igt_assert(gtt_offset == expected_offset);
> +
> +			for (i = 0; i < sizeof(batch) / 4; i++) {
> +				igt_assert(getline(&line, &line_size, file) > 0);
> +				snprintf(expected_line, sizeof(expected_line), "%08x :  %08x",
> +						4*i, batch[i]);
> +				igt_assert(strstr(line, expected_line));
> +			}
> +			bb_ok = true;
> +			continue;
> +		}
> +
> +		req_matched = sscanf(dashes, "--- %d requests\n",
> +				&requests);
> +		if (req_matched == 1) {
> +			igt_assert(strstr(ring_name, expected_ring_name));
> +			igt_assert(requests == 1);

Bad assumption. You could still have the request from
gem_quiescent_gpu() which may not have been retired before the error
triggers.

> +
> +			igt_assert(getline(&line, &line_size, file) > 0);
> +			items = sscanf(line, "  seqno 0x%08x, emitted %ld, tail 0x%08x\n",
> +					&seqno, &jiffies, &tail);
> +			igt_assert(items == 3);

Bad. I may change the format. s/may/will/

> +			req_ok = true;
> +			continue;
> +		}
> +
> +		ringbuf_matched = sscanf(dashes, "--- ringbuffer = 0x%08x\n",
> +				&gtt_offset);
> +		if (ringbuf_matched == 1) {
> +			if (!strstr(ring_name, expected_ring_name))
> +				continue;
> +			igt_assert(req_ok);
> +
> +			for (i = 0; i < tail / 4; i++) {
> +				igt_assert(getline(&line, &line_size, file) > 0);
> +				items = sscanf(line, "%08x :  %08x\n",
> +						&offset, &command);
> +				igt_assert(items == 2);
> +				if ((command & 0x1F800000) == MI_BATCH_BUFFER_START) {
> +					igt_assert(getline(&line, &line_size, file) > 0);
> +					items = sscanf(line, "%08x :  %08x\n",
> +							&offset, &expected_addr);
> +					igt_assert(items == 2);
> +					i++;
> +				}
> +			}
> +			igt_assert(expected_addr == expected_offset);

Bad. Some gen encode flags into the BB start address.

> +			ringbuf_ok = true;
> +			continue;
> +		}
> +
> +		if (bb_ok && req_ok && ringbuf_ok)
> +			break;
> +	}
> +	igt_assert(bb_ok && req_ok && ringbuf_ok);
> +
> +	free(line);
> +	free(ring_name);
> +	close(debug_fd);
> +}
oscar.mateo@intel.com April 14, 2014, 10:01 a.m. UTC | #2
> > +	FILE *file;
> > +	int debug_fd;
> > +	char *line = NULL;
> > +	size_t line_size = 0;
> > +	char *dashes = NULL;
> > +	char *ring_name = NULL;
> > +	int bb_matched = 0;
> > +	uint32_t gtt_offset;
> > +	char expected_line[32];
> > +	int req_matched = 0;
> > +	int requests;
> > +	uint32_t seqno, tail;
> > +	long jiffies;
> > +	int ringbuf_matched = 0;
> > +	unsigned int offset, command, expected_addr = 0;
> > +	int i, items;
> > +	bool bb_ok = false, req_ok = false, ringbuf_ok = false;
> 
> Most of these are only of use in local scope.

ACK, will change.

> > +	debug_fd = igt_debugfs_open("i915_error_state", O_RDONLY);
> > +	igt_assert(debug_fd >= 0);
> > +	file = fdopen(debug_fd, "r");
> > +
> > +	while (getline(&line, &line_size, file) > 0) {
> > +		dashes = strstr(line, "---");
> > +		if (!dashes)
> > +			continue;
> > +
> > +		ring_name = realloc(ring_name, dashes - line);
> > +		strncpy(ring_name, line, dashes - line);
> > +		ring_name[dashes - line - 1] = '\0';
> > +
> > +		bb_matched = sscanf(dashes, "--- gtt_offset = 0x%08x\n",
> > +				&gtt_offset);
> > +		if (bb_matched == 1) {
> > +			igt_assert(strstr(ring_name, expected_ring_name));
> > +			igt_assert(gtt_offset == expected_offset);
> > +
> > +			for (i = 0; i < sizeof(batch) / 4; i++) {
> > +				igt_assert(getline(&line, &line_size, file) > 0);
> > +				snprintf(expected_line, sizeof(expected_line),
> "%08x :  %08x",
> > +						4*i, batch[i]);
> > +				igt_assert(strstr(line, expected_line));
> > +			}
> > +			bb_ok = true;
> > +			continue;
> > +		}
> > +
> > +		req_matched = sscanf(dashes, "--- %d requests\n",
> > +				&requests);
> > +		if (req_matched == 1) {
> > +			igt_assert(strstr(ring_name, expected_ring_name));
> > +			igt_assert(requests == 1);
> 
> Bad assumption. You could still have the request from
> gem_quiescent_gpu() which may not have been retired before the error
> triggers.

Hmmmm... but I make a first valid gem_execbuf()+gem_sync() before the one that actually hangs. Shouldn´t that make sure that all previous requests have been retired?

> > +
> > +			igt_assert(getline(&line, &line_size, file) > 0);
> > +			items = sscanf(line, "  seqno 0x%08x, emitted %ld, tail
> 0x%08x\n",
> > +					&seqno, &jiffies, &tail);
> > +			igt_assert(items == 3);
> 
> Bad. I may change the format. s/may/will/

I still need to make sure I got a valid tail, so I need to know the format and fail if don´t understand it. Or do you want me to use a different tail, maybe the ringbuffer one?

> > +			req_ok = true;
> > +			continue;
> > +		}
> > +
> > +		ringbuf_matched = sscanf(dashes, "--- ringbuffer = 0x%08x\n",
> > +				&gtt_offset);
> > +		if (ringbuf_matched == 1) {
> > +			if (!strstr(ring_name, expected_ring_name))
> > +				continue;
> > +			igt_assert(req_ok);
> > +
> > +			for (i = 0; i < tail / 4; i++) {
> > +				igt_assert(getline(&line, &line_size, file) > 0);
> > +				items = sscanf(line, "%08x :  %08x\n",
> > +						&offset, &command);
> > +				igt_assert(items == 2);
> > +				if ((command & 0x1F800000) ==
> MI_BATCH_BUFFER_START) {
> > +					igt_assert(getline(&line, &line_size,
> file) > 0);
> > +					items = sscanf(line, "%08x :  %08x\n",
> > +							&offset,
> &expected_addr);
> > +					igt_assert(items == 2);
> > +					i++;
> > +				}
> > +			}
> > +			igt_assert(expected_addr == expected_offset);
> 
> Bad. Some gen encode flags into the BB start address.

ACK, will change.

> > +			ringbuf_ok = true;
> > +			continue;
> > +		}
> > +
> > +		if (bb_ok && req_ok && ringbuf_ok)
> > +			break;
> > +	}
> > +	igt_assert(bb_ok && req_ok && ringbuf_ok);
> > +
> > +	free(line);
> > +	free(ring_name);
> > +	close(debug_fd);
> > +}
Chris Wilson April 14, 2014, 10:14 a.m. UTC | #3
On Mon, Apr 14, 2014 at 10:01:12AM +0000, Mateo Lozano, Oscar wrote:
> > > +				&requests);
> > > +		if (req_matched == 1) {
> > > +			igt_assert(strstr(ring_name, expected_ring_name));
> > > +			igt_assert(requests == 1);
> > 
> > Bad assumption. You could still have the request from
> > gem_quiescent_gpu() which may not have been retired before the error
> > triggers.
> 
> Hmmmm... but I make a first valid gem_execbuf()+gem_sync() before the one that actually hangs. Shouldn´t that make sure that all previous requests have been retired?

No, I would not make that assumption about kernel behaviour. The only
thing that it will guarantee is that the last request with that handle
is retired. (In other words, forthcoming bug fixes already break this
assumption.)
 
> > > +
> > > +			igt_assert(getline(&line, &line_size, file) > 0);
> > > +			items = sscanf(line, "  seqno 0x%08x, emitted %ld, tail
> > 0x%08x\n",
> > > +					&seqno, &jiffies, &tail);
> > > +			igt_assert(items == 3);
> > 
> > Bad. I may change the format. s/may/will/
> 
> I still need to make sure I got a valid tail, so I need to know the format and fail if don´t understand it. Or do you want me to use a different tail, maybe the ringbuffer one?

I didn't spot where you used tail. Care to elaborate? I may be able to
suggest an alternative, or we may either code this more defensively or
make the kernel emission easier to use and hopefully more informative
for debugging.
-Chris
oscar.mateo@intel.com April 14, 2014, 10:23 a.m. UTC | #4
> > > > +				&requests);
> > > > +		if (req_matched == 1) {
> > > > +			igt_assert(strstr(ring_name, expected_ring_name));
> > > > +			igt_assert(requests == 1);
> > >
> > > Bad assumption. You could still have the request from
> > > gem_quiescent_gpu() which may not have been retired before the error
> > > triggers.
> >
> > Hmmmm... but I make a first valid gem_execbuf()+gem_sync() before the
> one that actually hangs. Shouldn´t that make sure that all previous requests
> have been retired?
> 
> No, I would not make that assumption about kernel behaviour. The only thing
> that it will guarantee is that the last request with that handle is retired. (In
> other words, forthcoming bug fixes already break this
> assumption.)

Ok, then I´ll check all the reported requests (can I at least assume that the one I am interested in will be the last one? request_list seems to be FIFO...).
 
> > > > +
> > > > +			igt_assert(getline(&line, &line_size, file) > 0);
> > > > +			items = sscanf(line, "  seqno 0x%08x, emitted %ld, tail
> > > 0x%08x\n",
> > > > +					&seqno, &jiffies, &tail);
> > > > +			igt_assert(items == 3);
> > >
> > > Bad. I may change the format. s/may/will/
> >
> > I still need to make sure I got a valid tail, so I need to know the format and
> fail if don´t understand it. Or do you want me to use a different tail, maybe the
> ringbuffer one?
> 
> I didn't spot where you used tail. Care to elaborate? I may be able to suggest an
> alternative, or we may either code this more defensively or make the kernel
> emission easier to use and hopefully more informative for debugging.

I´m using the request tail to traverse the ring buffer, make sure there is a MI_BB_START and then our object´s address shortly before that tail. I could use ring->tail, or I could skip this check altogether (after all, I´m already checking that the GPU was executing a Batch Buffer, and that the BB had my magic number on it).
Chris Wilson April 14, 2014, 10:30 a.m. UTC | #5
On Mon, Apr 14, 2014 at 10:23:20AM +0000, Mateo Lozano, Oscar wrote:
> > > > > +				&requests);
> > > > > +		if (req_matched == 1) {
> > > > > +			igt_assert(strstr(ring_name, expected_ring_name));
> > > > > +			igt_assert(requests == 1);
> > > >
> > > > Bad assumption. You could still have the request from
> > > > gem_quiescent_gpu() which may not have been retired before the error
> > > > triggers.
> > >
> > > Hmmmm... but I make a first valid gem_execbuf()+gem_sync() before the
> > one that actually hangs. Shouldn´t that make sure that all previous requests
> > have been retired?
> > 
> > No, I would not make that assumption about kernel behaviour. The only thing
> > that it will guarantee is that the last request with that handle is retired. (In
> > other words, forthcoming bug fixes already break this
> > assumption.)
> 
> Ok, then I´ll check all the reported requests (can I at least assume that the one I am interested in will be the last one? request_list seems to be FIFO...).

In general, even that may be dangerous. (But less so if can be sure that
is no batchbuffer injection from the testsuite, and that there are no
other clients, or that the kernel doesn't start having to do its own
w/a.)

> > > > > +
> > > > > +			igt_assert(getline(&line, &line_size, file) > 0);
> > > > > +			items = sscanf(line, "  seqno 0x%08x, emitted %ld, tail
> > > > 0x%08x\n",
> > > > > +					&seqno, &jiffies, &tail);
> > > > > +			igt_assert(items == 3);
> > > >
> > > > Bad. I may change the format. s/may/will/
> > >
> > > I still need to make sure I got a valid tail, so I need to know the format and
> > fail if don´t understand it. Or do you want me to use a different tail, maybe the
> > ringbuffer one?
> > 
> > I didn't spot where you used tail. Care to elaborate? I may be able to suggest an
> > alternative, or we may either code this more defensively or make the kernel
> > emission easier to use and hopefully more informative for debugging.
> 
> I´m using the request tail to traverse the ring buffer, make sure there is a MI_BB_START and then our object´s address shortly before that tail. I could use ring->tail, or I could skip this check altogether (after all, I´m already checking that the GPU was executing a Batch Buffer, and that the BB had my magic number on it).

I would add a little more smarts to both the kernel and error-decode.
In the kernel, we can print the guilty request, which you can then use
to confirm that it is yours. That seems to me to be a stronger
validation of gem_error_capture, and a useful bit of information from
hangstats that we do not expose currently.
-Chris
oscar.mateo@intel.com April 14, 2014, 1:03 p.m. UTC | #6
> I would add a little more smarts to both the kernel and error-decode.
> In the kernel, we can print the guilty request, which you can then use to
> confirm that it is yours. That seems to me to be a stronger validation of
> gem_error_capture, and a useful bit of information from hangstats that we do
> not expose currently.

That sounds good. I have to add a number of other things to i915_gpu_error as part of the Execlists code, so I´ll add a "--- guilty request" as well and resubmit this test together with the series.

Thanks,
Oscar
Mika Kuoppala April 15, 2014, 2:49 p.m. UTC | #7
Hi,

oscar.mateo@intel.com writes:

> From: Oscar Mateo <oscar.mateo@intel.com>
>
> Guarantees that error capture works at a very basic level.
>
> v2: Also check that the ring object contains a reloc with MI_BB_START
> for the presumed batch object's address.
>
> Signed-off-by: Oscar Mateo <oscar.mateo@intel.com>
> ---
>  tests/.gitignore          |   1 +
>  tests/Makefile.sources    |   1 +
>  tests/gem_error_capture.c | 269 ++++++++++++++++++++++++++++++++++++++++++++++
>  3 files changed, 271 insertions(+)
>  create mode 100644 tests/gem_error_capture.c
>
> diff --git a/tests/.gitignore b/tests/.gitignore
> index 146bab0..945574c 100644
> --- a/tests/.gitignore
> +++ b/tests/.gitignore
> @@ -27,6 +27,7 @@ gem_ctx_create
>  gem_ctx_exec
>  gem_double_irq_loop
>  gem_dummy_reloc_loop
> +gem_error_capture
>  gem_evict_alignment
>  gem_evict_everything
>  gem_exec_bad_domains
> diff --git a/tests/Makefile.sources b/tests/Makefile.sources
> index bf02a48..612beb6 100644
> --- a/tests/Makefile.sources
> +++ b/tests/Makefile.sources
> @@ -24,6 +24,7 @@ TESTS_progs_M = \
>  	gem_ctx_bad_exec \
>  	gem_ctx_exec \
>  	gem_dummy_reloc_loop \
> +	gem_error_capture \
>  	gem_evict_alignment \
>  	gem_evict_everything \
>  	gem_exec_bad_domains \
> diff --git a/tests/gem_error_capture.c b/tests/gem_error_capture.c
> new file mode 100644
> index 0000000..88845c9
> --- /dev/null
> +++ b/tests/gem_error_capture.c
> @@ -0,0 +1,269 @@
> +/*
> + * Copyright © 2014 Intel Corporation
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a
> + * copy of this software and associated documentation files (the "Software"),
> + * to deal in the Software without restriction, including without limitation
> + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
> + * and/or sell copies of the Software, and to permit persons to whom the
> + * Software is furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice (including the next
> + * paragraph) shall be included in all copies or substantial portions of the
> + * Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
> + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
> + * IN THE SOFTWARE.
> + *
> + * Authors:
> + *    Oscar Mateo <oscar.mateo@intel.com>
> + *
> + */
> +
> +/*
> + * Testcase: Check whether basic error state capture/dump mechanism is correctly
> + * working for all rings.
> + */
> +
> +#include <unistd.h>
> +#include <stdlib.h>
> +#include <stdint.h>
> +#include <stdio.h>
> +#include <string.h>
> +#include <fcntl.h>
> +#include <inttypes.h>
> +#include <errno.h>
> +#include <sys/stat.h>
> +#include <sys/ioctl.h>
> +#include "drm.h"
> +#include "ioctl_wrappers.h"
> +#include "drmtest.h"
> +#include "intel_io.h"
> +#include "igt_debugfs.h"
> +
> +#define MAGIC_NUMBER 0x10001
> +uint32_t batch[4] = {MI_NOOP, MI_BATCH_BUFFER_END, MAGIC_NUMBER, MAGIC_NUMBER};
> +
> +static void stop_rings(void)
> +{
> +	int fd;
> +	static const char buf[] = "0xf";
> +
> +	fd = igt_debugfs_open("i915_ring_stop", O_WRONLY);
> +	igt_assert(fd >= 0);
> +
> +	igt_assert(write(fd, buf, sizeof(buf)) == sizeof(buf));
> +
> +	close(fd);
> +}
> +
> +static bool rings_stopped(void)
> +{
> +	int fd;
> +	static char buf[128];
> +	unsigned long long val;
> +
> +	fd = igt_debugfs_open("i915_ring_stop", O_RDONLY);
> +	igt_assert(fd >= 0);
> +
> +	igt_assert(read(fd, buf, sizeof(buf)) > 0);
> +	close(fd);
> +
> +	sscanf(buf, "%llx", &val);
> +
> +	return (bool)val;
> +}

Please use igt_set_stop_rings() and igt_get_stop_rings().

Also consider stopping only the ring you are testing for.

> +static void clear_error_state(void)
> +{
> +	int fd;
> +	static const char buf[] = "";
> +
> +	fd = igt_debugfs_open("i915_error_state", O_WRONLY);
> +	igt_assert(fd >= 0);
> +
> +	igt_assert(write(fd, buf, sizeof(buf)) == sizeof(buf));
> +	close(fd);
> +}
> +
> +static void check_error_state(const char *expected_ring_name,
> +				uint64_t expected_offset)
> +{
> +	FILE *file;
> +	int debug_fd;
> +	char *line = NULL;
> +	size_t line_size = 0;
> +	char *dashes = NULL;
> +	char *ring_name = NULL;
> +	int bb_matched = 0;
> +	uint32_t gtt_offset;
> +	char expected_line[32];
> +	int req_matched = 0;
> +	int requests;
> +	uint32_t seqno, tail;
> +	long jiffies;
> +	int ringbuf_matched = 0;
> +	unsigned int offset, command, expected_addr = 0;
> +	int i, items;
> +	bool bb_ok = false, req_ok = false, ringbuf_ok = false;
> +
> +	debug_fd = igt_debugfs_open("i915_error_state", O_RDONLY);
> +	igt_assert(debug_fd >= 0);
> +	file = fdopen(debug_fd, "r");
> +
> +	while (getline(&line, &line_size, file) > 0) {
> +		dashes = strstr(line, "---");
> +		if (!dashes)
> +			continue;
> +
> +		ring_name = realloc(ring_name, dashes - line);
> +		strncpy(ring_name, line, dashes - line);
> +		ring_name[dashes - line - 1] = '\0';
> +
> +		bb_matched = sscanf(dashes, "--- gtt_offset = 0x%08x\n",
> +				&gtt_offset);
> +		if (bb_matched == 1) {
> +			igt_assert(strstr(ring_name, expected_ring_name));
> +			igt_assert(gtt_offset == expected_offset);
> +
> +			for (i = 0; i < sizeof(batch) / 4; i++) {
> +				igt_assert(getline(&line, &line_size, file) > 0);
> +				snprintf(expected_line, sizeof(expected_line), "%08x :  %08x",
> +						4*i, batch[i]);
> +				igt_assert(strstr(line, expected_line));
> +			}
> +			bb_ok = true;
> +			continue;
> +		}
> +
> +		req_matched = sscanf(dashes, "--- %d requests\n",
> +				&requests);
> +		if (req_matched == 1) {
> +			igt_assert(strstr(ring_name, expected_ring_name));
> +			igt_assert(requests == 1);
> +
> +			igt_assert(getline(&line, &line_size, file) > 0);
> +			items = sscanf(line, "  seqno 0x%08x, emitted %ld, tail 0x%08x\n",
> +					&seqno, &jiffies, &tail);
> +			igt_assert(items == 3);
> +			req_ok = true;
> +			continue;
> +		}
> +
> +		ringbuf_matched = sscanf(dashes, "--- ringbuffer = 0x%08x\n",
> +				&gtt_offset);
> +		if (ringbuf_matched == 1) {
> +			if (!strstr(ring_name, expected_ring_name))
> +				continue;
> +			igt_assert(req_ok);
> +
> +			for (i = 0; i < tail / 4; i++) {
> +				igt_assert(getline(&line, &line_size, file) > 0);
> +				items = sscanf(line, "%08x :  %08x\n",
> +						&offset, &command);
> +				igt_assert(items == 2);
> +				if ((command & 0x1F800000) == MI_BATCH_BUFFER_START) {
> +					igt_assert(getline(&line, &line_size, file) > 0);
> +					items = sscanf(line, "%08x :  %08x\n",
> +							&offset, &expected_addr);
> +					igt_assert(items == 2);

Should check for MAGIC_NUMBER here?

-Mika

> +					i++;
> +				}
> +			}
> +			igt_assert(expected_addr == expected_offset);
> +			ringbuf_ok = true;
> +			continue;
> +		}
> +
> +		if (bb_ok && req_ok && ringbuf_ok)
> +			break;
> +	}
> +	igt_assert(bb_ok && req_ok && ringbuf_ok);
> +
> +	free(line);
> +	free(ring_name);
> +	close(debug_fd);
> +}
> +
> +static void test(int fd, uint32_t handle, unsigned ring_id, const char *ring_name)
> +{
> +	struct drm_i915_gem_execbuffer2 execbuf;
> +	struct drm_i915_gem_exec_object2 exec;
> +	uint64_t presumed_offset;
> +
> +	gem_require_ring(fd, ring_id);
> +
> +	clear_error_state();
> +
> +	exec.handle = handle;
> +	exec.relocation_count = 0;
> +	exec.relocs_ptr = 0;
> +	exec.alignment = 0;
> +	exec.offset = 0;
> +	exec.flags = 0;
> +	exec.rsvd1 = 0;
> +	exec.rsvd2 = 0;
> +
> +	execbuf.buffers_ptr = (uintptr_t)&exec;
> +	execbuf.buffer_count = 1;
> +	execbuf.batch_start_offset = 0;
> +	execbuf.batch_len = 16;
> +	execbuf.cliprects_ptr = 0;
> +	execbuf.num_cliprects = 0;
> +	execbuf.DR1 = 0;
> +	execbuf.DR4 = 0;
> +	execbuf.flags = ring_id;
> +	i915_execbuffer2_set_context_id(execbuf, 0);
> +	execbuf.rsvd2 = 0;
> +
> +	gem_execbuf(fd, &execbuf);
> +	gem_sync(fd, handle);
> +
> +	presumed_offset = exec.offset;
> +
> +	stop_rings();
> +
> +	gem_execbuf(fd, &execbuf);
> +	gem_sync(fd, handle);
> +
> +	igt_assert(rings_stopped() == false);
> +	igt_assert(presumed_offset == exec.offset);
> +
> +	check_error_state(ring_name, exec.offset);
> +}
> +
> +uint32_t handle;
> +int fd;
> +
> +igt_main
> +{
> +	igt_fixture {
> +		fd = drm_open_any();
> +
> +		handle = gem_create(fd, 4096);
> +		gem_write(fd, handle, 0, batch, sizeof(batch));
> +	}
> +
> +	igt_subtest("render")
> +		test(fd, handle, I915_EXEC_RENDER, "render ring");
> +
> +	igt_subtest("bsd")
> +		test(fd, handle, I915_EXEC_BSD, "bsd ring");
> +
> +	igt_subtest("blt")
> +		test(fd, handle, I915_EXEC_BLT, "blitter ring");
> +
> +	igt_subtest("vebox")
> +		test(fd, handle, I915_EXEC_VEBOX, "video enhancement ring");
> +
> +	igt_fixture {
> +		gem_close(fd, handle);
> +
> +		close(fd);
> +	}
> +}
> -- 
> 1.9.0
>
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/intel-gfx
Daniel Vetter April 15, 2014, 7:38 p.m. UTC | #8
On Mon, Apr 14, 2014 at 01:03:58PM +0000, Mateo Lozano, Oscar wrote:
> > I would add a little more smarts to both the kernel and error-decode.
> > In the kernel, we can print the guilty request, which you can then use to
> > confirm that it is yours. That seems to me to be a stronger validation of
> > gem_error_capture, and a useful bit of information from hangstats that we do
> > not expose currently.
> 
> That sounds good. I have to add a number of other things to
> i915_gpu_error as part of the Execlists code, so I´ll add a "--- guilty
> request" as well and resubmit this test together with the series.

If we want this much smarts then we need a properly hanging batch, e.g.
like the looping batch used in gem_reset_stats.

The problem with that is that this will kill the gpu if reset doesn't work
(i.e. gen2/3) so we need to skip this test there. Or maybe split things
into 2 subtests and use the properly hanging batch only when we do the
extended guilty testing under discussion here.

But in any case just checking that the batch is somewhere in the ring
(properly masking of lower bits 0-11 ofc) and checking whether the batch
is correctl dumped (with the magic value) would catch a lot of the
past&present execbuf bugs - we've had issues with dumping fancy values of
0 a lot.

For the guilty stuff we have an extensive set of tests in gem_reset_stat
using the reset stat ioctl already. And for the occasional "the hang
detection logic is busted bug" I think nothing short of a human brain
locking at the batch really helps. At least if we want to be somewhat
platform agnostic ...

So imo the current level of checking loosk Good Enough. But I'm certainly
not going to stop you ;-)

Cheers, Daniel
oscar.mateo@intel.com May 9, 2014, 12:04 p.m. UTC | #9
Hi Mika,

> > +static bool rings_stopped(void)
> > +{
> > +	int fd;
> > +	static char buf[128];
> > +	unsigned long long val;
> > +
> > +	fd = igt_debugfs_open("i915_ring_stop", O_RDONLY);
> > +	igt_assert(fd >= 0);
> > +
> > +	igt_assert(read(fd, buf, sizeof(buf)) > 0);
> > +	close(fd);
> > +
> > +	sscanf(buf, "%llx", &val);
> > +
> > +	return (bool)val;
> > +}
> 
> Please use igt_set_stop_rings() and igt_get_stop_rings().
> 
> Also consider stopping only the ring you are testing for.

Oops, I didn see these before. Ok, done in the new version.

> > +			for (i = 0; i < tail / 4; i++) {
> > +				igt_assert(getline(&line, &line_size, file) > 0);
> > +				items = sscanf(line, "%08x :  %08x\n",
> > +						&offset, &command);
> > +				igt_assert(items == 2);
> > +				if ((command & 0x1F800000) == MI_BATCH_BUFFER_START) {
> > +					igt_assert(getline(&line, &line_size, file) > 0);
> > +					items = sscanf(line, "%08x :  %08x\n",
> > +							&offset, &expected_addr);
> > +					igt_assert(items == 2);
> 
> Should check for MAGIC_NUMBER here?

I'm doing it above already, inside bb_matched:

			for (i = 0; i < sizeof(batch) / 4; i++) {
				igt_assert(getline(&line, &line_size, file) > 0);
				snprintf(expected_line, sizeof(expected_line), "%08x :  %08x",
						4*i, batch[i]);
				igt_assert(strstr(line, expected_line));
			}

-- Oscar
oscar.mateo@intel.com May 9, 2014, 12:07 p.m. UTC | #10
On Tue, 2014-04-15 at 21:38 +0200, Daniel Vetter wrote:
> On Mon, Apr 14, 2014 at 01:03:58PM +0000, Mateo Lozano, Oscar wrote:

> > > I would add a little more smarts to both the kernel and error-decode.

> > > In the kernel, we can print the guilty request, which you can then use to

> > > confirm that it is yours. That seems to me to be a stronger validation of

> > > gem_error_capture, and a useful bit of information from hangstats that we do

> > > not expose currently.

> > 

> > That sounds good. I have to add a number of other things to

> > i915_gpu_error as part of the Execlists code, so I´ll add a "--- guilty

> > request" as well and resubmit this test together with the series.

> 

> If we want this much smarts then we need a properly hanging batch, e.g.

> like the looping batch used in gem_reset_stats.

> 

> The problem with that is that this will kill the gpu if reset doesn't work

> (i.e. gen2/3) so we need to skip this test there. Or maybe split things

> into 2 subtests and use the properly hanging batch only when we do the

> extended guilty testing under discussion here.

> 

> But in any case just checking that the batch is somewhere in the ring

> (properly masking of lower bits 0-11 ofc) and checking whether the batch

> is correctl dumped (with the magic value) would catch a lot of the

> past&present execbuf bugs - we've had issues with dumping fancy values of

> 0 a lot.

> 

> For the guilty stuff we have an extensive set of tests in gem_reset_stat

> using the reset stat ioctl already. And for the occasional "the hang

> detection logic is busted bug" I think nothing short of a human brain

> locking at the batch really helps. At least if we want to be somewhat

> platform agnostic ...

> 

> So imo the current level of checking loosk Good Enough. But I'm certainly

> not going to stop you ;-)


Ok, for the moment I'm happy that I can unblock Execlists. I don't mind
looking at the "--- guilty requests" in the future, but first I need to
get Execlists out of the way...

Cheers,
Oscar
diff mbox

Patch

diff --git a/tests/.gitignore b/tests/.gitignore
index 146bab0..945574c 100644
--- a/tests/.gitignore
+++ b/tests/.gitignore
@@ -27,6 +27,7 @@  gem_ctx_create
 gem_ctx_exec
 gem_double_irq_loop
 gem_dummy_reloc_loop
+gem_error_capture
 gem_evict_alignment
 gem_evict_everything
 gem_exec_bad_domains
diff --git a/tests/Makefile.sources b/tests/Makefile.sources
index bf02a48..612beb6 100644
--- a/tests/Makefile.sources
+++ b/tests/Makefile.sources
@@ -24,6 +24,7 @@  TESTS_progs_M = \
 	gem_ctx_bad_exec \
 	gem_ctx_exec \
 	gem_dummy_reloc_loop \
+	gem_error_capture \
 	gem_evict_alignment \
 	gem_evict_everything \
 	gem_exec_bad_domains \
diff --git a/tests/gem_error_capture.c b/tests/gem_error_capture.c
new file mode 100644
index 0000000..88845c9
--- /dev/null
+++ b/tests/gem_error_capture.c
@@ -0,0 +1,269 @@ 
+/*
+ * Copyright © 2014 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Oscar Mateo <oscar.mateo@intel.com>
+ *
+ */
+
+/*
+ * Testcase: Check whether basic error state capture/dump mechanism is correctly
+ * working for all rings.
+ */
+
+#include <unistd.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+#include <fcntl.h>
+#include <inttypes.h>
+#include <errno.h>
+#include <sys/stat.h>
+#include <sys/ioctl.h>
+#include "drm.h"
+#include "ioctl_wrappers.h"
+#include "drmtest.h"
+#include "intel_io.h"
+#include "igt_debugfs.h"
+
+#define MAGIC_NUMBER 0x10001
+uint32_t batch[4] = {MI_NOOP, MI_BATCH_BUFFER_END, MAGIC_NUMBER, MAGIC_NUMBER};
+
+static void stop_rings(void)
+{
+	int fd;
+	static const char buf[] = "0xf";
+
+	fd = igt_debugfs_open("i915_ring_stop", O_WRONLY);
+	igt_assert(fd >= 0);
+
+	igt_assert(write(fd, buf, sizeof(buf)) == sizeof(buf));
+
+	close(fd);
+}
+
+static bool rings_stopped(void)
+{
+	int fd;
+	static char buf[128];
+	unsigned long long val;
+
+	fd = igt_debugfs_open("i915_ring_stop", O_RDONLY);
+	igt_assert(fd >= 0);
+
+	igt_assert(read(fd, buf, sizeof(buf)) > 0);
+	close(fd);
+
+	sscanf(buf, "%llx", &val);
+
+	return (bool)val;
+}
+
+static void clear_error_state(void)
+{
+	int fd;
+	static const char buf[] = "";
+
+	fd = igt_debugfs_open("i915_error_state", O_WRONLY);
+	igt_assert(fd >= 0);
+
+	igt_assert(write(fd, buf, sizeof(buf)) == sizeof(buf));
+	close(fd);
+}
+
+static void check_error_state(const char *expected_ring_name,
+				uint64_t expected_offset)
+{
+	FILE *file;
+	int debug_fd;
+	char *line = NULL;
+	size_t line_size = 0;
+	char *dashes = NULL;
+	char *ring_name = NULL;
+	int bb_matched = 0;
+	uint32_t gtt_offset;
+	char expected_line[32];
+	int req_matched = 0;
+	int requests;
+	uint32_t seqno, tail;
+	long jiffies;
+	int ringbuf_matched = 0;
+	unsigned int offset, command, expected_addr = 0;
+	int i, items;
+	bool bb_ok = false, req_ok = false, ringbuf_ok = false;
+
+	debug_fd = igt_debugfs_open("i915_error_state", O_RDONLY);
+	igt_assert(debug_fd >= 0);
+	file = fdopen(debug_fd, "r");
+
+	while (getline(&line, &line_size, file) > 0) {
+		dashes = strstr(line, "---");
+		if (!dashes)
+			continue;
+
+		ring_name = realloc(ring_name, dashes - line);
+		strncpy(ring_name, line, dashes - line);
+		ring_name[dashes - line - 1] = '\0';
+
+		bb_matched = sscanf(dashes, "--- gtt_offset = 0x%08x\n",
+				&gtt_offset);
+		if (bb_matched == 1) {
+			igt_assert(strstr(ring_name, expected_ring_name));
+			igt_assert(gtt_offset == expected_offset);
+
+			for (i = 0; i < sizeof(batch) / 4; i++) {
+				igt_assert(getline(&line, &line_size, file) > 0);
+				snprintf(expected_line, sizeof(expected_line), "%08x :  %08x",
+						4*i, batch[i]);
+				igt_assert(strstr(line, expected_line));
+			}
+			bb_ok = true;
+			continue;
+		}
+
+		req_matched = sscanf(dashes, "--- %d requests\n",
+				&requests);
+		if (req_matched == 1) {
+			igt_assert(strstr(ring_name, expected_ring_name));
+			igt_assert(requests == 1);
+
+			igt_assert(getline(&line, &line_size, file) > 0);
+			items = sscanf(line, "  seqno 0x%08x, emitted %ld, tail 0x%08x\n",
+					&seqno, &jiffies, &tail);
+			igt_assert(items == 3);
+			req_ok = true;
+			continue;
+		}
+
+		ringbuf_matched = sscanf(dashes, "--- ringbuffer = 0x%08x\n",
+				&gtt_offset);
+		if (ringbuf_matched == 1) {
+			if (!strstr(ring_name, expected_ring_name))
+				continue;
+			igt_assert(req_ok);
+
+			for (i = 0; i < tail / 4; i++) {
+				igt_assert(getline(&line, &line_size, file) > 0);
+				items = sscanf(line, "%08x :  %08x\n",
+						&offset, &command);
+				igt_assert(items == 2);
+				if ((command & 0x1F800000) == MI_BATCH_BUFFER_START) {
+					igt_assert(getline(&line, &line_size, file) > 0);
+					items = sscanf(line, "%08x :  %08x\n",
+							&offset, &expected_addr);
+					igt_assert(items == 2);
+					i++;
+				}
+			}
+			igt_assert(expected_addr == expected_offset);
+			ringbuf_ok = true;
+			continue;
+		}
+
+		if (bb_ok && req_ok && ringbuf_ok)
+			break;
+	}
+	igt_assert(bb_ok && req_ok && ringbuf_ok);
+
+	free(line);
+	free(ring_name);
+	close(debug_fd);
+}
+
+static void test(int fd, uint32_t handle, unsigned ring_id, const char *ring_name)
+{
+	struct drm_i915_gem_execbuffer2 execbuf;
+	struct drm_i915_gem_exec_object2 exec;
+	uint64_t presumed_offset;
+
+	gem_require_ring(fd, ring_id);
+
+	clear_error_state();
+
+	exec.handle = handle;
+	exec.relocation_count = 0;
+	exec.relocs_ptr = 0;
+	exec.alignment = 0;
+	exec.offset = 0;
+	exec.flags = 0;
+	exec.rsvd1 = 0;
+	exec.rsvd2 = 0;
+
+	execbuf.buffers_ptr = (uintptr_t)&exec;
+	execbuf.buffer_count = 1;
+	execbuf.batch_start_offset = 0;
+	execbuf.batch_len = 16;
+	execbuf.cliprects_ptr = 0;
+	execbuf.num_cliprects = 0;
+	execbuf.DR1 = 0;
+	execbuf.DR4 = 0;
+	execbuf.flags = ring_id;
+	i915_execbuffer2_set_context_id(execbuf, 0);
+	execbuf.rsvd2 = 0;
+
+	gem_execbuf(fd, &execbuf);
+	gem_sync(fd, handle);
+
+	presumed_offset = exec.offset;
+
+	stop_rings();
+
+	gem_execbuf(fd, &execbuf);
+	gem_sync(fd, handle);
+
+	igt_assert(rings_stopped() == false);
+	igt_assert(presumed_offset == exec.offset);
+
+	check_error_state(ring_name, exec.offset);
+}
+
+uint32_t handle;
+int fd;
+
+igt_main
+{
+	igt_fixture {
+		fd = drm_open_any();
+
+		handle = gem_create(fd, 4096);
+		gem_write(fd, handle, 0, batch, sizeof(batch));
+	}
+
+	igt_subtest("render")
+		test(fd, handle, I915_EXEC_RENDER, "render ring");
+
+	igt_subtest("bsd")
+		test(fd, handle, I915_EXEC_BSD, "bsd ring");
+
+	igt_subtest("blt")
+		test(fd, handle, I915_EXEC_BLT, "blitter ring");
+
+	igt_subtest("vebox")
+		test(fd, handle, I915_EXEC_VEBOX, "video enhancement ring");
+
+	igt_fixture {
+		gem_close(fd, handle);
+
+		close(fd);
+	}
+}