Message ID | 1397234892-13820-1-git-send-email-oscar.mateo@intel.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On Fri, Apr 11, 2014 at 05:48:12PM +0100, oscar.mateo@intel.com wrote: > +static void check_error_state(const char *expected_ring_name, > + uint64_t expected_offset) > +{ > + FILE *file; > + int debug_fd; > + char *line = NULL; > + size_t line_size = 0; > + char *dashes = NULL; > + char *ring_name = NULL; > + int bb_matched = 0; > + uint32_t gtt_offset; > + char expected_line[32]; > + int req_matched = 0; > + int requests; > + uint32_t seqno, tail; > + long jiffies; > + int ringbuf_matched = 0; > + unsigned int offset, command, expected_addr = 0; > + int i, items; > + bool bb_ok = false, req_ok = false, ringbuf_ok = false; Most of these are only of use in local scope. > + debug_fd = igt_debugfs_open("i915_error_state", O_RDONLY); > + igt_assert(debug_fd >= 0); > + file = fdopen(debug_fd, "r"); > + > + while (getline(&line, &line_size, file) > 0) { > + dashes = strstr(line, "---"); > + if (!dashes) > + continue; > + > + ring_name = realloc(ring_name, dashes - line); > + strncpy(ring_name, line, dashes - line); > + ring_name[dashes - line - 1] = '\0'; > + > + bb_matched = sscanf(dashes, "--- gtt_offset = 0x%08x\n", > + >t_offset); > + if (bb_matched == 1) { > + igt_assert(strstr(ring_name, expected_ring_name)); > + igt_assert(gtt_offset == expected_offset); > + > + for (i = 0; i < sizeof(batch) / 4; i++) { > + igt_assert(getline(&line, &line_size, file) > 0); > + snprintf(expected_line, sizeof(expected_line), "%08x : %08x", > + 4*i, batch[i]); > + igt_assert(strstr(line, expected_line)); > + } > + bb_ok = true; > + continue; > + } > + > + req_matched = sscanf(dashes, "--- %d requests\n", > + &requests); > + if (req_matched == 1) { > + igt_assert(strstr(ring_name, expected_ring_name)); > + igt_assert(requests == 1); Bad assumption. You could still have the request from gem_quiescent_gpu() which may not have been retired before the error triggers. > + > + igt_assert(getline(&line, &line_size, file) > 0); > + items = sscanf(line, " seqno 0x%08x, emitted %ld, tail 0x%08x\n", > + &seqno, &jiffies, &tail); > + igt_assert(items == 3); Bad. I may change the format. s/may/will/ > + req_ok = true; > + continue; > + } > + > + ringbuf_matched = sscanf(dashes, "--- ringbuffer = 0x%08x\n", > + >t_offset); > + if (ringbuf_matched == 1) { > + if (!strstr(ring_name, expected_ring_name)) > + continue; > + igt_assert(req_ok); > + > + for (i = 0; i < tail / 4; i++) { > + igt_assert(getline(&line, &line_size, file) > 0); > + items = sscanf(line, "%08x : %08x\n", > + &offset, &command); > + igt_assert(items == 2); > + if ((command & 0x1F800000) == MI_BATCH_BUFFER_START) { > + igt_assert(getline(&line, &line_size, file) > 0); > + items = sscanf(line, "%08x : %08x\n", > + &offset, &expected_addr); > + igt_assert(items == 2); > + i++; > + } > + } > + igt_assert(expected_addr == expected_offset); Bad. Some gen encode flags into the BB start address. > + ringbuf_ok = true; > + continue; > + } > + > + if (bb_ok && req_ok && ringbuf_ok) > + break; > + } > + igt_assert(bb_ok && req_ok && ringbuf_ok); > + > + free(line); > + free(ring_name); > + close(debug_fd); > +}
> > + FILE *file; > > + int debug_fd; > > + char *line = NULL; > > + size_t line_size = 0; > > + char *dashes = NULL; > > + char *ring_name = NULL; > > + int bb_matched = 0; > > + uint32_t gtt_offset; > > + char expected_line[32]; > > + int req_matched = 0; > > + int requests; > > + uint32_t seqno, tail; > > + long jiffies; > > + int ringbuf_matched = 0; > > + unsigned int offset, command, expected_addr = 0; > > + int i, items; > > + bool bb_ok = false, req_ok = false, ringbuf_ok = false; > > Most of these are only of use in local scope. ACK, will change. > > + debug_fd = igt_debugfs_open("i915_error_state", O_RDONLY); > > + igt_assert(debug_fd >= 0); > > + file = fdopen(debug_fd, "r"); > > + > > + while (getline(&line, &line_size, file) > 0) { > > + dashes = strstr(line, "---"); > > + if (!dashes) > > + continue; > > + > > + ring_name = realloc(ring_name, dashes - line); > > + strncpy(ring_name, line, dashes - line); > > + ring_name[dashes - line - 1] = '\0'; > > + > > + bb_matched = sscanf(dashes, "--- gtt_offset = 0x%08x\n", > > + >t_offset); > > + if (bb_matched == 1) { > > + igt_assert(strstr(ring_name, expected_ring_name)); > > + igt_assert(gtt_offset == expected_offset); > > + > > + for (i = 0; i < sizeof(batch) / 4; i++) { > > + igt_assert(getline(&line, &line_size, file) > 0); > > + snprintf(expected_line, sizeof(expected_line), > "%08x : %08x", > > + 4*i, batch[i]); > > + igt_assert(strstr(line, expected_line)); > > + } > > + bb_ok = true; > > + continue; > > + } > > + > > + req_matched = sscanf(dashes, "--- %d requests\n", > > + &requests); > > + if (req_matched == 1) { > > + igt_assert(strstr(ring_name, expected_ring_name)); > > + igt_assert(requests == 1); > > Bad assumption. You could still have the request from > gem_quiescent_gpu() which may not have been retired before the error > triggers. Hmmmm... but I make a first valid gem_execbuf()+gem_sync() before the one that actually hangs. Shouldn´t that make sure that all previous requests have been retired? > > + > > + igt_assert(getline(&line, &line_size, file) > 0); > > + items = sscanf(line, " seqno 0x%08x, emitted %ld, tail > 0x%08x\n", > > + &seqno, &jiffies, &tail); > > + igt_assert(items == 3); > > Bad. I may change the format. s/may/will/ I still need to make sure I got a valid tail, so I need to know the format and fail if don´t understand it. Or do you want me to use a different tail, maybe the ringbuffer one? > > + req_ok = true; > > + continue; > > + } > > + > > + ringbuf_matched = sscanf(dashes, "--- ringbuffer = 0x%08x\n", > > + >t_offset); > > + if (ringbuf_matched == 1) { > > + if (!strstr(ring_name, expected_ring_name)) > > + continue; > > + igt_assert(req_ok); > > + > > + for (i = 0; i < tail / 4; i++) { > > + igt_assert(getline(&line, &line_size, file) > 0); > > + items = sscanf(line, "%08x : %08x\n", > > + &offset, &command); > > + igt_assert(items == 2); > > + if ((command & 0x1F800000) == > MI_BATCH_BUFFER_START) { > > + igt_assert(getline(&line, &line_size, > file) > 0); > > + items = sscanf(line, "%08x : %08x\n", > > + &offset, > &expected_addr); > > + igt_assert(items == 2); > > + i++; > > + } > > + } > > + igt_assert(expected_addr == expected_offset); > > Bad. Some gen encode flags into the BB start address. ACK, will change. > > + ringbuf_ok = true; > > + continue; > > + } > > + > > + if (bb_ok && req_ok && ringbuf_ok) > > + break; > > + } > > + igt_assert(bb_ok && req_ok && ringbuf_ok); > > + > > + free(line); > > + free(ring_name); > > + close(debug_fd); > > +}
On Mon, Apr 14, 2014 at 10:01:12AM +0000, Mateo Lozano, Oscar wrote: > > > + &requests); > > > + if (req_matched == 1) { > > > + igt_assert(strstr(ring_name, expected_ring_name)); > > > + igt_assert(requests == 1); > > > > Bad assumption. You could still have the request from > > gem_quiescent_gpu() which may not have been retired before the error > > triggers. > > Hmmmm... but I make a first valid gem_execbuf()+gem_sync() before the one that actually hangs. Shouldn´t that make sure that all previous requests have been retired? No, I would not make that assumption about kernel behaviour. The only thing that it will guarantee is that the last request with that handle is retired. (In other words, forthcoming bug fixes already break this assumption.) > > > + > > > + igt_assert(getline(&line, &line_size, file) > 0); > > > + items = sscanf(line, " seqno 0x%08x, emitted %ld, tail > > 0x%08x\n", > > > + &seqno, &jiffies, &tail); > > > + igt_assert(items == 3); > > > > Bad. I may change the format. s/may/will/ > > I still need to make sure I got a valid tail, so I need to know the format and fail if don´t understand it. Or do you want me to use a different tail, maybe the ringbuffer one? I didn't spot where you used tail. Care to elaborate? I may be able to suggest an alternative, or we may either code this more defensively or make the kernel emission easier to use and hopefully more informative for debugging. -Chris
> > > > + &requests); > > > > + if (req_matched == 1) { > > > > + igt_assert(strstr(ring_name, expected_ring_name)); > > > > + igt_assert(requests == 1); > > > > > > Bad assumption. You could still have the request from > > > gem_quiescent_gpu() which may not have been retired before the error > > > triggers. > > > > Hmmmm... but I make a first valid gem_execbuf()+gem_sync() before the > one that actually hangs. Shouldn´t that make sure that all previous requests > have been retired? > > No, I would not make that assumption about kernel behaviour. The only thing > that it will guarantee is that the last request with that handle is retired. (In > other words, forthcoming bug fixes already break this > assumption.) Ok, then I´ll check all the reported requests (can I at least assume that the one I am interested in will be the last one? request_list seems to be FIFO...). > > > > + > > > > + igt_assert(getline(&line, &line_size, file) > 0); > > > > + items = sscanf(line, " seqno 0x%08x, emitted %ld, tail > > > 0x%08x\n", > > > > + &seqno, &jiffies, &tail); > > > > + igt_assert(items == 3); > > > > > > Bad. I may change the format. s/may/will/ > > > > I still need to make sure I got a valid tail, so I need to know the format and > fail if don´t understand it. Or do you want me to use a different tail, maybe the > ringbuffer one? > > I didn't spot where you used tail. Care to elaborate? I may be able to suggest an > alternative, or we may either code this more defensively or make the kernel > emission easier to use and hopefully more informative for debugging. I´m using the request tail to traverse the ring buffer, make sure there is a MI_BB_START and then our object´s address shortly before that tail. I could use ring->tail, or I could skip this check altogether (after all, I´m already checking that the GPU was executing a Batch Buffer, and that the BB had my magic number on it).
On Mon, Apr 14, 2014 at 10:23:20AM +0000, Mateo Lozano, Oscar wrote: > > > > > + &requests); > > > > > + if (req_matched == 1) { > > > > > + igt_assert(strstr(ring_name, expected_ring_name)); > > > > > + igt_assert(requests == 1); > > > > > > > > Bad assumption. You could still have the request from > > > > gem_quiescent_gpu() which may not have been retired before the error > > > > triggers. > > > > > > Hmmmm... but I make a first valid gem_execbuf()+gem_sync() before the > > one that actually hangs. Shouldn´t that make sure that all previous requests > > have been retired? > > > > No, I would not make that assumption about kernel behaviour. The only thing > > that it will guarantee is that the last request with that handle is retired. (In > > other words, forthcoming bug fixes already break this > > assumption.) > > Ok, then I´ll check all the reported requests (can I at least assume that the one I am interested in will be the last one? request_list seems to be FIFO...). In general, even that may be dangerous. (But less so if can be sure that is no batchbuffer injection from the testsuite, and that there are no other clients, or that the kernel doesn't start having to do its own w/a.) > > > > > + > > > > > + igt_assert(getline(&line, &line_size, file) > 0); > > > > > + items = sscanf(line, " seqno 0x%08x, emitted %ld, tail > > > > 0x%08x\n", > > > > > + &seqno, &jiffies, &tail); > > > > > + igt_assert(items == 3); > > > > > > > > Bad. I may change the format. s/may/will/ > > > > > > I still need to make sure I got a valid tail, so I need to know the format and > > fail if don´t understand it. Or do you want me to use a different tail, maybe the > > ringbuffer one? > > > > I didn't spot where you used tail. Care to elaborate? I may be able to suggest an > > alternative, or we may either code this more defensively or make the kernel > > emission easier to use and hopefully more informative for debugging. > > I´m using the request tail to traverse the ring buffer, make sure there is a MI_BB_START and then our object´s address shortly before that tail. I could use ring->tail, or I could skip this check altogether (after all, I´m already checking that the GPU was executing a Batch Buffer, and that the BB had my magic number on it). I would add a little more smarts to both the kernel and error-decode. In the kernel, we can print the guilty request, which you can then use to confirm that it is yours. That seems to me to be a stronger validation of gem_error_capture, and a useful bit of information from hangstats that we do not expose currently. -Chris
> I would add a little more smarts to both the kernel and error-decode. > In the kernel, we can print the guilty request, which you can then use to > confirm that it is yours. That seems to me to be a stronger validation of > gem_error_capture, and a useful bit of information from hangstats that we do > not expose currently. That sounds good. I have to add a number of other things to i915_gpu_error as part of the Execlists code, so I´ll add a "--- guilty request" as well and resubmit this test together with the series. Thanks, Oscar
Hi, oscar.mateo@intel.com writes: > From: Oscar Mateo <oscar.mateo@intel.com> > > Guarantees that error capture works at a very basic level. > > v2: Also check that the ring object contains a reloc with MI_BB_START > for the presumed batch object's address. > > Signed-off-by: Oscar Mateo <oscar.mateo@intel.com> > --- > tests/.gitignore | 1 + > tests/Makefile.sources | 1 + > tests/gem_error_capture.c | 269 ++++++++++++++++++++++++++++++++++++++++++++++ > 3 files changed, 271 insertions(+) > create mode 100644 tests/gem_error_capture.c > > diff --git a/tests/.gitignore b/tests/.gitignore > index 146bab0..945574c 100644 > --- a/tests/.gitignore > +++ b/tests/.gitignore > @@ -27,6 +27,7 @@ gem_ctx_create > gem_ctx_exec > gem_double_irq_loop > gem_dummy_reloc_loop > +gem_error_capture > gem_evict_alignment > gem_evict_everything > gem_exec_bad_domains > diff --git a/tests/Makefile.sources b/tests/Makefile.sources > index bf02a48..612beb6 100644 > --- a/tests/Makefile.sources > +++ b/tests/Makefile.sources > @@ -24,6 +24,7 @@ TESTS_progs_M = \ > gem_ctx_bad_exec \ > gem_ctx_exec \ > gem_dummy_reloc_loop \ > + gem_error_capture \ > gem_evict_alignment \ > gem_evict_everything \ > gem_exec_bad_domains \ > diff --git a/tests/gem_error_capture.c b/tests/gem_error_capture.c > new file mode 100644 > index 0000000..88845c9 > --- /dev/null > +++ b/tests/gem_error_capture.c > @@ -0,0 +1,269 @@ > +/* > + * Copyright © 2014 Intel Corporation > + * > + * Permission is hereby granted, free of charge, to any person obtaining a > + * copy of this software and associated documentation files (the "Software"), > + * to deal in the Software without restriction, including without limitation > + * the rights to use, copy, modify, merge, publish, distribute, sublicense, > + * and/or sell copies of the Software, and to permit persons to whom the > + * Software is furnished to do so, subject to the following conditions: > + * > + * The above copyright notice and this permission notice (including the next > + * paragraph) shall be included in all copies or substantial portions of the > + * Software. > + * > + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR > + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, > + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL > + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER > + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING > + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS > + * IN THE SOFTWARE. > + * > + * Authors: > + * Oscar Mateo <oscar.mateo@intel.com> > + * > + */ > + > +/* > + * Testcase: Check whether basic error state capture/dump mechanism is correctly > + * working for all rings. > + */ > + > +#include <unistd.h> > +#include <stdlib.h> > +#include <stdint.h> > +#include <stdio.h> > +#include <string.h> > +#include <fcntl.h> > +#include <inttypes.h> > +#include <errno.h> > +#include <sys/stat.h> > +#include <sys/ioctl.h> > +#include "drm.h" > +#include "ioctl_wrappers.h" > +#include "drmtest.h" > +#include "intel_io.h" > +#include "igt_debugfs.h" > + > +#define MAGIC_NUMBER 0x10001 > +uint32_t batch[4] = {MI_NOOP, MI_BATCH_BUFFER_END, MAGIC_NUMBER, MAGIC_NUMBER}; > + > +static void stop_rings(void) > +{ > + int fd; > + static const char buf[] = "0xf"; > + > + fd = igt_debugfs_open("i915_ring_stop", O_WRONLY); > + igt_assert(fd >= 0); > + > + igt_assert(write(fd, buf, sizeof(buf)) == sizeof(buf)); > + > + close(fd); > +} > + > +static bool rings_stopped(void) > +{ > + int fd; > + static char buf[128]; > + unsigned long long val; > + > + fd = igt_debugfs_open("i915_ring_stop", O_RDONLY); > + igt_assert(fd >= 0); > + > + igt_assert(read(fd, buf, sizeof(buf)) > 0); > + close(fd); > + > + sscanf(buf, "%llx", &val); > + > + return (bool)val; > +} Please use igt_set_stop_rings() and igt_get_stop_rings(). Also consider stopping only the ring you are testing for. > +static void clear_error_state(void) > +{ > + int fd; > + static const char buf[] = ""; > + > + fd = igt_debugfs_open("i915_error_state", O_WRONLY); > + igt_assert(fd >= 0); > + > + igt_assert(write(fd, buf, sizeof(buf)) == sizeof(buf)); > + close(fd); > +} > + > +static void check_error_state(const char *expected_ring_name, > + uint64_t expected_offset) > +{ > + FILE *file; > + int debug_fd; > + char *line = NULL; > + size_t line_size = 0; > + char *dashes = NULL; > + char *ring_name = NULL; > + int bb_matched = 0; > + uint32_t gtt_offset; > + char expected_line[32]; > + int req_matched = 0; > + int requests; > + uint32_t seqno, tail; > + long jiffies; > + int ringbuf_matched = 0; > + unsigned int offset, command, expected_addr = 0; > + int i, items; > + bool bb_ok = false, req_ok = false, ringbuf_ok = false; > + > + debug_fd = igt_debugfs_open("i915_error_state", O_RDONLY); > + igt_assert(debug_fd >= 0); > + file = fdopen(debug_fd, "r"); > + > + while (getline(&line, &line_size, file) > 0) { > + dashes = strstr(line, "---"); > + if (!dashes) > + continue; > + > + ring_name = realloc(ring_name, dashes - line); > + strncpy(ring_name, line, dashes - line); > + ring_name[dashes - line - 1] = '\0'; > + > + bb_matched = sscanf(dashes, "--- gtt_offset = 0x%08x\n", > + >t_offset); > + if (bb_matched == 1) { > + igt_assert(strstr(ring_name, expected_ring_name)); > + igt_assert(gtt_offset == expected_offset); > + > + for (i = 0; i < sizeof(batch) / 4; i++) { > + igt_assert(getline(&line, &line_size, file) > 0); > + snprintf(expected_line, sizeof(expected_line), "%08x : %08x", > + 4*i, batch[i]); > + igt_assert(strstr(line, expected_line)); > + } > + bb_ok = true; > + continue; > + } > + > + req_matched = sscanf(dashes, "--- %d requests\n", > + &requests); > + if (req_matched == 1) { > + igt_assert(strstr(ring_name, expected_ring_name)); > + igt_assert(requests == 1); > + > + igt_assert(getline(&line, &line_size, file) > 0); > + items = sscanf(line, " seqno 0x%08x, emitted %ld, tail 0x%08x\n", > + &seqno, &jiffies, &tail); > + igt_assert(items == 3); > + req_ok = true; > + continue; > + } > + > + ringbuf_matched = sscanf(dashes, "--- ringbuffer = 0x%08x\n", > + >t_offset); > + if (ringbuf_matched == 1) { > + if (!strstr(ring_name, expected_ring_name)) > + continue; > + igt_assert(req_ok); > + > + for (i = 0; i < tail / 4; i++) { > + igt_assert(getline(&line, &line_size, file) > 0); > + items = sscanf(line, "%08x : %08x\n", > + &offset, &command); > + igt_assert(items == 2); > + if ((command & 0x1F800000) == MI_BATCH_BUFFER_START) { > + igt_assert(getline(&line, &line_size, file) > 0); > + items = sscanf(line, "%08x : %08x\n", > + &offset, &expected_addr); > + igt_assert(items == 2); Should check for MAGIC_NUMBER here? -Mika > + i++; > + } > + } > + igt_assert(expected_addr == expected_offset); > + ringbuf_ok = true; > + continue; > + } > + > + if (bb_ok && req_ok && ringbuf_ok) > + break; > + } > + igt_assert(bb_ok && req_ok && ringbuf_ok); > + > + free(line); > + free(ring_name); > + close(debug_fd); > +} > + > +static void test(int fd, uint32_t handle, unsigned ring_id, const char *ring_name) > +{ > + struct drm_i915_gem_execbuffer2 execbuf; > + struct drm_i915_gem_exec_object2 exec; > + uint64_t presumed_offset; > + > + gem_require_ring(fd, ring_id); > + > + clear_error_state(); > + > + exec.handle = handle; > + exec.relocation_count = 0; > + exec.relocs_ptr = 0; > + exec.alignment = 0; > + exec.offset = 0; > + exec.flags = 0; > + exec.rsvd1 = 0; > + exec.rsvd2 = 0; > + > + execbuf.buffers_ptr = (uintptr_t)&exec; > + execbuf.buffer_count = 1; > + execbuf.batch_start_offset = 0; > + execbuf.batch_len = 16; > + execbuf.cliprects_ptr = 0; > + execbuf.num_cliprects = 0; > + execbuf.DR1 = 0; > + execbuf.DR4 = 0; > + execbuf.flags = ring_id; > + i915_execbuffer2_set_context_id(execbuf, 0); > + execbuf.rsvd2 = 0; > + > + gem_execbuf(fd, &execbuf); > + gem_sync(fd, handle); > + > + presumed_offset = exec.offset; > + > + stop_rings(); > + > + gem_execbuf(fd, &execbuf); > + gem_sync(fd, handle); > + > + igt_assert(rings_stopped() == false); > + igt_assert(presumed_offset == exec.offset); > + > + check_error_state(ring_name, exec.offset); > +} > + > +uint32_t handle; > +int fd; > + > +igt_main > +{ > + igt_fixture { > + fd = drm_open_any(); > + > + handle = gem_create(fd, 4096); > + gem_write(fd, handle, 0, batch, sizeof(batch)); > + } > + > + igt_subtest("render") > + test(fd, handle, I915_EXEC_RENDER, "render ring"); > + > + igt_subtest("bsd") > + test(fd, handle, I915_EXEC_BSD, "bsd ring"); > + > + igt_subtest("blt") > + test(fd, handle, I915_EXEC_BLT, "blitter ring"); > + > + igt_subtest("vebox") > + test(fd, handle, I915_EXEC_VEBOX, "video enhancement ring"); > + > + igt_fixture { > + gem_close(fd, handle); > + > + close(fd); > + } > +} > -- > 1.9.0 > > _______________________________________________ > Intel-gfx mailing list > Intel-gfx@lists.freedesktop.org > http://lists.freedesktop.org/mailman/listinfo/intel-gfx
On Mon, Apr 14, 2014 at 01:03:58PM +0000, Mateo Lozano, Oscar wrote: > > I would add a little more smarts to both the kernel and error-decode. > > In the kernel, we can print the guilty request, which you can then use to > > confirm that it is yours. That seems to me to be a stronger validation of > > gem_error_capture, and a useful bit of information from hangstats that we do > > not expose currently. > > That sounds good. I have to add a number of other things to > i915_gpu_error as part of the Execlists code, so I´ll add a "--- guilty > request" as well and resubmit this test together with the series. If we want this much smarts then we need a properly hanging batch, e.g. like the looping batch used in gem_reset_stats. The problem with that is that this will kill the gpu if reset doesn't work (i.e. gen2/3) so we need to skip this test there. Or maybe split things into 2 subtests and use the properly hanging batch only when we do the extended guilty testing under discussion here. But in any case just checking that the batch is somewhere in the ring (properly masking of lower bits 0-11 ofc) and checking whether the batch is correctl dumped (with the magic value) would catch a lot of the past&present execbuf bugs - we've had issues with dumping fancy values of 0 a lot. For the guilty stuff we have an extensive set of tests in gem_reset_stat using the reset stat ioctl already. And for the occasional "the hang detection logic is busted bug" I think nothing short of a human brain locking at the batch really helps. At least if we want to be somewhat platform agnostic ... So imo the current level of checking loosk Good Enough. But I'm certainly not going to stop you ;-) Cheers, Daniel
Hi Mika, > > +static bool rings_stopped(void) > > +{ > > + int fd; > > + static char buf[128]; > > + unsigned long long val; > > + > > + fd = igt_debugfs_open("i915_ring_stop", O_RDONLY); > > + igt_assert(fd >= 0); > > + > > + igt_assert(read(fd, buf, sizeof(buf)) > 0); > > + close(fd); > > + > > + sscanf(buf, "%llx", &val); > > + > > + return (bool)val; > > +} > > Please use igt_set_stop_rings() and igt_get_stop_rings(). > > Also consider stopping only the ring you are testing for. Oops, I didn see these before. Ok, done in the new version. > > + for (i = 0; i < tail / 4; i++) { > > + igt_assert(getline(&line, &line_size, file) > 0); > > + items = sscanf(line, "%08x : %08x\n", > > + &offset, &command); > > + igt_assert(items == 2); > > + if ((command & 0x1F800000) == MI_BATCH_BUFFER_START) { > > + igt_assert(getline(&line, &line_size, file) > 0); > > + items = sscanf(line, "%08x : %08x\n", > > + &offset, &expected_addr); > > + igt_assert(items == 2); > > Should check for MAGIC_NUMBER here? I'm doing it above already, inside bb_matched: for (i = 0; i < sizeof(batch) / 4; i++) { igt_assert(getline(&line, &line_size, file) > 0); snprintf(expected_line, sizeof(expected_line), "%08x : %08x", 4*i, batch[i]); igt_assert(strstr(line, expected_line)); } -- Oscar
On Tue, 2014-04-15 at 21:38 +0200, Daniel Vetter wrote: > On Mon, Apr 14, 2014 at 01:03:58PM +0000, Mateo Lozano, Oscar wrote: > > > I would add a little more smarts to both the kernel and error-decode. > > > In the kernel, we can print the guilty request, which you can then use to > > > confirm that it is yours. That seems to me to be a stronger validation of > > > gem_error_capture, and a useful bit of information from hangstats that we do > > > not expose currently. > > > > That sounds good. I have to add a number of other things to > > i915_gpu_error as part of the Execlists code, so I´ll add a "--- guilty > > request" as well and resubmit this test together with the series. > > If we want this much smarts then we need a properly hanging batch, e.g. > like the looping batch used in gem_reset_stats. > > The problem with that is that this will kill the gpu if reset doesn't work > (i.e. gen2/3) so we need to skip this test there. Or maybe split things > into 2 subtests and use the properly hanging batch only when we do the > extended guilty testing under discussion here. > > But in any case just checking that the batch is somewhere in the ring > (properly masking of lower bits 0-11 ofc) and checking whether the batch > is correctl dumped (with the magic value) would catch a lot of the > past&present execbuf bugs - we've had issues with dumping fancy values of > 0 a lot. > > For the guilty stuff we have an extensive set of tests in gem_reset_stat > using the reset stat ioctl already. And for the occasional "the hang > detection logic is busted bug" I think nothing short of a human brain > locking at the batch really helps. At least if we want to be somewhat > platform agnostic ... > > So imo the current level of checking loosk Good Enough. But I'm certainly > not going to stop you ;-) Ok, for the moment I'm happy that I can unblock Execlists. I don't mind looking at the "--- guilty requests" in the future, but first I need to get Execlists out of the way... Cheers, Oscar
diff --git a/tests/.gitignore b/tests/.gitignore index 146bab0..945574c 100644 --- a/tests/.gitignore +++ b/tests/.gitignore @@ -27,6 +27,7 @@ gem_ctx_create gem_ctx_exec gem_double_irq_loop gem_dummy_reloc_loop +gem_error_capture gem_evict_alignment gem_evict_everything gem_exec_bad_domains diff --git a/tests/Makefile.sources b/tests/Makefile.sources index bf02a48..612beb6 100644 --- a/tests/Makefile.sources +++ b/tests/Makefile.sources @@ -24,6 +24,7 @@ TESTS_progs_M = \ gem_ctx_bad_exec \ gem_ctx_exec \ gem_dummy_reloc_loop \ + gem_error_capture \ gem_evict_alignment \ gem_evict_everything \ gem_exec_bad_domains \ diff --git a/tests/gem_error_capture.c b/tests/gem_error_capture.c new file mode 100644 index 0000000..88845c9 --- /dev/null +++ b/tests/gem_error_capture.c @@ -0,0 +1,269 @@ +/* + * Copyright © 2014 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + * Authors: + * Oscar Mateo <oscar.mateo@intel.com> + * + */ + +/* + * Testcase: Check whether basic error state capture/dump mechanism is correctly + * working for all rings. + */ + +#include <unistd.h> +#include <stdlib.h> +#include <stdint.h> +#include <stdio.h> +#include <string.h> +#include <fcntl.h> +#include <inttypes.h> +#include <errno.h> +#include <sys/stat.h> +#include <sys/ioctl.h> +#include "drm.h" +#include "ioctl_wrappers.h" +#include "drmtest.h" +#include "intel_io.h" +#include "igt_debugfs.h" + +#define MAGIC_NUMBER 0x10001 +uint32_t batch[4] = {MI_NOOP, MI_BATCH_BUFFER_END, MAGIC_NUMBER, MAGIC_NUMBER}; + +static void stop_rings(void) +{ + int fd; + static const char buf[] = "0xf"; + + fd = igt_debugfs_open("i915_ring_stop", O_WRONLY); + igt_assert(fd >= 0); + + igt_assert(write(fd, buf, sizeof(buf)) == sizeof(buf)); + + close(fd); +} + +static bool rings_stopped(void) +{ + int fd; + static char buf[128]; + unsigned long long val; + + fd = igt_debugfs_open("i915_ring_stop", O_RDONLY); + igt_assert(fd >= 0); + + igt_assert(read(fd, buf, sizeof(buf)) > 0); + close(fd); + + sscanf(buf, "%llx", &val); + + return (bool)val; +} + +static void clear_error_state(void) +{ + int fd; + static const char buf[] = ""; + + fd = igt_debugfs_open("i915_error_state", O_WRONLY); + igt_assert(fd >= 0); + + igt_assert(write(fd, buf, sizeof(buf)) == sizeof(buf)); + close(fd); +} + +static void check_error_state(const char *expected_ring_name, + uint64_t expected_offset) +{ + FILE *file; + int debug_fd; + char *line = NULL; + size_t line_size = 0; + char *dashes = NULL; + char *ring_name = NULL; + int bb_matched = 0; + uint32_t gtt_offset; + char expected_line[32]; + int req_matched = 0; + int requests; + uint32_t seqno, tail; + long jiffies; + int ringbuf_matched = 0; + unsigned int offset, command, expected_addr = 0; + int i, items; + bool bb_ok = false, req_ok = false, ringbuf_ok = false; + + debug_fd = igt_debugfs_open("i915_error_state", O_RDONLY); + igt_assert(debug_fd >= 0); + file = fdopen(debug_fd, "r"); + + while (getline(&line, &line_size, file) > 0) { + dashes = strstr(line, "---"); + if (!dashes) + continue; + + ring_name = realloc(ring_name, dashes - line); + strncpy(ring_name, line, dashes - line); + ring_name[dashes - line - 1] = '\0'; + + bb_matched = sscanf(dashes, "--- gtt_offset = 0x%08x\n", + >t_offset); + if (bb_matched == 1) { + igt_assert(strstr(ring_name, expected_ring_name)); + igt_assert(gtt_offset == expected_offset); + + for (i = 0; i < sizeof(batch) / 4; i++) { + igt_assert(getline(&line, &line_size, file) > 0); + snprintf(expected_line, sizeof(expected_line), "%08x : %08x", + 4*i, batch[i]); + igt_assert(strstr(line, expected_line)); + } + bb_ok = true; + continue; + } + + req_matched = sscanf(dashes, "--- %d requests\n", + &requests); + if (req_matched == 1) { + igt_assert(strstr(ring_name, expected_ring_name)); + igt_assert(requests == 1); + + igt_assert(getline(&line, &line_size, file) > 0); + items = sscanf(line, " seqno 0x%08x, emitted %ld, tail 0x%08x\n", + &seqno, &jiffies, &tail); + igt_assert(items == 3); + req_ok = true; + continue; + } + + ringbuf_matched = sscanf(dashes, "--- ringbuffer = 0x%08x\n", + >t_offset); + if (ringbuf_matched == 1) { + if (!strstr(ring_name, expected_ring_name)) + continue; + igt_assert(req_ok); + + for (i = 0; i < tail / 4; i++) { + igt_assert(getline(&line, &line_size, file) > 0); + items = sscanf(line, "%08x : %08x\n", + &offset, &command); + igt_assert(items == 2); + if ((command & 0x1F800000) == MI_BATCH_BUFFER_START) { + igt_assert(getline(&line, &line_size, file) > 0); + items = sscanf(line, "%08x : %08x\n", + &offset, &expected_addr); + igt_assert(items == 2); + i++; + } + } + igt_assert(expected_addr == expected_offset); + ringbuf_ok = true; + continue; + } + + if (bb_ok && req_ok && ringbuf_ok) + break; + } + igt_assert(bb_ok && req_ok && ringbuf_ok); + + free(line); + free(ring_name); + close(debug_fd); +} + +static void test(int fd, uint32_t handle, unsigned ring_id, const char *ring_name) +{ + struct drm_i915_gem_execbuffer2 execbuf; + struct drm_i915_gem_exec_object2 exec; + uint64_t presumed_offset; + + gem_require_ring(fd, ring_id); + + clear_error_state(); + + exec.handle = handle; + exec.relocation_count = 0; + exec.relocs_ptr = 0; + exec.alignment = 0; + exec.offset = 0; + exec.flags = 0; + exec.rsvd1 = 0; + exec.rsvd2 = 0; + + execbuf.buffers_ptr = (uintptr_t)&exec; + execbuf.buffer_count = 1; + execbuf.batch_start_offset = 0; + execbuf.batch_len = 16; + execbuf.cliprects_ptr = 0; + execbuf.num_cliprects = 0; + execbuf.DR1 = 0; + execbuf.DR4 = 0; + execbuf.flags = ring_id; + i915_execbuffer2_set_context_id(execbuf, 0); + execbuf.rsvd2 = 0; + + gem_execbuf(fd, &execbuf); + gem_sync(fd, handle); + + presumed_offset = exec.offset; + + stop_rings(); + + gem_execbuf(fd, &execbuf); + gem_sync(fd, handle); + + igt_assert(rings_stopped() == false); + igt_assert(presumed_offset == exec.offset); + + check_error_state(ring_name, exec.offset); +} + +uint32_t handle; +int fd; + +igt_main +{ + igt_fixture { + fd = drm_open_any(); + + handle = gem_create(fd, 4096); + gem_write(fd, handle, 0, batch, sizeof(batch)); + } + + igt_subtest("render") + test(fd, handle, I915_EXEC_RENDER, "render ring"); + + igt_subtest("bsd") + test(fd, handle, I915_EXEC_BSD, "bsd ring"); + + igt_subtest("blt") + test(fd, handle, I915_EXEC_BLT, "blitter ring"); + + igt_subtest("vebox") + test(fd, handle, I915_EXEC_VEBOX, "video enhancement ring"); + + igt_fixture { + gem_close(fd, handle); + + close(fd); + } +}