@@ -39,7 +39,7 @@ dist_bin_SCRIPTS = intel_gpu_abrt
AM_CPPFLAGS = -I$(top_srcdir) -I$(top_srcdir)/lib
AM_CFLAGS = $(DRM_CFLAGS) $(PCIACCESS_CFLAGS) $(CWARNFLAGS) $(CAIRO_CFLAGS)
-LDADD = $(top_builddir)/lib/libintel_tools.la $(DRM_LIBS) $(PCIACCESS_LIBS) $(CAIRO_LIBS)
+LDADD = $(top_builddir)/lib/libintel_tools.la $(DRM_LIBS) $(PCIACCESS_LIBS) $(CAIRO_LIBS) $(LIBUDEV_LIBS)
intel_dump_decode_SOURCES = \
intel_dump_decode.c
@@ -50,3 +50,7 @@ intel_error_decode_SOURCES = \
intel_bios_reader_SOURCES = \
intel_bios_reader.c \
intel_bios.h
+
+intel_l3_parity_SOURCES = \
+ intel_l3_parity.c \
+ intel_l3_udev_listener.c
@@ -37,6 +37,14 @@
#include "intel_chipset.h"
#include "intel_gpu_tools.h"
#include "drmtest.h"
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+#if HAVE_UDEV
+#include <libudev.h>
+#include <syslog.h>
+#endif
+#include "intel_l3_parity.h"
static unsigned int devid;
/* L3 size is always a function of banks. The number of banks cannot be
@@ -157,7 +165,8 @@ static void usage(const char *name)
" -r, --row=[row] The row to act upon (default 0)\n"
" -b, --bank=[bank] The bank to act upon (default 0)\n"
" -s, --subbank=[subbank] The subbank to act upon (default 0)\n"
- " -w, --slice=[slice] Which slice to act on (default: -1 [all])"
+ " -w, --slice=[slice] Which slice to act on (default: -1 [all])\n"
+ " , --daemon Run the listener (-L) as a daemon\n"
" ACTIONS (only 1 may be specified at a time):\n"
" -h, --help Display this help\n"
" -H, --hw-info Display the current L3 properties\n"
@@ -166,7 +175,8 @@ static void usage(const char *name)
" -e, --enable Enable row, bank, subbank (undo -d)\n"
" -d, --disable=<row,bank,subbank> Disable row, bank, subbank (inline arguments are deprecated. Please use -r, -b, -s instead\n"
" -i, --inject [HSW only] Cause hardware to inject a row errors\n"
- " -u, --uninject [HSW only] Turn off hardware error injectection (undo -i)\n",
+ " -u, --uninject [HSW only] Turn off hardware error injectection (undo -i)\n"
+ " -L, --listen Listen for uevent errors\n",
name);
}
@@ -179,6 +189,7 @@ int main(int argc, char *argv[])
int fd[REAL_MAX_SLICES] = {0}, ret, i;
int action = '0';
int drm_fd = drm_open_any();
+ int daemonize = 0;
devid = intel_get_drm_devid(drm_fd);
if (intel_gen(devid) < 7 || IS_VALLEYVIEW(devid))
@@ -202,11 +213,18 @@ int main(int argc, char *argv[])
assert(lseek(fd[i], 0, SEEK_SET) == 0);
}
+ /* NB: It is potentially unsafe to read this register if the kernel is
+ * actively using this register range, or we're running multiple
+ * instances of this tool. Since neither of those cases should occur
+ * (and the tool should be root only) we can safely ignore this for
+ * now. Just be aware of this if for some reason a hang is reported
+ * when using this tool.
+ */
dft = intel_register_read(0xb038);
while (1) {
int c, option_index = 0;
- static struct option long_options[] = {
+ struct option long_options[] = {
{ "help", no_argument, 0, 'h' },
{ "list", no_argument, 0, 'l' },
{ "clear-all", no_argument, 0, 'a' },
@@ -215,18 +233,23 @@ int main(int argc, char *argv[])
{ "inject", no_argument, 0, 'i' },
{ "uninject", no_argument, 0, 'u' },
{ "hw-info", no_argument, 0, 'H' },
+ { "listen", no_argument, 0, 'L' },
{ "row", required_argument, 0, 'r' },
{ "bank", required_argument, 0, 'b' },
{ "subbank", required_argument, 0, 's' },
{ "slice", required_argument, 0, 'w' },
+ { "daemon", no_argument, &daemonize, 1 },
{0, 0, 0, 0}
};
- c = getopt_long(argc, argv, "hHr:b:s:w:aled::iu", long_options,
+ c = getopt_long(argc, argv, "hHr:b:s:w:aled::iuL", long_options,
&option_index);
if (c == -1)
break;
+ if (c == 0)
+ continue;
+
switch (c) {
case '?':
case 'h':
@@ -274,6 +297,7 @@ int main(int argc, char *argv[])
case 'a':
case 'l':
case 'e':
+ case 'L':
if (action != '0') {
fprintf(stderr, "Only one action may be specified\n");
exit(EXIT_FAILURE);
@@ -299,6 +323,20 @@ int main(int argc, char *argv[])
printf("warning: overwriting existing injections. This is very dangerous.\n");
}
+ /* Daemon doesn't work like the other commands */
+ if (action == 'L') {
+ struct l3_parity par;
+ struct l3_location loc;
+ if (daemonize) {
+ assert(daemon(0, 0) == 0);
+ openlog(argv[0], LOG_CONS | LOG_PID, LOG_USER);
+ }
+ memset(&par, 0, sizeof(par));
+ assert(l3_uevent_setup(&par) == 0);
+ assert(l3_listen(&par, daemonize == 1, &loc) == 0);
+ exit(EXIT_SUCCESS);
+ }
+
if (action == 'l')
decode_dft(dft);
new file mode 100644
@@ -0,0 +1,31 @@
+#ifndef INTEL_L3_PARITY_H_
+#define INTEL_L3_PARITY_H_
+
+#include <stdint.h>
+#include <stdbool.h>
+
+struct l3_parity {
+ struct udev *udev;
+ struct udev_monitor *uevent_monitor;
+ int fd;
+ fd_set fdset;
+};
+
+struct l3_location {
+ uint8_t slice;
+ uint16_t row;
+ uint8_t bank;
+ uint8_t subbank;
+};
+
+#if HAVE_UDEV
+int l3_uevent_setup(struct l3_parity *par);
+/* Listens (blocks) for an l3 parity event. Returns the location of the error. */
+int l3_listen(struct l3_parity *par, bool daemon, struct l3_location *loc);
+#define l3_uevent_teardown(par) {}
+#else
+#define l3_uevent_setup(par, daemon, loc) -1
+#define l3_listen(par) -1
+#endif
+
+#endif
new file mode 100644
@@ -0,0 +1,108 @@
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#if HAVE_UDEV
+#include <libudev.h>
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <assert.h>
+#include <syslog.h>
+#include "i915_drm.h"
+#include "intel_l3_parity.h"
+
+#ifndef I915_L3_PARITY_UEVENT
+#define I915_L3_PARITY_UEVENT "L3_PARITY_ERROR"
+#endif
+
+int l3_uevent_setup(struct l3_parity *par)
+{
+ struct udev *udev;
+ struct udev_monitor *uevent_monitor;
+ fd_set fdset;
+ int fd, ret = -1;
+
+ udev = udev_new();
+ if (!udev) {
+ return -1;
+ }
+
+ uevent_monitor = udev_monitor_new_from_netlink(udev, "udev");
+ if (!uevent_monitor)
+ goto err_out;
+
+ ret = udev_monitor_filter_add_match_subsystem_devtype(uevent_monitor, "drm", "drm_minor");
+ if (ret < 0)
+ goto err_out;
+
+ ret = udev_monitor_enable_receiving(uevent_monitor);
+ if (ret < 0)
+ goto err_out;
+
+ fd = udev_monitor_get_fd(uevent_monitor);
+ FD_ZERO(&fdset);
+ FD_SET(fd, &fdset);
+
+ par->udev = udev;
+ par->fd = fd;
+ par->fdset = fdset;
+ par->uevent_monitor = uevent_monitor;
+ return 0;
+
+err_out:
+ udev_unref(udev);
+ return ret;
+}
+
+int l3_listen(struct l3_parity *par, bool daemon, struct l3_location *loc)
+{
+ struct udev_device *udev_dev;
+ const char *parity_status;
+ char *err_msg;
+ int ret;
+
+again:
+ ret = select(par->fd + 1, &par->fdset, NULL, NULL, NULL);
+ /* Number of bits set is returned, must be >= 1 */
+ if (ret <= 0) {
+ return ret;
+ }
+
+ assert(FD_ISSET(par->fd, &par->fdset));
+
+ udev_dev = udev_monitor_receive_device(par->uevent_monitor);
+ if (!udev_dev)
+ return -1;
+
+ parity_status = udev_device_get_property_value(udev_dev, I915_L3_PARITY_UEVENT);
+ if (strncmp(parity_status, "1", 1))
+ goto again;
+
+ loc->slice = atoi(udev_device_get_property_value(udev_dev, "SLICE"));
+ loc->row = atoi(udev_device_get_property_value(udev_dev, "ROW"));
+ loc->bank = atoi(udev_device_get_property_value(udev_dev, "BANK"));
+ loc->subbank = atoi(udev_device_get_property_value(udev_dev, "SUBBANK"));
+
+ udev_device_unref(udev_dev);
+
+ asprintf(&err_msg, "Parity error detected on: %d,%d,%d,%d. "
+ "Try to run intel_l3_parity -r %d -b %d -s %d -w %d -d",
+ loc->slice, loc->row, loc->bank, loc->subbank,
+ loc->row, loc->bank, loc->subbank, loc->slice);
+ if (daemon) {
+ syslog(LOG_INFO, "%s\n", err_msg);
+ goto again;
+ }
+
+ fprintf(stderr, "%s\n", err_msg);
+
+ free(err_msg);
+
+ return 0;
+}
+#endif
v2: Add a comment explaining the dangers of directly accessing the DFT register (Daniel) Signed-off-by: Ben Widawsky <ben@bwidawsk.net> --- tools/Makefile.am | 6 ++- tools/intel_l3_parity.c | 46 ++++++++++++++++-- tools/intel_l3_parity.h | 31 ++++++++++++ tools/intel_l3_udev_listener.c | 108 +++++++++++++++++++++++++++++++++++++++++ 4 files changed, 186 insertions(+), 5 deletions(-) create mode 100644 tools/intel_l3_parity.h create mode 100644 tools/intel_l3_udev_listener.c