Adding a userspace application crash handling system to autotest

Message ID	1252045654-16861-1-git-send-email-lmr@redhat.com (mailing list archive)
State	New, archived
Headers	show Received: from vger.kernel.org (vger.kernel.org [209.132.176.167]) by demeter.kernel.org (8.14.2/8.14.2) with ESMTP id n846SR1B028615 for <patchwork-kvm@patchwork.kernel.org>; Fri, 4 Sep 2009 06:28:28 GMT From: Lucas Meneghel Rodrigues <lmr@redhat.com> To: autotest@test.kernel.org Cc: kvm@vger.kernel.org, Lucas Meneghel Rodrigues <lmr@redhat.com> Subject: [PATCH] Adding a userspace application crash handling system to autotest Date: Fri, 4 Sep 2009 03:27:34 -0300 Message-Id: <1252045654-16861-1-git-send-email-lmr@redhat.com> Sender: kvm-owner@vger.kernel.org Precedence: bulk

diff --git a/client/common_lib/test.py b/client/common_lib/test.py index 362c960..7d2877a 100644 --- a/client/common_lib/test.py +++ b/client/common_lib/test.py @@ -17,7 +17,7 @@ # tmpdir eg. tmp/<tempname>_<testname.tag> import fcntl, os, re, sys, shutil, tarfile, tempfile, time, traceback -import warnings, logging +import warnings, logging, glob from autotest_lib.client.common_lib import error from autotest_lib.client.bin import utils @@ -31,7 +31,6 @@ class base_test: self.job = job self.pkgmgr = job.pkgmgr self.autodir = job.autodir - self.outputdir = outputdir self.tagged_testname = os.path.basename(self.outputdir) self.resultsdir = os.path.join(self.outputdir, 'results') @@ -40,6 +39,7 @@ class base_test: os.mkdir(self.profdir) self.debugdir = os.path.join(self.outputdir, 'debug') os.mkdir(self.debugdir) + self.configure_crash_handler() self.bindir = bindir if hasattr(job, 'libdir'): self.libdir = job.libdir @@ -54,6 +54,46 @@ class base_test: self.after_iteration_hooks = [] + def configure_crash_handler(self): + """ + Configure the crash handler by: + * Setting up core size to unlimited + * Putting an appropriate crash handler on /proc/sys/kernel/core_pattern + * Create files that the crash handler will use to figure which tests + are active at a given moment + + The crash handler will pick up the core file and write it to + self.debugdir, and perform analysis on it to generate a report. The + program also outputs some results to syslog. + + If multiple tests are running, an attempt to verify if we still have + the old PID on the system process table to determine whether it is a + parent of the current test execution. If we can't determine it, the + core file and the report file will be copied to all test debug dirs. + """ + self.pattern_file = '/proc/sys/kernel/core_pattern' + try: + # Trying to backup core pattern and register our script + self.core_pattern_backup = open(self.pattern_file, 'r').read() + pattern_file = open(self.pattern_file, 'w') + tools_dir = os.path.join(self.autodir, 'tools') + crash_handler_path = os.path.join(tools_dir, 'crash_handler.py') + pattern_file.write('|' + crash_handler_path + ' %p %t %u %s %h %e') + # Writing the files that the crash handler is going to use + self.debugdir_tmp_file = ('/tmp/autotest_results_dir.%s' % + os.getpid()) + utils.open_write_close(self.debugdir_tmp_file, self.debugdir + "\n") + self.crash_handling_enabled = True + except Exception, e: + self.crash_handling_enabled = False + logging.error('Crash handling system disabled: %s' % e) + + if self.crash_handling_enabled: + logging.debug('Crash handling system enabled.') + else: + logging.error('Crash handling system disabled: %s' % e) + + def assert_(self, expr, msg='Assertion failed.'): if not expr: raise error.TestError(msg) @@ -388,6 +428,24 @@ class base_test: try: if run_cleanup: _cherry_pick_call(self.cleanup, *args, **dargs) + # If core dumps are found on the debugdir after the + # execution of the test, let the user know. + if self.crash_handling_enabled: + core_dirs = glob.glob('%s/core.*' % self.debugdir) + if core_dirs: + logging.warning('Programs crashed during test ' + 'execution:') + for dir in core_dirs: + logging.warning('Please verify %s for more ' + 'info', dir) + # Remove the debugdir info file + os.unlink(self.debugdir_tmp_file) + # Restore the core pattern backup + try: + utils.open_write_close(self.pattern_file, + self.core_pattern_backup) + except EnvironmentError: + pass finally: self.job.logging.restore() except error.AutotestError: diff --git a/client/tools/crash_handler.py b/client/tools/crash_handler.py new file mode 100755 index 0000000..390643a --- /dev/null +++ b/client/tools/crash_handler.py @@ -0,0 +1,202 @@ +#!/usr/bin/python +""" +Simple crash handling application for autotest + +@copyright Red Hat Inc 2009 +@author Lucas Meneghel Rodrigues <lmr@redhat.com> +""" +import sys, os, commands, glob, tempfile, shutil, syslog + + +def get_parent_pid(pid): + """ + Returns the parent PID for a given PID, converted to an integer. + + @param pid: Process ID. + """ + try: + stat_file_contents = open('/proc/%s/stat' % pid, 'r').readline() + ppid = int(stat_file_contents.split(" ")[3]) + except: + # It is not possible to determine the parent because the process + # already left the process table. + ppid = 1 + + return ppid + + +def pid_descends_from(pid_a, pid_b): + """ + Check whether pid_a descends from pid_b. + + @param pid_a: Process ID. + @param pid_b: Process ID. + """ + pid_a = int(pid_a) + pid_b = int(pid_b) + current_pid = pid_a + while current_pid > 1: + if current_pid == pid_b: + syslog.syslog(syslog.LOG_INFO, + "PID %s descends from PID %s!" % (pid_a, pid_b)) + return True + else: + current_pid = get_parent_pid(current_pid) + syslog.syslog(syslog.LOG_INFO, + "PID %s does not descend from PID %s" % (pid_a, pid_b)) + return False + + +def write_to_file(file_path, contents): + """ + Write contents to a given file path specified. If not specified, the file + will be created. + + @param file_path: Path to a given file. + @param contents: File contents. + """ + file_object = open(file_path, 'w') + file_object.write(contents) + file_object.close() + + +def get_results_dir_list(pid, core_dir_basename): + """ + Get all valid output directories for the core file and the report. It works + by inspecting files created by each test on /tmp and verifying if the + PID of the process that crashed is a child or grandchild of the autotest + test process. If it can't find any relationship (maybe a daemon that died + during a test execution), it will write the core file to the debug dirs + of all tests currently being executed. If there are no active autotest + tests at a particular moment, it will return a list with ['/tmp']. + + @param pid: PID for the process that generated the core + @param core_dir_basename: Basename for the directory that will hold both + the core dump and the crash report. + """ + # Get all active test debugdir path files present + debugdir_files = glob.glob("/tmp/autotest_results_dir.*") + if debugdir_files: + pid_dir_dict = {} + for debugdir_file in debugdir_files: + a_pid = debugdir_file.split('.')[-1] + results_dir = open(debugdir_file, 'r').read().strip() + pid_dir_dict[a_pid] = os.path.join(results_dir, core_dir_basename) + + results_dir_list = [] + found_relation = False + for a_pid, a_path in pid_dir_dict.iteritems(): + if pid_descends_from(pid, a_pid): + results_dir_list.append(a_path) + found_relation = True + + # If we could not find any relations between the pids in the list with + # the process that crashed, we can't tell for sure which test spawned + # the process (maybe it is a daemon and started even before autotest + # started), so we will have to output the core file to all active test + # directories. + if not found_relation: + return pid_dir_dict.values() + else: + return results_dir_list + + else: + path_inactive_autotest = os.path.join('/tmp', core_dir_basename) + return [path_inactive_autotest] + + +def get_info_from_core(path): + """ + Reads a core file and extracts a dictionary with useful core information. + Right now, the only information extracted is the full executable name. + + @param path: Path to core file. + """ + # Here we are getting the executable full path in a very inelegant way :( + # Since the 'right' solution for it is to make a library to get information + # from core dump files, properly written, I'll leave this as it is for now. + full_exe_path = commands.getoutput('strings %s | grep "_="' % + path).strip("_=") + if full_exe_path.startswith("./"): + pwd = commands.getoutput('strings %s | grep "^PWD="' % + path).strip("PWD=") + full_exe_path = os.path.join(pwd, full_exe_path.strip("./")) + + return {'core_file': path, 'full_exe_path': full_exe_path} + + +if __name__ == "__main__": + syslog.openlog('AutotestCrashHandler', 0, syslog.LOG_DAEMON) + (crashed_pid, time, uid, signal, hostname, exe) = sys.argv[1:] + core_name = 'core' + report_name = 'report' + core_dir_name = 'crash.%s.%s' % (exe, crashed_pid) + core_tmp_dir = tempfile.mkdtemp(prefix='core_', dir='/tmp') + core_tmp_path = os.path.join(core_tmp_dir, core_name) + gdb_command_path = os.path.join(core_tmp_dir, 'gdb_command') + + try: + # Get the filtered results dir list + current_results_dir_list = get_results_dir_list(crashed_pid, + core_dir_name) + + # Write the core file to the appropriate directory + # (we are piping it to this script) + core_file = sys.stdin.read() + write_to_file(core_tmp_path, core_file) + + # Write a command file for GDB + gdb_command = 'bt full\n' + write_to_file(gdb_command_path, gdb_command) + + # Get full command path + exe_path = get_info_from_core(core_tmp_path)['full_exe_path'] + + # Take a backtrace from the running program + gdb_cmd = 'gdb -e %s -c %s -x %s -n -batch -quiet' % (exe_path, + core_tmp_path, + gdb_command_path) + backtrace = commands.getoutput(gdb_cmd) + # Sanitize output before passing it to the report + backtrace = backtrace.decode('utf-8', 'ignore') + + # Composing the format_dict + format_dict = {} + format_dict['program'] = exe_path + format_dict['pid'] = crashed_pid + format_dict['signal'] = signal + format_dict['hostname'] = hostname + format_dict['time'] = time + format_dict['backtrace'] = backtrace + + report = """Autotest crash report + +Program: %(program)s +PID: %(pid)s +Signal: %(signal)s +Hostname: %(hostname)s +Time of the crash: %(time)s +Program backtrace: +%(backtrace)s +""" % format_dict + + syslog.syslog(syslog.LOG_INFO, + "Application %s, PID %s crashed" % + (exe_path, crashed_pid)) + + # Now, for all results dir, let's create the directory if it doesn't + # exist, and write the core file and the report to it. + syslog.syslog(syslog.LOG_INFO, + "Writing core files and reports to %s" % + current_results_dir_list) + for result_dir in current_results_dir_list: + if not os.path.isdir(result_dir): + os.makedirs(result_dir) + core_path = os.path.join(result_dir, 'core') + write_to_file(core_path, core_file) + report_path = os.path.join(result_dir, 'report') + write_to_file(report_path, report) + + finally: + # Cleanup temporary directories + shutil.rmtree(core_tmp_dir)

Adding a userspace application crash handling system to autotest

Commit Message

Patch