From patchwork Tue Sep 8 13:53:50 2009 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Lucas Meneghel Rodrigues X-Patchwork-Id: 46215 Received: from vger.kernel.org (vger.kernel.org [209.132.176.167]) by demeter.kernel.org (8.14.2/8.14.2) with ESMTP id n88Ds2YT000766 for ; Tue, 8 Sep 2009 13:54:03 GMT Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1754391AbZIHNx5 (ORCPT ); Tue, 8 Sep 2009 09:53:57 -0400 Received: (majordomo@vger.kernel.org) by vger.kernel.org id S1754285AbZIHNx5 (ORCPT ); Tue, 8 Sep 2009 09:53:57 -0400 Received: from mx1.redhat.com ([209.132.183.28]:43862 "EHLO mx1.redhat.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1753800AbZIHNx4 (ORCPT ); Tue, 8 Sep 2009 09:53:56 -0400 Received: from int-mx05.intmail.prod.int.phx2.redhat.com (int-mx05.intmail.prod.int.phx2.redhat.com [10.5.11.18]) by mx1.redhat.com (8.13.8/8.13.8) with ESMTP id n88DrtHp006577; Tue, 8 Sep 2009 09:53:55 -0400 Received: from localhost.localdomain (vpn-10-30.bos.redhat.com [10.16.10.30]) by int-mx05.intmail.prod.int.phx2.redhat.com (8.13.8/8.13.8) with ESMTP id n88DrqHt010265; Tue, 8 Sep 2009 09:53:52 -0400 From: Lucas Meneghel Rodrigues To: autotest@test.kernel.org Cc: kvm@vger.kernel.org, jadmanski@google.com, Lucas Meneghel Rodrigues Subject: [PATCH] Adding a userspace application crash handling system to autotest Date: Tue, 8 Sep 2009 10:53:50 -0300 Message-Id: <1252418030-27982-1-git-send-email-lmr@redhat.com> X-Scanned-By: MIMEDefang 2.67 on 10.5.11.18 Sender: kvm-owner@vger.kernel.org Precedence: bulk List-ID: X-Mailing-List: kvm@vger.kernel.org This patch adds a system to watch user space segmentation faults, writing core dumps and some degree of core dump analysis report. We believe that such a system will be beneficial for autotest as a whole, since the ability to get core dumps and dump analysis for each app crashing during an autotest execution can help test engineers with richer debugging information. The system is comprised by 2 parts: * Modifications on test code that enable core dumps generation, register a core handler script in the kernel and check by generated core files at the end of each test. * A core handler script that is going to write the core on each test debug dir in a convenient way, with a report that currently is comprised by the process that died and a gdb stacktrace of the process. As the system gets in shape, we could add more scripts that can do fancier stuff (such as handlers that use frysk to get more info such as memory maps, provided that we have frysk installed in the machine). This is the proof of concept of the system. I am sending it to the mailing list on this early stage so I can get feedback on the feature. The system passes my basic tests: * Run a simple long test, such as the kvm test, and then crash an application while the test is running. I get reports generated on test.debugdir * Run a slightly more complex control file, with 3 parallel bonnie instances at once and crash an application while the test is running. I get reports generated on all test.debugdirs. 3rd try: * Explicitely enable core dumps using the resource module * Fixed a bug on the crash detection code, and factored it into a utility function. I believe we are good to go now. Signed-off-by: Lucas Meneghel Rodrigues --- client/common_lib/test.py | 66 +++++++++++++- client/tools/crash_handler.py | 202 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 266 insertions(+), 2 deletions(-) create mode 100755 client/tools/crash_handler.py diff --git a/client/common_lib/test.py b/client/common_lib/test.py index 362c960..65b78a3 100644 --- a/client/common_lib/test.py +++ b/client/common_lib/test.py @@ -17,7 +17,7 @@ # tmpdir eg. tmp/_ import fcntl, os, re, sys, shutil, tarfile, tempfile, time, traceback -import warnings, logging +import warnings, logging, glob, resource from autotest_lib.client.common_lib import error from autotest_lib.client.bin import utils @@ -31,7 +31,6 @@ class base_test: self.job = job self.pkgmgr = job.pkgmgr self.autodir = job.autodir - self.outputdir = outputdir self.tagged_testname = os.path.basename(self.outputdir) self.resultsdir = os.path.join(self.outputdir, 'results') @@ -40,6 +39,7 @@ class base_test: os.mkdir(self.profdir) self.debugdir = os.path.join(self.outputdir, 'debug') os.mkdir(self.debugdir) + self.configure_crash_handler() self.bindir = bindir if hasattr(job, 'libdir'): self.libdir = job.libdir @@ -54,6 +54,66 @@ class base_test: self.after_iteration_hooks = [] + def configure_crash_handler(self): + """ + Configure the crash handler by: + * Setting up core size to unlimited + * Putting an appropriate crash handler on /proc/sys/kernel/core_pattern + * Create files that the crash handler will use to figure which tests + are active at a given moment + + The crash handler will pick up the core file and write it to + self.debugdir, and perform analysis on it to generate a report. The + program also outputs some results to syslog. + + If multiple tests are running, an attempt to verify if we still have + the old PID on the system process table to determine whether it is a + parent of the current test execution. If we can't determine it, the + core file and the report file will be copied to all test debug dirs. + """ + self.pattern_file = '/proc/sys/kernel/core_pattern' + try: + # Enable core dumps + resource.setrlimit(resource.RLIMIT_CORE, (-1, -1)) + # Trying to backup core pattern and register our script + self.core_pattern_backup = open(self.pattern_file, 'r').read() + pattern_file = open(self.pattern_file, 'w') + tools_dir = os.path.join(self.autodir, 'tools') + crash_handler_path = os.path.join(tools_dir, 'crash_handler.py') + pattern_file.write('|' + crash_handler_path + ' %p %t %u %s %h %e') + # Writing the files that the crash handler is going to use + self.debugdir_tmp_file = ('/tmp/autotest_results_dir.%s' % + os.getpid()) + utils.open_write_close(self.debugdir_tmp_file, self.debugdir + "\n") + except Exception, e: + self.crash_handling_enabled = False + logging.error('Crash handling system disabled: %s' % e) + else: + self.crash_handling_enabled = True + logging.debug('Crash handling system enabled.') + + + def crash_handler_report(self): + """ + If core dumps are found on the debugdir after the execution of the + test, let the user know. + """ + if self.crash_handling_enabled: + core_dirs = glob.glob('%s/crash.*' % self.debugdir) + if core_dirs: + logging.warning('Programs crashed during test execution:') + for dir in core_dirs: + logging.warning('Please verify %s for more info', dir) + # Remove the debugdir info file + os.unlink(self.debugdir_tmp_file) + # Restore the core pattern backup + try: + utils.open_write_close(self.pattern_file, + self.core_pattern_backup) + except EnvironmentError: + pass + + def assert_(self, expr, msg='Assertion failed.'): if not expr: raise error.TestError(msg) @@ -377,6 +437,7 @@ class base_test: traceback.print_exc() print 'Now raising the earlier %s error' % exc_info[0] finally: + self.crash_handler_report() self.job.logging.restore() try: raise exc_info[0], exc_info[1], exc_info[2] @@ -389,6 +450,7 @@ class base_test: if run_cleanup: _cherry_pick_call(self.cleanup, *args, **dargs) finally: + self.crash_handler_report() self.job.logging.restore() except error.AutotestError: if self.network_destabilizing: diff --git a/client/tools/crash_handler.py b/client/tools/crash_handler.py new file mode 100755 index 0000000..e281eb5 --- /dev/null +++ b/client/tools/crash_handler.py @@ -0,0 +1,202 @@ +#!/usr/bin/python +""" +Simple crash handling application for autotest + +@copyright Red Hat Inc 2009 +@author Lucas Meneghel Rodrigues +""" +import sys, os, commands, glob, tempfile, shutil, syslog + + +def get_parent_pid(pid): + """ + Returns the parent PID for a given PID, converted to an integer. + + @param pid: Process ID. + """ + try: + stat_file_contents = open('/proc/%s/stat' % pid, 'r').readline() + ppid = int(stat_file_contents.split(" ")[3]) + except: + # It is not possible to determine the parent because the process + # already left the process table. + ppid = 1 + + return ppid + + +def pid_descends_from(pid_a, pid_b): + """ + Check whether pid_a descends from pid_b. + + @param pid_a: Process ID. + @param pid_b: Process ID. + """ + pid_a = int(pid_a) + pid_b = int(pid_b) + current_pid = pid_a + while current_pid > 1: + if current_pid == pid_b: + syslog.syslog(syslog.LOG_INFO, + "PID %s descends from PID %s!" % (pid_a, pid_b)) + return True + else: + current_pid = get_parent_pid(current_pid) + syslog.syslog(syslog.LOG_INFO, + "PID %s does not descend from PID %s" % (pid_a, pid_b)) + return False + + +def write_to_file(file_path, contents): + """ + Write contents to a given file path specified. If not specified, the file + will be created. + + @param file_path: Path to a given file. + @param contents: File contents. + """ + file_object = open(file_path, 'w') + file_object.write(contents) + file_object.close() + + +def get_results_dir_list(pid, core_dir_basename): + """ + Get all valid output directories for the core file and the report. It works + by inspecting files created by each test on /tmp and verifying if the + PID of the process that crashed is a child or grandchild of the autotest + test process. If it can't find any relationship (maybe a daemon that died + during a test execution), it will write the core file to the debug dirs + of all tests currently being executed. If there are no active autotest + tests at a particular moment, it will return a list with ['/tmp']. + + @param pid: PID for the process that generated the core + @param core_dir_basename: Basename for the directory that will hold both + the core dump and the crash report. + """ + # Get all active test debugdir path files present + debugdir_files = glob.glob("/tmp/autotest_results_dir.*") + if debugdir_files: + pid_dir_dict = {} + for debugdir_file in debugdir_files: + a_pid = debugdir_file.split('.')[-1] + results_dir = open(debugdir_file, 'r').read().strip() + pid_dir_dict[a_pid] = os.path.join(results_dir, core_dir_basename) + + results_dir_list = [] + found_relation = False + for a_pid, a_path in pid_dir_dict.iteritems(): + if pid_descends_from(pid, a_pid): + results_dir_list.append(a_path) + found_relation = True + + # If we could not find any relations between the pids in the list with + # the process that crashed, we can't tell for sure which test spawned + # the process (maybe it is a daemon and started even before autotest + # started), so we will have to output the core file to all active test + # directories. + if not found_relation: + return pid_dir_dict.values() + else: + return results_dir_list + + else: + path_inactive_autotest = os.path.join('/tmp', core_dir_basename) + return [path_inactive_autotest] + + +def get_info_from_core(path): + """ + Reads a core file and extracts a dictionary with useful core information. + Right now, the only information extracted is the full executable name. + + @param path: Path to core file. + """ + # Here we are getting the executable full path in a very inelegant way :( + # Since the 'right' solution for it is to make a library to get information + # from core dump files, properly written, I'll leave this as it is for now. + full_exe_path = commands.getoutput('strings %s | grep "_="' % + path).strip("_=") + if full_exe_path.startswith("./"): + pwd = commands.getoutput('strings %s | grep "^PWD="' % + path).strip("PWD=") + full_exe_path = os.path.join(pwd, full_exe_path.strip("./")) + + return {'core_file': path, 'full_exe_path': full_exe_path} + + +if __name__ == "__main__": + syslog.openlog('AutotestCrashHandler', 0, syslog.LOG_DAEMON) + (crashed_pid, time, uid, signal, hostname, exe) = sys.argv[1:] + core_name = 'core' + report_name = 'report' + core_dir_name = 'crash.%s.%s' % (exe, crashed_pid) + core_tmp_dir = tempfile.mkdtemp(prefix='core_', dir='/tmp') + core_tmp_path = os.path.join(core_tmp_dir, core_name) + gdb_command_path = os.path.join(core_tmp_dir, 'gdb_command') + + try: + # Get the filtered results dir list + current_results_dir_list = get_results_dir_list(crashed_pid, + core_dir_name) + + # Write the core file to the appropriate directory + # (we are piping it to this script) + core_file = sys.stdin.read() + write_to_file(core_tmp_path, core_file) + + # Write a command file for GDB + gdb_command = 'bt full\n' + write_to_file(gdb_command_path, gdb_command) + + # Get full command path + exe_path = get_info_from_core(core_tmp_path)['full_exe_path'] + + # Take a backtrace from the running program + gdb_cmd = 'gdb -e %s -c %s -x %s -n -batch -quiet' % (exe_path, + core_tmp_path, + gdb_command_path) + backtrace = commands.getoutput(gdb_cmd) + # Sanitize output before passing it to the report + backtrace = backtrace.decode('utf-8', 'ignore') + + # Composing the format_dict + format_dict = {} + format_dict['program'] = exe_path + format_dict['pid'] = crashed_pid + format_dict['signal'] = signal + format_dict['hostname'] = hostname + format_dict['time'] = time + format_dict['backtrace'] = backtrace + + report = """Autotest crash report + +Program: %(program)s +PID: %(pid)s +Signal: %(signal)s +Hostname: %(hostname)s +Time of the crash: %(time)s +Program backtrace: +%(backtrace)s +""" % format_dict + + syslog.syslog(syslog.LOG_INFO, + "Application %s, PID %s crashed" % + (exe_path, crashed_pid)) + + # Now, for all results dir, let's create the directory if it doesn't + # exist, and write the core file and the report to it. + syslog.syslog(syslog.LOG_INFO, + "Writing core files and reports to %s" % + current_results_dir_list) + for result_dir in current_results_dir_list: + if not os.path.isdir(result_dir): + os.makedirs(result_dir) + core_path = os.path.join(result_dir, 'core') + write_to_file(core_path, core_file) + report_path = os.path.join(result_dir, 'report') + write_to_file(report_path, report) + + finally: + # Cleanup temporary directories + shutil.rmtree(core_tmp_dir)