From patchwork Fri Apr 30 21:20:31 2010
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Lucas Meneghel Rodrigues <lmr@redhat.com>
X-Patchwork-Id: 96070
Received: from vger.kernel.org (vger.kernel.org [209.132.180.67])
	by demeter.kernel.org (8.14.3/8.14.3) with ESMTP id o3ULKhnR002734
	for <patchwork-kvm@patchwork.kernel.org>;
	Fri, 30 Apr 2010 21:20:43 GMT
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
	id S1759599Ab0D3VUj (ORCPT
	<rfc822;patchwork-kvm@patchwork.kernel.org>);
	Fri, 30 Apr 2010 17:20:39 -0400
Received: from mx1.redhat.com ([209.132.183.28]:27851 "EHLO mx1.redhat.com"
	rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP
	id S1759567Ab0D3VUi (ORCPT <rfc822;kvm@vger.kernel.org>);
	Fri, 30 Apr 2010 17:20:38 -0400
Received: from int-mx03.intmail.prod.int.phx2.redhat.com
	(int-mx03.intmail.prod.int.phx2.redhat.com [10.5.11.16])
	by mx1.redhat.com (8.13.8/8.13.8) with ESMTP id o3ULKaoq029157
	(version=TLSv1/SSLv3 cipher=DHE-RSA-AES256-SHA bits=256 verify=OK);
	Fri, 30 Apr 2010 17:20:36 -0400
Received: from localhost.localdomain (vpn-9-198.rdu.redhat.com [10.11.9.198])
	by int-mx03.intmail.prod.int.phx2.redhat.com (8.13.8/8.13.8) with
	ESMTP id o3ULKXD6008154; Fri, 30 Apr 2010 17:20:34 -0400
From: Lucas Meneghel Rodrigues <lmr@redhat.com>
To: autotest@test.kernel.org
Cc: kvm@vger.kernel.org, Lucas Meneghel Rodrigues <lmr@redhat.com>
Subject: [PATCH 1/2] IOzone test: Introduce postprocessing module
Date: Fri, 30 Apr 2010 18:20:31 -0300
Message-Id: <1272662432-27875-1-git-send-email-lmr@redhat.com>
X-Scanned-By: MIMEDefang 2.67 on 10.5.11.16
Sender: kvm-owner@vger.kernel.org
Precedence: bulk
List-ID: <kvm.vger.kernel.org>
X-Mailing-List: kvm@vger.kernel.org
X-Greylist: IP, sender and recipient auto-whitelisted, not delayed by
	milter-greylist-4.2.3 (demeter.kernel.org [140.211.167.41]);
	Fri, 30 Apr 2010 21:20:45 +0000 (UTC)


diff --git a/client/tests/iozone/postprocessing.py b/client/tests/iozone/postprocessing.py
new file mode 100755
index 0000000..b495502
--- /dev/null
+++ b/client/tests/iozone/postprocessing.py
@@ -0,0 +1,487 @@
+#!/usr/bin/python
+"""
+Postprocessing module for IOzone. It is capable to pick results from an
+IOzone run, calculate the geometric mean for all throughput results for
+a given file size or record size, and then generate a series of 2D and 3D
+graphs. The graph generation functionality depends on gnuplot, and if it
+is not present, functionality degrates gracefully.
+
+@copyright: Red Hat 2010
+"""
+import os, sys, optparse, logging, math, time
+import common
+from autotest_lib.client.common_lib import logging_config, logging_manager
+from autotest_lib.client.common_lib import error
+from autotest_lib.client.bin import utils, os_dep
+
+
+_LABELS = ('file_size', 'record_size', 'write', 'rewrite', 'read', 'reread',
+           'randread', 'randwrite', 'bkwdread', 'recordrewrite', 'strideread',
+           'fwrite', 'frewrite', 'fread', 'freread')
+
+
+def unique(list):
+    """
+    Return a list of the elements in list, but without duplicates.
+
+    @param list: List with values.
+    @return: List with non duplicate elements.
+    """
+    n = len(list)
+    if n == 0:
+        return []
+    u = {}
+    try:
+        for x in list:
+            u[x] = 1
+    except TypeError:
+        return None
+    else:
+        return u.keys()
+
+
+def geometric_mean(values):
+    """
+    Evaluates the geometric mean for a list of numeric values.
+
+    @param values: List with values.
+    @return: Single value representing the geometric mean for the list values.
+    @see: http://en.wikipedia.org/wiki/Geometric_mean
+    """
+    try:
+        values = [int(value) for value in values]
+    except ValueError:
+        return None
+    product = 1
+    n = len(values)
+    if n == 0:
+        return None
+    return math.exp(sum([math.log(x) for x in values])/n)
+
+
+def compare_matrices(matrix1, matrix2, treshold=0.05):
+    """
+    Compare 2 matrices nxm and return a matrix nxm with comparison data
+
+    @param matrix1: Reference Matrix with numeric data
+    @param matrix2: Matrix that will be compared
+    @param treshold: Any difference bigger than this percent treshold will be
+            reported.
+    """
+    improvements = 0
+    regressions = 0
+    same = 0
+    comparison_matrix = []
+
+    new_matrix = []
+    for line1, line2 in zip(matrix1, matrix2):
+        new_line = []
+        for element1, element2 in zip(line1, line2):
+            ratio = float(element2) / float(element1)
+            if ratio < (1 - treshold):
+                regressions += 1
+                new_line.append((100 * ratio - 1) - 100)
+            elif ratio > (1 + treshold):
+                improvements += 1
+                new_line.append("+" + str((100 * ratio - 1) - 100))
+            else:
+                same + 1
+                if line1.index(element1) == 0:
+                    new_line.append(element1)
+                else:
+                    new_line.append(".")
+        new_matrix.append(new_line)
+
+    total = improvements + regressions + same
+
+    return (new_matrix, improvements, regressions, total)
+
+
+class IOzoneAnalyzer(object):
+    """
+    Analyze an unprocessed IOzone file, and generate the following types of
+    report:
+
+    * Summary of throughput for all file and record sizes combined
+    * Summary of throughput for all file sizes
+    * Summary of throughput for all record sizes
+
+    If more than one file is provided to the analyzer object, a comparison
+    between the two runs is made, searching for regressions in performance.
+    """
+    def __init__(self, list_files, output_dir):
+        self.list_files = list_files
+        if not os.path.isdir(output_dir):
+            os.makedirs(output_dir)
+        self.output_dir = output_dir
+        logging.info("Results will be stored in %s", output_dir)
+
+
+    def average_performance(self, results, size=None):
+        """
+        Flattens a list containing performance results.
+
+        @param results: List of n lists containing data from performance runs.
+        @param size: Numerical value of a size (say, file_size) that was used
+                to filter the original results list.
+        @return: List with 1 list containing average data from the performance
+                run.
+        """
+        average_line = []
+        if size is not None:
+            average_line.append(size)
+        for i in range(2, 15):
+            average = geometric_mean([line[i] for line in results]) / 1024.0
+            average = int(average)
+            average_line.append(average)
+        return average_line
+
+
+    def process_results(self, results, label=None):
+        """
+        Process a list of IOzone results according to label.
+
+        @label: IOzone column label that we'll use to filter and compute
+                geometric mean results, in practical term either 'file_size'
+                or 'record_size'.
+        @result: A list of n x m columns with original iozone results.
+        @return: A list of n-? x (m-1) columns with geometric averages for
+                values of each label (ex, average for all file_sizes).
+        """
+        performance = []
+        if label is not None:
+            index = _LABELS.index(label)
+            sizes = unique([line[index] for line in results])
+            sizes.sort()
+            for size in sizes:
+                r_results = [line for line in results if line[index] == size]
+                performance.append(self.average_performance(r_results, size))
+        else:
+            performance.append(self.average_performance(results))
+
+        return performance
+
+
+    def parse_file(self, file):
+        """
+        Parse an IOzone results file.
+
+        @param file: File object that will be parsed.
+        @return: Matrix containing IOzone results extracted from the file.
+        """
+        lines = []
+        for line in file.readlines():
+            fields = line.split()
+            if len(fields) != 15:
+                continue
+            try:
+                lines.append([int(i) for i in fields])
+            except ValueError:
+                continue
+        return lines
+
+
+    def report(self, overall_results, record_size_results, file_size_results):
+        """
+        Generates analysis data for IOZone run.
+
+        Generates a report to both logs (where it goes with nice headers) and
+        output files for further processing (graph generation).
+
+        @param overall_results: 1x15 Matrix containing IOzone results for all
+                file sizes
+        @param record_size_results: nx15 Matrix containing IOzone results for
+                each record size tested.
+        @param file_size_results: nx15 Matrix containing file size results
+                for each file size tested.
+        """
+        # Here we'll use the logging system to put the output of our analysis
+        # to files
+        logger = logging.getLogger()
+        formatter = logging.Formatter("")
+
+        logging.info("")
+        logging.info("TABLE:  SUMMARY of ALL FILE and RECORD SIZES                        Results in MB/sec")
+        logging.info("")
+        logging.info("FILE & RECORD  INIT    RE              RE    RANDOM  RANDOM  BACKWD   RECRE  STRIDE    F       FRE     F       FRE")
+        logging.info("SIZES (KB)     WRITE   WRITE   READ    READ    READ   WRITE    READ   WRITE    READ    WRITE   WRITE   READ    READ")
+        logging.info("-------------------------------------------------------------------------------------------------------------------")
+        for result_line in overall_results:
+            logging.info("ALL            %-8s%-8s%-8s%-8s%-8s%-8s%-8s%-8s%-8s%-8s%-8s%-8s%-8s" % tuple(result_line))
+        logging.info("")
+
+        logging.info("DRILLED DATA:")
+
+        logging.info("")
+        logging.info("TABLE:  RECORD Size against all FILE Sizes                          Results in MB/sec")
+        logging.info("")
+        logging.info("RECORD    INIT    RE              RE    RANDOM  RANDOM  BACKWD   RECRE  STRIDE    F       FRE     F       FRE ")
+        logging.info("SIZE (KB) WRITE   WRITE   READ    READ    READ   WRITE    READ   WRITE    READ    WRITE   WRITE   READ    READ")
+        logging.info("--------------------------------------------------------------------------------------------------------------")
+
+        foutput_path = os.path.join(self.output_dir, '2d-datasource-file')
+        if os.path.isfile(foutput_path):
+            os.unlink(foutput_path)
+        foutput = logging.FileHandler(foutput_path)
+        foutput.setFormatter(formatter)
+        logger.addHandler(foutput)
+        for result_line in record_size_results:
+            logging.info("%-10s%-8s%-8s%-8s%-8s%-8s%-8s%-8s%-8s%-8s%-8s%-8s%-8s%-8s" % tuple(result_line))
+        logger.removeHandler(foutput)
+
+        logging.info("")
+
+        logging.info("")
+        logging.info("TABLE:  FILE Size against all RECORD Sizes                          Results in MB/sec")
+        logging.info("")
+        logging.info("RECORD    INIT    RE              RE    RANDOM  RANDOM  BACKWD   RECRE  STRIDE    F       FRE     F       FRE ")
+        logging.info("SIZE (KB) WRITE   WRITE   READ    READ    READ   WRITE    READ   WRITE    READ    WRITE   WRITE   READ    READ")
+        logging.info("--------------------------------------------------------------------------------------------------------------")
+
+        routput_path = os.path.join(self.output_dir, '2d-datasource-record')
+        if os.path.isfile(routput_path):
+            os.unlink(routput_path)
+        routput = logging.FileHandler(routput_path)
+        routput.setFormatter(formatter)
+        logger.addHandler(routput)
+        for result_line in file_size_results:
+            logging.info("%-10s%-8s%-8s%-8s%-8s%-8s%-8s%-8s%-8s%-8s%-8s%-8s%-8s%-8s" % tuple(result_line))
+        logger.removeHandler(routput)
+
+        logging.info("")
+
+
+    def report_comparison(self, record, file):
+        """
+        Generates comparison data for 2 IOZone runs.
+
+        It compares 2 sets of nxm results and outputs a table with differences.
+        If a difference higher or smaller than 5% is found, a warning is
+        triggered.
+
+        @param record: Tuple with 4 elements containing results for record size.
+        @param file: Tuple with 4 elements containing results for file size.
+        """
+        (record_size, record_improvements, record_regressions,
+         record_total) = record
+        (file_size, file_improvements, file_regressions,
+         file_total) = file
+        logging.info("ANALYSIS of DRILLED DATA:")
+
+        logging.info("")
+        logging.info("TABLE:  RECsize Difference between runs                            Results are % DIFF")
+        logging.info("")
+        logging.info("RECORD    INIT    RE              RE    RANDOM  RANDOM  BACKWD   RECRE  STRIDE    F       FRE     F       FRE ")
+        logging.info("SIZE (KB) WRITE   WRITE   READ    READ    READ   WRITE    READ   WRITE    READ    WRITE   WRITE   READ    READ")
+        logging.info("--------------------------------------------------------------------------------------------------------------")
+        for result_line in record_size:
+            logging.info("%-10s%-8.6s%-8.6s%-8.6s%-8.6s%-8.6s%-8.6s%-8.6s%-8.6s%-8.6s%-8.6s%-8.6s%-8.6s%-8.6s" % tuple(result_line))
+        logging.info("REGRESSIONS: %d (%.2f%%)    Improvements: %d (%.2f%%)",
+                     record_regressions,
+                     (100 * record_regressions/float(record_total)),
+                     record_improvements,
+                     (100 * record_improvements/float(record_total)))
+        logging.info("")
+
+        logging.info("")
+        logging.info("TABLE:  FILEsize Difference between runs                           Results are % DIFF")
+        logging.info("")
+        logging.info("RECORD    INIT    RE              RE    RANDOM  RANDOM  BACKWD   RECRE  STRIDE    F       FRE     F       FRE ")
+        logging.info("SIZE (KB) WRITE   WRITE   READ    READ    READ   WRITE    READ   WRITE    READ    WRITE   WRITE   READ    READ")
+        logging.info("--------------------------------------------------------------------------------------------------------------")
+        for result_line in file_size:
+            logging.info("%-10s%-8.6s%-8.6s%-8.6s%-8.6s%-8.6s%-8.6s%-8.6s%-8.6s%-8.6s%-8.6s%-8.6s%-8.6s%-8.6s" % tuple(result_line))
+        logging.info("REGRESSIONS: %d (%.2f%%)    Improvements: %d (%.2f%%)",
+                     file_regressions,
+                     (100 * file_regressions/float(file_total)),
+                     file_improvements,
+                     (100 * file_improvements/float(file_total)))
+        logging.info("")
+
+
+    def analyze(self):
+        """
+        Analyzes and eventually compares sets of IOzone data.
+        """
+        overall = []
+        record_size = []
+        file_size = []
+        for path in self.list_files:
+            file = open(path, 'r')
+            logging.info('FILE: %s', path)
+
+            results = self.parse_file(file)
+
+            overall_results = self.process_results(results)
+            record_size_results = self.process_results(results, 'record_size')
+            file_size_results = self.process_results(results, 'file_size')
+            self.report(overall_results, record_size_results, file_size_results)
+
+            if len(self.list_files) == 2:
+                overall.append(overall_results)
+                record_size.append(record_size_results)
+                file_size.append(file_size_results)
+
+        if len(self.list_files) == 2:
+            record_comparison = compare_matrices(*record_size)
+            file_comparison = compare_matrices(*file_size)
+            self.report_comparison(record_comparison, file_comparison)
+
+
+class IOzonePlotter(object):
+    """
+    Plots graphs based on the results of an IOzone run.
+
+    Plots graphs based on the results of an IOzone run. Uses gnuplot to
+    generate the graphs.
+    """
+    def __init__(self, results_file, output_dir):
+        self.active = True
+        try:
+            self.gnuplot = os_dep.command("gnuplot")
+        except:
+            logging.error("Command gnuplot not found, disabling graph "
+                          "generation")
+            self.active = False
+
+        if not os.path.isdir(output_dir):
+            os.makedirs(output_dir)
+        self.output_dir = output_dir
+
+        if not os.path.isfile(results_file):
+            logging.error("Invalid file %s provided, disabling graph "
+                          "generation", results_file)
+            self.active = False
+            self.results_file = None
+        else:
+            self.results_file = results_file
+            self.generate_data_source()
+
+
+    def generate_data_source(self):
+        """
+        Creates data file without headers for gnuplot consumption.
+        """
+        results_file = open(self.results_file, 'r')
+        self.datasource = os.path.join(self.output_dir, '3d-datasource')
+        datasource = open(self.datasource, 'w')
+        for line in results_file.readlines():
+            fields = line.split()
+            if len(fields) != 15:
+                continue
+            try:
+                values = [int(i) for i in fields]
+                datasource.write(line)
+            except ValueError:
+                continue
+        datasource.close()
+
+
+    def plot_2d_graphs(self):
+        """
+        For each one of the throughput parameters, generate a set of gnuplot
+        commands that will create a parametric surface with file size vs.
+        record size vs. throughput.
+        """
+        datasource_2d = os.path.join(self.output_dir, '2d-datasource-file')
+        for index, label in zip(range(1, 14), _LABELS[2:]):
+            commands_path = os.path.join(self.output_dir, '2d-%s.do' % label)
+            commands = ""
+            commands += "set title 'Iozone performance: %s'\n" % label
+            commands += "set logscale x\n"
+            commands += "set xlabel 'File size (KB)'\n"
+            commands += "set ylabel 'Througput (MB/s)'\n"
+            commands += "set terminal png small size 450 350\n"
+            commands += "set output '%s'\n" % os.path.join(self.output_dir,
+                                                           '2d-%s.png' % label)
+            commands += ("plot '%s' using 1:%s title '%s' with lines \n" %
+                         (datasource_2d, index, label))
+            commands_file = open(commands_path, 'w')
+            commands_file.write(commands)
+            commands_file.close()
+            try:
+                utils.run("%s %s" % (self.gnuplot, commands_path))
+            except error.CmdError, e:
+                logging.error("Problem plotting from commands file %s: %s",
+                              commands_file, str(e))
+
+
+    def plot_3d_graphs(self):
+        """
+        For each one of the throughput parameters, generate a set of gnuplot
+        commands that will create a parametric surface with file size vs.
+        record size vs. throughput.
+        """
+        for index, label in zip(range(1, 14), _LABELS[2:]):
+            commands_path = os.path.join(self.output_dir, '%s.do' % label)
+            commands = ""
+            commands += "set title 'Iozone performance: %s'\n" % label
+            commands += "set grid lt 2 lw 1\n"
+            commands += "set surface\n"
+            commands += "set parametric\n"
+            commands += "set xtics\n"
+            commands += "set ytics\n"
+            commands += "set logscale x 2\n"
+            commands += "set logscale y 2\n"
+            commands += "set logscale z\n"
+            commands += "set xrange [2.**5:2.**24]\n"
+            commands += "set xlabel 'File size (KB)'\n"
+            commands += "set ylabel 'Record size (KB)'\n"
+            commands += "set zlabel 'Througput (KB/s)'\n"
+            commands += "set data style lines\n"
+            commands += "set dgrid3d 80,80, 3\n"
+            commands += "set terminal png small size 900 700\n"
+            commands += "set output '%s'\n" % os.path.join(self.output_dir,
+                                                           '%s.png' % label)
+            commands += ("splot '%s' using 1:2:%s title '%s'\n" %
+                         (self.datasource, index, label))
+            commands_file = open(commands_path, 'w')
+            commands_file.write(commands)
+            commands_file.close()
+            try:
+                utils.run("%s %s" % (self.gnuplot, commands_path))
+            except error.CmdError, e:
+                logging.error("Problem plotting from commands file %s: %s",
+                              commands_file, str(e))
+
+
+    def plot_all(self):
+        """
+        Plot all graphs that are to be plotted, provided that we have gnuplot.
+        """
+        if self.active:
+            self.plot_2d_graphs()
+            self.plot_3d_graphs()
+
+
+class AnalyzerLoggingConfig(logging_config.LoggingConfig):
+    def configure_logging(self, results_dir=None, verbose=False):
+        super(AnalyzerLoggingConfig, self).configure_logging(use_console=True,
+                                                        verbose=verbose)
+
+
+if __name__ == "__main__":
+    parser = optparse.OptionParser("usage: %prog [options] [filenames]")
+    options, args = parser.parse_args()
+
+    logging_manager.configure_logging(AnalyzerLoggingConfig())
+
+    if args:
+        filenames = args
+    else:
+        parser.print_help()
+        sys.exit(1)
+
+    if len(args) > 2:
+        parser.print_help()
+        sys.exit(1)
+
+    o = os.path.join(os.getcwd(),
+                     "iozone-graphs-%s" % time.strftime('%Y-%m-%d-%H.%M.%S'))
+    if not os.path.isdir(o):
+        os.makedirs(o)
+
+    a = IOzoneAnalyzer(list_files=filenames, output_dir=o)
+    a.analyze()
+    p = IOzonePlotter(results_file=filenames[0], output_dir=o)
+    p.plot_all()