swift-mirror/benchmark/scripts/Benchmark_Driver

#!/usr/bin/env python
# -*- coding: utf-8 -*-

# ===--- Benchmark_Driver ------------------------------------------------===//
#
#  This source file is part of the Swift.org open source project
#
#  Copyright (c) 2014 - 2017 Apple Inc. and the Swift project authors
#  Licensed under Apache License v2.0 with Runtime Library Exception
#
#  See https://swift.org/LICENSE.txt for license information
#  See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
#
# ===---------------------------------------------------------------------===//

import argparse
import glob
import os
import re
import subprocess
import sys
import time

from compare_perf_tests import LogParser

DRIVER_DIR = os.path.dirname(os.path.realpath(__file__))


class BenchmarkDriver(object):
    """Executes tests from Swift Benchmark Suite."""

    def __init__(self, args, tests=None, _subprocess=None, parser=None):
        """Initialized with command line arguments.

        Optional parameters for injecting dependencies; used for testing.
        """
        self.args = args
        self._subprocess = _subprocess or subprocess
        self.all_tests = []
        self.tests = tests or self._get_tests()
        self.parser = parser or LogParser()
        self.results = {}

    def _invoke(self, cmd):
        return self._subprocess.check_output(
            cmd, stderr=self._subprocess.STDOUT)

    @property
    def test_harness(self):
        """Full path to test harness binary."""
        suffix = (self.args.optimization if hasattr(self.args, 'optimization')
                  else 'O')
        return os.path.join(self.args.tests, "Benchmark_" + suffix)

    @property
    def _cmd_list_benchmarks(self):
        # Use tab delimiter for easier parsing to override the default comma.
        # (The third 'column' is always comma-separated list of tags in square
        # brackets -- currently unused here.)
        return [self.test_harness, '--list', '--delim=\t'] + (
            ['--skip-tags='] if (self.args.benchmarks or
                                 self.args.filters) else [])

    def _get_tests(self):
        """Return a list of performance tests to run."""
        index_name_pairs = [
            line.split('\t')[:2] for line in
            self._invoke(self._cmd_list_benchmarks).split('\n')[1:-1]
        ]
        # unzip list of pairs into 2 lists
        indices, self.all_tests = map(list, zip(*index_name_pairs))
        if self.args.filters:
            return self._tests_matching_patterns()
        if self.args.benchmarks:
            return self._tests_by_name_or_index(indices)
        return self.all_tests

    def _tests_matching_patterns(self):
        regexes = [re.compile(pattern) for pattern in self.args.filters]
        return sorted(list(set([name for pattern in regexes
                                for name in self.all_tests
                                if pattern.match(name)])))

    def _tests_by_name_or_index(self, indices):
        benchmarks = set(self.args.benchmarks)
        index_to_name = dict(zip(indices, self.all_tests))
        indexed_names = [index_to_name[i]
                         for i in benchmarks.intersection(set(indices))]
        return sorted(list(
            benchmarks.intersection(set(self.all_tests)).union(indexed_names)))

    def run(self, test, num_samples=None, num_iters=None,
            verbose=None, measure_memory=False):
        """Execute benchmark and gather results."""
        num_samples = num_samples or 1
        num_iters = num_iters or 0  # automatically determine N to run for 1s

        cmd = self._cmd_run(
            test, num_samples, num_iters, verbose, measure_memory)
        output = self._invoke(cmd)
        result = self.parser.results_from_string(output).items()[0][1]
        return result

    def _cmd_run(self, test, num_samples, num_iters, verbose, measure_memory):
        cmd = [self.test_harness, test]
        if num_samples > 1:
            cmd.append('--num-samples={0}'.format(num_samples))
        if num_iters > 0:
            cmd.append('--num-iters={0}'.format(num_iters))
        if verbose:
            cmd.append('--verbose')
        if measure_memory:
            cmd.append('--memory')
        return cmd


def instrument_test(driver_path, test, num_samples):
    """Run a test and instrument its peak memory use"""
    test_outputs = []
    for _ in range(num_samples):
        test_output_raw = subprocess.check_output(
            ['time', '-lp', driver_path, test],
            stderr=subprocess.STDOUT
        )
        peak_memory = re.match('\s*(\d+)\s*maximum resident set size',
                               test_output_raw.split('\n')[-15]).group(1)
        test_outputs.append(test_output_raw.split()[1].split(',') +
                            [peak_memory])

    # Average sample results
    num_samples_index = 2
    min_index = 3
    max_index = 4
    avg_start_index = 5

    # TODO: Correctly take stdev
    avg_test_output = test_outputs[0]
    avg_test_output[avg_start_index:] = map(int,
                                            avg_test_output[avg_start_index:])
    for test_output in test_outputs[1:]:
        for i in range(avg_start_index, len(test_output)):
            avg_test_output[i] += int(test_output[i])
    for i in range(avg_start_index, len(avg_test_output)):
        avg_test_output[i] = int(round(avg_test_output[i] /
                                       float(len(test_outputs))))
    avg_test_output[num_samples_index] = num_samples
    avg_test_output[min_index] = min(
        test_outputs, key=lambda x: int(x[min_index]))[min_index]
    avg_test_output[max_index] = max(
        test_outputs, key=lambda x: int(x[max_index]))[max_index]
    avg_test_output = map(str, avg_test_output)

    return avg_test_output


def get_current_git_branch(git_repo_path):
    """Return the selected branch for the repo `git_repo_path`"""
    return subprocess.check_output(
        ['git', '-C', git_repo_path, 'rev-parse',
         '--abbrev-ref', 'HEAD'], stderr=subprocess.STDOUT).strip()


def get_git_head_ID(git_repo_path):
    """Return the short identifier for the HEAD commit of the repo
        `git_repo_path`"""
    return subprocess.check_output(
        ['git', '-C', git_repo_path, 'rev-parse',
         '--short', 'HEAD'], stderr=subprocess.STDOUT).strip()


def log_results(log_directory, driver, formatted_output, swift_repo=None):
    """Log `formatted_output` to a branch specific directory in
    `log_directory`
    """
    try:
        branch = get_current_git_branch(swift_repo)
    except (OSError, subprocess.CalledProcessError):
        branch = None
    try:
        head_ID = '-' + get_git_head_ID(swift_repo)
    except (OSError, subprocess.CalledProcessError):
        head_ID = ''
    timestamp = time.strftime("%Y%m%d%H%M%S", time.localtime())
    if branch:
        output_directory = os.path.join(log_directory, branch)
    else:
        output_directory = log_directory
    driver_name = os.path.basename(driver)
    try:
        os.makedirs(output_directory)
    except OSError:
        pass
    log_file = os.path.join(output_directory,
                            driver_name + '-' + timestamp + head_ID + '.log')
    print('Logging results to: %s' % log_file)
    with open(log_file, 'w') as f:
        f.write(formatted_output)


def run_benchmarks(driver, benchmarks=[], num_samples=10, verbose=False,
                   log_directory=None, swift_repo=None):
    """Run perf tests individually and return results in a format that's
    compatible with `parse_results`. If `benchmarks` is not empty,
    only run tests included in it.
    """
    # Set a constant hash seed. Some tests are currently sensitive to
    # fluctuations in the number of hash collisions.
    #
    # FIXME: This should only be set in the environment of the child process
    # that runs the tests.
    os.environ["SWIFT_DETERMINISTIC_HASHING"] = "1"

    (total_tests, total_min, total_max, total_mean) = (0, 0, 0, 0)
    output = []
    headings = ['#', 'TEST', 'SAMPLES', 'MIN(μs)', 'MAX(μs)', 'MEAN(μs)',
                'SD(μs)', 'MEDIAN(μs)', 'MAX_RSS(B)']
    line_format = '{:>3} {:<25} {:>7} {:>7} {:>7} {:>8} {:>6} {:>10} {:>10}'
    if verbose and log_directory:
        print(line_format.format(*headings))
    for test in benchmarks:
        test_output = instrument_test(driver, test, num_samples)
        if test_output[0] == 'Totals':
            continue
        if verbose:
            if log_directory:
                print(line_format.format(*test_output))
            else:
                print(','.join(test_output))
        output.append(test_output)
        (samples, _min, _max, mean) = map(int, test_output[2:6])
        total_tests += 1
        total_min += _min
        total_max += _max
        total_mean += mean
    if not output:
        return
    formatted_output = '\n'.join([','.join(l) for l in output])
    totals = map(str, ['Totals', total_tests, total_min, total_max,
                       total_mean, '0', '0', '0'])
    totals_output = '\n\n' + ','.join(totals)
    if verbose:
        if log_directory:
            print(line_format.format(*([''] + totals)))
        else:
            print(totals_output[1:])
    formatted_output += totals_output
    if log_directory:
        log_results(log_directory, driver, formatted_output, swift_repo)
    return formatted_output


def run(args):
    driver = BenchmarkDriver(args)
    run_benchmarks(
        driver.test_harness, benchmarks=driver.tests,
        num_samples=args.iterations, verbose=True,
        log_directory=args.output_dir,
        swift_repo=args.swift_repo)
    return 0


def format_name(log_path):
    """Return the filename and directory for a log file"""
    return '/'.join(log_path.split('/')[-2:])


def compare_logs(compare_script, new_log, old_log, log_dir, opt):
    """Return diff of log files at paths `new_log` and `old_log`"""
    print('Comparing %s %s ...' % (format_name(old_log), format_name(new_log)))
    subprocess.call([compare_script, '--old-file', old_log,
                    '--new-file', new_log, '--format', 'markdown',
                     '--output', os.path.join(log_dir, 'latest_compare_{0}.md'
                                              .format(opt))])


def compare(args):
    log_dir = args.log_dir
    swift_repo = args.swift_repo
    compare_script = args.compare_script
    baseline_branch = args.baseline_branch
    current_branch = get_current_git_branch(swift_repo)
    current_branch_dir = os.path.join(log_dir, current_branch)
    baseline_branch_dir = os.path.join(log_dir, baseline_branch)

    if current_branch != baseline_branch and \
       not os.path.isdir(baseline_branch_dir):
        print(('Unable to find benchmark logs for {baseline_branch} branch. ' +
               'Set a baseline benchmark log by passing --benchmark to ' +
               'build-script while on {baseline_branch} branch.')
              .format(baseline_branch=baseline_branch))
        return 1

    recent_logs = {}
    for branch_dir in [current_branch_dir, baseline_branch_dir]:
        for opt in ['O', 'Onone']:
            recent_logs[os.path.basename(branch_dir) + '_' + opt] = sorted(
                glob.glob(os.path.join(
                    branch_dir, 'Benchmark_' + opt + '-*.log')),
                key=os.path.getctime, reverse=True)

    if current_branch == baseline_branch:
        if len(recent_logs[baseline_branch + '_O']) > 1 and \
           len(recent_logs[baseline_branch + '_Onone']) > 1:
            compare_logs(compare_script,
                         recent_logs[baseline_branch + '_O'][0],
                         recent_logs[baseline_branch + '_O'][1],
                         log_dir, 'O')
            compare_logs(compare_script,
                         recent_logs[baseline_branch + '_Onone'][0],
                         recent_logs[baseline_branch + '_Onone'][1],
                         log_dir, 'Onone')
        else:
            print(('{baseline_branch}/{baseline_branch} comparison ' +
                   'skipped: no previous {baseline_branch} logs')
                  .format(baseline_branch=baseline_branch))
    else:
        # TODO: Check for outdated baseline branch log
        if len(recent_logs[current_branch + '_O']) == 0 or \
           len(recent_logs[current_branch + '_Onone']) == 0:
            print('branch sanity failure: missing branch logs')
            return 1

        if len(recent_logs[current_branch + '_O']) == 1 or \
           len(recent_logs[current_branch + '_Onone']) == 1:
            print('branch/branch comparison skipped: no previous branch logs')
        else:
            compare_logs(compare_script,
                         recent_logs[current_branch + '_O'][0],
                         recent_logs[current_branch + '_O'][1],
                         log_dir, 'O')
            compare_logs(compare_script,
                         recent_logs[current_branch + '_Onone'][0],
                         recent_logs[current_branch + '_Onone'][1],
                         log_dir, 'Onone')

        if len(recent_logs[baseline_branch + '_O']) == 0 or \
           len(recent_logs[baseline_branch + '_Onone']) == 0:
            print(('branch/{baseline_branch} failure: no {baseline_branch} ' +
                   'logs')
                  .format(baseline_branch=baseline_branch))
            return 1
        else:
            compare_logs(compare_script,
                         recent_logs[current_branch + '_O'][0],
                         recent_logs[baseline_branch + '_O'][0],
                         log_dir, 'O')
            compare_logs(compare_script,
                         recent_logs[current_branch + '_Onone'][0],
                         recent_logs[baseline_branch + '_Onone'][0],
                         log_dir, 'Onone')

        # TODO: Fail on large regressions

    return 0


def positive_int(value):
    ivalue = int(value)
    if not (ivalue > 0):
        raise ValueError
    return ivalue


def parse_args(args):
    """Parse command line arguments and set default values."""
    parser = argparse.ArgumentParser(
        epilog='Example: ./Benchmark_Driver run -i 5 -f Prefix -f .*Suffix.*'
    )
    subparsers = parser.add_subparsers(
        title='Swift benchmark driver commands',
        help='See COMMAND -h for additional arguments', metavar='COMMAND')

    shared_benchmarks_parser = argparse.ArgumentParser(add_help=False)
    benchmarks_group = shared_benchmarks_parser.add_mutually_exclusive_group()
    benchmarks_group.add_argument(
        'benchmarks',
        default=[],
        help='benchmark to run (default: all)', nargs='*', metavar="BENCHMARK")
    benchmarks_group.add_argument(
        '-f', '--filter', dest='filters', action='append',
        help='run all tests whose name match regular expression PATTERN, ' +
        'multiple filters are supported', metavar="PATTERN")
    shared_benchmarks_parser.add_argument(
        '-t', '--tests',
        help='directory containing Benchmark_O{,none,size} ' +
        '(default: DRIVER_DIR)',
        default=DRIVER_DIR)
    shared_benchmarks_parser.add_argument(
        '-o', '--optimization',
        metavar='OPT',
        choices=['O', 'Onone', 'Osize'],
        help='optimization level to use: {O,Onone,Osize}, (default: O)',
        default='O')

    run_parser = subparsers.add_parser(
        'run',
        help='Run benchmarks and output results to stdout',
        parents=[shared_benchmarks_parser])
    run_parser.add_argument(
        '-i', '--iterations',
        help='number of times to run each test (default: 1)',
        type=positive_int, default=1)
    run_parser.add_argument(
        '--output-dir',
        help='log results to directory (default: no logging)')
    run_parser.add_argument(
        '--swift-repo',
        help='absolute path to the Swift source repository')
    run_parser.set_defaults(func=run)

    compare_parser = subparsers.add_parser(
        'compare',
        help='Compare benchmark results')
    compare_parser.add_argument(
        '--log-dir', required=True,
        help='directory containing benchmark logs')
    compare_parser.add_argument(
        '--swift-repo', required=True,
        help='absolute path to the Swift source repository')
    compare_parser.add_argument(
        '--compare-script', required=True,
        help='absolute path to compare script')
    compare_parser.add_argument(
        '--baseline-branch', default='master',
        help='attempt to compare results to baseline results for specified '
             'branch (default: master)')
    compare_parser.set_defaults(func=compare)

    return parser.parse_args(args)


def main():
    args = parse_args(sys.argv[1:])
    return args.func(args)


if __name__ == '__main__':
    exit(main())