#!/usr/bin/python # -*- coding: utf-8 -*- # ===--- compare_perf_tests.py -------------------------------------------===// # # This source file is part of the Swift.org open source project # # Copyright (c) 2014 - 2017 Apple Inc. and the Swift project authors # Licensed under Apache License v2.0 with Runtime Library Exception # # See https://swift.org/LICENSE.txt for license information # See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors # # ===---------------------------------------------------------------------===// """ This script compares performance test logs and issues a formatted report. Invoke `$ compare_perf_tests.py -h ` for complete list of options. class `Sample` is single benchmark measurement. class `PerformanceTestSamples` is collection of `Sample`s and their statistics. class `PerformanceTestResult` is a summary of performance test execution. class `LogParser` converts log files into `PerformanceTestResult`s. class `ResultComparison` compares new and old `PerformanceTestResult`s. class `TestComparator` analyzes changes betweeen the old and new test results. class `ReportFormatter` creates the test comparison report in specified format. """ from __future__ import print_function import argparse import functools import re import sys from bisect import bisect, bisect_left, bisect_right from collections import namedtuple from math import ceil, sqrt class Sample(namedtuple("Sample", "i num_iters runtime")): u"""Single benchmark measurement. Initialized with: `i`: ordinal number of the sample taken, `num-num_iters`: number or iterations used to compute it, `runtime`: in microseconds (μs). """ def __repr__(self): """Shorter Sample formatting for debugging purposes.""" return "s({0.i!r}, {0.num_iters!r}, {0.runtime!r})".format(self) class Yield(namedtuple("Yield", "before_sample after")): u"""Meta-measurement of when the Benchmark_X voluntarily yielded process. `before_sample`: index of measurement taken just after returning from yield `after`: time elapsed since the previous yield in microseconds (μs) """ class PerformanceTestSamples(object): """Collection of runtime samples from the benchmark execution. Computes the sample population statistics. """ def __init__(self, name, samples=None): """Initialize with benchmark name and optional list of Samples.""" self.name = name # Name of the performance test self.samples = [] self.outliers = [] self._runtimes = [] self.mean = 0.0 self.S_runtime = 0.0 # For computing running variance for sample in samples or []: self.add(sample) def __str__(self): """Text summary of benchmark statistics.""" return ( "{0.name!s} n={0.count!r} " "Min={0.min!r} Q1={0.q1!r} M={0.median!r} Q3={0.q3!r} " "Max={0.max!r} " "R={0.range!r} {0.spread:.2%} IQR={0.iqr!r} " "Mean={0.mean:.0f} SD={0.sd:.0f} CV={0.cv:.2%}".format(self) if self.samples else "{0.name!s} n=0".format(self) ) def add(self, sample): """Add sample to collection and recompute statistics.""" assert isinstance(sample, Sample) self._update_stats(sample) i = bisect(self._runtimes, sample.runtime) self._runtimes.insert(i, sample.runtime) self.samples.insert(i, sample) def _update_stats(self, sample): old_stats = (self.count, self.mean, self.S_runtime) _, self.mean, self.S_runtime = self.running_mean_variance( old_stats, sample.runtime ) def exclude_outliers(self, top_only=False): """Exclude outliers by applying Interquartile Range Rule. Moves the samples outside of the inner fences (Q1 - 1.5*IQR and Q3 + 1.5*IQR) into outliers list and recomputes statistics for the remaining sample population. Optionally apply only the top inner fence, preserving the small outliers. Experimentally, this rule seems to perform well-enough on the benchmark runtimes in the microbenchmark range to filter out the environment noise caused by preemtive multitasking. """ lo = ( 0 if top_only else bisect_left(self._runtimes, int(self.q1 - 1.5 * self.iqr)) ) hi = bisect_right(self._runtimes, int(self.q3 + 1.5 * self.iqr)) outliers = self.samples[:lo] + self.samples[hi:] samples = self.samples[lo:hi] self.__init__(self.name) # re-initialize for sample in samples: # and self.add(sample) # re-compute stats self.outliers = outliers @property def count(self): """Number of samples used to compute the statistics.""" return len(self.samples) @property def num_samples(self): """Number of all samples in the collection.""" return len(self.samples) + len(self.outliers) @property def all_samples(self): """List of all samples in ascending order.""" return sorted(self.samples + self.outliers, key=lambda s: s.i or -1) @property def min(self): """Minimum sampled value.""" return self.samples[0].runtime @property def max(self): """Maximum sampled value.""" return self.samples[-1].runtime def quantile(self, q): """Return runtime for given quantile. Equivalent to quantile estimate type R-1, SAS-3. See: https://en.wikipedia.org/wiki/Quantile#Estimating_quantiles_from_a_sample """ index = max(0, int(ceil(self.count * float(q))) - 1) return self.samples[index].runtime @property def median(self): """Median sampled value.""" return self.quantile(0.5) @property def q1(self): """First Quartile (25th Percentile).""" return self.quantile(0.25) @property def q3(self): """Third Quartile (75th Percentile).""" return self.quantile(0.75) @property def iqr(self): """Interquartile Range.""" return self.q3 - self.q1 @property def sd(self): u"""Standard Deviation (μs).""" return 0 if self.count < 2 else sqrt(self.S_runtime / (self.count - 1)) @staticmethod def running_mean_variance(stats, x): """Compute running variance, B. P. Welford's method. See Knuth TAOCP vol 2, 3rd edition, page 232, or https://www.johndcook.com/blog/standard_deviation/ M is mean, Standard Deviation is defined as sqrt(S/k-1) """ (k, M_, S_) = stats k = float(k + 1) M = M_ + (x - M_) / k S = S_ + (x - M_) * (x - M) return (k, M, S) @property def cv(self): """Coeficient of Variation (%).""" return (self.sd / self.mean) if self.mean else 0 @property def range(self): """Range of samples values (Max - Min).""" return self.max - self.min @property def spread(self): """Sample Spread; i.e. Range as (%) of Min.""" return self.range / float(self.min) if self.min else 0 class PerformanceTestResult(object): u"""Result from executing an individual Swift Benchmark Suite benchmark. Reported by the test driver (Benchmark_O, Benchmark_Onone, Benchmark_Osize or Benchmark_Driver). It suppors 2 log formats emitted by the test driver. Legacy format with statistics for normal distribution (MEAN, SD): #,TEST,SAMPLES,MIN(μs),MAX(μs),MEAN(μs),SD(μs),MEDIAN(μs),MAX_RSS(B) And new quantiles format with variable number of columns: #,TEST,SAMPLES,MIN(μs),MEDIAN(μs),MAX(μs) #,TEST,SAMPLES,MIN(μs),Q1(μs),Q2(μs),Q3(μs),MAX(μs),MAX_RSS(B) The number of columns between MIN and MAX depends on the test driver's `--quantile`parameter. In both cases, the last column, MAX_RSS is optional. """ def __init__(self, csv_row, quantiles=False, memory=False, delta=False, meta=False): """Initialize from a row of multiple columns with benchmark summary. The row is an iterable, such as a row provided by the CSV parser. """ self.test_num = csv_row[0] # Ordinal number of the test self.name = csv_row[1] # Name of the performance test self.num_samples = int(csv_row[2]) # Number of measurements taken if quantiles: # Variable number of columns representing quantiles mem_index = (-1 if memory else 0) + (-3 if meta else 0) runtimes = csv_row[3:mem_index] if memory or meta else csv_row[3:] if delta: runtimes = [int(x) if x else 0 for x in runtimes] runtimes = functools.reduce( lambda l, x: l.append(l[-1] + x) or l if l else [x], # runnin runtimes, None, ) # total num_values = len(runtimes) if self.num_samples < num_values: # remove repeated samples quantile = num_values - 1 qs = [float(i) / float(quantile) for i in range(0, num_values)] indices = [ max(0, int(ceil(self.num_samples * float(q))) - 1) for q in qs ] runtimes = [ runtimes[indices.index(i)] for i in range(0, self.num_samples) ] self.samples = PerformanceTestSamples( self.name, [Sample(None, None, int(runtime)) for runtime in runtimes] ) self.samples.exclude_outliers(top_only=True) sams = self.samples self.min, self.max, self.median, self.mean, self.sd = ( sams.min, sams.max, sams.median, sams.mean, sams.sd, ) self.max_rss = ( # Maximum Resident Set Size (B) int(csv_row[mem_index]) if memory else None ) else: # Legacy format with statistics for normal distribution. self.min = int(csv_row[3]) # Minimum runtime (μs) self.max = int(csv_row[4]) # Maximum runtime (μs) self.mean = float(csv_row[5]) # Mean (average) runtime (μs) self.sd = float(csv_row[6]) # Standard Deviation (μs) self.median = int(csv_row[7]) # Median runtime (μs) self.max_rss = ( # Maximum Resident Set Size (B) int(csv_row[8]) if len(csv_row) > 8 else None ) self.samples = None # Optional measurement metadata. The number of: # memory pages used, involuntary context switches and voluntary yields self.mem_pages, self.involuntary_cs, self.yield_count = ( [int(x) for x in csv_row[-3:]] if meta else (None, None, None) ) self.yields = None self.setup = None def __repr__(self): """Short summary for debugging purposes.""" return ( "".format(self) ) def merge(self, r): """Merge two results. Recomputes min, max and mean statistics. If all `samples` are avaliable, it recomputes all the statistics. The use case here is comparing test results parsed from concatenated log files from multiple runs of benchmark driver. """ # Statistics if self.samples and r.samples: for sample in r.samples.samples: self.samples.add(sample) sams = self.samples self.num_samples = sams.num_samples self.min, self.max, self.median, self.mean, self.sd = ( sams.min, sams.max, sams.median, sams.mean, sams.sd, ) else: self.min = min(self.min, r.min) self.max = max(self.max, r.max) self.mean = ( # pooled mean is the weighted sum of means (self.mean * self.num_samples) + (r.mean * r.num_samples) ) / float(self.num_samples + r.num_samples) self.num_samples += r.num_samples self.median, self.sd = None, None # Metadata def minimum(a, b): # work around None being less than everything return min(filter(lambda x: x is not None, [a, b])) if any([a, b]) else None self.max_rss = minimum(self.max_rss, r.max_rss) self.setup = minimum(self.setup, r.setup) class ResultComparison(object): """ResultComparison compares MINs from new and old PerformanceTestResult. It computes speedup ratio and improvement delta (%). """ def __init__(self, old, new): """Initialize with old and new `PerformanceTestResult`s to compare.""" self.old = old self.new = new assert old.name == new.name self.name = old.name # Test name, convenience accessor # Speedup ratio self.ratio = (old.min + 0.001) / (new.min + 0.001) # Test runtime improvement in % ratio = (new.min + 0.001) / (old.min + 0.001) self.delta = (ratio - 1) * 100 # Indication of dubious changes: when result's MIN falls inside the # (MIN, MAX) interval of result they are being compared with. self.is_dubious = (old.min < new.min and new.min < old.max) or ( new.min < old.min and old.min < new.max ) class LogParser(object): """Converts log outputs into `PerformanceTestResult`s. Supports various formats produced by the `Benchmark_Driver` and `Benchmark_O`('Onone', 'Osize'). It can also merge together the results from concatenated log files. """ def __init__(self): """Create instance of `LogParser`.""" self.results = [] self.quantiles, self.delta, self.memory = False, False, False self.meta = False self._reset() def _reset(self): """Reset parser to the default state for reading a new result.""" self.samples, self.yields, self.num_iters = [], [], 1 self.setup, self.max_rss, self.mem_pages = None, None, None self.voluntary_cs, self.involuntary_cs = None, None # Parse lines like this # #,TEST,SAMPLES,MIN(μs),MAX(μs),MEAN(μs),SD(μs),MEDIAN(μs) results_re = re.compile( r"( *\d+[, \t]+[\w.\-\?!]+[, \t]+" + r"[, \t]+".join([r"\d+"] * 2) # #,TEST + r"(?:[, \t]+\d*)*)" # at least 2... ) # ...or more numeric columns def _append_result(self, result): columns = result.split(",") if "," in result else result.split() r = PerformanceTestResult( columns, quantiles=self.quantiles, memory=self.memory, delta=self.delta, meta=self.meta, ) r.setup = self.setup r.max_rss = r.max_rss or self.max_rss r.mem_pages = r.mem_pages or self.mem_pages r.voluntary_cs = self.voluntary_cs r.involuntary_cs = r.involuntary_cs or self.involuntary_cs if self.samples: r.samples = PerformanceTestSamples(r.name, self.samples) r.samples.exclude_outliers() self.results.append(r) r.yields = self.yields or None self._reset() def _store_memory_stats(self, max_rss, mem_pages): self.max_rss = int(max_rss) self.mem_pages = int(mem_pages) def _configure_format(self, header): self.quantiles = "MEAN" not in header self.memory = "MAX_RSS" in header self.meta = "PAGES" in header self.delta = "𝚫" in header # Regular expression and action to take when it matches the parsed line state_actions = { results_re: _append_result, # Verbose mode adds new productions: # Adaptively determined N; test loop multiple adjusting runtime to ~1s re.compile(r"\s+Measuring with scale (\d+)."): ( lambda self, num_iters: setattr(self, "num_iters", num_iters) ), re.compile(r"\s+Sample (\d+),(\d+)"): ( lambda self, i, runtime: self.samples.append( Sample(int(i), int(self.num_iters), int(runtime)) ) ), re.compile(r"\s+SetUp (\d+)"): ( lambda self, setup: setattr(self, "setup", int(setup)) ), re.compile(r"\s+Yielding after ~(\d+) μs"): ( lambda self, since_last_yield: self.yields.append( Yield(len(self.samples), int(since_last_yield)) ) ), re.compile(r"( *#[, \t]+TEST[, \t]+SAMPLES[, \t]+MIN.*)"): _configure_format, # Environmental statistics: memory usage and context switches re.compile( r"\s+MAX_RSS \d+ - \d+ = (\d+) \((\d+) pages\)" ): _store_memory_stats, re.compile(r"\s+VCS \d+ - \d+ = (\d+)"): ( lambda self, vcs: setattr(self, "voluntary_cs", int(vcs)) ), re.compile(r"\s+ICS \d+ - \d+ = (\d+)"): ( lambda self, ics: setattr(self, "involuntary_cs", int(ics)) ), } def parse_results(self, lines): """Parse results from the lines of the log output from Benchmark*. Returns a list of `PerformanceTestResult`s. """ for line in lines: for regexp, action in LogParser.state_actions.items(): match = regexp.match(line) if match: action(self, *match.groups()) break # stop after 1st match else: # If none matches, skip the line. # print('skipping: ' + line.rstrip('\n')) continue return self.results @staticmethod def _results_from_lines(lines): tests = LogParser().parse_results(lines) def add_or_merge(names, r): if r.name not in names: names[r.name] = r else: names[r.name].merge(r) return names return functools.reduce(add_or_merge, tests, dict()) @staticmethod def results_from_string(log_contents): """Parse `PerformanceTestResult`s from the supplied string. Returns dictionary of test names and `PerformanceTestResult`s. """ return LogParser._results_from_lines(log_contents.splitlines()) @staticmethod def results_from_file(log_file): """Parse `PerformanceTestResult`s from the log file. Returns dictionary of test names and `PerformanceTestResult`s. """ with open(log_file) as f: return LogParser._results_from_lines(f.readlines()) class TestComparator(object): """Analyzes changes betweeen the old and new test results. It determines which tests were `added`, `removed` and which can be compared. It then splits the `ResultComparison`s into 3 groups according to the `delta_threshold` by the change in performance: `increased`, `descreased` and `unchanged`. Whole computaion is performed during initialization and results are provided as properties on this object. The lists of `added`, `removed` and `unchanged` tests are sorted alphabetically. The `increased` and `decreased` lists are sorted in descending order by the amount of change. """ def __init__(self, old_results, new_results, delta_threshold): """Initialize with dictionaries of old and new benchmark results. Dictionary keys are benchmark names, values are `PerformanceTestResult`s. """ old_tests = set(old_results.keys()) new_tests = set(new_results.keys()) comparable_tests = new_tests.intersection(old_tests) added_tests = new_tests.difference(old_tests) removed_tests = old_tests.difference(new_tests) self.added = sorted([new_results[t] for t in added_tests], key=lambda r: r.name) self.removed = sorted( [old_results[t] for t in removed_tests], key=lambda r: r.name ) def compare(name): return ResultComparison(old_results[name], new_results[name]) comparisons = list(map(compare, comparable_tests)) def partition(items, p): return functools.reduce( lambda x, y: x[not p(y)].append(y) or x, items, ([], []) ) decreased, not_decreased = partition( comparisons, lambda c: c.ratio < (1 - delta_threshold) ) increased, unchanged = partition( not_decreased, lambda c: c.ratio > (1 + delta_threshold) ) # sorted partitions names = [c.name for c in comparisons] comparisons = dict(zip(names, comparisons)) self.decreased = [ comparisons[c.name] for c in sorted(decreased, key=lambda c: -c.delta) ] self.increased = [ comparisons[c.name] for c in sorted(increased, key=lambda c: c.delta) ] self.unchanged = [ comparisons[c.name] for c in sorted(unchanged, key=lambda c: c.name) ] class ReportFormatter(object): """Creates the report from perfromance test comparison in specified format. `ReportFormatter` formats the `PerformanceTestResult`s and `ResultComparison`s provided by `TestComparator` into report table. Supported formats are: `markdown` (used for displaying benchmark results on GitHub), `git` and `html`. """ def __init__(self, comparator, changes_only, single_table=False): """Initialize with `TestComparator` and names of branches.""" self.comparator = comparator self.changes_only = changes_only self.single_table = single_table PERFORMANCE_TEST_RESULT_HEADER = ("TEST", "MIN", "MAX", "MEAN", "MAX_RSS") RESULT_COMPARISON_HEADER = ("TEST", "OLD", "NEW", "DELTA", "RATIO") @staticmethod def header_for(result): """Column labels for header row in results table.""" return ( ReportFormatter.PERFORMANCE_TEST_RESULT_HEADER if isinstance(result, PerformanceTestResult) else # isinstance(result, ResultComparison) ReportFormatter.RESULT_COMPARISON_HEADER ) @staticmethod def values(result): """Format values from PerformanceTestResult or ResultComparison. Returns tuple of strings to display in the results table. """ return ( ( result.name, str(result.min), str(result.max), str(int(result.mean)), str(result.max_rss) if result.max_rss else "—", ) if isinstance(result, PerformanceTestResult) else # isinstance(result, ResultComparison) ( result.name, str(result.old.min), str(result.new.min), "{0:+.1f}%".format(result.delta), "{0:.2f}x{1}".format(result.ratio, " (?)" if result.is_dubious else ""), ) ) def markdown(self): """Report results of benchmark comparisons in Markdown format.""" return self._formatted_text( label_formatter=lambda s: ("**" + s + "**"), COLUMN_SEPARATOR=" | ", DELIMITER_ROW=([":---"] + ["---:"] * 4), SEPARATOR="  | | | | \n", SECTION="""
{0} ({1}) {2}
""", ) def git(self): """Report results of benchmark comparisons in 'git' format.""" return self._formatted_text( label_formatter=lambda s: s.upper(), COLUMN_SEPARATOR=" ", DELIMITER_ROW=None, SEPARATOR="\n", SECTION=""" {0} ({1}): \n{2}""", ) def _column_widths(self): changed = self.comparator.decreased + self.comparator.increased results = changed if self.changes_only else changed + self.comparator.unchanged results += self.comparator.added + self.comparator.removed widths = [ map(len, columns) for columns in [ ReportFormatter.PERFORMANCE_TEST_RESULT_HEADER, ReportFormatter.RESULT_COMPARISON_HEADER, ] + [ReportFormatter.values(r) for r in results] ] def max_widths(maximum, widths): return map(max, zip(maximum, widths)) return list(functools.reduce(max_widths, widths, [0] * 5)) def _formatted_text( self, label_formatter, COLUMN_SEPARATOR, DELIMITER_ROW, SEPARATOR, SECTION ): widths = self._column_widths() self.header_printed = False def justify_columns(contents): return [c.ljust(w) for w, c in zip(widths, contents)] def row(contents): return ( "" if not contents else COLUMN_SEPARATOR.join(justify_columns(contents)) + "\n" ) def header(title, column_labels): labels = ( column_labels if not self.single_table else map(label_formatter, (title,) + column_labels[1:]) ) h = ( ("" if not self.header_printed else SEPARATOR) + row(labels) + (row(DELIMITER_ROW) if not self.header_printed else "") ) if self.single_table and not self.header_printed: self.header_printed = True return h def format_columns(r, is_strong): return r if not is_strong else r[:-1] + ("**" + r[-1] + "**",) def table(title, results, is_strong=False, is_open=False): if not results: return "" rows = [ row(format_columns(ReportFormatter.values(r), is_strong)) for r in results ] table = header( title if self.single_table else "", ReportFormatter.header_for(results[0]), ) + "".join(rows) return ( table if self.single_table else SECTION.format( title, len(results), table, "open" if is_open else "" ) ) return "\n" + "".join( [ table("Regression", self.comparator.decreased, True, True), table("Improvement", self.comparator.increased, True), ( "" if self.changes_only else table("No Changes", self.comparator.unchanged) ), table("Added", self.comparator.added, is_open=True), table("Removed", self.comparator.removed, is_open=True), ] ) HTML = """ {0}
""" HTML_HEADER_ROW = """ {0} ({1}) {2} {3} {4} {5} """ HTML_ROW = """ {0} {1} {2} {3} {5} """ def html(self): """Report results of benchmark comparisons in HTML format.""" def row(name, old, new, delta, speedup, speedup_color): return self.HTML_ROW.format(name, old, new, delta, speedup_color, speedup) def header(contents): return self.HTML_HEADER_ROW.format(*contents) def table(title, results, speedup_color): rows = [ row(*(ReportFormatter.values(r) + (speedup_color,))) for r in results ] return ( "" if not rows else header( (title, len(results)) + ReportFormatter.header_for(results[0])[1:] ) + "".join(rows) ) return self.HTML.format( "".join( [ table("Regression", self.comparator.decreased, "red"), table("Improvement", self.comparator.increased, "green"), ( "" if self.changes_only else table("No Changes", self.comparator.unchanged, "black") ), table("Added", self.comparator.added, ""), table("Removed", self.comparator.removed, ""), ] ) ) def parse_args(args): """Parse command line arguments and set default values.""" parser = argparse.ArgumentParser(description="Compare Performance tests.") parser.add_argument( "--old-file", help="Baseline performance test suite (csv file)", required=True ) parser.add_argument( "--new-file", help="New performance test suite (csv file)", required=True ) parser.add_argument( "--format", choices=["markdown", "git", "html"], help="Output format. Default is markdown.", default="markdown", ) parser.add_argument("--output", help="Output file name") parser.add_argument( "--changes-only", help="Output only affected tests", action="store_true" ) parser.add_argument( "--single-table", help="Combine data in a single table in git and markdown formats", action="store_true", ) parser.add_argument( "--delta-threshold", help="Delta threshold. Default 0.05.", type=float, default=0.05, ) return parser.parse_args(args) def create_report( old_results, new_results, delta_threshold, format, changes_only=True, single_table=True, ): comparator = TestComparator(old_results, new_results, delta_threshold) formatter = ReportFormatter(comparator, changes_only, single_table) formats = { "markdown": formatter.markdown, "git": formatter.git, "html": formatter.html, } report = formats[format]() return report def main(): """Compare benchmarks for changes in a formatted report.""" args = parse_args(sys.argv[1:]) report = create_report( LogParser.results_from_file(args.old_file), LogParser.results_from_file(args.new_file), args.delta_threshold, args.format, args.changes_only, args.single_table, ) print(report) if args.output: with open(args.output, "w") as f: f.write(report) if __name__ == "__main__": sys.exit(main())