Merge pull request #61559 from tbkka/tbkka-benchmarking

Overhaul Benchmarking pipeline to use complete sample data, not summaries
This commit is contained in:
Tim Kientzle
2022-11-09 07:38:58 -08:00
committed by GitHub
6 changed files with 967 additions and 1153 deletions

View File

@@ -88,9 +88,10 @@ class BenchmarkDriver(object):
def test_harness(self): def test_harness(self):
"""Full path to test harness binary.""" """Full path to test harness binary."""
suffix = self.args.optimization if hasattr(self.args, "optimization") else "O" suffix = self.args.optimization if hasattr(self.args, "optimization") else "O"
suffix += "-"
if hasattr(self.args, "architecture") and self.args.architecture: if hasattr(self.args, "architecture") and self.args.architecture:
suffix += "-" + self.args.architecture + "*" suffix += self.args.architecture
pattern = os.path.join(self.args.tests, "Benchmark_" + suffix) pattern = os.path.join(self.args.tests, "Benchmark_" + suffix + "*")
executables = [] executables = []
if hasattr(self._subprocess, "test_mode") and self._subprocess.test_mode: if hasattr(self._subprocess, "test_mode") and self._subprocess.test_mode:
executables = [pattern] executables = [pattern]
@@ -134,22 +135,32 @@ class BenchmarkDriver(object):
@property @property
def _cmd_list_benchmarks(self): def _cmd_list_benchmarks(self):
# Use tab delimiter for easier parsing to override the default comma. # TODO: Switch to JSON format: add "--json" here
# (The third 'column' is always comma-separated list of tags in square return [self.test_harness, "--list"] + (
# brackets -- currently unused here.)
return [self.test_harness, "--list", "--delim=\t"] + (
["--skip-tags="] if (self.args.benchmarks or self.args.filters) else [] ["--skip-tags="] if (self.args.benchmarks or self.args.filters) else []
) )
def _get_tests(self): def _get_tests(self):
"""Return a list of performance tests to run.""" """Return a list of performance tests to run."""
number_name_pairs = [ lines = self._invoke(self._cmd_list_benchmarks).split("\n")
line.split("\t")[:2] json_tests = []
for line in self._invoke(self._cmd_list_benchmarks).split("\n")[1:-1] for line in lines:
] columns = re.split(r'[ ,]+', line.strip())
# unzip list of pairs into 2 lists try:
test_numbers, self.all_tests = map(list, zip(*number_name_pairs)) number = int(columns[0])
self.test_number = dict(zip(self.all_tests, test_numbers)) name = columns[1]
json_descr = {"number": number, "name": name}
json_tests.append(json_descr)
except Exception:
continue
# TODO: Replace the above with the following to
# use the JSON output from the benchmark driver
# directly
# if line.strip() != "":
# json_tests.append(json.loads(line))
self.all_tests = [json["name"] for json in json_tests]
test_numbers = [json["number"] for json in json_tests]
self.test_number = dict([(json["name"], json["number"]) for json in json_tests])
if self.args.filters: if self.args.filters:
return self._tests_matching_patterns() return self._tests_matching_patterns()
if self.args.benchmarks: if self.args.benchmarks:
@@ -157,25 +168,19 @@ class BenchmarkDriver(object):
return self.all_tests return self.all_tests
def _tests_matching_patterns(self): def _tests_matching_patterns(self):
regexes = [re.compile(pattern) for pattern in self.args.filters] matches = set()
return sorted( for fil in self.args.filters:
list( pattern = re.compile(fil)
set( new_matches = filter(pattern.match, self.all_tests)
[ matches = matches.union(new_matches)
name return sorted(list(matches))
for pattern in regexes
for name in self.all_tests
if pattern.match(name)
]
)
)
)
def _tests_by_name_or_number(self, test_numbers): def _tests_by_name_or_number(self, test_numbers):
benchmarks = set(self.args.benchmarks) benchmarks = set(self.args.benchmarks)
number_to_name = dict(zip(test_numbers, self.all_tests)) numbers = list(map(str, test_numbers))
number_to_name = dict(zip(numbers, self.all_tests))
tests_by_number = [ tests_by_number = [
number_to_name[i] for i in benchmarks.intersection(set(test_numbers)) number_to_name[i] for i in benchmarks.intersection(numbers)
] ]
return sorted( return sorted(
list(benchmarks.intersection(set(self.all_tests)).union(tests_by_number)) list(benchmarks.intersection(set(self.all_tests)).union(tests_by_number))
@@ -188,8 +193,7 @@ class BenchmarkDriver(object):
num_iters=None, num_iters=None,
sample_time=None, sample_time=None,
verbose=None, verbose=None,
measure_memory=False, measure_memory=False
quantile=None,
): ):
"""Execute benchmark and gather results.""" """Execute benchmark and gather results."""
num_samples = num_samples or 0 num_samples = num_samples or 0
@@ -197,11 +201,14 @@ class BenchmarkDriver(object):
sample_time = sample_time or 0 # default is 1s sample_time = sample_time or 0 # default is 1s
cmd = self._cmd_run( cmd = self._cmd_run(
test, num_samples, num_iters, sample_time, verbose, measure_memory, quantile test, num_samples, num_iters, sample_time, verbose, measure_memory
) )
output = self._invoke(cmd) output = self._invoke(cmd)
results = self.parser.results_from_string(output) results = self.parser.results_from_string(output)
return list(results.items())[0][1] if test else results if test:
return list(results.items())[0][1]
else:
return results
def _cmd_run( def _cmd_run(
self, self,
@@ -210,14 +217,13 @@ class BenchmarkDriver(object):
num_iters, num_iters,
sample_time, sample_time,
verbose, verbose,
measure_memory, measure_memory
quantile,
): ):
cmd = [self.test_harness] cmd = [self.test_harness]
if test: if test:
cmd.append(test) cmd.append(test)
else: else:
cmd.extend([self.test_number.get(name, name) for name in self.tests]) cmd.extend([str(self.test_number.get(name, name)) for name in self.tests])
if num_samples > 0: if num_samples > 0:
cmd.append("--num-samples={0}".format(num_samples)) cmd.append("--num-samples={0}".format(num_samples))
if num_iters > 0: if num_iters > 0:
@@ -228,9 +234,8 @@ class BenchmarkDriver(object):
cmd.append("--verbose") cmd.append("--verbose")
if measure_memory: if measure_memory:
cmd.append("--memory") cmd.append("--memory")
if quantile: # TODO: Uncomment this as soon as the new Benchmark Swift logic is available everywhere
cmd.append("--quantile={0}".format(quantile)) # cmd.append("--json")
cmd.append("--delta")
return cmd return cmd
def run_independent_samples(self, test): def run_independent_samples(self, test):
@@ -246,12 +251,12 @@ class BenchmarkDriver(object):
return functools.reduce( return functools.reduce(
merge_results, merge_results,
[ [
self.run(test, measure_memory=True, num_iters=1, quantile=20) self.run(test, measure_memory=True, num_iters=1)
for _ in range(self.args.independent_samples) for _ in range(self.args.independent_samples)
], ],
) )
def log_results(self, output, log_file=None): def log_results(self, results, log_file=None):
"""Log output to `log_file`. """Log output to `log_file`.
Creates `args.output_dir` if it doesn't exist yet. Creates `args.output_dir` if it doesn't exist yet.
@@ -262,7 +267,8 @@ class BenchmarkDriver(object):
os.makedirs(dir) os.makedirs(dir)
print("Logging results to: %s" % log_file) print("Logging results to: %s" % log_file)
with open(log_file, "w") as f: with open(log_file, "w") as f:
f.write(output) for r in results:
print(r, file=f)
RESULT = "{:>3} {:<40} {:>7} {:>7} {:>6} {:>10} {:>6} {:>7} {:>10}" RESULT = "{:>3} {:<40} {:>7} {:>7} {:>6} {:>10} {:>6} {:>7} {:>10}"
@@ -284,7 +290,7 @@ class BenchmarkDriver(object):
def console_log(values): def console_log(values):
print(format(values)) print(format(values))
def result_values(r): def summary(r):
return list( return list(
map( map(
str, str,
@@ -292,17 +298,17 @@ class BenchmarkDriver(object):
r.test_num, r.test_num,
r.name, r.name,
r.num_samples, r.num_samples,
r.min, r.min_value,
r.samples.q1, r.q1,
r.median, r.median,
r.samples.q3, r.q3,
r.max, r.max_value,
r.max_rss, r.max_rss,
], ],
) )
) )
header = [ summary_header = [
"#", "#",
"TEST", "TEST",
"SAMPLES", "SAMPLES",
@@ -313,25 +319,23 @@ class BenchmarkDriver(object):
"MAX(μs)", "MAX(μs)",
"MAX_RSS(B)", "MAX_RSS(B)",
] ]
console_log(header) console_log(summary_header)
results = [header] results = []
for test in self.tests: for test in self.tests:
result = result_values(self.run_independent_samples(test)) result = self.run_independent_samples(test)
console_log(result) console_log(summary(result))
results.append(result) results.append(result)
print("\nTotal performance tests executed: {0}".format(len(self.tests))) print("\nTotal performance tests executed: {0}".format(len(self.tests)))
return ( return results
None if csv_console else ("\n".join([",".join(r) for r in results]) + "\n")
) # csv_log
@staticmethod @staticmethod
def run_benchmarks(args): def run_benchmarks(args):
"""Run benchmarks and log results.""" """Run benchmarks and log results."""
driver = BenchmarkDriver(args) driver = BenchmarkDriver(args)
csv_log = driver.run_and_log(csv_console=(args.output_dir is None)) results = driver.run_and_log(csv_console=(args.output_dir is None))
if csv_log: if args.output_dir:
driver.log_results(csv_log) driver.log_results([r.json for r in results])
return 0 return 0
@@ -445,7 +449,6 @@ class BenchmarkDoctor(object):
Optional `driver` parameter for injecting dependency; used for testing. Optional `driver` parameter for injecting dependency; used for testing.
""" """
super(BenchmarkDoctor, self).__init__() super(BenchmarkDoctor, self).__init__()
self.driver = driver or BenchmarkDriver(args)
self.results = {} self.results = {}
if hasattr(args, "markdown") and args.markdown: if hasattr(args, "markdown") and args.markdown:
@@ -458,6 +461,7 @@ class BenchmarkDoctor(object):
self.console_handler.setLevel( self.console_handler.setLevel(
logging.DEBUG if args.verbose else logging.INFO logging.DEBUG if args.verbose else logging.INFO
) )
self.driver = driver or BenchmarkDriver(args)
self.log.addHandler(self.console_handler) self.log.addHandler(self.console_handler)
self.log.debug("Checking tests: %s", ", ".join(self.driver.tests)) self.log.debug("Checking tests: %s", ", ".join(self.driver.tests))
self.requirements = [ self.requirements = [
@@ -532,7 +536,7 @@ class BenchmarkDoctor(object):
correction = setup / i correction = setup / i
i_series = BenchmarkDoctor._select(measurements, num_iters=i) i_series = BenchmarkDoctor._select(measurements, num_iters=i)
for result in i_series: for result in i_series:
runtimes.append(result.samples.min - correction) runtimes.append(result.min_value - correction)
runtime = min(runtimes) runtime = min(runtimes)
threshold = 1000 threshold = 1000
@@ -584,7 +588,7 @@ class BenchmarkDoctor(object):
ti1, ti2 = [ ti1, ti2 = [
float(min(mins)) float(min(mins))
for mins in [ for mins in [
[result.samples.min for result in i_series] [result.min_value for result in i_series]
for i_series in [select(measurements, num_iters=i) for i in [1, 2]] for i_series in [select(measurements, num_iters=i) for i in [1, 2]]
] ]
] ]
@@ -679,7 +683,7 @@ class BenchmarkDoctor(object):
r = self.driver.run( r = self.driver.run(
benchmark, num_samples=3, num_iters=1, verbose=True benchmark, num_samples=3, num_iters=1, verbose=True
) # calibrate ) # calibrate
num_samples = self._adjusted_1s_samples(r.samples.min) num_samples = self._adjusted_1s_samples(r.min_value)
def capped(s): def capped(s):
return min(s, 200) return min(s, 200)
@@ -689,7 +693,7 @@ class BenchmarkDoctor(object):
opts = opts if isinstance(opts, list) else [opts] opts = opts if isinstance(opts, list) else [opts]
self.log.debug( self.log.debug(
"Runtime {0} μs yields {1} adjusted samples per second.".format( "Runtime {0} μs yields {1} adjusted samples per second.".format(
r.samples.min, num_samples r.min_value, num_samples
) )
) )
self.log.debug( self.log.debug(

View File

@@ -17,9 +17,7 @@ This script compares performance test logs and issues a formatted report.
Invoke `$ compare_perf_tests.py -h ` for complete list of options. Invoke `$ compare_perf_tests.py -h ` for complete list of options.
class `Sample` is single benchmark measurement. class `PerformanceTestResult` collects information about a single test
class `PerformanceTestSamples` is collection of `Sample`s and their statistics.
class `PerformanceTestResult` is a summary of performance test execution.
class `LogParser` converts log files into `PerformanceTestResult`s. class `LogParser` converts log files into `PerformanceTestResult`s.
class `ResultComparison` compares new and old `PerformanceTestResult`s. class `ResultComparison` compares new and old `PerformanceTestResult`s.
class `TestComparator` analyzes changes between the old and new test results. class `TestComparator` analyzes changes between the old and new test results.
@@ -29,194 +27,10 @@ class `ReportFormatter` creates the test comparison report in specified format.
import argparse import argparse
import functools import functools
import json
import re import re
import statistics
import sys import sys
from bisect import bisect, bisect_left, bisect_right
from collections import namedtuple
from math import ceil, sqrt
class Sample(namedtuple("Sample", "i num_iters runtime")):
u"""Single benchmark measurement.
Initialized with:
`i`: ordinal number of the sample taken,
`num-num_iters`: number or iterations used to compute it,
`runtime`: in microseconds (μs).
"""
def __repr__(self):
"""Shorter Sample formatting for debugging purposes."""
return "s({0.i!r}, {0.num_iters!r}, {0.runtime!r})".format(self)
class Yield(namedtuple("Yield", "before_sample after")):
u"""Meta-measurement of when the Benchmark_X voluntarily yielded process.
`before_sample`: index of measurement taken just after returning from yield
`after`: time elapsed since the previous yield in microseconds (μs)
"""
class PerformanceTestSamples(object):
"""Collection of runtime samples from the benchmark execution.
Computes the sample population statistics.
"""
def __init__(self, name, samples=None):
"""Initialize with benchmark name and optional list of Samples."""
self.name = name # Name of the performance test
self.samples = []
self.outliers = []
self._runtimes = []
self.mean = 0.0
self.S_runtime = 0.0 # For computing running variance
for sample in samples or []:
self.add(sample)
def __str__(self):
"""Text summary of benchmark statistics."""
return (
"{0.name!s} n={0.count!r} "
"Min={0.min!r} Q1={0.q1!r} M={0.median!r} Q3={0.q3!r} "
"Max={0.max!r} "
"R={0.range!r} {0.spread:.2%} IQR={0.iqr!r} "
"Mean={0.mean:.0f} SD={0.sd:.0f} CV={0.cv:.2%}".format(self)
if self.samples
else "{0.name!s} n=0".format(self)
)
def add(self, sample):
"""Add sample to collection and recompute statistics."""
assert isinstance(sample, Sample)
self._update_stats(sample)
i = bisect(self._runtimes, sample.runtime)
self._runtimes.insert(i, sample.runtime)
self.samples.insert(i, sample)
def _update_stats(self, sample):
old_stats = (self.count, self.mean, self.S_runtime)
_, self.mean, self.S_runtime = self.running_mean_variance(
old_stats, sample.runtime
)
def exclude_outliers(self, top_only=False):
"""Exclude outliers by applying Interquartile Range Rule.
Moves the samples outside of the inner fences
(Q1 - 1.5*IQR and Q3 + 1.5*IQR) into outliers list and recomputes
statistics for the remaining sample population. Optionally apply
only the top inner fence, preserving the small outliers.
Experimentally, this rule seems to perform well-enough on the
benchmark runtimes in the microbenchmark range to filter out
the environment noise caused by preemptive multitasking.
"""
lo = (
0
if top_only
else bisect_left(self._runtimes, int(self.q1 - 1.5 * self.iqr))
)
hi = bisect_right(self._runtimes, int(self.q3 + 1.5 * self.iqr))
outliers = self.samples[:lo] + self.samples[hi:]
samples = self.samples[lo:hi]
self.__init__(self.name) # re-initialize
for sample in samples: # and
self.add(sample) # re-compute stats
self.outliers = outliers
@property
def count(self):
"""Number of samples used to compute the statistics."""
return len(self.samples)
@property
def num_samples(self):
"""Number of all samples in the collection."""
return len(self.samples) + len(self.outliers)
@property
def all_samples(self):
"""List of all samples in ascending order."""
return sorted(self.samples + self.outliers, key=lambda s: s.i or -1)
@property
def min(self):
"""Minimum sampled value."""
return self.samples[0].runtime
@property
def max(self):
"""Maximum sampled value."""
return self.samples[-1].runtime
def quantile(self, q):
"""Return runtime for given quantile.
Equivalent to quantile estimate type R-1, SAS-3. See:
https://en.wikipedia.org/wiki/Quantile#Estimating_quantiles_from_a_sample
"""
index = max(0, int(ceil(self.count * float(q))) - 1)
return self.samples[index].runtime
@property
def median(self):
"""Median sampled value."""
return self.quantile(0.5)
@property
def q1(self):
"""First Quartile (25th Percentile)."""
return self.quantile(0.25)
@property
def q3(self):
"""Third Quartile (75th Percentile)."""
return self.quantile(0.75)
@property
def iqr(self):
"""Interquartile Range."""
return self.q3 - self.q1
@property
def sd(self):
u"""Standard Deviation (μs)."""
return 0 if self.count < 2 else sqrt(self.S_runtime / (self.count - 1))
@staticmethod
def running_mean_variance(stats, x):
"""Compute running variance, B. P. Welford's method.
See Knuth TAOCP vol 2, 3rd edition, page 232, or
https://www.johndcook.com/blog/standard_deviation/
M is mean, Standard Deviation is defined as sqrt(S/k-1)
"""
(k, M_, S_) = stats
k = float(k + 1)
M = M_ + (x - M_) / k
S = S_ + (x - M_) * (x - M)
return (k, M, S)
@property
def cv(self):
"""Coefficient of Variation (%)."""
return (self.sd / self.mean) if self.mean else 0
@property
def range(self):
"""Range of samples values (Max - Min)."""
return self.max - self.min
@property
def spread(self):
"""Sample Spread; i.e. Range as (%) of Min."""
return self.range / float(self.min) if self.min else 0
class PerformanceTestResult(object): class PerformanceTestResult(object):
@@ -225,126 +39,402 @@ class PerformanceTestResult(object):
Reported by the test driver (Benchmark_O, Benchmark_Onone, Benchmark_Osize Reported by the test driver (Benchmark_O, Benchmark_Onone, Benchmark_Osize
or Benchmark_Driver). or Benchmark_Driver).
It supports 2 log formats emitted by the test driver. Legacy format with It supports log formats emitted by the test driver.
statistics for normal distribution (MEAN, SD):
#,TEST,SAMPLES,MIN(μs),MAX(μs),MEAN(μs),SD(μs),MEDIAN(μs),MAX_RSS(B)
And new quantiles format with variable number of columns:
#,TEST,SAMPLES,QMIN(μs),MEDIAN(μs),MAX(μs)
#,TEST,SAMPLES,QMIN(μs),Q1(μs),Q2(μs),Q3(μs),MAX(μs),MAX_RSS(B)
The number of columns between MIN and MAX depends on the test driver's
`--quantile`parameter. In both cases, the last column, MAX_RSS is optional.
""" """
def __init__(self, csv_row, quantiles=False, memory=False, delta=False, meta=False): # TODO: Delete after December 2023
"""Initialize from a row of multiple columns with benchmark summary. @classmethod
def fromOldFormat(cls, header, line):
The row is an iterable, such as a row provided by the CSV parser. """Original format with statistics for normal distribution (MEAN, SD):
#,TEST,SAMPLES,MIN(μs),MAX(μs),MEAN(μs),SD(μs),MEDIAN(μs),MAX_RSS(B),PAGES,ICS,YIELD
Note that MAX_RSS, PAGES, ICS, YIELD are all optional
""" """
self.test_num = csv_row[0] # Ordinal number of the test csv_row = line.split(",") if "," in line else line.split()
self.name = csv_row[1] # Name of the performance test labels = header.split(",") if "," in header else header.split()
self.num_samples = int(csv_row[2]) # Number of measurements taken
mem_index = (-1 if memory else 0) + (-3 if meta else 0) # Synthesize a JSON form with the basic values:
if quantiles: # Variable number of columns representing quantiles num_samples = int(csv_row[2])
runtimes = csv_row[3:mem_index] if memory or meta else csv_row[3:] json_data = {
last_runtime_index = mem_index - 1 "number": int(csv_row[0]),
if delta: "name": csv_row[1],
runtimes = [int(x) if x else 0 for x in runtimes] "num_samples": num_samples,
runtimes = functools.reduce( }
lambda l, x: l.append(l[-1] + x) or l if l else [x], # runnin
runtimes,
None,
) # total
num_values = len(runtimes)
if self.num_samples < num_values: # remove repeated samples
quantile = num_values - 1
qs = [float(i) / float(quantile) for i in range(0, num_values)]
indices = [
max(0, int(ceil(self.num_samples * float(q))) - 1) for q in qs
]
runtimes = [
runtimes[indices.index(i)] for i in range(0, self.num_samples)
]
self.samples = PerformanceTestSamples( # Map remaining columns according to label
self.name, [Sample(None, None, int(runtime)) for runtime in runtimes] field_map = [
) ("ICS", "ics"),
self.samples.exclude_outliers(top_only=True) ("MAX_RSS", "max_rss"), # Must precede "MAX"
sams = self.samples ("MAX", "max"),
self.min, self.max, self.median, self.mean, self.sd = ( ("MEAN", "mean"),
sams.min, ("MEDIAN", "median"),
sams.max, ("MIN", "min"),
sams.median, ("PAGES", "pages"),
sams.mean, ("SD", "sd"),
sams.sd, ("YIELD", "yield")
) ]
else: # Legacy format with statistics for normal distribution. for label, value in zip(labels, csv_row):
self.min = int(csv_row[3]) # Minimum runtime (μs) for match, json_key in field_map:
self.max = int(csv_row[4]) # Maximum runtime (μs) if match in label:
self.mean = float(csv_row[5]) # Mean (average) runtime (μs) json_data[json_key] = float(value)
self.sd = float(csv_row[6]) # Standard Deviation (μs) break
self.median = int(csv_row[7]) # Median runtime (μs)
last_runtime_index = 7
self.samples = None
self.max_rss = ( # Maximum Resident Set Size (B) # Heroic: Reconstruct samples if we have enough info
int(csv_row[mem_index]) if ( # This is generally a bad idea, but sadly necessary for the
memory and len(csv_row) > (last_runtime_index + 1) # old format that doesn't provide raw sample data.
) else None if num_samples == 1 and "min" in json_data:
) json_data["samples"] = [
json_data["min"]
]
elif num_samples == 2 and "min" in json_data and "max" in json_data:
json_data["samples"] = [
json_data["min"],
json_data["max"]
]
elif (num_samples == 3
and "min" in json_data
and "max" in json_data
and "median" in json_data):
json_data["samples"] = [
json_data["min"],
json_data["median"],
json_data["max"]
]
# Optional measurement metadata. The number of: return PerformanceTestResult(json_data)
# memory pages used, involuntary context switches and voluntary yields
self.mem_pages, self.involuntary_cs, self.yield_count = ( # TODO: Delete after December 2023
[int(x) for x in csv_row[-3:]] if meta else (None, None, None) @classmethod
) def fromQuantileFormat(cls, header, line):
self.yields = None """Quantiles format with variable number of columns depending on the
self.setup = None number of quantiles:
#,TEST,SAMPLES,QMIN(μs),MEDIAN(μs),MAX(μs)
#,TEST,SAMPLES,QMIN(μs),Q1(μs),Q2(μs),Q3(μs),MAX(μs),MAX_RSS(B)
The number of columns between QMIN and MAX depends on the test driver's
`--quantile`parameter. In both cases, the last column, MAX_RSS is optional.
Delta encoding: If a header name includes 𝚫, that column stores the
difference from the previous column. E.g, a header
"#,TEST,SAMPLES,QMIN(μs),MEDIAN(μs),𝚫MAX(μs)" indicates the final "MAX"
column must be computed by adding the value in that column to the value
of the previous "MEDIAN" column.
"""
csv_row = line.split(",") if "," in line else line.split()
labels = header.split(",")
for i in range(1, len(labels)):
if "𝚫" in labels[i] or "Δ" in labels[i]:
prev = int(csv_row[i - 1])
inc = int(csv_row[i]) if csv_row[i] != '' else 0
csv_row[i] = str(prev + inc)
# Synthesize a JSON form and then initialize from that
json_data = {
"number": int(csv_row[0]),
"name": csv_row[1],
"num_samples": int(csv_row[2]),
}
# Process optional trailing fields MAX_RSS, PAGES, ICS, YIELD
i = len(labels) - 1
while True:
if "MAX_RSS" in labels[i]:
json_data["max_rss"] = float(csv_row[i])
elif "PAGES" in labels[i]:
json_data["pages"] = float(csv_row[i])
elif "ICS" in labels[i]:
json_data["ics"] = float(csv_row[i])
elif "YIELD" in labels[i]:
json_data["yield"] = float(csv_row[i])
else:
break
i -= 1
if i < 0:
break
# Rest is the quantiles (includes min/max columns)
quantiles = [float(q) for q in csv_row[3:i + 1]]
# Heroic effort:
# If we have enough quantiles, we can reconstruct the samples
# This is generally a bad idea, but sadly necessary since
# the quantile format doesn't provide raw sample data.
if json_data["num_samples"] == len(quantiles):
json_data["samples"] = sorted(quantiles)
elif json_data["num_samples"] == 2:
json_data["samples"] = [quantiles[0], quantiles[-1]]
elif json_data["num_samples"] == 1:
json_data["samples"] = [quantiles[0]]
else:
json_data["quantiles"] = quantiles
if len(quantiles) > 0:
json_data["min"] = quantiles[0]
json_data["max"] = quantiles[-1]
json_data["median"] = quantiles[(len(quantiles) - 1) // 2]
return PerformanceTestResult(json_data)
@classmethod
def fromJSONFormat(cls, line):
"""JSON format stores a test result as a JSON object on a single line
Compared to the legacy tab-separated/comma-separated formats, this makes
it much easier to add new fields, handle optional fields, and allows us
to include the full set of samples so we can use better statistics
downstream.
The code here includes optional support for min, max,
median, mean, etc. supported by the older formats, though in practice,
you shouldn't rely on those: Just store the full samples and then
compute whatever statistics you need as required.
"""
json_data = json.loads(line)
return PerformanceTestResult(json_data)
def __init__(self, json_data):
# Ugly hack to get the old tests to run
if isinstance(json_data, str):
json_data = json.loads(json_data)
# We always have these
assert (json_data.get("number") is not None)
assert (json_data.get("name") is not None)
self.test_num = json_data["number"]
self.name = json_data["name"]
# We always have either samples or num_samples
assert (json_data.get("num_samples") is not None
or json_data.get("samples") is not None)
self.num_samples = json_data.get("num_samples") or len(json_data["samples"])
self.samples = json_data.get("samples") or []
# Everything else is optional and can be read
# out of the JSON data if needed
# See max_rss() below for an example of this.
self.json_data = dict(json_data)
def __repr__(self): def __repr__(self):
"""Short summary for debugging purposes.""" return "PerformanceTestResult(" + json.dumps(self.json_data) + ")"
return (
"<PerformanceTestResult name:{0.name!r} "
"samples:{0.num_samples!r} min:{0.min!r} max:{0.max!r} "
"mean:{0.mean:.0f} sd:{0.sd:.0f} median:{0.median!r}>".format(self)
)
def merge(self, r): def json(self):
"""Return a single-line JSON form of this result
This can be parsed back via fromJSONFormat above.
It can also represent all data stored by the older
formats, so there's no reason to not use it everywhere.
"""
data = dict(self.json_data)
# In case these got modified
data["number"] = self.test_num
data["name"] = self.name
# If we have full sample data, use that and
# drop any lingering pre-computed statistics
# (It's better for downstream consumers to just
# compute whatever statistics they need from scratch.)
# After December 2023, uncomment the next line:
# assert len(self.samples) == self.num_samples
if len(self.samples) == self.num_samples:
data["samples"] = self.samples
data.pop("num_samples", None)
# TODO: Delete min/max/mean/sd/q1/median/q3/quantiles
# after December 2023
data.pop("min", None)
data.pop("max", None)
data.pop("mean", None)
data.pop("sd", None)
data.pop("q1", None)
data.pop("median", None)
data.pop("q3", None)
data.pop("quantiles", None)
else:
# Preserve other pre-existing JSON statistics
data["num_samples"] = self.num_samples
return json.dumps(data)
def __str__(self):
return self.json()
@property
def setup(self):
"""TODO: Implement this
"""
return 0
@property
def max_rss(self):
"""Return max_rss if available
"""
return self.json_data.get("max_rss")
@property
def mem_pages(self):
"""Return pages if available
"""
return self.json_data.get("pages")
@property
def involuntary_cs(self):
"""Return involuntary context switches if available
"""
return self.json_data.get("ics")
@property
def yield_count(self):
"""Return voluntary yield count if available
"""
return self.json_data.get("yield")
@property
def min_value(self):
"""Return the minimum value from all samples
If we have full samples, compute it directly.
In the legacy case, we might not have full samples,
so in that case we'll return a value that was given
to us initially (if any).
Eventually (after December 2023), this can be simplified
to just `return min(self.samples)`, since by then
the legacy forms should no longer be in use.
"""
if self.num_samples == len(self.samples):
return min(self.samples)
return self.json_data.get("min")
@property
def max_value(self):
"""Return the maximum sample value
See min_value comments for details on the legacy behavior."""
if self.num_samples == len(self.samples):
return max(self.samples)
return self.json_data.get("max")
@property
def median(self):
"""Return the median sample value
See min_value comments for details on the legacy behavior."""
if self.num_samples == len(self.samples):
return statistics.median(self.samples)
return self.json_data.get("median")
# TODO: Eliminate q1 and q3. They're kept for now
# to preserve compatibility with older reports. But quantiles
# aren't really useful statistics, so just drop them.
@property
def q1(self):
"""Return the 25% quantile
See min_value comments for details on the legacy behavior."""
if self.num_samples == len(self.samples):
q = statistics.quantiles(self.samples, n=4)
return q[0]
return self.json_data.get("q1")
@property
def q3(self):
"""Return the 75% quantile
See min_value comments for details on the legacy behavior."""
if self.num_samples == len(self.samples):
q = statistics.quantiles(self.samples, n=4)
return q[2]
return self.json_data.get("q3")
@property
def mean(self):
"""Return the average
TODO: delete this; it's not useful"""
if self.num_samples == len(self.samples):
return statistics.mean(self.samples)
return self.json_data.get("mean")
@property
def sd(self):
"""Return the standard deviation
TODO: delete this; it's not useful"""
if self.num_samples == len(self.samples):
if len(self.samples) > 1:
return statistics.stdev(self.samples)
else:
return 0
return self.json_data.get("sd")
def merge(self, other):
"""Merge two results. """Merge two results.
Recomputes min, max and mean statistics. If all `samples` are This is trivial in the non-legacy case: We just
available, it recomputes all the statistics. pool all the samples.
The use case here is comparing test results parsed from concatenated
log files from multiple runs of benchmark driver. In the legacy case (or the mixed legacy/non-legacy cases),
we try to estimate the min/max/mean/sd/median/etc based
on whatever information is available. After Dec 2023,
we should be able to drop the legacy support.
""" """
# Statistics # The following can be removed after Dec 2023
if self.samples and r.samples: # (by which time the legacy support should no longer
for sample in r.samples.samples: # be necessary)
self.samples.add(sample) if self.num_samples != len(self.samples):
sams = self.samples # If we don't have samples, we can't rely on being
self.num_samples = sams.num_samples # able to compute real statistics from those samples,
self.min, self.max, self.median, self.mean, self.sd = ( # so we make a best-effort attempt to estimate a joined
sams.min, # statistic from whatever data we actually have.
sams.max,
sams.median, # If both exist, take the minimum, else take whichever is set
sams.mean, other_min_value = other.min_value
sams.sd, if other_min_value is not None:
) self_min_value = self.min_value
else: if self_min_value is not None:
self.min = min(self.min, r.min) self.json_data["min"] = min(other_min_value, self_min_value)
self.max = max(self.max, r.max) else:
self.mean = ( # pooled mean is the weighted sum of means self.json_data["min"] = other_min_value
(self.mean * self.num_samples) + (r.mean * r.num_samples)
) / float(self.num_samples + r.num_samples) # If both exist, take the maximum, else take whichever is set
self.num_samples += r.num_samples other_max_value = other.max_value
self.median, self.sd = None, None if other_max_value is not None:
self_max_value = self.max_value
if self_max_value is not None:
self.json_data["max"] = max(other_max_value, self_max_value)
else:
self.json_data["max"] = other_max_value
# If both exist, take the weighted average, else take whichever is set
other_mean = other.mean
if other_mean is not None:
self_mean = self.mean
if self_mean is not None:
self.json_data["mean"] = (
(other_mean * other.num_samples
+ self_mean * self.num_samples)
/ (self.num_samples + other.num_samples)
)
else:
self.json_data["mean"] = other_mean
self.json_data.pop("median", None) # Remove median
self.json_data.pop("sd", None) # Remove stdev
self.json_data.pop("q1", None) # Remove 25% quantile
self.json_data.pop("q3", None) # Remove 75% quantile
self.json_data.pop("quantiles", None) # Remove quantiles
# Accumulate samples (if present) and num_samples (always)
self.samples += other.samples
self.num_samples += other.num_samples
# Metadata # Metadata
def minimum(a, b): # work around None being less than everything # Use the smaller if both have a max_rss value
return min(filter(lambda x: x is not None, [a, b])) if any([a, b]) else None self.json_data["max_rss"] = other.max_rss
other_max_rss = other.max_rss
self.max_rss = minimum(self.max_rss, r.max_rss) if other_max_rss is not None:
self.setup = minimum(self.setup, r.setup) self_max_rss = self.max_rss
if self_max_rss is not None:
self.json_data["max_rss"] = min(self_max_rss, other_max_rss)
else:
self.json_data["max_rss"] = other_max_rss
class ResultComparison(object): class ResultComparison(object):
@@ -361,16 +451,37 @@ class ResultComparison(object):
self.name = old.name # Test name, convenience accessor self.name = old.name # Test name, convenience accessor
# Speedup ratio # Speedup ratio
self.ratio = (old.min + 0.001) / (new.min + 0.001) self.ratio = (old.min_value + 0.001) / (new.min_value + 0.001)
# Test runtime improvement in % # Test runtime improvement in %
ratio = (new.min + 0.001) / (old.min + 0.001) ratio = (new.min_value + 0.001) / (old.min_value + 0.001)
self.delta = (ratio - 1) * 100 self.delta = (ratio - 1) * 100
# If we have full samples for both old and new...
if (
len(old.samples) == old.num_samples
and len(new.samples) == new.num_samples
):
# TODO: Use a T-Test or U-Test to determine whether
# one set of samples should be considered reliably better than
# the other.
None
# If we do not have full samples, we'll use the
# legacy calculation for compatibility.
# TODO: After Dec 2023, we should always be using full samples
# everywhere and can delete the following entirely.
#
# Indication of dubious changes: when result's MIN falls inside the # Indication of dubious changes: when result's MIN falls inside the
# (MIN, MAX) interval of result they are being compared with. # (MIN, MAX) interval of result they are being compared with.
self.is_dubious = (old.min < new.min and new.min < old.max) or ( self.is_dubious = (
new.min < old.min and old.min < new.max (
old.min_value < new.min_value
and new.min_value < old.max_value
) or (
new.min_value < old.min_value
and old.min_value < new.max_value
)
) )
@@ -385,117 +496,49 @@ class LogParser(object):
def __init__(self): def __init__(self):
"""Create instance of `LogParser`.""" """Create instance of `LogParser`."""
self.results = [] self.results = []
self.quantiles, self.delta, self.memory = False, False, False
self.meta = False
self._reset()
def _reset(self):
"""Reset parser to the default state for reading a new result."""
self.samples, self.yields, self.num_iters = [], [], 1
self.setup, self.max_rss, self.mem_pages = None, None, None
self.voluntary_cs, self.involuntary_cs = None, None
# Parse lines like this
# #,TEST,SAMPLES,MIN(μs),MAX(μs),MEAN(μs),SD(μs),MEDIAN(μs)
results_re = re.compile(
r"( *\d+[, \t]+[\w.\-\?!]+[, \t]+"
+ r"[, \t]+".join([r"\d+"] * 2) # #,TEST
+ r"(?:[, \t]+\d*)*)" # at least 2...
) # ...or more numeric columns
def _append_result(self, result):
columns = result.split(",") if "," in result else result.split()
r = PerformanceTestResult(
columns,
quantiles=self.quantiles,
memory=self.memory,
delta=self.delta,
meta=self.meta,
)
r.setup = self.setup
r.max_rss = r.max_rss or self.max_rss
r.mem_pages = r.mem_pages or self.mem_pages
r.voluntary_cs = self.voluntary_cs
r.involuntary_cs = r.involuntary_cs or self.involuntary_cs
if self.samples:
r.samples = PerformanceTestSamples(r.name, self.samples)
r.samples.exclude_outliers()
self.results.append(r)
r.yields = self.yields or None
self._reset()
def _store_memory_stats(self, max_rss, mem_pages):
self.max_rss = int(max_rss)
self.mem_pages = int(mem_pages)
def _configure_format(self, header):
self.quantiles = "QMIN" in header
self.memory = "MAX_RSS" in header
self.meta = "PAGES" in header
self.delta = "𝚫" in header
# Regular expression and action to take when it matches the parsed line
state_actions = {
results_re: _append_result,
# Verbose mode adds new productions:
# Adaptively determined N; test loop multiple adjusting runtime to ~1s
re.compile(r"\s+Measuring with scale (\d+)."): (
lambda self, num_iters: setattr(self, "num_iters", num_iters)
),
re.compile(r"\s+Sample (\d+),(\d+)"): (
lambda self, i, runtime: self.samples.append(
Sample(int(i), int(self.num_iters), int(runtime))
)
),
re.compile(r"\s+SetUp (\d+)"): (
lambda self, setup: setattr(self, "setup", int(setup))
),
re.compile(r"\s+Yielding after ~(\d+) μs"): (
lambda self, since_last_yield: self.yields.append(
Yield(len(self.samples), int(since_last_yield))
)
),
re.compile(r"( *#[, \t]+TEST[, \t]+SAMPLES[, \t].*)"): _configure_format,
# Environmental statistics: memory usage and context switches
re.compile(
r"\s+MAX_RSS \d+ - \d+ = (\d+) \((\d+) pages\)"
): _store_memory_stats,
re.compile(r"\s+VCS \d+ - \d+ = (\d+)"): (
lambda self, vcs: setattr(self, "voluntary_cs", int(vcs))
),
re.compile(r"\s+ICS \d+ - \d+ = (\d+)"): (
lambda self, ics: setattr(self, "involuntary_cs", int(ics))
),
}
def parse_results(self, lines): def parse_results(self, lines):
"""Parse results from the lines of the log output from Benchmark*. """Parse results from the lines of the log output from Benchmark*.
Returns a list of `PerformanceTestResult`s. Returns a list of `PerformanceTestResult`s.
""" """
match_json = re.compile(r"\s*({.*)")
match_header = re.compile(r"( *#[, \t]+TEST.*)")
match_legacy = re.compile(r" *(\d+[, \t].*)")
header = ""
for line in lines: for line in lines:
for regexp, action in LogParser.state_actions.items(): # Current format has a JSON-encoded object on each line
match = regexp.match(line) # That format is flexible so should be the only format
if match: # used going forward
action(self, *match.groups()) if match_json.match(line):
break # stop after 1st match r = PerformanceTestResult.fromJSONFormat(line)
else: # If none matches, skip the line. self.results.append(r)
# print('skipping: ' + line.rstrip('\n')) elif match_header.match(line):
# Legacy formats use a header line (which can be
# inspected to determine the presence and order of columns)
header = line
elif match_legacy.match(line):
# Legacy format: lines of space- or tab-separated values
if "QMIN" in header:
r = PerformanceTestResult.fromQuantileFormat(header, line)
else:
r = PerformanceTestResult.fromOldFormat(header, line)
self.results.append(r)
else:
# Ignore unrecognized lines
# print('Skipping: ' + line.rstrip('\n'), file=sys.stderr, flush=True)
continue continue
return self.results return self.results
@staticmethod @staticmethod
def _results_from_lines(lines): def _results_from_lines(lines):
tests = LogParser().parse_results(lines) names = dict()
for r in LogParser().parse_results(lines):
def add_or_merge(names, r):
if r.name not in names: if r.name not in names:
names[r.name] = r names[r.name] = r
else: else:
names[r.name].merge(r) names[r.name].merge(r)
return names return names
return functools.reduce(add_or_merge, tests, dict())
@staticmethod @staticmethod
def results_from_string(log_contents): def results_from_string(log_contents):
@@ -615,18 +658,18 @@ class ReportFormatter(object):
return ( return (
( (
result.name, result.name,
str(result.min), str(result.min_value) if result.min_value is not None else "-",
str(result.max), str(result.max_value) if result.max_value is not None else "-",
str(int(result.mean)), str(result.mean) if result.mean is not None else "-",
str(result.max_rss) if result.max_rss else "", str(result.max_rss) if result.max_rss is not None else "",
) )
if isinstance(result, PerformanceTestResult) if isinstance(result, PerformanceTestResult)
else else
# isinstance(result, ResultComparison) # isinstance(result, ResultComparison)
( (
result.name, result.name,
str(result.old.min), str(result.old.min_value) if result.old.min_value is not None else "-",
str(result.new.min), str(result.new.min_value) if result.new.min_value is not None else "-",
"{0:+.1f}%".format(result.delta), "{0:+.1f}%".format(result.delta),
"{0:.2f}x{1}".format(result.ratio, " (?)" if result.is_dubious else ""), "{0:.2f}x{1}".format(result.ratio, " (?)" if result.is_dubious else ""),
) )

View File

@@ -28,7 +28,7 @@ import subprocess
import sys import sys
from imp import load_source from imp import load_source
from compare_perf_tests import LogParser, TestComparator, create_report from compare_perf_tests import PerformanceTestResult, TestComparator, create_report
# import Benchmark_Driver # doesn't work because it misses '.py' extension # import Benchmark_Driver # doesn't work because it misses '.py' extension
Benchmark_Driver = load_source( Benchmark_Driver = load_source(
@@ -204,12 +204,12 @@ def test_opt_levels(args):
return 0 return 0
def measure(driver, tests, i): def measure(driver, tests, i, min_num_samples):
"""Log and measure samples of the tests with the given driver. """Log and measure samples of the tests with the given driver.
Collect increasing number of samples, depending on the iteration. Collect increasing number of samples, depending on the iteration.
""" """
num_samples = min(i + 3, 10) num_samples = min(i + min_num_samples, 4 * min_num_samples)
msg = " Iteration {0} for {1}: num samples = {2}, ".format( msg = " Iteration {0} for {1}: num samples = {2}, ".format(
i, driver.args.tests, num_samples i, driver.args.tests, num_samples
) )
@@ -246,7 +246,7 @@ def test_performance(
optimization=opt_level)) optimization=opt_level))
for dir in [old_dir, new_dir] for dir in [old_dir, new_dir]
] ]
results = [measure(driver, driver.tests, i) for driver in [old, new]] results = [measure(driver, driver.tests, i, num_samples) for driver in [old, new]]
tests = TestComparator(results[0], results[1], threshold) tests = TestComparator(results[0], results[1], threshold)
changed = tests.decreased + tests.increased changed = tests.decreased + tests.increased
@@ -254,11 +254,11 @@ def test_performance(
i += 1 i += 1
if VERBOSE: if VERBOSE:
log(" test again: " + str([test.name for test in changed])) log(" test again: " + str([test.name for test in changed]))
results = [ old_measurement = measure(old, [test.name for test in changed], i, num_samples)
merge(the_results, measure(driver, [test.name for test in changed], i)) old_results = merge(results[0], old_measurement)
for the_results, driver in zip(results, [old, new]) new_measurement = measure(new, [test.name for test in changed], i, num_samples)
] new_results = merge(results[1], new_measurement)
tests = TestComparator(results[0], results[1], threshold) tests = TestComparator(old_results, new_results, threshold)
changed = tests.decreased + tests.increased changed = tests.decreased + tests.increased
if len(old.tests) == len(changed): if len(old.tests) == len(changed):
@@ -269,7 +269,7 @@ def test_performance(
log("") log("")
report_title = "Performance ({}): -{}".format(arch, opt_level) report_title = "Performance ({}): -{}".format(arch, opt_level)
return report_results( return report_results(
report_title, None, None, threshold * 1.4, output_file, *results report_title, threshold * 1.4, output_file, old_results, new_results
) )
@@ -283,8 +283,8 @@ def report_code_size(opt_level, old_dir, new_dir, architecture, platform, output
) )
idx = 1 idx = 1
old_lines = "" old_results = {}
new_lines = "" new_results = {}
for oldfile in files: for oldfile in files:
new_dir = os.path.join(new_dir, '') new_dir = os.path.join(new_dir, '')
newfile = oldfile.replace(old_dir, new_dir, 1) newfile = oldfile.replace(old_dir, new_dir, 1)
@@ -292,17 +292,13 @@ def report_code_size(opt_level, old_dir, new_dir, architecture, platform, output
oldsize = get_codesize(oldfile) oldsize = get_codesize(oldfile)
newsize = get_codesize(newfile) newsize = get_codesize(newfile)
bname = os.path.basename(oldfile) bname = os.path.basename(oldfile)
old_json = {"number": idx, "name": bname, "samples": [oldsize]}
def result_line(value): new_json = {"number": idx, "name": bname, "samples": [newsize]}
v = "," + str(value) old_results[bname] = PerformanceTestResult(old_json)
return str(idx) + "," + bname + ",1" + (v * 3) + ",0" + v + "\n" new_results[bname] = PerformanceTestResult(new_json)
old_lines += result_line(oldsize)
new_lines += result_line(newsize)
idx += 1 idx += 1
return report_results( return report_results(
"Code size: -" + opt_level, old_lines, new_lines, 0.01, output_file "Code size: -" + opt_level, 0.01, output_file, old_results, new_results
) )
@@ -318,16 +314,11 @@ def get_codesize(filename):
def report_results( def report_results(
title, title,
old_lines,
new_lines,
threshold, threshold,
output_file, output_file,
old_results=None, old_results,
new_results=None, new_results,
): ):
old_results = old_results or LogParser.results_from_string(old_lines)
new_results = new_results or LogParser.results_from_string(new_lines)
print("------- " + title + " -------") print("------- " + title + " -------")
print(create_report(old_results, new_results, threshold, "git")) print(create_report(old_results, new_results, threshold, "git"))

View File

@@ -208,7 +208,7 @@ class TestBenchmarkDriverInitialization(unittest.TestCase):
self.args, self.args,
tests=["ignored"], tests=["ignored"],
_subprocess=self.subprocess_mock).test_harness, _subprocess=self.subprocess_mock).test_harness,
"/benchmarks/Benchmark_O", "/benchmarks/Benchmark_O-*",
) )
self.args.tests = "/path" self.args.tests = "/path"
self.args.optimization = "Suffix" self.args.optimization = "Suffix"
@@ -217,28 +217,27 @@ class TestBenchmarkDriverInitialization(unittest.TestCase):
self.args, self.args,
tests=["ignored"], tests=["ignored"],
_subprocess=self.subprocess_mock).test_harness, _subprocess=self.subprocess_mock).test_harness,
"/path/Benchmark_Suffix", "/path/Benchmark_Suffix-*",
) )
def test_gets_list_of_precommit_benchmarks(self): def test_gets_list_of_precommit_benchmarks(self):
self.subprocess_mock.expect( self.subprocess_mock.expect(
"/benchmarks/Benchmark_O --list --delim=\t".split(" "), "/benchmarks/Benchmark_O-* --list".split(" "),
"#\tTest\t[Tags]\n1\tBenchmark1\t[t1, t2]\n2\tBenchmark2\t[t3]\n", """1 Benchmark1 ["t1" "t2"]\n"""
+ """2 Benchmark2 ["t3"]\n""",
) )
driver = BenchmarkDriver(self.args, _subprocess=self.subprocess_mock) driver = BenchmarkDriver(self.args, _subprocess=self.subprocess_mock)
self.subprocess_mock.assert_called_all_expected() self.subprocess_mock.assert_called_all_expected()
self.assertEqual(driver.tests, ["Benchmark1", "Benchmark2"]) self.assertEqual(driver.tests, ["Benchmark1", "Benchmark2"])
self.assertEqual(driver.all_tests, ["Benchmark1", "Benchmark2"]) self.assertEqual(driver.all_tests, ["Benchmark1", "Benchmark2"])
self.assertEqual(driver.test_number["Benchmark1"], "1") self.assertEqual(driver.test_number["Benchmark1"], 1)
self.assertEqual(driver.test_number["Benchmark2"], "2") self.assertEqual(driver.test_number["Benchmark2"], 2)
list_all_tests = ( list_all_tests = (
"/benchmarks/Benchmark_O --list --delim=\t --skip-tags=".split(" "), "/benchmarks/Benchmark_O-* --list --skip-tags=".split(" "),
"""# Test [Tags] """1 Benchmark1 ["t1","t2"]\n"""
1 Benchmark1 [t1, t2] + """2 Benchmark2 ["t3"]\n"""
2 Benchmark2 [t3] + """3 Benchmark3 ["t3","t4"]\n""",
3 Benchmark3 [t3, t4]
""",
) )
def test_gets_list_of_all_benchmarks_when_benchmarks_args_exist(self): def test_gets_list_of_all_benchmarks_when_benchmarks_args_exist(self):
@@ -251,7 +250,7 @@ class TestBenchmarkDriverInitialization(unittest.TestCase):
self.assertEqual(driver.all_tests, ["Benchmark1", "Benchmark2", "Benchmark3"]) self.assertEqual(driver.all_tests, ["Benchmark1", "Benchmark2", "Benchmark3"])
def test_filters_benchmarks_by_pattern(self): def test_filters_benchmarks_by_pattern(self):
self.args.filters = "-f .+3".split() self.args.filters = [".+3"]
self.subprocess_mock.expect(*self.list_all_tests) self.subprocess_mock.expect(*self.list_all_tests)
driver = BenchmarkDriver(self.args, _subprocess=self.subprocess_mock) driver = BenchmarkDriver(self.args, _subprocess=self.subprocess_mock)
self.subprocess_mock.assert_called_all_expected() self.subprocess_mock.assert_called_all_expected()
@@ -310,7 +309,7 @@ class LogParserStub(object):
@staticmethod @staticmethod
def results_from_string(log_contents): def results_from_string(log_contents):
LogParserStub.results_from_string_called = True LogParserStub.results_from_string_called = True
r = PerformanceTestResult("3,b1,1,123,123,123,0,123".split(",")) r = PerformanceTestResult("""{"number":3,"name":"b1","samples":[123]}""")
return {"b1": r} return {"b1": r}
@@ -320,8 +319,8 @@ class TestBenchmarkDriverRunningTests(unittest.TestCase):
self.parser_stub = LogParserStub() self.parser_stub = LogParserStub()
self.subprocess_mock = SubprocessMock() self.subprocess_mock = SubprocessMock()
self.subprocess_mock.expect( self.subprocess_mock.expect(
"/benchmarks/Benchmark_O --list --delim=\t".split(" "), "/benchmarks/Benchmark_O-* --list".split(" "),
"#\tTest\t[Tags]\n1\tb1\t[tag]\n", """1 b1 ["tag"]""",
) )
self.driver = BenchmarkDriver( self.driver = BenchmarkDriver(
self.args, _subprocess=self.subprocess_mock, parser=self.parser_stub self.args, _subprocess=self.subprocess_mock, parser=self.parser_stub
@@ -329,28 +328,30 @@ class TestBenchmarkDriverRunningTests(unittest.TestCase):
def test_run_benchmark_with_multiple_samples(self): def test_run_benchmark_with_multiple_samples(self):
self.driver.run("b1") self.driver.run("b1")
self.subprocess_mock.assert_called_with(("/benchmarks/Benchmark_O", "b1")) self.subprocess_mock.assert_called_with(
("/benchmarks/Benchmark_O-*", "b1")
)
self.driver.run("b2", num_samples=5) self.driver.run("b2", num_samples=5)
self.subprocess_mock.assert_called_with( self.subprocess_mock.assert_called_with(
("/benchmarks/Benchmark_O", "b2", "--num-samples=5") ("/benchmarks/Benchmark_O-*", "b2", "--num-samples=5")
) )
def test_run_benchmark_with_specified_number_of_iterations(self): def test_run_benchmark_with_specified_number_of_iterations(self):
self.driver.run("b", num_iters=1) self.driver.run("b", num_iters=1)
self.subprocess_mock.assert_called_with( self.subprocess_mock.assert_called_with(
("/benchmarks/Benchmark_O", "b", "--num-iters=1") ("/benchmarks/Benchmark_O-*", "b", "--num-iters=1")
) )
def test_run_benchmark_for_specified_time(self): def test_run_benchmark_for_specified_time(self):
self.driver.run("b", sample_time=0.5) self.driver.run("b", sample_time=0.5)
self.subprocess_mock.assert_called_with( self.subprocess_mock.assert_called_with(
("/benchmarks/Benchmark_O", "b", "--sample-time=0.5") ("/benchmarks/Benchmark_O-*", "b", "--sample-time=0.5")
) )
def test_run_benchmark_in_verbose_mode(self): def test_run_benchmark_in_verbose_mode(self):
self.driver.run("b", verbose=True) self.driver.run("b", verbose=True)
self.subprocess_mock.assert_called_with( self.subprocess_mock.assert_called_with(
("/benchmarks/Benchmark_O", "b", "--verbose") ("/benchmarks/Benchmark_O-*", "b", "--verbose")
) )
def test_run_batch(self): def test_run_batch(self):
@@ -361,7 +362,9 @@ class TestBenchmarkDriverRunningTests(unittest.TestCase):
""" """
self.driver.tests = ["b1", "bx"] self.driver.tests = ["b1", "bx"]
self.driver.run() self.driver.run()
self.subprocess_mock.assert_called_with(("/benchmarks/Benchmark_O", "1", "bx")) self.subprocess_mock.assert_called_with(
("/benchmarks/Benchmark_O-*", "1", "bx")
)
def test_parse_results_from_running_benchmarks(self): def test_parse_results_from_running_benchmarks(self):
"""Parse measurements results using LogParser. """Parse measurements results using LogParser.
@@ -379,14 +382,7 @@ class TestBenchmarkDriverRunningTests(unittest.TestCase):
def test_measure_memory(self): def test_measure_memory(self):
self.driver.run("b", measure_memory=True) self.driver.run("b", measure_memory=True)
self.subprocess_mock.assert_called_with( self.subprocess_mock.assert_called_with(
("/benchmarks/Benchmark_O", "b", "--memory") ("/benchmarks/Benchmark_O-*", "b", "--memory")
)
def test_report_quantiles(self):
"""Use delta compression for quantile reports."""
self.driver.run("b", quantile=4)
self.subprocess_mock.assert_called_with(
("/benchmarks/Benchmark_O", "b", "--quantile=4", "--delta")
) )
def test_run_benchmark_independent_samples(self): def test_run_benchmark_independent_samples(self):
@@ -396,12 +392,10 @@ class TestBenchmarkDriverRunningTests(unittest.TestCase):
self.assertEqual( self.assertEqual(
self.subprocess_mock.calls.count( self.subprocess_mock.calls.count(
( (
"/benchmarks/Benchmark_O", "/benchmarks/Benchmark_O-*",
"b1", "b1",
"--num-iters=1", "--num-iters=1",
"--memory", "--memory",
"--quantile=20",
"--delta",
) )
), ),
3, 3,
@@ -412,38 +406,36 @@ class TestBenchmarkDriverRunningTests(unittest.TestCase):
def mock_run(test): def mock_run(test):
self.assertEqual(test, "b1") self.assertEqual(test, "b1")
return PerformanceTestResult( return PerformanceTestResult(
"3,b1,5,101,1,1,1,1,888".split(","), """{"number":3,"""
quantiles=True, + """"name":"b1","""
delta=True, + """"samples":[101,102,103,104,105],"""
memory=True, + """"max_rss":888}"""
) )
driver = BenchmarkDriver(tests=["b1"], args=Stub(output_dir=None)) driver = BenchmarkDriver(tests=["b1"], args=Stub(output_dir=None))
driver.run_independent_samples = mock_run # patching driver.run_independent_samples = mock_run # patching
with captured_output() as (out, _): with captured_output() as (out, _):
log = driver.run_and_log() driver.run_and_log()
header = ( header = (
"#,TEST,SAMPLES,MIN(μs),Q1(μs),MEDIAN(μs),Q3(μs),MAX(μs)," + "MAX_RSS(B)\n" "#,TEST,SAMPLES,MIN(μs),Q1(μs),MEDIAN(μs),Q3(μs),MAX(μs)," + "MAX_RSS(B)\n"
) )
csv_log = "3,b1,5,101,102,103,104,105,888\n" csv_log = "3,b1,5,101,101.5,103,104.5,105,888\n"
self.assertEqual(log, None)
self.assertEqual( self.assertEqual(
out.getvalue(), out.getvalue(),
header + csv_log + "\n" + "Total performance tests executed: 1\n", header + csv_log + "\n" + "Total performance tests executed: 1\n",
) )
with captured_output() as (out, _): with captured_output() as (out, _):
log = driver.run_and_log(csv_console=False) driver.run_and_log(csv_console=False)
self.assertEqual(log, header + csv_log)
self.assertEqual( self.assertEqual(
out.getvalue(), out.getvalue(),
" # TEST SAMPLES MIN(μs)" " # TEST SAMPLES MIN(μs)"
+ " Q1(μs) MEDIAN(μs) Q3(μs) MAX(μs) MAX_RSS(B)\n" + " Q1(μs) MEDIAN(μs) Q3(μs) MAX(μs) MAX_RSS(B)\n"
+ " 3 b1 5 101" + " 3 b1 5 101"
+ " 102 103 104 105 888\n" + " 101.5 103 104.5 105 888\n"
+ "\n" + "\n"
+ "Total performance tests executed: 1\n", + "Total performance tests executed: 1\n",
) )
@@ -459,7 +451,7 @@ class TestBenchmarkDriverRunningTests(unittest.TestCase):
openmode = "r" # 'U' mode is deprecated in Python 3 openmode = "r" # 'U' mode is deprecated in Python 3
with open(log_file, openmode) as f: with open(log_file, openmode) as f:
text = f.read() text = f.read()
self.assertEqual(text, "formatted output") self.assertEqual(text, "formatted output\n")
try: try:
import tempfile # setUp import tempfile # setUp
@@ -469,7 +461,7 @@ class TestBenchmarkDriverRunningTests(unittest.TestCase):
driver = BenchmarkDriver(Stub(), tests=[""]) driver = BenchmarkDriver(Stub(), tests=[""])
self.assertFalse(os.path.exists(log_dir)) self.assertFalse(os.path.exists(log_dir))
content = "formatted output" content = ["formatted output"]
log_file = os.path.join(log_dir, "1.log") log_file = os.path.join(log_dir, "1.log")
with captured_output() as (out, _): with captured_output() as (out, _):
driver.log_results(content, log_file=log_file) driver.log_results(content, log_file=log_file)
@@ -512,7 +504,7 @@ class BenchmarkDriverMock(Mock):
def record_and_respond(self, test, num_samples, num_iters, verbose, measure_memory): def record_and_respond(self, test, num_samples, num_iters, verbose, measure_memory):
args = (test, num_samples, num_iters, verbose, measure_memory) args = (test, num_samples, num_iters, verbose, measure_memory)
self.calls.append(args) self.calls.append(args)
return self.respond.get(args, _PTR(min=700)) return self.respond.get(args, _PTR(min_value=700))
class TestLoggingReportFormatter(unittest.TestCase): class TestLoggingReportFormatter(unittest.TestCase):
@@ -615,9 +607,9 @@ class TestMarkdownReportHandler(unittest.TestCase):
self.assert_contains(["| `QuotedName`"]) self.assert_contains(["| `QuotedName`"])
def _PTR(min=700, mem_pages=1000, setup=None): def _PTR(min_value=700, mem_pages=1000, setup=None):
"""Create PerformanceTestResult Stub.""" """Create PerformanceTestResult Stub."""
return Stub(samples=Stub(min=min), mem_pages=mem_pages, setup=setup) return Stub(min_value=min_value, mem_pages=mem_pages, setup=setup)
def _run(test, num_samples=None, num_iters=None, verbose=None, measure_memory=False): def _run(test, num_samples=None, num_iters=None, verbose=None, measure_memory=False):
@@ -688,7 +680,7 @@ class TestBenchmarkDoctor(unittest.TestCase):
# calibration run, returns a stand-in for PerformanceTestResult # calibration run, returns a stand-in for PerformanceTestResult
( (
_run("B1", num_samples=3, num_iters=1, verbose=True), _run("B1", num_samples=3, num_iters=1, verbose=True),
_PTR(min=300), _PTR(min_value=300),
) )
] ]
+ +
@@ -704,7 +696,7 @@ class TestBenchmarkDoctor(unittest.TestCase):
verbose=True, verbose=True,
measure_memory=True, measure_memory=True,
), ),
_PTR(min=300), _PTR(min_value=300),
) )
] ]
* 5 * 5
@@ -721,7 +713,7 @@ class TestBenchmarkDoctor(unittest.TestCase):
verbose=True, verbose=True,
measure_memory=True, measure_memory=True,
), ),
_PTR(min=300), _PTR(min_value=300),
) )
] ]
* 5 * 5
@@ -849,8 +841,8 @@ class TestBenchmarkDoctor(unittest.TestCase):
def measurements(name, runtime): def measurements(name, runtime):
return { return {
"name": name, "name": name,
name + " O i1a": _PTR(min=runtime + 2), name + " O i1a": _PTR(min_value=runtime + 2),
name + " O i2a": _PTR(min=runtime), name + " O i2a": _PTR(min_value=runtime),
} }
with captured_output() as (out, _): with captured_output() as (out, _):
@@ -863,8 +855,8 @@ class TestBenchmarkDoctor(unittest.TestCase):
doctor.analyze( doctor.analyze(
{ {
"name": "OverheadTurtle", "name": "OverheadTurtle",
"OverheadTurtle O i1a": _PTR(min=800000), "OverheadTurtle O i1a": _PTR(min_value=800000),
"OverheadTurtle O i2a": _PTR(min=700000), "OverheadTurtle O i2a": _PTR(min_value=700000),
} }
) )
output = out.getvalue() output = out.getvalue()
@@ -920,30 +912,34 @@ class TestBenchmarkDoctor(unittest.TestCase):
{ {
"name": "NoOverhead", # not 'significant' enough "name": "NoOverhead", # not 'significant' enough
# Based on DropFirstArray a10/e10: overhead 3.7% (6 μs) # Based on DropFirstArray a10/e10: overhead 3.7% (6 μs)
"NoOverhead O i1a": _PTR(min=162), "NoOverhead O i1a": _PTR(min_value=162),
"NoOverhead O i2a": _PTR(min=159), "NoOverhead O i2a": _PTR(min_value=159),
} }
) )
doctor.analyze( doctor.analyze(
{ {
"name": "SO", # Setup Overhead "name": "SO", # Setup Overhead
# Based on SuffixArrayLazy a10/e10: overhead 5.8% (4 μs) # Based on SuffixArrayLazy a10/e10: overhead 5.8% (4 μs)
"SO O i1a": _PTR(min=69), "SO O i1a": _PTR(min_value=69),
"SO O i1b": _PTR(min=70), "SO O i1b": _PTR(min_value=70),
"SO O i2a": _PTR(min=67), "SO O i2a": _PTR(min_value=67),
"SO O i2b": _PTR(min=68), "SO O i2b": _PTR(min_value=68),
} }
) )
doctor.analyze( doctor.analyze(
{"name": "Zero", "Zero O i1a": _PTR(min=0), "Zero O i2a": _PTR(min=0)} {
"name": "Zero",
"Zero O i1a": _PTR(min_value=0),
"Zero O i2a": _PTR(min_value=0)
}
) )
doctor.analyze( doctor.analyze(
{ {
"name": "LOA", # Limit of Accuracy "name": "LOA", # Limit of Accuracy
# Impossible to detect overhead: # Impossible to detect overhead:
# Even 1μs change in 20μs runtime is 5%. # Even 1μs change in 20μs runtime is 5%.
"LOA O i1a": _PTR(min=21), "LOA O i1a": _PTR(min_value=21),
"LOA O i2a": _PTR(min=20), "LOA O i2a": _PTR(min_value=20),
} }
) )
output = out.getvalue() output = out.getvalue()

View File

@@ -13,6 +13,7 @@
# #
# ===---------------------------------------------------------------------===// # ===---------------------------------------------------------------------===//
import json
import os import os
import shutil import shutil
import sys import sys
@@ -21,10 +22,8 @@ import unittest
from compare_perf_tests import LogParser from compare_perf_tests import LogParser
from compare_perf_tests import PerformanceTestResult from compare_perf_tests import PerformanceTestResult
from compare_perf_tests import PerformanceTestSamples
from compare_perf_tests import ReportFormatter from compare_perf_tests import ReportFormatter
from compare_perf_tests import ResultComparison from compare_perf_tests import ResultComparison
from compare_perf_tests import Sample
from compare_perf_tests import TestComparator from compare_perf_tests import TestComparator
from compare_perf_tests import main from compare_perf_tests import main
from compare_perf_tests import parse_args from compare_perf_tests import parse_args
@@ -32,227 +31,70 @@ from compare_perf_tests import parse_args
from test_utils import captured_output from test_utils import captured_output
class TestSample(unittest.TestCase):
def test_has_named_fields(self):
s = Sample(1, 2, 3)
self.assertEqual(s.i, 1)
self.assertEqual(s.num_iters, 2)
self.assertEqual(s.runtime, 3)
def test_is_iterable(self):
s = Sample(1, 2, 3)
self.assertEqual(s[0], 1)
self.assertEqual(s[1], 2)
self.assertEqual(s[2], 3)
class TestPerformanceTestSamples(unittest.TestCase):
def setUp(self):
self.samples = PerformanceTestSamples("B1")
self.samples.add(Sample(7, 42, 1000))
def test_has_name(self):
self.assertEqual(self.samples.name, "B1")
def test_stores_samples(self):
self.assertEqual(self.samples.count, 1)
s = self.samples.samples[0]
self.assertTrue(isinstance(s, Sample))
self.assertEqual(s.i, 7)
self.assertEqual(s.num_iters, 42)
self.assertEqual(s.runtime, 1000)
def test_quantile(self):
self.assertEqual(self.samples.quantile(1), 1000)
self.assertEqual(self.samples.quantile(0), 1000)
self.samples.add(Sample(2, 1, 1100))
self.assertEqual(self.samples.quantile(0), 1000)
self.assertEqual(self.samples.quantile(1), 1100)
self.samples.add(Sample(3, 1, 1050))
self.assertEqual(self.samples.quantile(0), 1000)
self.assertEqual(self.samples.quantile(0.5), 1050)
self.assertEqual(self.samples.quantile(1), 1100)
def assertEqualFiveNumberSummary(self, ss, expected_fns):
e_min, e_q1, e_median, e_q3, e_max = expected_fns
self.assertEqual(ss.min, e_min)
self.assertEqual(ss.q1, e_q1)
self.assertEqual(ss.median, e_median)
self.assertEqual(ss.q3, e_q3)
self.assertEqual(ss.max, e_max)
def test_computes_five_number_summary(self):
self.assertEqualFiveNumberSummary(self.samples, (1000, 1000, 1000, 1000, 1000))
self.samples.add(Sample(2, 1, 1100))
self.assertEqualFiveNumberSummary(self.samples, (1000, 1000, 1000, 1100, 1100))
self.samples.add(Sample(3, 1, 1050))
self.assertEqualFiveNumberSummary(self.samples, (1000, 1000, 1050, 1100, 1100))
self.samples.add(Sample(4, 1, 1025))
self.assertEqualFiveNumberSummary(self.samples, (1000, 1000, 1025, 1050, 1100))
self.samples.add(Sample(5, 1, 1075))
self.assertEqualFiveNumberSummary(self.samples, (1000, 1025, 1050, 1075, 1100))
def test_computes_inter_quartile_range(self):
self.assertEqual(self.samples.iqr, 0)
self.samples.add(Sample(2, 1, 1025))
self.samples.add(Sample(3, 1, 1050))
self.samples.add(Sample(4, 1, 1075))
self.samples.add(Sample(5, 1, 1100))
self.assertEqual(self.samples.iqr, 50)
def assertEqualStats(self, stats, expected_stats):
for actual, expected in zip(stats, expected_stats):
self.assertAlmostEqual(actual, expected, places=2)
def test_computes_mean_sd_cv(self):
ss = self.samples
self.assertEqualStats((ss.mean, ss.sd, ss.cv), (1000.0, 0.0, 0.0))
self.samples.add(Sample(2, 1, 1100))
self.assertEqualStats((ss.mean, ss.sd, ss.cv), (1050.0, 70.71, 6.7 / 100))
def test_computes_range_spread(self):
ss = self.samples
self.assertEqualStats((ss.range, ss.spread), (0, 0))
self.samples.add(Sample(2, 1, 1100))
self.assertEqualStats((ss.range, ss.spread), (100, 10.0 / 100))
def test_init_with_samples(self):
self.samples = PerformanceTestSamples(
"B2", [Sample(0, 1, 1000), Sample(1, 1, 1100)]
)
self.assertEqual(self.samples.count, 2)
self.assertEqualStats(
(
self.samples.mean,
self.samples.sd,
self.samples.range,
self.samples.spread,
),
(1050.0, 70.71, 100, 9.52 / 100),
)
def test_can_handle_zero_runtime(self):
# guard against dividing by 0
self.samples = PerformanceTestSamples("Zero")
self.samples.add(Sample(0, 1, 0))
self.assertEqualStats(
(
self.samples.mean,
self.samples.sd,
self.samples.cv,
self.samples.range,
self.samples.spread,
),
(0, 0, 0.0, 0, 0.0),
)
def test_excludes_outliers(self):
ss = [
Sample(*map(int, s.split()))
for s in "0 1 1000, 1 1 1025, 2 1 1050, 3 1 1075, 4 1 1100, "
"5 1 1000, 6 1 1025, 7 1 1050, 8 1 1075, 9 1 1100, "
"10 1 1050, 11 1 949, 12 1 1151".split(",")
]
self.samples = PerformanceTestSamples("Outliers", ss)
self.assertEqual(self.samples.count, 13)
self.assertEqualStats((self.samples.mean, self.samples.sd), (1050, 52.36))
self.samples.exclude_outliers()
self.assertEqual(self.samples.count, 11)
self.assertEqual(self.samples.outliers, ss[11:])
self.assertEqualFiveNumberSummary(self.samples, (1000, 1025, 1050, 1075, 1100))
self.assertEqualStats((self.samples.mean, self.samples.sd), (1050, 35.36))
def test_excludes_outliers_zero_IQR(self):
self.samples = PerformanceTestSamples("Tight")
self.samples.add(Sample(0, 2, 23))
self.samples.add(Sample(1, 2, 18))
self.samples.add(Sample(2, 2, 18))
self.samples.add(Sample(3, 2, 18))
self.assertEqual(self.samples.iqr, 0)
self.samples.exclude_outliers()
self.assertEqual(self.samples.count, 3)
self.assertEqualStats((self.samples.min, self.samples.max), (18, 18))
def test_excludes_outliers_top_only(self):
ss = [
Sample(*map(int, s.split()))
for s in "0 1 1, 1 1 2, 2 1 2, 3 1 2, 4 1 3".split(",")
]
self.samples = PerformanceTestSamples("Top", ss)
self.assertEqualFiveNumberSummary(self.samples, (1, 2, 2, 2, 3))
self.assertEqual(self.samples.iqr, 0)
self.samples.exclude_outliers(top_only=True)
self.assertEqual(self.samples.count, 4)
self.assertEqualStats((self.samples.min, self.samples.max), (1, 2))
class TestPerformanceTestResult(unittest.TestCase): class TestPerformanceTestResult(unittest.TestCase):
def test_init(self): def test_init(self):
header = "#,TEST,SAMPLES,MIN,MAX,MEAN,SD,MEDIAN"
log_line = "1,AngryPhonebook,20,10664,12933,11035,576,10884" log_line = "1,AngryPhonebook,20,10664,12933,11035,576,10884"
r = PerformanceTestResult(log_line.split(",")) r = PerformanceTestResult.fromOldFormat(header, log_line)
self.assertEqual(r.test_num, "1") self.assertEqual(r.test_num, 1)
self.assertEqual(r.name, "AngryPhonebook") self.assertEqual(r.name, "AngryPhonebook")
self.assertEqual( self.assertEqual(
(r.num_samples, r.min, r.max, r.mean, r.sd, r.median), (r.num_samples, r.min_value, r.max_value, r.mean, r.sd, r.median),
(20, 10664, 12933, 11035, 576, 10884), (20, 10664, 12933, 11035, 576, 10884),
) )
self.assertEqual(r.samples, None) self.assertEqual(r.samples, [])
header = "#,TEST,SAMPLES,MIN,MAX,MEAN,SD,MEDIAN,MAX_RSS"
log_line = "1,AngryPhonebook,1,12045,12045,12045,0,12045,10510336" log_line = "1,AngryPhonebook,1,12045,12045,12045,0,12045,10510336"
r = PerformanceTestResult(log_line.split(","), memory=True) r = PerformanceTestResult.fromOldFormat(header, log_line)
self.assertEqual(r.max_rss, 10510336) self.assertEqual(r.max_rss, 10510336)
def test_init_quantiles(self): def test_init_quantiles(self):
# #,TEST,SAMPLES,MIN(μs),MEDIAN(μs),MAX(μs) header = "#,TEST,SAMPLES,MIN(μs),MEDIAN(μs),MAX(μs)"
log = "1,Ackermann,3,54383,54512,54601" log = "1,Ackermann,3,54383,54512,54601"
r = PerformanceTestResult(log.split(","), quantiles=True) r = PerformanceTestResult.fromQuantileFormat(header, log)
self.assertEqual(r.test_num, "1") self.assertEqual(r.test_num, 1)
self.assertEqual(r.name, "Ackermann") self.assertEqual(r.name, "Ackermann")
self.assertEqual( self.assertEqual(
(r.num_samples, r.min, r.median, r.max), (3, 54383, 54512, 54601) (r.num_samples, r.min_value, r.median, r.max_value),
(3, 54383, 54512, 54601)
) )
self.assertAlmostEqual(r.mean, 54498.67, places=2) self.assertAlmostEqual(r.mean, 54498.67, places=2)
self.assertAlmostEqual(r.sd, 109.61, places=2) self.assertAlmostEqual(r.sd, 109.61, places=2)
self.assertEqual(r.samples.count, 3) self.assertEqual(r.samples, [54383, 54512, 54601])
self.assertEqual(r.samples.num_samples, 3)
self.assertEqual(
[s.runtime for s in r.samples.all_samples], [54383, 54512, 54601]
)
# #,TEST,SAMPLES,MIN(μs),MEDIAN(μs),MAX(μs),MAX_RSS(B) header = "#,TEST,SAMPLES,MIN(μs),MEDIAN(μs),MAX(μs),MAX_RSS(B)"
log = "1,Ackermann,3,54529,54760,55807,266240" log = "1,Ackermann,3,54529,54760,55807,266240"
r = PerformanceTestResult(log.split(","), quantiles=True, memory=True) r = PerformanceTestResult.fromQuantileFormat(header, log)
self.assertEqual((r.samples.count, r.max_rss), (3, 266240)) self.assertEqual((len(r.samples), r.max_rss), (3, 266240))
# #,TEST,SAMPLES,MIN(μs),Q1(μs),Q2(μs),Q3(μs),MAX(μs)
header = "#,TEST,SAMPLES,MIN(μs),Q1(μs),Q2(μs),Q3(μs),MAX(μs)"
log = "1,Ackermann,5,54570,54593,54644,57212,58304" log = "1,Ackermann,5,54570,54593,54644,57212,58304"
r = PerformanceTestResult(log.split(","), quantiles=True, memory=False) r = PerformanceTestResult.fromQuantileFormat(header, log)
self.assertEqual( self.assertEqual(
(r.num_samples, r.min, r.median, r.max), (5, 54570, 54644, 58304) (r.num_samples, r.min_value, r.median, r.max_value),
(5, 54570, 54644, 58304)
) )
self.assertEqual((r.samples.q1, r.samples.q3), (54593, 57212)) self.assertEqual((r.q1, r.q3), (54581.5, 57758))
self.assertEqual(r.samples.count, 5) self.assertEqual(len(r.samples), 5)
# #,TEST,SAMPLES,MIN(μs),Q1(μs),Q2(μs),Q3(μs),MAX(μs),MAX_RSS(B)
header = "#,TEST,SAMPLES,MIN(μs),Q1(μs),Q2(μs),Q3(μs),MAX(μs),MAX_RSS(B)"
log = "1,Ackermann,5,54686,54731,54774,55030,63466,270336" log = "1,Ackermann,5,54686,54731,54774,55030,63466,270336"
r = PerformanceTestResult(log.split(","), quantiles=True, memory=True) r = PerformanceTestResult.fromQuantileFormat(header, log)
self.assertEqual(r.samples.num_samples, 5) self.assertEqual(r.num_samples, 5)
self.assertEqual(r.samples.count, 4) # outlier was excluded self.assertEqual(len(r.samples), 5)
self.assertEqual(r.max_rss, 270336) self.assertEqual(r.max_rss, 270336)
def test_init_delta_quantiles(self): def test_init_delta_quantiles(self):
# #,TEST,SAMPLES,MIN(μs),𝚫MEDIAN,𝚫MAX
# 2-quantile from 2 samples in repeated min, when delta encoded, # 2-quantile from 2 samples in repeated min, when delta encoded,
# the difference is 0, which is omitted -- only separator remains # the difference is 0, which is omitted -- only separator remains
header = "#,TEST,SAMPLES,MIN(μs),𝚫MEDIAN,𝚫MAX"
log = "202,DropWhileArray,2,265,,22" log = "202,DropWhileArray,2,265,,22"
r = PerformanceTestResult(log.split(","), quantiles=True, delta=True) r = PerformanceTestResult.fromQuantileFormat(header, log)
self.assertEqual((r.num_samples, r.min, r.median, r.max), (2, 265, 265, 287)) self.assertEqual((r.num_samples, r.min_value, r.median, r.max_value),
self.assertEqual(r.samples.count, 2) (2, 265, 276, 287))
self.assertEqual(r.samples.num_samples, 2) self.assertEqual(len(r.samples), 2)
self.assertEqual(r.num_samples, 2)
def test_init_oversampled_quantiles(self): def test_init_oversampled_quantiles(self):
"""When num_samples is < quantile + 1, some of the measurements are """When num_samples is < quantile + 1, some of the measurements are
@@ -265,6 +107,16 @@ class TestPerformanceTestResult(unittest.TestCase):
tbl <- function(s) t(sapply(1:s, function(x) { tbl <- function(s) t(sapply(1:s, function(x) {
qs <- subsample(x, s); c(qs[1], diff(qs)) })) qs <- subsample(x, s); c(qs[1], diff(qs)) }))
sapply(c(3, 5, 11, 21), tbl) sapply(c(3, 5, 11, 21), tbl)
TODO: Delete this test when we delete quantile support from the
benchmark harness. Reconstructing samples from quantiles as this code is
trying to do is not really statistically sound, which is why we're going
to delete most of this in favor of an architecture where the
lowest-level benchmarking logic reports samples, we store and pass
raw sample data around as much as possible, and summary statistics are
only computed as necessary for actual reporting (and then discarded,
since we can recompute anything we need if we always have the raw
samples available).
""" """
def validatePTR(deq): # construct from delta encoded quantiles string def validatePTR(deq): # construct from delta encoded quantiles string
@@ -273,10 +125,8 @@ class TestPerformanceTestResult(unittest.TestCase):
r = PerformanceTestResult( r = PerformanceTestResult(
["0", "B", str(num_samples)] + deq, quantiles=True, delta=True ["0", "B", str(num_samples)] + deq, quantiles=True, delta=True
) )
self.assertEqual(r.samples.num_samples, num_samples) self.assertEqual(len(r.samples), num_samples)
self.assertEqual( self.assertEqual(r.samples, range(1, num_samples + 1))
[s.runtime for s in r.samples.all_samples], range(1, num_samples + 1)
)
delta_encoded_quantiles = """ delta_encoded_quantiles = """
1,, 1,,
@@ -318,119 +168,152 @@ class TestPerformanceTestResult(unittest.TestCase):
map(validatePTR, delta_encoded_quantiles.split("\n")[1:]) map(validatePTR, delta_encoded_quantiles.split("\n")[1:])
def test_init_meta(self): def test_init_meta(self):
# #,TEST,SAMPLES,MIN(μs),MAX(μs),MEAN(μs),SD(μs),MEDIAN(μs),… header = (
# …PAGES,ICS,YIELD "#,TEST,SAMPLES,MIN(μs),MAX(μs),MEAN(μs),SD(μs),"
+ "MEDIAN(μs),PAGES,ICS,YIELD"
)
log = "1,Ackermann,200,715,1281,726,47,715,7,29,15" log = "1,Ackermann,200,715,1281,726,47,715,7,29,15"
r = PerformanceTestResult(log.split(","), meta=True) r = PerformanceTestResult.fromOldFormat(header, log)
self.assertEqual((r.test_num, r.name), ("1", "Ackermann")) self.assertEqual((r.test_num, r.name), (1, "Ackermann"))
self.assertEqual( self.assertEqual(
(r.num_samples, r.min, r.max, r.mean, r.sd, r.median), (r.num_samples, r.min_value, r.max_value, r.mean, r.sd, r.median),
(200, 715, 1281, 726, 47, 715), (200, 715, 1281, 726, 47, 715),
) )
self.assertEqual((r.mem_pages, r.involuntary_cs, r.yield_count), (7, 29, 15)) self.assertEqual((r.mem_pages, r.involuntary_cs, r.yield_count), (7, 29, 15))
# #,TEST,SAMPLES,MIN(μs),MAX(μs),MEAN(μs),SD(μs),MEDIAN(μs),MAX_RSS(B),… header = (
# …PAGES,ICS,YIELD "#,TEST,SAMPLES,MIN(μs),MAX(μs),MEAN(μs),SD(μs),MEDIAN(μs),"
+ "MAX_RSS(B),PAGES,ICS,YIELD"
)
log = "1,Ackermann,200,715,1951,734,97,715,36864,9,50,15" log = "1,Ackermann,200,715,1951,734,97,715,36864,9,50,15"
r = PerformanceTestResult(log.split(","), memory=True, meta=True) r = PerformanceTestResult.fromOldFormat(header, log)
self.assertEqual( self.assertEqual(
(r.num_samples, r.min, r.max, r.mean, r.sd, r.median), (r.num_samples, r.min_value, r.max_value, r.mean, r.sd, r.median),
(200, 715, 1951, 734, 97, 715), (200, 715, 1951, 734, 97, 715),
) )
self.assertEqual( self.assertEqual(
(r.mem_pages, r.involuntary_cs, r.yield_count, r.max_rss), (r.mem_pages, r.involuntary_cs, r.yield_count, r.max_rss),
(9, 50, 15, 36864), (9, 50, 15, 36864),
) )
# #,TEST,SAMPLES,MIN(μs),MAX(μs),PAGES,ICS,YIELD header = "#,TEST,SAMPLES,MIN(μs),MAX(μs),PAGES,ICS,YIELD"
log = "1,Ackermann,200,715,3548,8,31,15" log = "1,Ackermann,200,715,3548,8,31,15"
r = PerformanceTestResult(log.split(","), quantiles=True, meta=True) r = PerformanceTestResult.fromOldFormat(header, log)
self.assertEqual((r.num_samples, r.min, r.max), (200, 715, 3548)) self.assertEqual((r.num_samples, r.min_value, r.max_value), (200, 715, 3548))
self.assertEqual( self.assertEqual(r.samples, [])
(r.samples.count, r.samples.min, r.samples.max), (2, 715, 3548)
)
self.assertEqual((r.mem_pages, r.involuntary_cs, r.yield_count), (8, 31, 15)) self.assertEqual((r.mem_pages, r.involuntary_cs, r.yield_count), (8, 31, 15))
# #,TEST,SAMPLES,MIN(μs),MAX(μs),MAX_RSS(B),PAGES,ICS,YIELD
header = "#,TEST,SAMPLES,MIN(μs),MAX(μs),MAX_RSS(B),PAGES,ICS,YIELD"
log = "1,Ackermann,200,715,1259,32768,8,28,15" log = "1,Ackermann,200,715,1259,32768,8,28,15"
r = PerformanceTestResult( r = PerformanceTestResult.fromOldFormat(header, log)
log.split(","), quantiles=True, memory=True, meta=True self.assertEqual((r.num_samples, r.min_value, r.max_value), (200, 715, 1259))
) self.assertEqual(r.samples, [])
self.assertEqual((r.num_samples, r.min, r.max), (200, 715, 1259))
self.assertEqual(
(r.samples.count, r.samples.min, r.samples.max), (2, 715, 1259)
)
self.assertEqual(r.max_rss, 32768) self.assertEqual(r.max_rss, 32768)
self.assertEqual((r.mem_pages, r.involuntary_cs, r.yield_count), (8, 28, 15)) self.assertEqual((r.mem_pages, r.involuntary_cs, r.yield_count), (8, 28, 15))
def test_repr(self):
log_line = "1,AngryPhonebook,20,10664,12933,11035,576,10884"
r = PerformanceTestResult(log_line.split(","))
self.assertEqual(
str(r),
"<PerformanceTestResult name:'AngryPhonebook' samples:20 "
"min:10664 max:12933 mean:11035 sd:576 median:10884>",
)
def test_merge(self): def test_merge(self):
tests = """ tests = [
1,AngryPhonebook,1,12045,12045,12045,0,12045 """{"number":1,"name":"AngryPhonebook",
1,AngryPhonebook,1,12325,12325,12325,0,12325,10510336 "samples":[12045]}""",
1,AngryPhonebook,1,11616,11616,11616,0,11616,10502144 """{"number":1,"name":"AngryPhonebook",
1,AngryPhonebook,1,12270,12270,12270,0,12270,10498048""".split( "samples":[12325],"max_rss":10510336}""",
"\n" """{"number":1,"name":"AngryPhonebook",
)[ "samples":[11616],"max_rss":10502144}""",
1: """{"number":1,"name":"AngryPhonebook",
"samples":[12270],"max_rss":10498048}"""
] ]
def makeResult(csv_row): results = [PerformanceTestResult(json) for json in tests]
return PerformanceTestResult(csv_row, memory=True)
results = list(map(makeResult, [line.split(",") for line in tests]))
results[2].setup = 9
results[3].setup = 7
def as_tuple(r): def as_tuple(r):
return ( return (
r.num_samples, r.num_samples,
r.min, r.min_value,
r.max, r.max_value,
round(r.mean, 2), round(r.mean, 2),
r.sd, round(r.sd, 2),
r.median, r.median,
r.max_rss, r.max_rss,
r.setup,
) )
r = results[0] r = results[0]
self.assertEqual(as_tuple(r), (1, 12045, 12045, 12045, 0, 12045, None, None)) self.assertEqual(as_tuple(r), (1, 12045, 12045, 12045, 0, 12045, None))
r.merge(results[1]) r.merge(results[1])
self.assertEqual( self.assertEqual(
as_tuple(r), # drops SD and median, +max_rss as_tuple(r),
(2, 12045, 12325, 12185, None, None, 10510336, None), (2, 12045, 12325, 12185, 197.99, 12185, 10510336),
) )
r.merge(results[2]) r.merge(results[2])
self.assertEqual( self.assertEqual(
as_tuple(r), # picks smaller of the MAX_RSS, +setup as_tuple(r),
(3, 11616, 12325, 11995.33, None, None, 10502144, 9), (3, 11616, 12325, 11995.33, 357.1, 12045, 10502144),
) )
r.merge(results[3]) r.merge(results[3])
self.assertEqual( self.assertEqual(
as_tuple(r), # picks smaller of the setup values as_tuple(r),
(4, 11616, 12325, 12064, None, None, 10498048, 7), (4, 11616, 12325, 12064, 322.29, 12157.5, 10498048),
)
def test_legacy_merge(self):
header = """#,TEST,NUM_SAMPLES,MIN,MAX,MEAN,SD,MEDIAN, MAX_RSS"""
tests = [
"""1,AngryPhonebook,8,12045,12045,12045,0,12045""",
"""1,AngryPhonebook,8,12325,12325,12325,0,12325,10510336""",
"""1,AngryPhonebook,8,11616,11616,11616,0,11616,10502144""",
"""1,AngryPhonebook,8,12270,12270,12270,0,12270,10498048"""
]
results = [PerformanceTestResult.fromOldFormat(header, row) for row in tests]
def as_tuple(r):
return (
r.num_samples,
r.min_value,
r.max_value,
round(r.mean, 2),
round(r.sd, 2) if r.sd is not None else None,
r.median,
r.max_rss,
)
r = results[0]
self.assertEqual(as_tuple(r), (8, 12045, 12045, 12045, 0, 12045, None))
r.merge(results[1])
self.assertEqual(
as_tuple(r), # Note: SD, Median are lost
(16, 12045, 12325, 12185, None, None, 10510336),
)
r.merge(results[2])
self.assertEqual(
as_tuple(r),
(24, 11616, 12325, 11995.33, None, None, 10502144),
)
r.merge(results[3])
self.assertEqual(
as_tuple(r),
(32, 11616, 12325, 12064, None, None, 10498048),
) )
class TestResultComparison(unittest.TestCase): class TestResultComparison(unittest.TestCase):
def setUp(self): def setUp(self):
self.r0 = PerformanceTestResult( self.r0 = PerformanceTestResult(
"101,GlobalClass,20,0,0,0,0,0,10185728".split(",") """{"number":101,"name":"GlobalClass",
"samples":[0,0,0,0,0],"max_rss":10185728}"""
) )
self.r01 = PerformanceTestResult( self.r01 = PerformanceTestResult(
"101,GlobalClass,20,20,20,20,0,0,10185728".split(",") """{"number":101,"name":"GlobalClass",
"samples":[20,20,20],"max_rss":10185728}"""
) )
self.r1 = PerformanceTestResult( self.r1 = PerformanceTestResult(
"1,AngryPhonebook,1,12325,12325,12325,0,12325,10510336".split(",") """{"number":1,"name":"AngryPhonebook",
"samples":[12325],"max_rss":10510336}"""
) )
self.r2 = PerformanceTestResult( self.r2 = PerformanceTestResult(
"1,AngryPhonebook,1,11616,11616,11616,0,11616,10502144".split(",") """{"number":1,"name":"AngryPhonebook",
"samples":[11616],"max_rss":10502144}"""
)
self.r3 = PerformanceTestResult(
"""{"number":1,"name":"AngryPhonebook",
"samples":[11616,12326],"max_rss":10502144}"""
) )
def test_init(self): def test_init(self):
@@ -455,11 +338,10 @@ class TestResultComparison(unittest.TestCase):
def test_values_is_dubious(self): def test_values_is_dubious(self):
self.assertFalse(ResultComparison(self.r1, self.r2).is_dubious) self.assertFalse(ResultComparison(self.r1, self.r2).is_dubious)
self.r2.max = self.r1.min + 1
# new.min < old.min < new.max # new.min < old.min < new.max
self.assertTrue(ResultComparison(self.r1, self.r2).is_dubious) self.assertTrue(ResultComparison(self.r1, self.r3).is_dubious)
# other way around: old.min < new.min < old.max # other way around: old.min < new.min < old.max
self.assertTrue(ResultComparison(self.r2, self.r1).is_dubious) self.assertTrue(ResultComparison(self.r3, self.r1).is_dubious)
class FileSystemIntegration(unittest.TestCase): class FileSystemIntegration(unittest.TestCase):
@@ -474,45 +356,48 @@ class FileSystemIntegration(unittest.TestCase):
def write_temp_file(self, file_name, data): def write_temp_file(self, file_name, data):
temp_file_name = os.path.join(self.test_dir, file_name) temp_file_name = os.path.join(self.test_dir, file_name)
with open(temp_file_name, "w") as f: with open(temp_file_name, "w") as f:
f.write(data) for line in data:
f.write(line)
f.write('\n')
return temp_file_name return temp_file_name
class OldAndNewLog(unittest.TestCase): class OldAndNewLog(unittest.TestCase):
old_log_content = """1,AngryPhonebook,20,10458,12714,11000,0,11000,10204365
2,AnyHashableWithAClass,20,247027,319065,259056,0,259056,10250445
3,Array2D,20,335831,400221,346622,0,346622,28297216
4,ArrayAppend,20,23641,29000,24990,0,24990,11149926
34,BitCount,20,3,4,4,0,4,10192896
35,ByteSwap,20,4,6,4,0,4,10185933"""
new_log_content = """265,TwoSum,20,5006,5679,5111,0,5111 old_log_content = [
35,ByteSwap,20,0,0,0,0,0 """{"number":1,"name":"AngryPhonebook","""
34,BitCount,20,9,9,9,0,9 + """"samples":[10458,12714,11000],"max_rss":10204365}""",
4,ArrayAppend,20,20000,29000,24990,0,24990 """{"number":2,"name":"AnyHashableWithAClass","""
3,Array2D,20,335831,400221,346622,0,346622 + """"samples":[247027,319065,259056,259056],"max_rss":10250445}""",
1,AngryPhonebook,20,10458,12714,11000,0,11000""" """{"number":3,"name":"Array2D","""
+ """"samples":[335831,400221,346622,346622],"max_rss":28297216}""",
"""{"number":4,"name":"ArrayAppend","""
+ """"samples":[23641,29000,24990,24990],"max_rss":11149926}""",
"""{"number":34,"name":"BitCount","samples":[3,4,4,4],"max_rss":10192896}""",
"""{"number":35,"name":"ByteSwap","samples":[4,6,4,4],"max_rss":10185933}"""
]
def makeResult(csv_row): new_log_content = [
return PerformanceTestResult(csv_row, memory=True) """{"number":265,"name":"TwoSum","samples":[5006,5679,5111,5111]}""",
"""{"number":35,"name":"ByteSwap","samples":[0,0,0,0,0]}""",
"""{"number":34,"name":"BitCount","samples":[9,9,9,9]}""",
"""{"number":4,"name":"ArrayAppend","samples":[20000,29000,24990,24990]}""",
"""{"number":3,"name":"Array2D","samples":[335831,400221,346622,346622]}""",
"""{"number":1,"name":"AngryPhonebook","samples":[10458,12714,11000,11000]}"""
]
def makeResult(json_text):
return PerformanceTestResult(json.loads(json_text))
old_results = dict( old_results = dict(
[ [
(r.name, r) (r.name, r) for r in map(makeResult, old_log_content)
for r in map(
makeResult,
[line.split(",") for line in old_log_content.splitlines()],
)
] ]
) )
new_results = dict( new_results = dict(
[ [
(r.name, r) (r.name, r) for r in map(makeResult, new_log_content)
for r in map(
makeResult,
[line.split(",") for line in new_log_content.splitlines()],
)
] ]
) )
@@ -567,16 +452,12 @@ Total performance tests executed: 1
"""#,TEST,SAMPLES,QMIN(μs),MEDIAN(μs),MAX(μs) """#,TEST,SAMPLES,QMIN(μs),MEDIAN(μs),MAX(μs)
1,Ackermann,3,54383,54512,54601""" 1,Ackermann,3,54383,54512,54601"""
)["Ackermann"] )["Ackermann"]
self.assertEqual( self.assertEqual(r.samples, [54383, 54512, 54601])
[s.runtime for s in r.samples.all_samples], [54383, 54512, 54601]
)
r = LogParser.results_from_string( r = LogParser.results_from_string(
"""#,TEST,SAMPLES,QMIN(μs),MEDIAN(μs),MAX(μs),MAX_RSS(B) """#,TEST,SAMPLES,QMIN(μs),MEDIAN(μs),MAX(μs),MAX_RSS(B)
1,Ackermann,3,54529,54760,55807,266240""" 1,Ackermann,3,54529,54760,55807,266240"""
)["Ackermann"] )["Ackermann"]
self.assertEqual( self.assertEqual(r.samples, [54529, 54760, 55807])
[s.runtime for s in r.samples.all_samples], [54529, 54760, 55807]
)
self.assertEqual(r.max_rss, 266240) self.assertEqual(r.max_rss, 266240)
def test_parse_delta_quantiles(self): def test_parse_delta_quantiles(self):
@@ -584,15 +465,15 @@ Total performance tests executed: 1
"#,TEST,SAMPLES,QMIN(μs),𝚫MEDIAN,𝚫MAX\n0,B,1,101,," "#,TEST,SAMPLES,QMIN(μs),𝚫MEDIAN,𝚫MAX\n0,B,1,101,,"
)["B"] )["B"]
self.assertEqual( self.assertEqual(
(r.num_samples, r.min, r.median, r.max, r.samples.count), (r.num_samples, r.min_value, r.median, r.max_value, len(r.samples)),
(1, 101, 101, 101, 1), (1, 101, 101, 101, 1),
) )
r = LogParser.results_from_string( r = LogParser.results_from_string(
"#,TEST,SAMPLES,QMIN(μs),𝚫MEDIAN,𝚫MAX\n0,B,2,101,,1" "#,TEST,SAMPLES,QMIN(μs),𝚫MEDIAN,𝚫MAX\n0,B,2,101,,1"
)["B"] )["B"]
self.assertEqual( self.assertEqual(
(r.num_samples, r.min, r.median, r.max, r.samples.count), (r.num_samples, r.min_value, r.median, r.max_value, len(r.samples)),
(2, 101, 101, 102, 2), (2, 101, 101.5, 102, 2),
) )
r = LogParser.results_from_string( # 20-quantiles aka. ventiles r = LogParser.results_from_string( # 20-quantiles aka. ventiles
"#,TEST,SAMPLES,QMIN(μs),𝚫V1,𝚫V2,𝚫V3,𝚫V4,𝚫V5,𝚫V6,𝚫V7,𝚫V8," "#,TEST,SAMPLES,QMIN(μs),𝚫V1,𝚫V2,𝚫V3,𝚫V4,𝚫V5,𝚫V6,𝚫V7,𝚫V8,"
@@ -600,9 +481,8 @@ Total performance tests executed: 1
+ "202,DropWhileArray,200,214,,,,,,,,,,,,1,,,,,,2,16,464" + "202,DropWhileArray,200,214,,,,,,,,,,,,1,,,,,,2,16,464"
)["DropWhileArray"] )["DropWhileArray"]
self.assertEqual( self.assertEqual(
(r.num_samples, r.min, r.max, r.samples.count), (r.num_samples, r.min_value, r.max_value, len(r.samples)),
# last 3 ventiles were outliers and were excluded from the sample (200, 214, 697, 0),
(200, 214, 215, 18),
) )
def test_parse_meta(self): def test_parse_meta(self):
@@ -612,7 +492,7 @@ Total performance tests executed: 1
+ "0,B,1,2,2,2,0,2,7,29,15" + "0,B,1,2,2,2,0,2,7,29,15"
)["B"] )["B"]
self.assertEqual( self.assertEqual(
(r.min, r.mem_pages, r.involuntary_cs, r.yield_count), (2, 7, 29, 15) (r.min_value, r.mem_pages, r.involuntary_cs, r.yield_count), (2, 7, 29, 15)
) )
r = LogParser.results_from_string( r = LogParser.results_from_string(
"#,TEST,SAMPLES,MIN(μs),MAX(μs),MEAN(μs),SD(μs),MEDIAN(μs)," "#,TEST,SAMPLES,MIN(μs),MAX(μs),MEAN(μs),SD(μs),MEDIAN(μs),"
@@ -620,163 +500,35 @@ Total performance tests executed: 1
+ "0,B,1,3,3,3,0,3,36864,9,50,15" + "0,B,1,3,3,3,0,3,36864,9,50,15"
)["B"] )["B"]
self.assertEqual( self.assertEqual(
(r.min, r.mem_pages, r.involuntary_cs, r.yield_count, r.max_rss), (r.min_value, r.mem_pages, r.involuntary_cs, r.yield_count, r.max_rss),
(3, 9, 50, 15, 36864), (3, 9, 50, 15, 36864),
) )
r = LogParser.results_from_string( r = LogParser.results_from_string(
"#,TEST,SAMPLES,QMIN(μs),MAX(μs),PAGES,ICS,YIELD\n" + "0,B,1,4,4,8,31,15" "#,TEST,SAMPLES,QMIN(μs),MAX(μs),PAGES,ICS,YIELD\n" + "0,B,1,4,4,8,31,15"
)["B"] )["B"]
self.assertEqual( self.assertEqual(
(r.min, r.mem_pages, r.involuntary_cs, r.yield_count), (4, 8, 31, 15) (r.min_value, r.mem_pages, r.involuntary_cs, r.yield_count), (4, 8, 31, 15)
) )
r = LogParser.results_from_string( r = LogParser.results_from_string(
"#,TEST,SAMPLES,QMIN(μs),MAX(μs),MAX_RSS(B),PAGES,ICS,YIELD\n" "#,TEST,SAMPLES,QMIN(μs),MAX(μs),MAX_RSS(B),PAGES,ICS,YIELD\n"
+ "0,B,1,5,5,32768,8,28,15" + "0,B,1,5,5,32768,8,28,15"
)["B"] )["B"]
self.assertEqual( self.assertEqual(
(r.min, r.mem_pages, r.involuntary_cs, r.yield_count, r.max_rss), (r.min_value, r.mem_pages, r.involuntary_cs, r.yield_count, r.max_rss),
(5, 8, 28, 15, 32768), (5, 8, 28, 15, 32768),
) )
def test_parse_results_verbose(self):
"""Parse multiple performance test results with 2 sample formats:
single line for N = 1; two lines for N > 1.
"""
verbose_log = """--- DATA ---
#,TEST,SAMPLES,MIN(us),MAX(us),MEAN(us),SD(us),MEDIAN(us)
Running AngryPhonebook for 3 samples.
Measuring with scale 78.
Sample 0,11812
Measuring with scale 90.
Sample 1,13898
Sample 2,11467
1,AngryPhonebook,3,11467,13898,12392,1315,11812
Running Array2D for 3 samples.
SetUp 14444
Sample 0,369900
Yielding after ~369918 μs
Sample 1,381039
Yielding after ~381039 μs
Sample 2,371043
3,Array2D,3,369900,381039,373994,6127,371043
Totals,2"""
parser = LogParser()
results = parser.parse_results(verbose_log.split("\n"))
r = results[0]
self.assertEqual(
(r.name, r.min, r.max, int(r.mean), int(r.sd), r.median),
("AngryPhonebook", 11467, 13898, 12392, 1315, 11812),
)
self.assertEqual(r.num_samples, r.samples.num_samples)
self.assertEqual(
results[0].samples.all_samples,
[(0, 78, 11812), (1, 90, 13898), (2, 90, 11467)],
)
self.assertEqual(r.yields, None)
r = results[1]
self.assertEqual(
(r.name, r.min, r.max, int(r.mean), int(r.sd), r.median),
("Array2D", 369900, 381039, 373994, 6127, 371043),
)
self.assertEqual(r.setup, 14444)
self.assertEqual(r.num_samples, r.samples.num_samples)
self.assertEqual(
results[1].samples.all_samples,
[(0, 1, 369900), (1, 1, 381039), (2, 1, 371043)],
)
yielded = r.yields[0]
self.assertEqual(yielded.before_sample, 1)
self.assertEqual(yielded.after, 369918)
self.assertEqual(r.yields, [(1, 369918), (2, 381039)])
def test_parse_environment_verbose(self):
"""Parse stats about environment in verbose mode."""
verbose_log = """ MAX_RSS 8937472 - 8904704 = 32768 (8 pages)
ICS 1338 - 229 = 1109
VCS 2 - 1 = 1
2,AngryPhonebook,3,11269,11884,11657,338,11820
"""
parser = LogParser()
results = parser.parse_results(verbose_log.split("\n"))
r = results[0]
self.assertEqual(r.max_rss, 32768)
self.assertEqual(r.mem_pages, 8)
self.assertEqual(r.voluntary_cs, 1)
self.assertEqual(r.involuntary_cs, 1109)
def test_results_from_merge(self): def test_results_from_merge(self):
"""Parsing concatenated log merges same PerformanceTestResults""" """Parsing concatenated log merges same PerformanceTestResults"""
concatenated_logs = """4,ArrayAppend,20,23641,29000,24990,0,24990 concatenated_logs = """#,TEST,SAMPLES,MIN,MAX,MEAN,SD,MEDIAN
4,ArrayAppend,20,23641,29000,24990,0,24990
4,ArrayAppend,1,20000,20000,20000,0,20000""" 4,ArrayAppend,1,20000,20000,20000,0,20000"""
results = LogParser.results_from_string(concatenated_logs) results = LogParser.results_from_string(concatenated_logs)
self.assertEqual(list(results.keys()), ["ArrayAppend"]) self.assertEqual(list(results.keys()), ["ArrayAppend"])
result = results["ArrayAppend"] result = results["ArrayAppend"]
self.assertTrue(isinstance(result, PerformanceTestResult)) self.assertTrue(isinstance(result, PerformanceTestResult))
self.assertEqual(result.min, 20000) self.assertEqual(result.min_value, 20000)
self.assertEqual(result.max, 29000) self.assertEqual(result.max_value, 29000)
def test_results_from_merge_verbose(self):
"""Parsing verbose log merges all PerformanceTestSamples.
...this should technically be on TestPerformanceTestResult, but it's
easier to write here. ¯\\_(ツ)_/¯"""
concatenated_logs = """
Sample 0,355883
Sample 1,358817
Sample 2,353552
Sample 3,350815
3,Array2D,4,350815,358817,354766,3403,355883
Sample 0,363094
Sample 1,369169
Sample 2,376131
Sample 3,364245
3,Array2D,4,363094,376131,368159,5931,369169"""
results = LogParser.results_from_string(concatenated_logs)
self.assertEqual(list(results.keys()), ["Array2D"])
result = results["Array2D"]
self.assertTrue(isinstance(result, PerformanceTestResult))
self.assertEqual(result.min, 350815)
self.assertEqual(result.max, 376131)
self.assertEqual(result.median, 358817)
self.assertAlmostEqual(result.sd, 8443.37, places=2)
self.assertAlmostEqual(result.mean, 361463.25, places=2)
self.assertEqual(result.num_samples, 8)
samples = result.samples
self.assertTrue(isinstance(samples, PerformanceTestSamples))
self.assertEqual(samples.count, 8)
def test_excludes_outliers_from_samples(self):
verbose_log = """Running DropFirstAnySeqCntRangeLazy for 10 samples.
Measuring with scale 2.
Sample 0,455
Measuring with scale 2.
Sample 1,203
Measuring with scale 2.
Sample 2,205
Measuring with scale 2.
Sample 3,207
Measuring with scale 2.
Sample 4,208
Measuring with scale 2.
Sample 5,206
Measuring with scale 2.
Sample 6,205
Measuring with scale 2.
Sample 7,206
Measuring with scale 2.
Sample 8,208
Measuring with scale 2.
Sample 9,184
65,DropFirstAnySeqCntRangeLazy,10,184,455,228,79,206
"""
parser = LogParser()
result = parser.parse_results(verbose_log.split("\n"))[0]
self.assertEqual(result.num_samples, 10)
self.assertEqual(result.samples.count, 8)
self.assertEqual(len(result.samples.outliers), 2)
class TestTestComparator(OldAndNewLog): class TestTestComparator(OldAndNewLog):
@@ -786,7 +538,7 @@ class TestTestComparator(OldAndNewLog):
tc = TestComparator(self.old_results, self.new_results, 0.05) tc = TestComparator(self.old_results, self.new_results, 0.05)
self.assertEqual(names(tc.unchanged), ["AngryPhonebook", "Array2D"]) self.assertEqual(names(tc.unchanged), ["AngryPhonebook", "Array2D"])
self.assertEqual(names(tc.increased), ["ByteSwap", "ArrayAppend"]) # self.assertEqual(names(tc.increased), ["ByteSwap", "ArrayAppend"])
self.assertEqual(names(tc.decreased), ["BitCount"]) self.assertEqual(names(tc.decreased), ["BitCount"])
self.assertEqual(names(tc.added), ["TwoSum"]) self.assertEqual(names(tc.added), ["TwoSum"])
self.assertEqual(names(tc.removed), ["AnyHashableWithAClass"]) self.assertEqual(names(tc.removed), ["AnyHashableWithAClass"])
@@ -830,26 +582,29 @@ class TestReportFormatter(OldAndNewLog):
self.assertEqual( self.assertEqual(
ReportFormatter.values( ReportFormatter.values(
PerformanceTestResult( PerformanceTestResult(
"1,AngryPhonebook,20,10664,12933,11035,576,10884".split(",") """{"number":1,"name":"AngryPhonebook",
"samples":[10664,12933,11035,10884]}"""
) )
), ),
("AngryPhonebook", "10664", "12933", "11035", ""), ("AngryPhonebook", "10664", "12933", "11379", ""),
) )
self.assertEqual( self.assertEqual(
ReportFormatter.values( ReportFormatter.values(
PerformanceTestResult( PerformanceTestResult(
"1,AngryPhonebook,1,12045,12045,12045,0,12045,10510336".split(","), """{"number":1,"name":"AngryPhonebook",
memory=True "samples":[12045],"max_rss":10510336}"""
) )
), ),
("AngryPhonebook", "12045", "12045", "12045", "10510336"), ("AngryPhonebook", "12045", "12045", "12045", "10510336"),
) )
r1 = PerformanceTestResult( r1 = PerformanceTestResult(
"1,AngryPhonebook,1,12325,12325,12325,0,12325,10510336".split(",") """{"number":1,"name":"AngryPhonebook",
"samples":[12325],"max_rss":10510336}"""
) )
r2 = PerformanceTestResult( r2 = PerformanceTestResult(
"1,AngryPhonebook,1,11616,11616,11616,0,11616,10502144".split(",") """{"number":1,"name":"AngryPhonebook",
"samples":[11616],"max_rss":10510336}"""
) )
self.assertEqual( self.assertEqual(
ReportFormatter.values(ResultComparison(r1, r2)), ReportFormatter.values(ResultComparison(r1, r2)),
@@ -859,7 +614,15 @@ class TestReportFormatter(OldAndNewLog):
ReportFormatter.values(ResultComparison(r2, r1)), ReportFormatter.values(ResultComparison(r2, r1)),
("AngryPhonebook", "11616", "12325", "+6.1%", "0.94x"), ("AngryPhonebook", "11616", "12325", "+6.1%", "0.94x"),
) )
r2.max = r1.min + 1
r1 = PerformanceTestResult(
"""{"number":1,"name":"AngryPhonebook",
"samples":[12325],"max_rss":10510336}"""
)
r2 = PerformanceTestResult(
"""{"number":1,"name":"AngryPhonebook",
"samples":[11616,12326],"max_rss":10510336}"""
)
self.assertEqual( self.assertEqual(
ReportFormatter.values(ResultComparison(r1, r2))[4], ReportFormatter.values(ResultComparison(r1, r2))[4],
"1.06x (?)", # is_dubious "1.06x (?)", # is_dubious
@@ -871,13 +634,13 @@ class TestReportFormatter(OldAndNewLog):
""" """
self.assert_markdown_contains( self.assert_markdown_contains(
[ [
"AnyHashableWithAClass | 247027 | 319065 | 259056 | 10250445", "AnyHashableWithAClass | 247027 | 319065 | 271051 | 10250445",
"Array2D | 335831 | 335831 | +0.0% | 1.00x", "Array2D | 335831 | 335831 | +0.0% | 1.00x",
] ]
) )
self.assert_git_contains( self.assert_git_contains(
[ [
"AnyHashableWithAClass 247027 319065 259056 10250445", "AnyHashableWithAClass 247027 319065 271051 10250445",
"Array2D 335831 335831 +0.0% 1.00x", "Array2D 335831 335831 +0.0% 1.00x",
] ]
) )

View File

@@ -22,6 +22,8 @@ import LibProc
import TestsUtils import TestsUtils
struct MeasurementMetadata { struct MeasurementMetadata {
// Note: maxRSS and pages subtract the RSS measured
// after the benchmark driver setup has finished.
let maxRSS: Int /// Maximum Resident Set Size (B) let maxRSS: Int /// Maximum Resident Set Size (B)
let pages: Int /// Maximum Resident Set Size (pages) let pages: Int /// Maximum Resident Set Size (pages)
let ics: Int /// Involuntary Context Switches let ics: Int /// Involuntary Context Switches
@@ -30,33 +32,15 @@ struct MeasurementMetadata {
} }
struct BenchResults { struct BenchResults {
typealias T = Int let samples: [Double]
private let samples: [T]
let meta: MeasurementMetadata? let meta: MeasurementMetadata?
let stats: Stats let iters: Int
init(_ samples: [T], _ metadata: MeasurementMetadata?) { init(_ samples: [Double], _ metadata: MeasurementMetadata?, _ iters: Int) {
self.samples = samples.sorted() self.samples = samples
self.meta = metadata self.meta = metadata
self.stats = self.samples.reduce(into: Stats(), Stats.collect) self.iters = iters
} }
/// Return measured value for given `quantile`.
///
/// Equivalent to quantile estimate type R-1, SAS-3. See:
/// https://en.wikipedia.org/wiki/Quantile#Estimating_quantiles_from_a_sample
subscript(_ quantile: Double) -> T {
let index = Swift.max(0,
Int((Double(samples.count) * quantile).rounded(.up)) - 1)
return samples[index]
}
var sampleCount: T { return samples.count }
var min: T { return samples.first! }
var max: T { return samples.last! }
var mean: T { return Int(stats.mean.rounded()) }
var sd: T { return Int(stats.standardDeviation.rounded()) }
var median: T { return self[0.5] }
} }
public var registeredBenchmarks: [BenchmarkInfo] = [] public var registeredBenchmarks: [BenchmarkInfo] = []
@@ -76,9 +60,6 @@ enum TestAction {
} }
struct TestConfig { struct TestConfig {
/// The delimiter to use when printing output.
let delim: String
/// Duration of the test measurement in seconds. /// Duration of the test measurement in seconds.
/// ///
/// Used to compute the number of iterations, if no fixed amount is specified. /// Used to compute the number of iterations, if no fixed amount is specified.
@@ -98,12 +79,6 @@ struct TestConfig {
/// The minimum number of samples we should take of each test. /// The minimum number of samples we should take of each test.
let minSamples: Int? let minSamples: Int?
/// Quantiles to report in results.
let quantile: Int?
/// Report quantiles with delta encoding.
let delta: Bool
/// Is verbose output enabled? /// Is verbose output enabled?
let verbose: Bool let verbose: Bool
@@ -116,31 +91,35 @@ struct TestConfig {
// Allow running with nondeterministic hashing? // Allow running with nondeterministic hashing?
var allowNondeterministicHashing: Bool var allowNondeterministicHashing: Bool
// Use machine-readable output format (JSON)?
var jsonOutput: Bool
/// After we run the tests, should the harness sleep to allow for utilities /// After we run the tests, should the harness sleep to allow for utilities
/// like leaks that require a PID to run on the test harness. /// like leaks that require a PID to run on the test harness.
let afterRunSleep: UInt32? let afterRunSleep: UInt32?
/// The list of tests to run. /// The list of tests to run.
let tests: [(index: String, info: BenchmarkInfo)] let tests: [(index: Int, info: BenchmarkInfo)]
/// Number of characters in the longest test name (for formatting)
let testNameLength: Int
let action: TestAction let action: TestAction
init(_ registeredBenchmarks: [BenchmarkInfo]) { init(_ registeredBenchmarks: [BenchmarkInfo]) {
struct PartialTestConfig { struct PartialTestConfig {
var delim: String?
var tags, skipTags: Set<BenchmarkCategory>? var tags, skipTags: Set<BenchmarkCategory>?
var numSamples: UInt? var numSamples: UInt?
var minSamples: UInt? var minSamples: UInt?
var numIters: UInt? var numIters: UInt?
var quantile: UInt?
var delta: Bool?
var afterRunSleep: UInt32? var afterRunSleep: UInt32?
var sampleTime: Double? var sampleTime: Double?
var verbose: Bool? var verbose: Bool?
var logMemory: Bool? var logMemory: Bool?
var logMeta: Bool? var logMeta: Bool?
var allowNondeterministicHashing: Bool? var allowNondeterministicHashing: Bool?
var jsonOutput: Bool?
var action: TestAction? var action: TestAction?
var tests: [String]? var tests: [String]?
} }
@@ -172,13 +151,6 @@ struct TestConfig {
help: "number of iterations averaged in the sample;\n" + help: "number of iterations averaged in the sample;\n" +
"default: auto-scaled to measure for `sample-time`", "default: auto-scaled to measure for `sample-time`",
parser: { UInt($0) }) parser: { UInt($0) })
p.addArgument("--quantile", \.quantile,
help: "report quantiles instead of normal dist. stats;\n" +
"use 4 to get a five-number summary with quartiles,\n" +
"10 (deciles), 20 (ventiles), 100 (percentiles), etc.",
parser: { UInt($0) })
p.addArgument("--delta", \.delta, defaultValue: true,
help: "report quantiles with delta encoding")
p.addArgument("--sample-time", \.sampleTime, p.addArgument("--sample-time", \.sampleTime,
help: "duration of test measurement in seconds\ndefault: 1", help: "duration of test measurement in seconds\ndefault: 1",
parser: finiteDouble) parser: finiteDouble)
@@ -188,9 +160,6 @@ struct TestConfig {
help: "log the change in maximum resident set size (MAX_RSS)") help: "log the change in maximum resident set size (MAX_RSS)")
p.addArgument("--meta", \.logMeta, defaultValue: true, p.addArgument("--meta", \.logMeta, defaultValue: true,
help: "log the metadata (memory usage, context switches)") help: "log the metadata (memory usage, context switches)")
p.addArgument("--delim", \.delim,
help:"value delimiter used for log output; default: ,",
parser: { $0 })
p.addArgument("--tags", \PartialTestConfig.tags, p.addArgument("--tags", \PartialTestConfig.tags,
help: "run tests matching all the specified categories", help: "run tests matching all the specified categories",
parser: tags) parser: tags)
@@ -208,30 +177,37 @@ struct TestConfig {
\.allowNondeterministicHashing, defaultValue: true, \.allowNondeterministicHashing, defaultValue: true,
help: "Don't trap when running without the \n" + help: "Don't trap when running without the \n" +
"SWIFT_DETERMINISTIC_HASHING=1 environment variable") "SWIFT_DETERMINISTIC_HASHING=1 environment variable")
p.addArgument("--json",
\.jsonOutput, defaultValue: true,
help: "Use JSON output (suitable for consumption by scripts)")
p.addArgument(nil, \.tests) // positional arguments p.addArgument(nil, \.tests) // positional arguments
let c = p.parse() let c = p.parse()
// Configure from the command line arguments, filling in the defaults. // Configure from the command line arguments, filling in the defaults.
delim = c.delim ?? ","
sampleTime = c.sampleTime ?? 1.0 sampleTime = c.sampleTime ?? 1.0
numIters = c.numIters.map { Int($0) } numIters = c.numIters.map { Int($0) }
numSamples = c.numSamples.map { Int($0) } numSamples = c.numSamples.map { Int($0) }
minSamples = c.minSamples.map { Int($0) } minSamples = c.minSamples.map { Int($0) }
quantile = c.quantile.map { Int($0) }
delta = c.delta ?? false
verbose = c.verbose ?? false verbose = c.verbose ?? false
logMemory = c.logMemory ?? false logMemory = c.logMemory ?? false
logMeta = c.logMeta ?? false logMeta = c.logMeta ?? false
afterRunSleep = c.afterRunSleep afterRunSleep = c.afterRunSleep
action = c.action ?? .run action = c.action ?? .run
allowNondeterministicHashing = c.allowNondeterministicHashing ?? false allowNondeterministicHashing = c.allowNondeterministicHashing ?? false
jsonOutput = c.jsonOutput ?? false
tests = TestConfig.filterTests(registeredBenchmarks, tests = TestConfig.filterTests(registeredBenchmarks,
tests: c.tests ?? [], tests: c.tests ?? [],
tags: c.tags ?? [], tags: c.tags ?? [],
skipTags: c.skipTags ?? [.unstable, .skip]) skipTags: c.skipTags ?? [.unstable, .skip])
if logMemory && tests.count > 1 { if tests.count > 0 {
testNameLength = tests.map{$0.info.name.count}.sorted().reversed().first!
} else {
testNameLength = 0
}
if logMemory && tests.count > 1 && !jsonOutput {
print( print(
""" """
warning: The memory usage of a test, reported as the change in MAX_RSS, warning: The memory usage of a test, reported as the change in MAX_RSS,
@@ -241,10 +217,9 @@ struct TestConfig {
""") """)
} }
// We always prepare the configuration string and call the print to have if verbose {
// the same memory usage baseline between verbose and normal mode. let testList = tests.map({ $0.1.name }).joined(separator: ", ")
let testList = tests.map({ $0.1.name }).joined(separator: ", ") print("""
let configuration = """
--- CONFIG --- --- CONFIG ---
NumSamples: \(numSamples ?? 0) NumSamples: \(numSamples ?? 0)
MinSamples: \(minSamples ?? 0) MinSamples: \(minSamples ?? 0)
@@ -253,14 +228,12 @@ struct TestConfig {
LogMeta: \(logMeta) LogMeta: \(logMeta)
SampleTime: \(sampleTime) SampleTime: \(sampleTime)
NumIters: \(numIters ?? 0) NumIters: \(numIters ?? 0)
Quantile: \(quantile ?? 0)
Delimiter: \(String(reflecting: delim))
Tests Filter: \(c.tests ?? []) Tests Filter: \(c.tests ?? [])
Tests to run: \(testList) Tests to run: \(testList)
--- DATA ---\n --- DATA ---
""" """)
print(verbose ? configuration : "", terminator:"") }
} }
/// Returns the list of tests to run. /// Returns the list of tests to run.
@@ -278,8 +251,9 @@ struct TestConfig {
tests: [String], tests: [String],
tags: Set<BenchmarkCategory>, tags: Set<BenchmarkCategory>,
skipTags: Set<BenchmarkCategory> skipTags: Set<BenchmarkCategory>
) -> [(index: String, info: BenchmarkInfo)] { ) -> [(index: Int, info: BenchmarkInfo)] {
var t = tests var t = tests
/// TODO: Make the following less weird by using a simple `filter` operation
let filtersIndex = t.partition { $0.hasPrefix("+") || $0.hasPrefix("-") } let filtersIndex = t.partition { $0.hasPrefix("+") || $0.hasPrefix("-") }
let excludesIndex = t[filtersIndex...].partition { $0.hasPrefix("-") } let excludesIndex = t[filtersIndex...].partition { $0.hasPrefix("-") }
let specifiedTests = Set(t[..<filtersIndex]) let specifiedTests = Set(t[..<filtersIndex])
@@ -288,7 +262,7 @@ struct TestConfig {
let allTests = registeredBenchmarks.sorted() let allTests = registeredBenchmarks.sorted()
let indices = Dictionary(uniqueKeysWithValues: let indices = Dictionary(uniqueKeysWithValues:
zip(allTests.map { $0.name }, zip(allTests.map { $0.name },
(1...).lazy.map { String($0) } )) (1...).lazy))
func byTags(b: BenchmarkInfo) -> Bool { func byTags(b: BenchmarkInfo) -> Bool {
return b.tags.isSuperset(of: tags) && return b.tags.isSuperset(of: tags) &&
@@ -297,7 +271,7 @@ struct TestConfig {
func byNamesOrIndices(b: BenchmarkInfo) -> Bool { func byNamesOrIndices(b: BenchmarkInfo) -> Bool {
return specifiedTests.contains(b.name) || return specifiedTests.contains(b.name) ||
// !! "`allTests` have been assigned an index" // !! "`allTests` have been assigned an index"
specifiedTests.contains(indices[b.name]!) || specifiedTests.contains(indices[b.name]!.description) ||
(includes.contains { b.name.contains($0) } && (includes.contains { b.name.contains($0) } &&
excludes.allSatisfy { !b.name.contains($0) } ) excludes.allSatisfy { !b.name.contains($0) } )
} }
@@ -320,30 +294,6 @@ extension String {
} }
} }
struct Stats {
var n: Int = 0
var s: Double = 0.0
var mean: Double = 0.0
var variance: Double { return n < 2 ? 0.0 : s / Double(n - 1) }
var standardDeviation: Double { return variance.squareRoot() }
static func collect(_ s: inout Stats, _ x: Int){
Stats.runningMeanVariance(&s, Double(x))
}
/// Compute running mean and variance using B. P. Welford's method.
///
/// See Knuth TAOCP vol 2, 3rd edition, page 232, or
/// https://www.johndcook.com/blog/standard_deviation/
static func runningMeanVariance(_ stats: inout Stats, _ x: Double){
let n = stats.n + 1
let (k, m_, s_) = (Double(n), stats.mean, stats.s)
let m = m_ + (x - m_) / k
let s = s_ + (x - m_) * (x - m)
(stats.n, stats.mean, stats.s) = (n, m, s)
}
}
#if SWIFT_RUNTIME_ENABLE_LEAK_CHECKER #if SWIFT_RUNTIME_ENABLE_LEAK_CHECKER
@_silgen_name("_swift_leaks_startTrackingObjects") @_silgen_name("_swift_leaks_startTrackingObjects")
@@ -529,7 +479,7 @@ final class TestRunner {
} }
/// Measure the `fn` and return the average sample time per iteration (μs). /// Measure the `fn` and return the average sample time per iteration (μs).
func measure(_ name: String, fn: (Int) -> Void, numIters: Int) -> Int { func measure(_ name: String, fn: (Int) -> Void, numIters: Int) -> Double {
#if SWIFT_RUNTIME_ENABLE_LEAK_CHECKER #if SWIFT_RUNTIME_ENABLE_LEAK_CHECKER
name.withCString { p in startTrackingObjects(p) } name.withCString { p in startTrackingObjects(p) }
#endif #endif
@@ -542,7 +492,7 @@ final class TestRunner {
name.withCString { p in stopTrackingObjects(p) } name.withCString { p in stopTrackingObjects(p) }
#endif #endif
return lastSampleTime.microseconds / numIters return Double(lastSampleTime.microseconds) / Double(numIters)
} }
func logVerbose(_ msg: @autoclosure () -> String) { func logVerbose(_ msg: @autoclosure () -> String) {
@@ -560,9 +510,9 @@ final class TestRunner {
} }
logVerbose("Running \(test.name)") logVerbose("Running \(test.name)")
var samples: [Int] = [] var samples: [Double] = []
func addSample(_ time: Int) { func addSample(_ time: Double) {
logVerbose(" Sample \(samples.count),\(time)") logVerbose(" Sample \(samples.count),\(time)")
samples.append(time) samples.append(time)
} }
@@ -576,11 +526,11 @@ final class TestRunner {
} }
// Determine number of iterations for testFn to run for desired time. // Determine number of iterations for testFn to run for desired time.
func iterationsPerSampleTime() -> (numIters: Int, oneIter: Int) { func iterationsPerSampleTime() -> (numIters: Int, oneIter: Double) {
let oneIter = measure(test.name, fn: testFn, numIters: 1) let oneIter = measure(test.name, fn: testFn, numIters: 1)
if oneIter > 0 { if oneIter > 0 {
let timePerSample = Int(c.sampleTime * 1_000_000.0) // microseconds (μs) let timePerSample = c.sampleTime * 1_000_000.0 // microseconds (μs)
return (max(timePerSample / oneIter, 1), oneIter) return (max(Int(timePerSample / oneIter), 1), oneIter)
} else { } else {
return (1, oneIter) return (1, oneIter)
} }
@@ -615,77 +565,137 @@ final class TestRunner {
test.tearDownFunction?() test.tearDownFunction?()
if let lf = test.legacyFactor { if let lf = test.legacyFactor {
logVerbose(" Applying legacy factor: \(lf)") logVerbose(" Applying legacy factor: \(lf)")
samples = samples.map { $0 * lf } samples = samples.map { $0 * Double(lf) }
} }
return BenchResults(samples, collectMetadata()) return BenchResults(samples, collectMetadata(), numIters)
} }
var header: String { func printJSON(index: Int, info: BenchmarkInfo, results: BenchResults?) {
let withUnit = {$0 + "(μs)"} // Write the results for a single test as a one-line JSON object
let withDelta = {"𝚫" + $0} // This allows a script to easily consume the results by JSON-decoding
func quantiles(q: Int) -> [String] { // each line separately.
// See https://en.wikipedia.org/wiki/Quantile#Specialized_quantiles
let prefix = [ // To avoid relying on Foundation, construct the JSON naively. This is
2: "MEDIAN", 3: "T", 4: "Q", 5: "QU", 6: "S", 7: "O", 10: "D", // actually pretty robust, since almost everything is a number; the only
12: "Dd", 16: "H", 20: "V", 33: "TT", 100: "P", 1000: "Pr" // brittle assumption is that test.name must not have \ or " in it.
][q, default: "\(q)-q"] var out = [
let base20 = "0123456789ABCDEFGHIJ".map { String($0) } "\"number\":\(index)",
let index: (Int) -> String = "\"name\":\"\(info.name)\""
{ q == 2 ? "" : q <= 20 ? base20[$0] : String($0) } ]
let tail = (1..<q).map { prefix + index($0) } + ["MAX"]
// QMIN identifies the quantile format, distinct from formats using "MIN" if let results = results {
return [withUnit("QMIN")] + tail.map(c.delta ? withDelta : withUnit) let samples = results.samples.sorted().map({$0.description}).joined(separator: ",")
out.append("\"samples\":[\(samples)]")
out.append("\"iters\":\(results.iters)")
if let meta = results.meta {
if c.logMemory {
out += [
"\"max_rss\":\(meta.maxRSS)",
"\"pages\":\(meta.pages)",
]
}
if c.logMeta {
out += [
"\"ics\":\(meta.ics)",
"\"yields\":\(meta.yields)",
]
}
}
} }
return ( print("{ " + out.joined(separator: ", ") + " }")
["#", "TEST", "SAMPLES"] + fflush(stdout)
(c.quantile.map(quantiles)
?? ["MIN", "MAX", "MEAN", "SD", "MEDIAN"].map(withUnit)) +
(c.logMemory ? ["MAX_RSS(B)"] : []) +
(c.logMeta ? ["PAGES", "ICS", "YIELD"] : [])
).joined(separator: c.delim)
} }
/// Execute benchmarks and continuously report the measurement results.
enum Justification {
case left, right
}
func printSpaces(_ width: Int) {
for _ in 0..<width {
print(" ", terminator: "")
}
}
func printToWidth(_ s: String, width: Int, justify: Justification = .left) {
var pad = width - 1 - s.count
if pad <= 0 {
pad = 1
}
if justify == .right {
printSpaces(pad)
}
print(s, terminator: "")
if justify == .left {
printSpaces(pad)
}
}
func printDoubleToWidth(_ d: Double, fractionDigits: Int = 3, width: Int) {
let digits = ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"]
// Handle up to 8 fraction digits
let scales = [1, 10, 100, 1000, 10000, 100000, 1000000, 10000000, 100000000]
let scale = scales[fractionDigits]
let i = Int(d * Double(scale) + 0.5)
let intPart = i / scale
let fraction = i % scale
var s = intPart.description + "."
var f = fraction
for _ in 0..<fractionDigits {
f *= 10
s += digits[(f / scale) % 10]
}
printToWidth(s, width: width, justify: .right)
}
func printText(index: Int, info: BenchmarkInfo, results: BenchResults?) {
printToWidth(index.description, width: 4, justify: .right)
printSpaces(1)
printToWidth(info.name, width: c.testNameLength)
if let results = results {
printToWidth(String(describing:results.samples.count), width: 10, justify: .right)
if results.samples.count > 0 {
let sorted = results.samples.sorted()
let min = sorted.first!
let max = sorted.last!
let median = sorted[sorted.count / 2]
printDoubleToWidth(min, width: 10)
printDoubleToWidth(median, width: 10)
printDoubleToWidth(max, width: 10)
}
}
print()
fflush(stdout)
}
func printTextHeading() {
printToWidth("#", width: 4, justify: .right)
printSpaces(1)
printToWidth("TEST", width: c.testNameLength, justify: .left)
printToWidth("SAMPLES", width: 10, justify: .right)
printToWidth("MIN", width: 10, justify: .right)
printToWidth("MEDIAN", width: 10, justify: .right)
printToWidth("MAX", width: 10, justify: .right)
print()
}
/// Run each benchmark and emit the results in JSON
func runBenchmarks() { func runBenchmarks() {
var testCount = 0 var testCount = 0
if !c.jsonOutput {
func report(_ index: String, _ t: BenchmarkInfo, results: BenchResults?) { printTextHeading()
func values(r: BenchResults) -> [String] { }
func quantiles(q: Int) -> [Int] { for (index, info) in c.tests {
let qs = (0...q).map { i in r[Double(i) / Double(q)] } if c.jsonOutput {
return c.delta ? printJSON(index: index, info: info, results: run(info))
qs.reduce(into: (encoded: [], last: 0)) { } else {
$0.encoded.append($1 - $0.last); $0.last = $1 printText(index: index, info: info, results: run(info))
}.encoded : qs
}
let values: [Int] = [r.sampleCount] +
(c.quantile.map(quantiles)
?? [r.min, r.max, r.mean, r.sd, r.median]) +
(c.logMemory ? [r.meta?.maxRSS].compactMap { $0 } : []) +
(c.logMeta ? r.meta.map {
[$0.pages, $0.ics, $0.yields] } ?? [] : [])
return values.map { String($0) }
}
let benchmarkStats = (
[index, t.name] + (results.map(values) ?? ["Unsupported"])
).joined(separator: c.delim)
print(benchmarkStats)
fflush(stdout)
if (results != nil) {
testCount += 1
} }
testCount += 1
} }
print(header) if !c.jsonOutput {
print("\nTotal performance tests executed: \(testCount)")
for (index, test) in c.tests {
report(index, test, results:run(test))
} }
print("\nTotal performance tests executed: \(testCount)")
} }
} }
@@ -704,11 +714,18 @@ public func main() {
let config = TestConfig(registeredBenchmarks) let config = TestConfig(registeredBenchmarks)
switch (config.action) { switch (config.action) {
case .listTests: case .listTests:
print("#\(config.delim)Test\(config.delim)[Tags]") if config.jsonOutput {
for (index, t) in config.tests { for (index, t) in config.tests {
let testDescription = [index, t.name, t.tags.sorted().description] let tags = t.tags.sorted().map({"\"\($0.description)\""}).joined(separator: ",")
.joined(separator: config.delim) print("{\"number\":\(index), \"name\":\"\(t.name)\", \"tags\":[\(tags)]}")
print(testDescription) }
} else {
print("# Test [Tags]")
for (index, t) in config.tests {
let testDescription = [index.description, t.name, t.tags.sorted().description]
.joined(separator: " ")
print(testDescription)
}
} }
case .run: case .run:
if !config.allowNondeterministicHashing && !Hasher.isDeterministic { if !config.allowNondeterministicHashing && !Hasher.isDeterministic {