Overhaul Benchmarking pipeline to use complete sample data, not summaries

The Swift benchmarking harness now has two distinct output formats:

* Default: Formatted text that's intended for human consumption.
  Right now, this is just the minimum value, but we can augment that.

* `--json`: each output line is a JSON-encoded object that contains raw data
  This information is intended for use by python scripts that aggregate
  or compare multiple independent tests.

Previously, we tried to use the same output for both purposes.  This required
the python scripts to do more complex parsing of textual layouts, and also meant
that the python scripts had only summary data to work with instead of full raw
sample information.  This in turn made it almost impossible to derive meaningful
comparisons between runs or to aggregate multiple runs.

Typical output in the new JSON format looks like this:
```
{"number":89, "name":"PerfTest", "samples":[1.23, 2.35], "max_rss":16384}
{"number":91, "name":"OtherTest", "samples":[14.8, 19.7]}
```

This format is easy to parse in Python.  Just iterate over
lines and decode each one separately. Also note that the
optional fields (`"max_rss"` above) are trivial to handle:
```
import json
for l in lines:
   j = json.loads(l)
   # Default 0 if not present
   max_rss = j.get("max_rss", 0)
```
Note the `"samples"` array includes the runtime for each individual run.

Because optional fields are so much easier to handle in this form, I reworked
the Python logic to translate old formats into this JSON format for more
uniformity.  Hopefully, we can simplify the code in a year or so by stripping
out the old log formats entirely, along with some of the redundant statistical
calculations.  In particular, the python logic still makes an effort to preserve
mean, median, max, min, stdev, and other statistical data whenever the full set
of samples is not present.  Once we've gotten to a point where we're always
keeping full samples, we can compute any such information on the fly as needed,
eliminating the need to record it.

This is a pretty big rearchitecture of the core benchmarking logic. In order to
try to keep things a bit more manageable, I have not taken this opportunity to
replace any of the actual statistics used in the higher level code or to change
how the actual samples are measured. (But I expect this rearchitecture will make
such changes simpler.) In particular, this should not actually change any
benchmark results.

For the future, please keep this general principle in mind: Statistical
summaries (averages, medians, etc) should as a rule be computed for immediate
output and rarely if ever stored or used as input for other processing. Instead,
aim to store and transfer raw data from which statistics can be recomputed as
necessary.
This commit is contained in:
Tim Kientzle
2022-10-12 13:23:06 -07:00
parent 1a1afeb410
commit 971a5d8547
5 changed files with 846 additions and 1083 deletions

View File

@@ -28,6 +28,7 @@ class `BenchmarkDoctor` analyzes performance tests, implements `check` COMMAND.
import argparse
import functools
import glob
import json
import logging
import math
import os
@@ -88,9 +89,10 @@ class BenchmarkDriver(object):
def test_harness(self):
"""Full path to test harness binary."""
suffix = self.args.optimization if hasattr(self.args, "optimization") else "O"
suffix += "-"
if hasattr(self.args, "architecture") and self.args.architecture:
suffix += "-" + self.args.architecture + "*"
pattern = os.path.join(self.args.tests, "Benchmark_" + suffix)
suffix += self.args.architecture
pattern = os.path.join(self.args.tests, "Benchmark_" + suffix + "*")
executables = []
if hasattr(self._subprocess, "test_mode") and self._subprocess.test_mode:
executables = [pattern]
@@ -134,22 +136,20 @@ class BenchmarkDriver(object):
@property
def _cmd_list_benchmarks(self):
# Use tab delimiter for easier parsing to override the default comma.
# (The third 'column' is always comma-separated list of tags in square
# brackets -- currently unused here.)
return [self.test_harness, "--list", "--delim=\t"] + (
return [self.test_harness, "--list", "--json"] + (
["--skip-tags="] if (self.args.benchmarks or self.args.filters) else []
)
def _get_tests(self):
"""Return a list of performance tests to run."""
number_name_pairs = [
line.split("\t")[:2]
for line in self._invoke(self._cmd_list_benchmarks).split("\n")[1:-1]
]
# unzip list of pairs into 2 lists
test_numbers, self.all_tests = map(list, zip(*number_name_pairs))
self.test_number = dict(zip(self.all_tests, test_numbers))
lines = self._invoke(self._cmd_list_benchmarks).split("\n")
json_tests = []
for l in lines:
if l.strip() != "":
json_tests.append(json.loads(l))
self.all_tests = [json["name"] for json in json_tests]
test_numbers = [json["number"] for json in json_tests]
self.test_number = dict([(json["name"], json["number"]) for json in json_tests])
if self.args.filters:
return self._tests_matching_patterns()
if self.args.benchmarks:
@@ -157,25 +157,18 @@ class BenchmarkDriver(object):
return self.all_tests
def _tests_matching_patterns(self):
regexes = [re.compile(pattern) for pattern in self.args.filters]
return sorted(
list(
set(
[
name
for pattern in regexes
for name in self.all_tests
if pattern.match(name)
]
)
)
)
regexes = map(re.compile, self.args.filters)
matches = set()
for pattern in regexes:
new_matches = filter(pattern.match, self.all_tests)
matches.union(new_matches)
return sorted(list(matches))
def _tests_by_name_or_number(self, test_numbers):
benchmarks = set(self.args.benchmarks)
number_to_name = dict(zip(test_numbers, self.all_tests))
tests_by_number = [
number_to_name[i] for i in benchmarks.intersection(set(test_numbers))
number_to_name[i] for i in benchmarks.intersection(test_numbers)
]
return sorted(
list(benchmarks.intersection(set(self.all_tests)).union(tests_by_number))
@@ -188,8 +181,7 @@ class BenchmarkDriver(object):
num_iters=None,
sample_time=None,
verbose=None,
measure_memory=False,
quantile=None,
measure_memory=False
):
"""Execute benchmark and gather results."""
num_samples = num_samples or 0
@@ -197,7 +189,7 @@ class BenchmarkDriver(object):
sample_time = sample_time or 0 # default is 1s
cmd = self._cmd_run(
test, num_samples, num_iters, sample_time, verbose, measure_memory, quantile
test, num_samples, num_iters, sample_time, verbose, measure_memory
)
output = self._invoke(cmd)
results = self.parser.results_from_string(output)
@@ -210,8 +202,7 @@ class BenchmarkDriver(object):
num_iters,
sample_time,
verbose,
measure_memory,
quantile,
measure_memory
):
cmd = [self.test_harness]
if test:
@@ -228,9 +219,7 @@ class BenchmarkDriver(object):
cmd.append("--verbose")
if measure_memory:
cmd.append("--memory")
if quantile:
cmd.append("--quantile={0}".format(quantile))
cmd.append("--delta")
cmd.append("--json")
return cmd
def run_independent_samples(self, test):
@@ -246,12 +235,12 @@ class BenchmarkDriver(object):
return functools.reduce(
merge_results,
[
self.run(test, measure_memory=True, num_iters=1, quantile=20)
self.run(test, measure_memory=True, num_iters=1)
for _ in range(self.args.independent_samples)
],
)
def log_results(self, output, log_file=None):
def log_results(self, results, log_file=None):
"""Log output to `log_file`.
Creates `args.output_dir` if it doesn't exist yet.
@@ -262,7 +251,8 @@ class BenchmarkDriver(object):
os.makedirs(dir)
print("Logging results to: %s" % log_file)
with open(log_file, "w") as f:
f.write(output)
for r in results:
print(r, file=f)
RESULT = "{:>3} {:<40} {:>7} {:>7} {:>6} {:>10} {:>6} {:>7} {:>10}"
@@ -284,7 +274,7 @@ class BenchmarkDriver(object):
def console_log(values):
print(format(values))
def result_values(r):
def summary(r):
return list(
map(
str,
@@ -292,17 +282,17 @@ class BenchmarkDriver(object):
r.test_num,
r.name,
r.num_samples,
r.min,
r.samples.q1,
r.min_value,
r.q1,
r.median,
r.samples.q3,
r.max,
r.q3,
r.max_value,
r.max_rss,
],
)
)
header = [
summary_header = [
"#",
"TEST",
"SAMPLES",
@@ -313,25 +303,23 @@ class BenchmarkDriver(object):
"MAX(μs)",
"MAX_RSS(B)",
]
console_log(header)
results = [header]
console_log(summary_header)
results = []
for test in self.tests:
result = result_values(self.run_independent_samples(test))
console_log(result)
result = self.run_independent_samples(test)
console_log(summary(result))
results.append(result)
print("\nTotal performance tests executed: {0}".format(len(self.tests)))
return (
None if csv_console else ("\n".join([",".join(r) for r in results]) + "\n")
) # csv_log
return results
@staticmethod
def run_benchmarks(args):
"""Run benchmarks and log results."""
driver = BenchmarkDriver(args)
csv_log = driver.run_and_log(csv_console=(args.output_dir is None))
if csv_log:
driver.log_results(csv_log)
results = driver.run_and_log(csv_console=(args.output_dir is None))
if args.output_dir:
driver.log_results([r.json for r in results])
return 0
@@ -445,7 +433,6 @@ class BenchmarkDoctor(object):
Optional `driver` parameter for injecting dependency; used for testing.
"""
super(BenchmarkDoctor, self).__init__()
self.driver = driver or BenchmarkDriver(args)
self.results = {}
if hasattr(args, "markdown") and args.markdown:
@@ -458,6 +445,7 @@ class BenchmarkDoctor(object):
self.console_handler.setLevel(
logging.DEBUG if args.verbose else logging.INFO
)
self.driver = driver or BenchmarkDriver(args)
self.log.addHandler(self.console_handler)
self.log.debug("Checking tests: %s", ", ".join(self.driver.tests))
self.requirements = [
@@ -532,7 +520,7 @@ class BenchmarkDoctor(object):
correction = setup / i
i_series = BenchmarkDoctor._select(measurements, num_iters=i)
for result in i_series:
runtimes.append(result.samples.min - correction)
runtimes.append(result.min_value - correction)
runtime = min(runtimes)
threshold = 1000
@@ -584,7 +572,7 @@ class BenchmarkDoctor(object):
ti1, ti2 = [
float(min(mins))
for mins in [
[result.samples.min for result in i_series]
[result.min_value for result in i_series]
for i_series in [select(measurements, num_iters=i) for i in [1, 2]]
]
]
@@ -679,7 +667,7 @@ class BenchmarkDoctor(object):
r = self.driver.run(
benchmark, num_samples=3, num_iters=1, verbose=True
) # calibrate
num_samples = self._adjusted_1s_samples(r.samples.min)
num_samples = self._adjusted_1s_samples(r.min_value)
def capped(s):
return min(s, 200)
@@ -689,7 +677,7 @@ class BenchmarkDoctor(object):
opts = opts if isinstance(opts, list) else [opts]
self.log.debug(
"Runtime {0} μs yields {1} adjusted samples per second.".format(
r.samples.min, num_samples
r.min_value, num_samples
)
)
self.log.debug(

View File

@@ -17,9 +17,7 @@ This script compares performance test logs and issues a formatted report.
Invoke `$ compare_perf_tests.py -h ` for complete list of options.
class `Sample` is single benchmark measurement.
class `PerformanceTestSamples` is collection of `Sample`s and their statistics.
class `PerformanceTestResult` is a summary of performance test execution.
class `PerformanceTestResult` collects information about a single test
class `LogParser` converts log files into `PerformanceTestResult`s.
class `ResultComparison` compares new and old `PerformanceTestResult`s.
class `TestComparator` analyzes changes between the old and new test results.
@@ -29,194 +27,10 @@ class `ReportFormatter` creates the test comparison report in specified format.
import argparse
import functools
import json
import re
import statistics
import sys
from bisect import bisect, bisect_left, bisect_right
from collections import namedtuple
from math import ceil, sqrt
class Sample(namedtuple("Sample", "i num_iters runtime")):
u"""Single benchmark measurement.
Initialized with:
`i`: ordinal number of the sample taken,
`num-num_iters`: number or iterations used to compute it,
`runtime`: in microseconds (μs).
"""
def __repr__(self):
"""Shorter Sample formatting for debugging purposes."""
return "s({0.i!r}, {0.num_iters!r}, {0.runtime!r})".format(self)
class Yield(namedtuple("Yield", "before_sample after")):
u"""Meta-measurement of when the Benchmark_X voluntarily yielded process.
`before_sample`: index of measurement taken just after returning from yield
`after`: time elapsed since the previous yield in microseconds (μs)
"""
class PerformanceTestSamples(object):
"""Collection of runtime samples from the benchmark execution.
Computes the sample population statistics.
"""
def __init__(self, name, samples=None):
"""Initialize with benchmark name and optional list of Samples."""
self.name = name # Name of the performance test
self.samples = []
self.outliers = []
self._runtimes = []
self.mean = 0.0
self.S_runtime = 0.0 # For computing running variance
for sample in samples or []:
self.add(sample)
def __str__(self):
"""Text summary of benchmark statistics."""
return (
"{0.name!s} n={0.count!r} "
"Min={0.min!r} Q1={0.q1!r} M={0.median!r} Q3={0.q3!r} "
"Max={0.max!r} "
"R={0.range!r} {0.spread:.2%} IQR={0.iqr!r} "
"Mean={0.mean:.0f} SD={0.sd:.0f} CV={0.cv:.2%}".format(self)
if self.samples
else "{0.name!s} n=0".format(self)
)
def add(self, sample):
"""Add sample to collection and recompute statistics."""
assert isinstance(sample, Sample)
self._update_stats(sample)
i = bisect(self._runtimes, sample.runtime)
self._runtimes.insert(i, sample.runtime)
self.samples.insert(i, sample)
def _update_stats(self, sample):
old_stats = (self.count, self.mean, self.S_runtime)
_, self.mean, self.S_runtime = self.running_mean_variance(
old_stats, sample.runtime
)
def exclude_outliers(self, top_only=False):
"""Exclude outliers by applying Interquartile Range Rule.
Moves the samples outside of the inner fences
(Q1 - 1.5*IQR and Q3 + 1.5*IQR) into outliers list and recomputes
statistics for the remaining sample population. Optionally apply
only the top inner fence, preserving the small outliers.
Experimentally, this rule seems to perform well-enough on the
benchmark runtimes in the microbenchmark range to filter out
the environment noise caused by preemptive multitasking.
"""
lo = (
0
if top_only
else bisect_left(self._runtimes, int(self.q1 - 1.5 * self.iqr))
)
hi = bisect_right(self._runtimes, int(self.q3 + 1.5 * self.iqr))
outliers = self.samples[:lo] + self.samples[hi:]
samples = self.samples[lo:hi]
self.__init__(self.name) # re-initialize
for sample in samples: # and
self.add(sample) # re-compute stats
self.outliers = outliers
@property
def count(self):
"""Number of samples used to compute the statistics."""
return len(self.samples)
@property
def num_samples(self):
"""Number of all samples in the collection."""
return len(self.samples) + len(self.outliers)
@property
def all_samples(self):
"""List of all samples in ascending order."""
return sorted(self.samples + self.outliers, key=lambda s: s.i or -1)
@property
def min(self):
"""Minimum sampled value."""
return self.samples[0].runtime
@property
def max(self):
"""Maximum sampled value."""
return self.samples[-1].runtime
def quantile(self, q):
"""Return runtime for given quantile.
Equivalent to quantile estimate type R-1, SAS-3. See:
https://en.wikipedia.org/wiki/Quantile#Estimating_quantiles_from_a_sample
"""
index = max(0, int(ceil(self.count * float(q))) - 1)
return self.samples[index].runtime
@property
def median(self):
"""Median sampled value."""
return self.quantile(0.5)
@property
def q1(self):
"""First Quartile (25th Percentile)."""
return self.quantile(0.25)
@property
def q3(self):
"""Third Quartile (75th Percentile)."""
return self.quantile(0.75)
@property
def iqr(self):
"""Interquartile Range."""
return self.q3 - self.q1
@property
def sd(self):
u"""Standard Deviation (μs)."""
return 0 if self.count < 2 else sqrt(self.S_runtime / (self.count - 1))
@staticmethod
def running_mean_variance(stats, x):
"""Compute running variance, B. P. Welford's method.
See Knuth TAOCP vol 2, 3rd edition, page 232, or
https://www.johndcook.com/blog/standard_deviation/
M is mean, Standard Deviation is defined as sqrt(S/k-1)
"""
(k, M_, S_) = stats
k = float(k + 1)
M = M_ + (x - M_) / k
S = S_ + (x - M_) * (x - M)
return (k, M, S)
@property
def cv(self):
"""Coefficient of Variation (%)."""
return (self.sd / self.mean) if self.mean else 0
@property
def range(self):
"""Range of samples values (Max - Min)."""
return self.max - self.min
@property
def spread(self):
"""Sample Spread; i.e. Range as (%) of Min."""
return self.range / float(self.min) if self.min else 0
class PerformanceTestResult(object):
@@ -225,126 +39,395 @@ class PerformanceTestResult(object):
Reported by the test driver (Benchmark_O, Benchmark_Onone, Benchmark_Osize
or Benchmark_Driver).
It supports 2 log formats emitted by the test driver. Legacy format with
statistics for normal distribution (MEAN, SD):
#,TEST,SAMPLES,MIN(μs),MAX(μs),MEAN(μs),SD(μs),MEDIAN(μs),MAX_RSS(B)
And new quantiles format with variable number of columns:
#,TEST,SAMPLES,QMIN(μs),MEDIAN(μs),MAX(μs)
#,TEST,SAMPLES,QMIN(μs),Q1(μs),Q2(μs),Q3(μs),MAX(μs),MAX_RSS(B)
The number of columns between MIN and MAX depends on the test driver's
`--quantile`parameter. In both cases, the last column, MAX_RSS is optional.
It supports log formats emitted by the test driver.
"""
def __init__(self, csv_row, quantiles=False, memory=False, delta=False, meta=False):
"""Initialize from a row of multiple columns with benchmark summary.
The row is an iterable, such as a row provided by the CSV parser.
@classmethod
def fromOldFormat(cls, header, line):
"""Original format with statistics for normal distribution (MEAN, SD):
#,TEST,SAMPLES,MIN(μs),MAX(μs),MEAN(μs),SD(μs),MEDIAN(μs),MAX_RSS(B),PAGES,ICS,YIELD
Note that MAX_RSS, PAGES, ICS, YIELD are all optional
"""
self.test_num = csv_row[0] # Ordinal number of the test
self.name = csv_row[1] # Name of the performance test
self.num_samples = int(csv_row[2]) # Number of measurements taken
csv_row = line.split(",") if "," in line else line.split()
labels = header.split(",") if "," in header else header.split()
mem_index = (-1 if memory else 0) + (-3 if meta else 0)
if quantiles: # Variable number of columns representing quantiles
runtimes = csv_row[3:mem_index] if memory or meta else csv_row[3:]
last_runtime_index = mem_index - 1
if delta:
runtimes = [int(x) if x else 0 for x in runtimes]
runtimes = functools.reduce(
lambda l, x: l.append(l[-1] + x) or l if l else [x], # runnin
runtimes,
None,
) # total
num_values = len(runtimes)
if self.num_samples < num_values: # remove repeated samples
quantile = num_values - 1
qs = [float(i) / float(quantile) for i in range(0, num_values)]
indices = [
max(0, int(ceil(self.num_samples * float(q))) - 1) for q in qs
]
runtimes = [
runtimes[indices.index(i)] for i in range(0, self.num_samples)
]
# Synthesize a JSON form with the basic values:
num_samples = int(csv_row[2])
json_data = {
"number": int(csv_row[0]),
"name": csv_row[1],
"num_samples": num_samples,
}
self.samples = PerformanceTestSamples(
self.name, [Sample(None, None, int(runtime)) for runtime in runtimes]
)
self.samples.exclude_outliers(top_only=True)
sams = self.samples
self.min, self.max, self.median, self.mean, self.sd = (
sams.min,
sams.max,
sams.median,
sams.mean,
sams.sd,
)
else: # Legacy format with statistics for normal distribution.
self.min = int(csv_row[3]) # Minimum runtime (μs)
self.max = int(csv_row[4]) # Maximum runtime (μs)
self.mean = float(csv_row[5]) # Mean (average) runtime (μs)
self.sd = float(csv_row[6]) # Standard Deviation (μs)
self.median = int(csv_row[7]) # Median runtime (μs)
last_runtime_index = 7
self.samples = None
# Map remaining columns according to label
field_map = [
("ICS", "ics"),
("MAX_RSS", "max_rss"), # Must precede "MAX"
("MAX", "max"),
("MEAN", "mean"),
("MEDIAN", "median"),
("MIN", "min"),
("PAGES", "pages"),
("SD", "sd"),
("YIELD", "yield")
]
for label, value in zip(labels, csv_row):
for match, json_key in field_map:
if match in label:
json_data[json_key] = float(value)
break
self.max_rss = ( # Maximum Resident Set Size (B)
int(csv_row[mem_index]) if (
memory and len(csv_row) > (last_runtime_index + 1)
) else None
)
# Heroic: Reconstruct samples if we have enough info
# This is generally a bad idea, but sadly necessary for the
# old format that doesn't provide raw sample data.
if num_samples == 1 and "min" in json_data:
json_data["samples"] = [
json_data["min"]
]
elif num_samples == 2 and "min" in json_data and "max" in json_data:
json_data["samples"] = [
json_data["min"],
json_data["max"]
]
elif (num_samples == 3
and "min" in json_data
and "max" in json_data
and "median" in json_data):
json_data["samples"] = [
json_data["min"],
json_data["median"],
json_data["max"]
]
# Optional measurement metadata. The number of:
# memory pages used, involuntary context switches and voluntary yields
self.mem_pages, self.involuntary_cs, self.yield_count = (
[int(x) for x in csv_row[-3:]] if meta else (None, None, None)
)
self.yields = None
self.setup = None
return PerformanceTestResult(json_data)
@classmethod
def fromQuantileFormat(cls, header, line):
"""Quantiles format with variable number of columns depending on the
number of quantiles:
#,TEST,SAMPLES,QMIN(μs),MEDIAN(μs),MAX(μs)
#,TEST,SAMPLES,QMIN(μs),Q1(μs),Q2(μs),Q3(μs),MAX(μs),MAX_RSS(B)
The number of columns between QMIN and MAX depends on the test driver's
`--quantile`parameter. In both cases, the last column, MAX_RSS is optional.
Delta encoding: If a header name includes 𝚫, that column stores the
difference from the previous column. E.g, a header
"#,TEST,SAMPLES,QMIN(μs),MEDIAN(μs),𝚫MAX(μs)" indicates the final "MAX"
column must be computed by adding the value in that column to the value
of the previous "MEDIAN" column.
"""
csv_row = line.split(",") if "," in line else line.split()
labels = header.split(",")
for i in range(1, len(labels)):
if "𝚫" in labels[i] or "Δ" in labels[i]:
prev = int(csv_row[i - 1])
inc = int(csv_row[i]) if csv_row[i] != '' else 0
csv_row[i] = str(prev + inc)
# Synthesize a JSON form and then initialize from that
json_data = {
"number": int(csv_row[0]),
"name": csv_row[1],
"num_samples": int(csv_row[2]),
}
# Process optional trailing fields MAX_RSS, PAGES, ICS, YIELD
i = len(labels) - 1
while True:
if "MAX_RSS" in labels[i]:
json_data["max_rss"] = float(csv_row[i])
elif "PAGES" in labels[i]:
json_data["pages"] = float(csv_row[i])
elif "ICS" in labels[i]:
json_data["ics"] = float(csv_row[i])
elif "YIELD" in labels[i]:
json_data["yield"] = float(csv_row[i])
else:
break
i -= 1
if i < 0:
break
# Rest is the quantiles (includes min/max columns)
quantiles = [float(q) for q in csv_row[3:i + 1]]
# Heroic effort:
# If we have enough quantiles, we can reconstruct the samples
# This is generally a bad idea, but sadly necessary since
# the quantile format doesn't provide raw sample data.
if json_data["num_samples"] == len(quantiles):
json_data["samples"] = sorted(quantiles)
elif json_data["num_samples"] == 2:
json_data["samples"] = [quantiles[0], quantiles[-1]]
elif json_data["num_samples"] == 1:
json_data["samples"] = [quantiles[0]]
else:
json_data["quantiles"] = quantiles
if len(quantiles) > 0:
json_data["min"] = quantiles[0]
json_data["max"] = quantiles[-1]
json_data["median"] = quantiles[(len(quantiles) - 1) // 2]
return PerformanceTestResult(json_data)
@classmethod
def fromJSONFormat(cls, line):
"""JSON format stores a test result as a JSON object on a single line
Compared to the legacy tab-separated/comma-separated formats, this makes
it much easier to add new fields, handle optional fields, and allows us
to include the full set of samples so we can use better statistics
downstream.
The code here includes optional support for min, max,
median, mean, etc. supported by the older formats, though in practice,
you shouldn't rely on those: Just store the full samples and then
compute whatever statistics you need as required.
"""
json_data = json.loads(line)
return PerformanceTestResult(json_data)
def __init__(self, json_data):
if isinstance(json_data, str):
json_data = json.loads(json_data)
# We always have these
assert (json_data.get("number") is not None)
assert (json_data.get("name") is not None)
self.test_num = json_data["number"]
self.name = json_data["name"]
# We always have either samples or num_samples
assert (json_data.get("num_samples") is not None
or json_data.get("samples") is not None)
self.num_samples = json_data.get("num_samples") or len(json_data["samples"])
self.samples = json_data.get("samples") or []
# Everything else is optional and can be read
# out of the JSON data if needed
# See max_rss() below for an example of this.
self.json_data = dict(json_data)
def __repr__(self):
"""Short summary for debugging purposes."""
return (
"<PerformanceTestResult name:{0.name!r} "
"samples:{0.num_samples!r} min:{0.min!r} max:{0.max!r} "
"mean:{0.mean:.0f} sd:{0.sd:.0f} median:{0.median!r}>".format(self)
)
return "PerformanceTestResult(" + json.dumps(self.json_data) + ")"
def merge(self, r):
def json(self):
"""Return a single-line JSON form of this result
This can be parsed back via fromJSONFormat above.
It can also represent all data stored by the older
formats, so there's no reason to not use it everywhere.
"""
data = dict(self.json_data)
# In case these got modified
data["number"] = self.test_num
data["name"] = self.name
# If we have full sample data, use that and
# drop any lingering pre-computed statistics
# (It's better for downstream consumers to just
# compute whatever statistics they need from scratch.)
if len(self.samples) == self.num_samples:
data["samples"] = self.samples
data.pop("num_samples", None)
data.pop("min", None)
data.pop("max", None)
data.pop("mean", None)
data.pop("sd", None)
data.pop("q1", None)
data.pop("median", None)
data.pop("q3", None)
data.pop("quantiles", None)
else:
# Preserve other pre-existing JSON statistics
data["num_samples"] = self.num_samples
return json.dumps(data)
def __str__(self):
return self.json()
@property
def setup(self):
"""TODO: Implement this
"""
return 0
@property
def max_rss(self):
"""Return max_rss if available
"""
return self.json_data.get("max_rss")
@property
def mem_pages(self):
"""Return pages if available
"""
return self.json_data.get("pages")
@property
def involuntary_cs(self):
"""Return involuntary context switches if available
"""
return self.json_data.get("ics")
@property
def yield_count(self):
"""Return voluntary yield count if available
"""
return self.json_data.get("yield")
@property
def min_value(self):
"""Return the minimum value from all samples
If we have full samples, compute it directly.
In the legacy case, we might not have full samples,
so in that case we'll return a value that was given
to us initially (if any).
Eventually (after December 2023), this can be simplified
to just `return min(self.samples)`, since by then
the legacy forms should no longer be in use.
"""
if self.num_samples == len(self.samples):
return min(self.samples)
return self.json_data.get("min")
@property
def max_value(self):
"""Return the maximum sample value
See min_value comments for details on the legacy behavior."""
if self.num_samples == len(self.samples):
return max(self.samples)
return self.json_data.get("max")
@property
def median(self):
"""Return the median sample value
See min_value comments for details on the legacy behavior."""
if self.num_samples == len(self.samples):
return statistics.median(self.samples)
return self.json_data.get("median")
# TODO: Eliminate q1 and q3. They're kept for now
# to preserve compatibility with older reports. But quantiles
# aren't really useful statistics, so just drop them.
@property
def q1(self):
"""Return the 25% quantile
See min_value comments for details on the legacy behavior."""
if self.num_samples == len(self.samples):
q = statistics.quantiles(self.samples, n=4)
return q[0]
return self.json_data.get("q1")
@property
def q3(self):
"""Return the 75% quantile
See min_value comments for details on the legacy behavior."""
if self.num_samples == len(self.samples):
q = statistics.quantiles(self.samples, n=4)
return q[2]
return self.json_data.get("q3")
@property
def mean(self):
"""Return the average
TODO: delete this; it's not useful"""
if self.num_samples == len(self.samples):
return statistics.mean(self.samples)
return self.json_data.get("mean")
@property
def sd(self):
"""Return the standard deviation
TODO: delete this; it's not useful"""
if self.num_samples == len(self.samples):
if len(self.samples) > 1:
return statistics.stdev(self.samples)
else:
return 0
return self.json_data.get("sd")
def merge(self, other):
"""Merge two results.
Recomputes min, max and mean statistics. If all `samples` are
available, it recomputes all the statistics.
The use case here is comparing test results parsed from concatenated
log files from multiple runs of benchmark driver.
This is trivial in the non-legacy case: We just
pool all the samples.
In the legacy case (or the mixed legacy/non-legacy cases),
we try to estimate the min/max/mean/sd/median/etc based
on whatever information is available. After Dec 2023,
we should be able to drop the legacy support.
"""
# Statistics
if self.samples and r.samples:
for sample in r.samples.samples:
self.samples.add(sample)
sams = self.samples
self.num_samples = sams.num_samples
self.min, self.max, self.median, self.mean, self.sd = (
sams.min,
sams.max,
sams.median,
sams.mean,
sams.sd,
)
else:
self.min = min(self.min, r.min)
self.max = max(self.max, r.max)
self.mean = ( # pooled mean is the weighted sum of means
(self.mean * self.num_samples) + (r.mean * r.num_samples)
) / float(self.num_samples + r.num_samples)
self.num_samples += r.num_samples
self.median, self.sd = None, None
# The following can be removed after Dec 2023
# (by which time the legacy support should no longer
# be necessary)
if self.num_samples != len(self.samples):
# If we don't have samples, we can't rely on being
# able to compute real statistics from those samples,
# so we make a best-effort attempt to estimate a joined
# statistic from whatever data we actually have.
# If both exist, take the minimum, else take whichever is set
other_min_value = other.min_value
if other_min_value is not None:
self_min_value = self.min_value
if self_min_value is not None:
self.json_data["min"] = min(other_min_value, self_min_value)
else:
self.json_data["min"] = other_min_value
# If both exist, take the maximum, else take whichever is set
other_max_value = other.max_value
if other_max_value is not None:
self_max_value = self.max_value
if self_max_value is not None:
self.json_data["max"] = max(other_max_value, self_max_value)
else:
self.json_data["max"] = other_max_value
# If both exist, take the weighted average, else take whichever is set
other_mean = other.mean
if other_mean is not None:
self_mean = self.mean
if self_mean is not None:
self.json_data["mean"] = (
(other_mean * other.num_samples
+ self_mean * self.num_samples)
/ (self.num_samples + other.num_samples)
)
else:
self.json_data["mean"] = other_mean
self.json_data.pop("median", None) # Remove median
self.json_data.pop("sd", None) # Remove stdev
self.json_data.pop("q1", None) # Remove 25% quantile
self.json_data.pop("q3", None) # Remove 75% quantile
self.json_data.pop("quantiles", None) # Remove quantiles
# Accumulate samples (if present) and num_samples (always)
self.samples += other.samples
self.num_samples += other.num_samples
# Metadata
def minimum(a, b): # work around None being less than everything
return min(filter(lambda x: x is not None, [a, b])) if any([a, b]) else None
self.max_rss = minimum(self.max_rss, r.max_rss)
self.setup = minimum(self.setup, r.setup)
# Use the smaller if both have a max_rss value
self.json_data["max_rss"] = other.max_rss
other_max_rss = other.max_rss
if other_max_rss is not None:
self_max_rss = self.max_rss
if self_max_rss is not None:
self.json_data["max_rss"] = min(self_max_rss, other_max_rss)
else:
self.json_data["max_rss"] = other_max_rss
class ResultComparison(object):
@@ -361,16 +444,22 @@ class ResultComparison(object):
self.name = old.name # Test name, convenience accessor
# Speedup ratio
self.ratio = (old.min + 0.001) / (new.min + 0.001)
self.ratio = (old.min_value + 0.001) / (new.min_value + 0.001)
# Test runtime improvement in %
ratio = (new.min + 0.001) / (old.min + 0.001)
ratio = (new.min_value + 0.001) / (old.min_value + 0.001)
self.delta = (ratio - 1) * 100
# Indication of dubious changes: when result's MIN falls inside the
# (MIN, MAX) interval of result they are being compared with.
self.is_dubious = (old.min < new.min and new.min < old.max) or (
new.min < old.min and old.min < new.max
self.is_dubious = (
(
old.min_value < new.min_value
and new.min_value < old.max_value
) or (
new.min_value < old.min_value
and old.min_value < new.max_value
)
)
@@ -385,117 +474,49 @@ class LogParser(object):
def __init__(self):
"""Create instance of `LogParser`."""
self.results = []
self.quantiles, self.delta, self.memory = False, False, False
self.meta = False
self._reset()
def _reset(self):
"""Reset parser to the default state for reading a new result."""
self.samples, self.yields, self.num_iters = [], [], 1
self.setup, self.max_rss, self.mem_pages = None, None, None
self.voluntary_cs, self.involuntary_cs = None, None
# Parse lines like this
# #,TEST,SAMPLES,MIN(μs),MAX(μs),MEAN(μs),SD(μs),MEDIAN(μs)
results_re = re.compile(
r"( *\d+[, \t]+[\w.\-\?!]+[, \t]+"
+ r"[, \t]+".join([r"\d+"] * 2) # #,TEST
+ r"(?:[, \t]+\d*)*)" # at least 2...
) # ...or more numeric columns
def _append_result(self, result):
columns = result.split(",") if "," in result else result.split()
r = PerformanceTestResult(
columns,
quantiles=self.quantiles,
memory=self.memory,
delta=self.delta,
meta=self.meta,
)
r.setup = self.setup
r.max_rss = r.max_rss or self.max_rss
r.mem_pages = r.mem_pages or self.mem_pages
r.voluntary_cs = self.voluntary_cs
r.involuntary_cs = r.involuntary_cs or self.involuntary_cs
if self.samples:
r.samples = PerformanceTestSamples(r.name, self.samples)
r.samples.exclude_outliers()
self.results.append(r)
r.yields = self.yields or None
self._reset()
def _store_memory_stats(self, max_rss, mem_pages):
self.max_rss = int(max_rss)
self.mem_pages = int(mem_pages)
def _configure_format(self, header):
self.quantiles = "QMIN" in header
self.memory = "MAX_RSS" in header
self.meta = "PAGES" in header
self.delta = "𝚫" in header
# Regular expression and action to take when it matches the parsed line
state_actions = {
results_re: _append_result,
# Verbose mode adds new productions:
# Adaptively determined N; test loop multiple adjusting runtime to ~1s
re.compile(r"\s+Measuring with scale (\d+)."): (
lambda self, num_iters: setattr(self, "num_iters", num_iters)
),
re.compile(r"\s+Sample (\d+),(\d+)"): (
lambda self, i, runtime: self.samples.append(
Sample(int(i), int(self.num_iters), int(runtime))
)
),
re.compile(r"\s+SetUp (\d+)"): (
lambda self, setup: setattr(self, "setup", int(setup))
),
re.compile(r"\s+Yielding after ~(\d+) μs"): (
lambda self, since_last_yield: self.yields.append(
Yield(len(self.samples), int(since_last_yield))
)
),
re.compile(r"( *#[, \t]+TEST[, \t]+SAMPLES[, \t].*)"): _configure_format,
# Environmental statistics: memory usage and context switches
re.compile(
r"\s+MAX_RSS \d+ - \d+ = (\d+) \((\d+) pages\)"
): _store_memory_stats,
re.compile(r"\s+VCS \d+ - \d+ = (\d+)"): (
lambda self, vcs: setattr(self, "voluntary_cs", int(vcs))
),
re.compile(r"\s+ICS \d+ - \d+ = (\d+)"): (
lambda self, ics: setattr(self, "involuntary_cs", int(ics))
),
}
def parse_results(self, lines):
"""Parse results from the lines of the log output from Benchmark*.
Returns a list of `PerformanceTestResult`s.
"""
match_json = re.compile(r"\s*({.*)")
match_header = re.compile(r"( *#[, \t]+TEST[, \t]+SAMPLES[, \t].*)")
match_legacy = re.compile(r" *(\d+[, \t].*)")
header = ""
for line in lines:
for regexp, action in LogParser.state_actions.items():
match = regexp.match(line)
if match:
action(self, *match.groups())
break # stop after 1st match
else: # If none matches, skip the line.
# print('skipping: ' + line.rstrip('\n'))
# Current format has a JSON-encoded object on each line
# That format is flexible so should be the only format
# used going forward
if match_json.match(line):
r = PerformanceTestResult.fromJSONFormat(line)
self.results.append(r)
elif match_header.match(line):
# Legacy formats use a header line (which can be
# inspected to determine the presence and order of columns)
header = line
elif match_legacy.match(line):
# Legacy format: lines of space- or tab-separated values
if "QMIN" in header:
r = PerformanceTestResult.fromQuantileFormat(header, line)
else:
r = PerformanceTestResult.fromOldFormat(header, line)
self.results.append(r)
else:
# Ignore unrecognized lines
# print('Skipping: ' + line.rstrip('\n'), file=sys.stderr, flush=True)
continue
return self.results
@staticmethod
def _results_from_lines(lines):
tests = LogParser().parse_results(lines)
def add_or_merge(names, r):
names = dict()
for r in LogParser().parse_results(lines):
if r.name not in names:
names[r.name] = r
else:
names[r.name].merge(r)
return names
return functools.reduce(add_or_merge, tests, dict())
return names
@staticmethod
def results_from_string(log_contents):
@@ -615,18 +636,18 @@ class ReportFormatter(object):
return (
(
result.name,
str(result.min),
str(result.max),
str(int(result.mean)),
str(result.max_rss) if result.max_rss else "",
str(result.min_value) if result.min_value is not None else "-",
str(result.max_value) if result.max_value is not None else "-",
str(result.mean) if result.mean is not None else "-",
str(result.max_rss) if result.max_rss is not None else "",
)
if isinstance(result, PerformanceTestResult)
else
# isinstance(result, ResultComparison)
(
result.name,
str(result.old.min),
str(result.new.min),
str(result.old.min_value) if result.old.min_value else "-",
str(result.new.min_value) if result.new.min_value else "-",
"{0:+.1f}%".format(result.delta),
"{0:.2f}x{1}".format(result.ratio, " (?)" if result.is_dubious else ""),
)

View File

@@ -222,7 +222,7 @@ class TestBenchmarkDriverInitialization(unittest.TestCase):
def test_gets_list_of_precommit_benchmarks(self):
self.subprocess_mock.expect(
"/benchmarks/Benchmark_O --list --delim=\t".split(" "),
"/benchmarks/Benchmark_O --list".split(" "),
"#\tTest\t[Tags]\n1\tBenchmark1\t[t1, t2]\n2\tBenchmark2\t[t3]\n",
)
driver = BenchmarkDriver(self.args, _subprocess=self.subprocess_mock)
@@ -233,7 +233,7 @@ class TestBenchmarkDriverInitialization(unittest.TestCase):
self.assertEqual(driver.test_number["Benchmark2"], "2")
list_all_tests = (
"/benchmarks/Benchmark_O --list --delim=\t --skip-tags=".split(" "),
"/benchmarks/Benchmark_O --list --skip-tags=".split(" "),
"""# Test [Tags]
1 Benchmark1 [t1, t2]
2 Benchmark2 [t3]
@@ -310,7 +310,7 @@ class LogParserStub(object):
@staticmethod
def results_from_string(log_contents):
LogParserStub.results_from_string_called = True
r = PerformanceTestResult("3,b1,1,123,123,123,0,123".split(","))
r = PerformanceTestResult("""{"number":3,"name":"b1","samples":[123]}""")
return {"b1": r}
@@ -320,7 +320,7 @@ class TestBenchmarkDriverRunningTests(unittest.TestCase):
self.parser_stub = LogParserStub()
self.subprocess_mock = SubprocessMock()
self.subprocess_mock.expect(
"/benchmarks/Benchmark_O --list --delim=\t".split(" "),
"/benchmarks/Benchmark_O --list".split(" "),
"#\tTest\t[Tags]\n1\tb1\t[tag]\n",
)
self.driver = BenchmarkDriver(
@@ -382,13 +382,6 @@ class TestBenchmarkDriverRunningTests(unittest.TestCase):
("/benchmarks/Benchmark_O", "b", "--memory")
)
def test_report_quantiles(self):
"""Use delta compression for quantile reports."""
self.driver.run("b", quantile=4)
self.subprocess_mock.assert_called_with(
("/benchmarks/Benchmark_O", "b", "--quantile=4", "--delta")
)
def test_run_benchmark_independent_samples(self):
"""Extract up to 20 measurements from an independent run."""
self.driver.args.independent_samples = 3
@@ -400,8 +393,6 @@ class TestBenchmarkDriverRunningTests(unittest.TestCase):
"b1",
"--num-iters=1",
"--memory",
"--quantile=20",
"--delta",
)
),
3,
@@ -412,38 +403,36 @@ class TestBenchmarkDriverRunningTests(unittest.TestCase):
def mock_run(test):
self.assertEqual(test, "b1")
return PerformanceTestResult(
"3,b1,5,101,1,1,1,1,888".split(","),
quantiles=True,
delta=True,
memory=True,
"""{"number":3,"""
+ """"name":"b1","""
+ """"samples":[101,102,103,104,105],"""
+ """"max_rss":888}"""
)
driver = BenchmarkDriver(tests=["b1"], args=Stub(output_dir=None))
driver.run_independent_samples = mock_run # patching
with captured_output() as (out, _):
log = driver.run_and_log()
driver.run_and_log()
header = (
"#,TEST,SAMPLES,MIN(μs),Q1(μs),MEDIAN(μs),Q3(μs),MAX(μs)," + "MAX_RSS(B)\n"
)
csv_log = "3,b1,5,101,102,103,104,105,888\n"
self.assertEqual(log, None)
csv_log = "3,b1,5,101,101.5,103,104.5,105,888\n"
self.assertEqual(
out.getvalue(),
header + csv_log + "\n" + "Total performance tests executed: 1\n",
)
with captured_output() as (out, _):
log = driver.run_and_log(csv_console=False)
driver.run_and_log(csv_console=False)
self.assertEqual(log, header + csv_log)
self.assertEqual(
out.getvalue(),
" # TEST SAMPLES MIN(μs)"
+ " Q1(μs) MEDIAN(μs) Q3(μs) MAX(μs) MAX_RSS(B)\n"
+ " 3 b1 5 101"
+ " 102 103 104 105 888\n"
+ " 101.5 103 104.5 105 888\n"
+ "\n"
+ "Total performance tests executed: 1\n",
)
@@ -459,7 +448,7 @@ class TestBenchmarkDriverRunningTests(unittest.TestCase):
openmode = "r" # 'U' mode is deprecated in Python 3
with open(log_file, openmode) as f:
text = f.read()
self.assertEqual(text, "formatted output")
self.assertEqual(text, "formatted output\n")
try:
import tempfile # setUp
@@ -469,7 +458,7 @@ class TestBenchmarkDriverRunningTests(unittest.TestCase):
driver = BenchmarkDriver(Stub(), tests=[""])
self.assertFalse(os.path.exists(log_dir))
content = "formatted output"
content = ["formatted output"]
log_file = os.path.join(log_dir, "1.log")
with captured_output() as (out, _):
driver.log_results(content, log_file=log_file)

View File

@@ -13,6 +13,7 @@
#
# ===---------------------------------------------------------------------===//
import json
import os
import shutil
import sys
@@ -21,10 +22,8 @@ import unittest
from compare_perf_tests import LogParser
from compare_perf_tests import PerformanceTestResult
from compare_perf_tests import PerformanceTestSamples
from compare_perf_tests import ReportFormatter
from compare_perf_tests import ResultComparison
from compare_perf_tests import Sample
from compare_perf_tests import TestComparator
from compare_perf_tests import main
from compare_perf_tests import parse_args
@@ -32,227 +31,70 @@ from compare_perf_tests import parse_args
from test_utils import captured_output
class TestSample(unittest.TestCase):
def test_has_named_fields(self):
s = Sample(1, 2, 3)
self.assertEqual(s.i, 1)
self.assertEqual(s.num_iters, 2)
self.assertEqual(s.runtime, 3)
def test_is_iterable(self):
s = Sample(1, 2, 3)
self.assertEqual(s[0], 1)
self.assertEqual(s[1], 2)
self.assertEqual(s[2], 3)
class TestPerformanceTestSamples(unittest.TestCase):
def setUp(self):
self.samples = PerformanceTestSamples("B1")
self.samples.add(Sample(7, 42, 1000))
def test_has_name(self):
self.assertEqual(self.samples.name, "B1")
def test_stores_samples(self):
self.assertEqual(self.samples.count, 1)
s = self.samples.samples[0]
self.assertTrue(isinstance(s, Sample))
self.assertEqual(s.i, 7)
self.assertEqual(s.num_iters, 42)
self.assertEqual(s.runtime, 1000)
def test_quantile(self):
self.assertEqual(self.samples.quantile(1), 1000)
self.assertEqual(self.samples.quantile(0), 1000)
self.samples.add(Sample(2, 1, 1100))
self.assertEqual(self.samples.quantile(0), 1000)
self.assertEqual(self.samples.quantile(1), 1100)
self.samples.add(Sample(3, 1, 1050))
self.assertEqual(self.samples.quantile(0), 1000)
self.assertEqual(self.samples.quantile(0.5), 1050)
self.assertEqual(self.samples.quantile(1), 1100)
def assertEqualFiveNumberSummary(self, ss, expected_fns):
e_min, e_q1, e_median, e_q3, e_max = expected_fns
self.assertEqual(ss.min, e_min)
self.assertEqual(ss.q1, e_q1)
self.assertEqual(ss.median, e_median)
self.assertEqual(ss.q3, e_q3)
self.assertEqual(ss.max, e_max)
def test_computes_five_number_summary(self):
self.assertEqualFiveNumberSummary(self.samples, (1000, 1000, 1000, 1000, 1000))
self.samples.add(Sample(2, 1, 1100))
self.assertEqualFiveNumberSummary(self.samples, (1000, 1000, 1000, 1100, 1100))
self.samples.add(Sample(3, 1, 1050))
self.assertEqualFiveNumberSummary(self.samples, (1000, 1000, 1050, 1100, 1100))
self.samples.add(Sample(4, 1, 1025))
self.assertEqualFiveNumberSummary(self.samples, (1000, 1000, 1025, 1050, 1100))
self.samples.add(Sample(5, 1, 1075))
self.assertEqualFiveNumberSummary(self.samples, (1000, 1025, 1050, 1075, 1100))
def test_computes_inter_quartile_range(self):
self.assertEqual(self.samples.iqr, 0)
self.samples.add(Sample(2, 1, 1025))
self.samples.add(Sample(3, 1, 1050))
self.samples.add(Sample(4, 1, 1075))
self.samples.add(Sample(5, 1, 1100))
self.assertEqual(self.samples.iqr, 50)
def assertEqualStats(self, stats, expected_stats):
for actual, expected in zip(stats, expected_stats):
self.assertAlmostEqual(actual, expected, places=2)
def test_computes_mean_sd_cv(self):
ss = self.samples
self.assertEqualStats((ss.mean, ss.sd, ss.cv), (1000.0, 0.0, 0.0))
self.samples.add(Sample(2, 1, 1100))
self.assertEqualStats((ss.mean, ss.sd, ss.cv), (1050.0, 70.71, 6.7 / 100))
def test_computes_range_spread(self):
ss = self.samples
self.assertEqualStats((ss.range, ss.spread), (0, 0))
self.samples.add(Sample(2, 1, 1100))
self.assertEqualStats((ss.range, ss.spread), (100, 10.0 / 100))
def test_init_with_samples(self):
self.samples = PerformanceTestSamples(
"B2", [Sample(0, 1, 1000), Sample(1, 1, 1100)]
)
self.assertEqual(self.samples.count, 2)
self.assertEqualStats(
(
self.samples.mean,
self.samples.sd,
self.samples.range,
self.samples.spread,
),
(1050.0, 70.71, 100, 9.52 / 100),
)
def test_can_handle_zero_runtime(self):
# guard against dividing by 0
self.samples = PerformanceTestSamples("Zero")
self.samples.add(Sample(0, 1, 0))
self.assertEqualStats(
(
self.samples.mean,
self.samples.sd,
self.samples.cv,
self.samples.range,
self.samples.spread,
),
(0, 0, 0.0, 0, 0.0),
)
def test_excludes_outliers(self):
ss = [
Sample(*map(int, s.split()))
for s in "0 1 1000, 1 1 1025, 2 1 1050, 3 1 1075, 4 1 1100, "
"5 1 1000, 6 1 1025, 7 1 1050, 8 1 1075, 9 1 1100, "
"10 1 1050, 11 1 949, 12 1 1151".split(",")
]
self.samples = PerformanceTestSamples("Outliers", ss)
self.assertEqual(self.samples.count, 13)
self.assertEqualStats((self.samples.mean, self.samples.sd), (1050, 52.36))
self.samples.exclude_outliers()
self.assertEqual(self.samples.count, 11)
self.assertEqual(self.samples.outliers, ss[11:])
self.assertEqualFiveNumberSummary(self.samples, (1000, 1025, 1050, 1075, 1100))
self.assertEqualStats((self.samples.mean, self.samples.sd), (1050, 35.36))
def test_excludes_outliers_zero_IQR(self):
self.samples = PerformanceTestSamples("Tight")
self.samples.add(Sample(0, 2, 23))
self.samples.add(Sample(1, 2, 18))
self.samples.add(Sample(2, 2, 18))
self.samples.add(Sample(3, 2, 18))
self.assertEqual(self.samples.iqr, 0)
self.samples.exclude_outliers()
self.assertEqual(self.samples.count, 3)
self.assertEqualStats((self.samples.min, self.samples.max), (18, 18))
def test_excludes_outliers_top_only(self):
ss = [
Sample(*map(int, s.split()))
for s in "0 1 1, 1 1 2, 2 1 2, 3 1 2, 4 1 3".split(",")
]
self.samples = PerformanceTestSamples("Top", ss)
self.assertEqualFiveNumberSummary(self.samples, (1, 2, 2, 2, 3))
self.assertEqual(self.samples.iqr, 0)
self.samples.exclude_outliers(top_only=True)
self.assertEqual(self.samples.count, 4)
self.assertEqualStats((self.samples.min, self.samples.max), (1, 2))
class TestPerformanceTestResult(unittest.TestCase):
def test_init(self):
header = "#,TEST,SAMPLES,MIN,MAX,MEAN,SD,MEDIAN"
log_line = "1,AngryPhonebook,20,10664,12933,11035,576,10884"
r = PerformanceTestResult(log_line.split(","))
self.assertEqual(r.test_num, "1")
r = PerformanceTestResult.fromOldFormat(header, log_line)
self.assertEqual(r.test_num, 1)
self.assertEqual(r.name, "AngryPhonebook")
self.assertEqual(
(r.num_samples, r.min, r.max, r.mean, r.sd, r.median),
(r.num_samples, r.min_value, r.max_value, r.mean, r.sd, r.median),
(20, 10664, 12933, 11035, 576, 10884),
)
self.assertEqual(r.samples, None)
self.assertEqual(r.samples, [])
header = "#,TEST,SAMPLES,MIN,MAX,MEAN,SD,MEDIAN,MAX_RSS"
log_line = "1,AngryPhonebook,1,12045,12045,12045,0,12045,10510336"
r = PerformanceTestResult(log_line.split(","), memory=True)
r = PerformanceTestResult.fromOldFormat(header, log_line)
self.assertEqual(r.max_rss, 10510336)
def test_init_quantiles(self):
# #,TEST,SAMPLES,MIN(μs),MEDIAN(μs),MAX(μs)
header = "#,TEST,SAMPLES,MIN(μs),MEDIAN(μs),MAX(μs)"
log = "1,Ackermann,3,54383,54512,54601"
r = PerformanceTestResult(log.split(","), quantiles=True)
self.assertEqual(r.test_num, "1")
r = PerformanceTestResult.fromQuantileFormat(header, log)
self.assertEqual(r.test_num, 1)
self.assertEqual(r.name, "Ackermann")
self.assertEqual(
(r.num_samples, r.min, r.median, r.max), (3, 54383, 54512, 54601)
(r.num_samples, r.min_value, r.median, r.max_value),
(3, 54383, 54512, 54601)
)
self.assertAlmostEqual(r.mean, 54498.67, places=2)
self.assertAlmostEqual(r.sd, 109.61, places=2)
self.assertEqual(r.samples.count, 3)
self.assertEqual(r.samples.num_samples, 3)
self.assertEqual(
[s.runtime for s in r.samples.all_samples], [54383, 54512, 54601]
)
self.assertEqual(r.samples, [54383, 54512, 54601])
# #,TEST,SAMPLES,MIN(μs),MEDIAN(μs),MAX(μs),MAX_RSS(B)
header = "#,TEST,SAMPLES,MIN(μs),MEDIAN(μs),MAX(μs),MAX_RSS(B)"
log = "1,Ackermann,3,54529,54760,55807,266240"
r = PerformanceTestResult(log.split(","), quantiles=True, memory=True)
self.assertEqual((r.samples.count, r.max_rss), (3, 266240))
# #,TEST,SAMPLES,MIN(μs),Q1(μs),Q2(μs),Q3(μs),MAX(μs)
r = PerformanceTestResult.fromQuantileFormat(header, log)
self.assertEqual((len(r.samples), r.max_rss), (3, 266240))
header = "#,TEST,SAMPLES,MIN(μs),Q1(μs),Q2(μs),Q3(μs),MAX(μs)"
log = "1,Ackermann,5,54570,54593,54644,57212,58304"
r = PerformanceTestResult(log.split(","), quantiles=True, memory=False)
r = PerformanceTestResult.fromQuantileFormat(header, log)
self.assertEqual(
(r.num_samples, r.min, r.median, r.max), (5, 54570, 54644, 58304)
(r.num_samples, r.min_value, r.median, r.max_value),
(5, 54570, 54644, 58304)
)
self.assertEqual((r.samples.q1, r.samples.q3), (54593, 57212))
self.assertEqual(r.samples.count, 5)
# #,TEST,SAMPLES,MIN(μs),Q1(μs),Q2(μs),Q3(μs),MAX(μs),MAX_RSS(B)
self.assertEqual((r.q1, r.q3), (54581.5, 57758))
self.assertEqual(len(r.samples), 5)
header = "#,TEST,SAMPLES,MIN(μs),Q1(μs),Q2(μs),Q3(μs),MAX(μs),MAX_RSS(B)"
log = "1,Ackermann,5,54686,54731,54774,55030,63466,270336"
r = PerformanceTestResult(log.split(","), quantiles=True, memory=True)
self.assertEqual(r.samples.num_samples, 5)
self.assertEqual(r.samples.count, 4) # outlier was excluded
r = PerformanceTestResult.fromQuantileFormat(header, log)
self.assertEqual(r.num_samples, 5)
self.assertEqual(len(r.samples), 5)
self.assertEqual(r.max_rss, 270336)
def test_init_delta_quantiles(self):
# #,TEST,SAMPLES,MIN(μs),𝚫MEDIAN,𝚫MAX
# 2-quantile from 2 samples in repeated min, when delta encoded,
# the difference is 0, which is omitted -- only separator remains
header = "#,TEST,SAMPLES,MIN(μs),𝚫MEDIAN,𝚫MAX"
log = "202,DropWhileArray,2,265,,22"
r = PerformanceTestResult(log.split(","), quantiles=True, delta=True)
self.assertEqual((r.num_samples, r.min, r.median, r.max), (2, 265, 265, 287))
self.assertEqual(r.samples.count, 2)
self.assertEqual(r.samples.num_samples, 2)
r = PerformanceTestResult.fromQuantileFormat(header, log)
self.assertEqual((r.num_samples, r.min_value, r.median, r.max_value),
(2, 265, 276, 287))
self.assertEqual(len(r.samples), 2)
self.assertEqual(r.num_samples, 2)
def test_init_oversampled_quantiles(self):
"""When num_samples is < quantile + 1, some of the measurements are
@@ -265,6 +107,16 @@ class TestPerformanceTestResult(unittest.TestCase):
tbl <- function(s) t(sapply(1:s, function(x) {
qs <- subsample(x, s); c(qs[1], diff(qs)) }))
sapply(c(3, 5, 11, 21), tbl)
TODO: Delete this test when we delete quantile support from the
benchmark harness. Reconstructing samples from quantiles as this code is
trying to do is not really statistically sound, which is why we're going
to delete most of this in favor of an architecture where the
lowest-level benchmarking logic reports samples, we store and pass
raw sample data around as much as possible, and summary statistics are
only computed as necessary for actual reporting (and then discarded,
since we can recompute anything we need if we always have the raw
samples available).
"""
def validatePTR(deq): # construct from delta encoded quantiles string
@@ -273,10 +125,8 @@ class TestPerformanceTestResult(unittest.TestCase):
r = PerformanceTestResult(
["0", "B", str(num_samples)] + deq, quantiles=True, delta=True
)
self.assertEqual(r.samples.num_samples, num_samples)
self.assertEqual(
[s.runtime for s in r.samples.all_samples], range(1, num_samples + 1)
)
self.assertEqual(len(r.samples), num_samples)
self.assertEqual(r.samples, range(1, num_samples + 1))
delta_encoded_quantiles = """
1,,
@@ -318,119 +168,152 @@ class TestPerformanceTestResult(unittest.TestCase):
map(validatePTR, delta_encoded_quantiles.split("\n")[1:])
def test_init_meta(self):
# #,TEST,SAMPLES,MIN(μs),MAX(μs),MEAN(μs),SD(μs),MEDIAN(μs),…
# …PAGES,ICS,YIELD
header = (
"#,TEST,SAMPLES,MIN(μs),MAX(μs),MEAN(μs),SD(μs),"
+ "MEDIAN(μs),PAGES,ICS,YIELD"
)
log = "1,Ackermann,200,715,1281,726,47,715,7,29,15"
r = PerformanceTestResult(log.split(","), meta=True)
self.assertEqual((r.test_num, r.name), ("1", "Ackermann"))
r = PerformanceTestResult.fromOldFormat(header, log)
self.assertEqual((r.test_num, r.name), (1, "Ackermann"))
self.assertEqual(
(r.num_samples, r.min, r.max, r.mean, r.sd, r.median),
(r.num_samples, r.min_value, r.max_value, r.mean, r.sd, r.median),
(200, 715, 1281, 726, 47, 715),
)
self.assertEqual((r.mem_pages, r.involuntary_cs, r.yield_count), (7, 29, 15))
# #,TEST,SAMPLES,MIN(μs),MAX(μs),MEAN(μs),SD(μs),MEDIAN(μs),MAX_RSS(B),…
# …PAGES,ICS,YIELD
header = (
"#,TEST,SAMPLES,MIN(μs),MAX(μs),MEAN(μs),SD(μs),MEDIAN(μs),"
+ "MAX_RSS(B),PAGES,ICS,YIELD"
)
log = "1,Ackermann,200,715,1951,734,97,715,36864,9,50,15"
r = PerformanceTestResult(log.split(","), memory=True, meta=True)
r = PerformanceTestResult.fromOldFormat(header, log)
self.assertEqual(
(r.num_samples, r.min, r.max, r.mean, r.sd, r.median),
(r.num_samples, r.min_value, r.max_value, r.mean, r.sd, r.median),
(200, 715, 1951, 734, 97, 715),
)
self.assertEqual(
(r.mem_pages, r.involuntary_cs, r.yield_count, r.max_rss),
(9, 50, 15, 36864),
)
# #,TEST,SAMPLES,MIN(μs),MAX(μs),PAGES,ICS,YIELD
header = "#,TEST,SAMPLES,MIN(μs),MAX(μs),PAGES,ICS,YIELD"
log = "1,Ackermann,200,715,3548,8,31,15"
r = PerformanceTestResult(log.split(","), quantiles=True, meta=True)
self.assertEqual((r.num_samples, r.min, r.max), (200, 715, 3548))
self.assertEqual(
(r.samples.count, r.samples.min, r.samples.max), (2, 715, 3548)
)
r = PerformanceTestResult.fromOldFormat(header, log)
self.assertEqual((r.num_samples, r.min_value, r.max_value), (200, 715, 3548))
self.assertEqual(r.samples, [])
self.assertEqual((r.mem_pages, r.involuntary_cs, r.yield_count), (8, 31, 15))
# #,TEST,SAMPLES,MIN(μs),MAX(μs),MAX_RSS(B),PAGES,ICS,YIELD
header = "#,TEST,SAMPLES,MIN(μs),MAX(μs),MAX_RSS(B),PAGES,ICS,YIELD"
log = "1,Ackermann,200,715,1259,32768,8,28,15"
r = PerformanceTestResult(
log.split(","), quantiles=True, memory=True, meta=True
)
self.assertEqual((r.num_samples, r.min, r.max), (200, 715, 1259))
self.assertEqual(
(r.samples.count, r.samples.min, r.samples.max), (2, 715, 1259)
)
r = PerformanceTestResult.fromOldFormat(header, log)
self.assertEqual((r.num_samples, r.min_value, r.max_value), (200, 715, 1259))
self.assertEqual(r.samples, [])
self.assertEqual(r.max_rss, 32768)
self.assertEqual((r.mem_pages, r.involuntary_cs, r.yield_count), (8, 28, 15))
def test_repr(self):
log_line = "1,AngryPhonebook,20,10664,12933,11035,576,10884"
r = PerformanceTestResult(log_line.split(","))
self.assertEqual(
str(r),
"<PerformanceTestResult name:'AngryPhonebook' samples:20 "
"min:10664 max:12933 mean:11035 sd:576 median:10884>",
)
def test_merge(self):
tests = """
1,AngryPhonebook,1,12045,12045,12045,0,12045
1,AngryPhonebook,1,12325,12325,12325,0,12325,10510336
1,AngryPhonebook,1,11616,11616,11616,0,11616,10502144
1,AngryPhonebook,1,12270,12270,12270,0,12270,10498048""".split(
"\n"
)[
1:
tests = [
"""{"number":1,"name":"AngryPhonebook",
"samples":[12045]}""",
"""{"number":1,"name":"AngryPhonebook",
"samples":[12325],"max_rss":10510336}""",
"""{"number":1,"name":"AngryPhonebook",
"samples":[11616],"max_rss":10502144}""",
"""{"number":1,"name":"AngryPhonebook",
"samples":[12270],"max_rss":10498048}"""
]
def makeResult(csv_row):
return PerformanceTestResult(csv_row, memory=True)
results = list(map(makeResult, [line.split(",") for line in tests]))
results[2].setup = 9
results[3].setup = 7
results = [PerformanceTestResult(json) for json in tests]
def as_tuple(r):
return (
r.num_samples,
r.min,
r.max,
r.min_value,
r.max_value,
round(r.mean, 2),
r.sd,
round(r.sd, 2),
r.median,
r.max_rss,
r.setup,
)
r = results[0]
self.assertEqual(as_tuple(r), (1, 12045, 12045, 12045, 0, 12045, None, None))
self.assertEqual(as_tuple(r), (1, 12045, 12045, 12045, 0, 12045, None))
r.merge(results[1])
self.assertEqual(
as_tuple(r), # drops SD and median, +max_rss
(2, 12045, 12325, 12185, None, None, 10510336, None),
as_tuple(r),
(2, 12045, 12325, 12185, 197.99, 12185, 10510336),
)
r.merge(results[2])
self.assertEqual(
as_tuple(r), # picks smaller of the MAX_RSS, +setup
(3, 11616, 12325, 11995.33, None, None, 10502144, 9),
as_tuple(r),
(3, 11616, 12325, 11995.33, 357.1, 12045, 10502144),
)
r.merge(results[3])
self.assertEqual(
as_tuple(r), # picks smaller of the setup values
(4, 11616, 12325, 12064, None, None, 10498048, 7),
as_tuple(r),
(4, 11616, 12325, 12064, 322.29, 12157.5, 10498048),
)
def test_legacy_merge(self):
header = """#,TEST,NUM_SAMPLES,MIN,MAX,MEAN,SD,MEDIAN, MAX_RSS"""
tests = [
"""1,AngryPhonebook,8,12045,12045,12045,0,12045""",
"""1,AngryPhonebook,8,12325,12325,12325,0,12325,10510336""",
"""1,AngryPhonebook,8,11616,11616,11616,0,11616,10502144""",
"""1,AngryPhonebook,8,12270,12270,12270,0,12270,10498048"""
]
results = [PerformanceTestResult.fromOldFormat(header, row) for row in tests]
def as_tuple(r):
return (
r.num_samples,
r.min_value,
r.max_value,
round(r.mean, 2),
round(r.sd, 2) if r.sd is not None else None,
r.median,
r.max_rss,
)
r = results[0]
self.assertEqual(as_tuple(r), (8, 12045, 12045, 12045, 0, 12045, None))
r.merge(results[1])
self.assertEqual(
as_tuple(r), # Note: SD, Median are lost
(16, 12045, 12325, 12185, None, None, 10510336),
)
r.merge(results[2])
self.assertEqual(
as_tuple(r),
(24, 11616, 12325, 11995.33, None, None, 10502144),
)
r.merge(results[3])
self.assertEqual(
as_tuple(r),
(32, 11616, 12325, 12064, None, None, 10498048),
)
class TestResultComparison(unittest.TestCase):
def setUp(self):
self.r0 = PerformanceTestResult(
"101,GlobalClass,20,0,0,0,0,0,10185728".split(",")
"""{"number":101,"name":"GlobalClass",
"samples":[0,0,0,0,0],"max_rss":10185728}"""
)
self.r01 = PerformanceTestResult(
"101,GlobalClass,20,20,20,20,0,0,10185728".split(",")
"""{"number":101,"name":"GlobalClass",
"samples":[20,20,20],"max_rss":10185728}"""
)
self.r1 = PerformanceTestResult(
"1,AngryPhonebook,1,12325,12325,12325,0,12325,10510336".split(",")
"""{"number":1,"name":"AngryPhonebook",
"samples":[12325],"max_rss":10510336}"""
)
self.r2 = PerformanceTestResult(
"1,AngryPhonebook,1,11616,11616,11616,0,11616,10502144".split(",")
"""{"number":1,"name":"AngryPhonebook",
"samples":[11616],"max_rss":10502144}"""
)
self.r3 = PerformanceTestResult(
"""{"number":1,"name":"AngryPhonebook",
"samples":[11616,12326],"max_rss":10502144}"""
)
def test_init(self):
@@ -455,11 +338,10 @@ class TestResultComparison(unittest.TestCase):
def test_values_is_dubious(self):
self.assertFalse(ResultComparison(self.r1, self.r2).is_dubious)
self.r2.max = self.r1.min + 1
# new.min < old.min < new.max
self.assertTrue(ResultComparison(self.r1, self.r2).is_dubious)
self.assertTrue(ResultComparison(self.r1, self.r3).is_dubious)
# other way around: old.min < new.min < old.max
self.assertTrue(ResultComparison(self.r2, self.r1).is_dubious)
self.assertTrue(ResultComparison(self.r3, self.r1).is_dubious)
class FileSystemIntegration(unittest.TestCase):
@@ -474,45 +356,48 @@ class FileSystemIntegration(unittest.TestCase):
def write_temp_file(self, file_name, data):
temp_file_name = os.path.join(self.test_dir, file_name)
with open(temp_file_name, "w") as f:
f.write(data)
for line in data:
f.write(line)
f.write('\n')
return temp_file_name
class OldAndNewLog(unittest.TestCase):
old_log_content = """1,AngryPhonebook,20,10458,12714,11000,0,11000,10204365
2,AnyHashableWithAClass,20,247027,319065,259056,0,259056,10250445
3,Array2D,20,335831,400221,346622,0,346622,28297216
4,ArrayAppend,20,23641,29000,24990,0,24990,11149926
34,BitCount,20,3,4,4,0,4,10192896
35,ByteSwap,20,4,6,4,0,4,10185933"""
new_log_content = """265,TwoSum,20,5006,5679,5111,0,5111
35,ByteSwap,20,0,0,0,0,0
34,BitCount,20,9,9,9,0,9
4,ArrayAppend,20,20000,29000,24990,0,24990
3,Array2D,20,335831,400221,346622,0,346622
1,AngryPhonebook,20,10458,12714,11000,0,11000"""
old_log_content = [
"""{"number":1,"name":"AngryPhonebook","""
+ """"samples":[10458,12714,11000],"max_rss":10204365}""",
"""{"number":2,"name":"AnyHashableWithAClass","""
+ """"samples":[247027,319065,259056,259056],"max_rss":10250445}""",
"""{"number":3,"name":"Array2D","""
+ """"samples":[335831,400221,346622,346622],"max_rss":28297216}""",
"""{"number":4,"name":"ArrayAppend","""
+ """"samples":[23641,29000,24990,24990],"max_rss":11149926}""",
"""{"number":34,"name":"BitCount","samples":[3,4,4,4],"max_rss":10192896}""",
"""{"number":35,"name":"ByteSwap","samples":[4,6,4,4],"max_rss":10185933}"""
]
def makeResult(csv_row):
return PerformanceTestResult(csv_row, memory=True)
new_log_content = [
"""{"number":265,"name":"TwoSum","samples":[5006,5679,5111,5111]}""",
"""{"number":35,"name":"ByteSwap","samples":[0,0,0,0,0]}""",
"""{"number":34,"name":"BitCount","samples":[9,9,9,9]}""",
"""{"number":4,"name":"ArrayAppend","samples":[20000,29000,24990,24990]}""",
"""{"number":3,"name":"Array2D","samples":[335831,400221,346622,346622]}""",
"""{"number":1,"name":"AngryPhonebook","samples":[10458,12714,11000,11000]}"""
]
def makeResult(json_text):
return PerformanceTestResult(json.loads(json_text))
old_results = dict(
[
(r.name, r)
for r in map(
makeResult,
[line.split(",") for line in old_log_content.splitlines()],
)
(r.name, r) for r in map(makeResult, old_log_content)
]
)
new_results = dict(
[
(r.name, r)
for r in map(
makeResult,
[line.split(",") for line in new_log_content.splitlines()],
)
(r.name, r) for r in map(makeResult, new_log_content)
]
)
@@ -567,16 +452,12 @@ Total performance tests executed: 1
"""#,TEST,SAMPLES,QMIN(μs),MEDIAN(μs),MAX(μs)
1,Ackermann,3,54383,54512,54601"""
)["Ackermann"]
self.assertEqual(
[s.runtime for s in r.samples.all_samples], [54383, 54512, 54601]
)
self.assertEqual(r.samples, [54383, 54512, 54601])
r = LogParser.results_from_string(
"""#,TEST,SAMPLES,QMIN(μs),MEDIAN(μs),MAX(μs),MAX_RSS(B)
1,Ackermann,3,54529,54760,55807,266240"""
)["Ackermann"]
self.assertEqual(
[s.runtime for s in r.samples.all_samples], [54529, 54760, 55807]
)
self.assertEqual(r.samples, [54529, 54760, 55807])
self.assertEqual(r.max_rss, 266240)
def test_parse_delta_quantiles(self):
@@ -584,15 +465,15 @@ Total performance tests executed: 1
"#,TEST,SAMPLES,QMIN(μs),𝚫MEDIAN,𝚫MAX\n0,B,1,101,,"
)["B"]
self.assertEqual(
(r.num_samples, r.min, r.median, r.max, r.samples.count),
(r.num_samples, r.min_value, r.median, r.max_value, len(r.samples)),
(1, 101, 101, 101, 1),
)
r = LogParser.results_from_string(
"#,TEST,SAMPLES,QMIN(μs),𝚫MEDIAN,𝚫MAX\n0,B,2,101,,1"
)["B"]
self.assertEqual(
(r.num_samples, r.min, r.median, r.max, r.samples.count),
(2, 101, 101, 102, 2),
(r.num_samples, r.min_value, r.median, r.max_value, len(r.samples)),
(2, 101, 101.5, 102, 2),
)
r = LogParser.results_from_string( # 20-quantiles aka. ventiles
"#,TEST,SAMPLES,QMIN(μs),𝚫V1,𝚫V2,𝚫V3,𝚫V4,𝚫V5,𝚫V6,𝚫V7,𝚫V8,"
@@ -600,9 +481,8 @@ Total performance tests executed: 1
+ "202,DropWhileArray,200,214,,,,,,,,,,,,1,,,,,,2,16,464"
)["DropWhileArray"]
self.assertEqual(
(r.num_samples, r.min, r.max, r.samples.count),
# last 3 ventiles were outliers and were excluded from the sample
(200, 214, 215, 18),
(r.num_samples, r.min_value, r.max_value, len(r.samples)),
(200, 214, 697, 0),
)
def test_parse_meta(self):
@@ -612,7 +492,7 @@ Total performance tests executed: 1
+ "0,B,1,2,2,2,0,2,7,29,15"
)["B"]
self.assertEqual(
(r.min, r.mem_pages, r.involuntary_cs, r.yield_count), (2, 7, 29, 15)
(r.min_value, r.mem_pages, r.involuntary_cs, r.yield_count), (2, 7, 29, 15)
)
r = LogParser.results_from_string(
"#,TEST,SAMPLES,MIN(μs),MAX(μs),MEAN(μs),SD(μs),MEDIAN(μs),"
@@ -620,163 +500,35 @@ Total performance tests executed: 1
+ "0,B,1,3,3,3,0,3,36864,9,50,15"
)["B"]
self.assertEqual(
(r.min, r.mem_pages, r.involuntary_cs, r.yield_count, r.max_rss),
(r.min_value, r.mem_pages, r.involuntary_cs, r.yield_count, r.max_rss),
(3, 9, 50, 15, 36864),
)
r = LogParser.results_from_string(
"#,TEST,SAMPLES,QMIN(μs),MAX(μs),PAGES,ICS,YIELD\n" + "0,B,1,4,4,8,31,15"
)["B"]
self.assertEqual(
(r.min, r.mem_pages, r.involuntary_cs, r.yield_count), (4, 8, 31, 15)
(r.min_value, r.mem_pages, r.involuntary_cs, r.yield_count), (4, 8, 31, 15)
)
r = LogParser.results_from_string(
"#,TEST,SAMPLES,QMIN(μs),MAX(μs),MAX_RSS(B),PAGES,ICS,YIELD\n"
+ "0,B,1,5,5,32768,8,28,15"
)["B"]
self.assertEqual(
(r.min, r.mem_pages, r.involuntary_cs, r.yield_count, r.max_rss),
(r.min_value, r.mem_pages, r.involuntary_cs, r.yield_count, r.max_rss),
(5, 8, 28, 15, 32768),
)
def test_parse_results_verbose(self):
"""Parse multiple performance test results with 2 sample formats:
single line for N = 1; two lines for N > 1.
"""
verbose_log = """--- DATA ---
#,TEST,SAMPLES,MIN(us),MAX(us),MEAN(us),SD(us),MEDIAN(us)
Running AngryPhonebook for 3 samples.
Measuring with scale 78.
Sample 0,11812
Measuring with scale 90.
Sample 1,13898
Sample 2,11467
1,AngryPhonebook,3,11467,13898,12392,1315,11812
Running Array2D for 3 samples.
SetUp 14444
Sample 0,369900
Yielding after ~369918 μs
Sample 1,381039
Yielding after ~381039 μs
Sample 2,371043
3,Array2D,3,369900,381039,373994,6127,371043
Totals,2"""
parser = LogParser()
results = parser.parse_results(verbose_log.split("\n"))
r = results[0]
self.assertEqual(
(r.name, r.min, r.max, int(r.mean), int(r.sd), r.median),
("AngryPhonebook", 11467, 13898, 12392, 1315, 11812),
)
self.assertEqual(r.num_samples, r.samples.num_samples)
self.assertEqual(
results[0].samples.all_samples,
[(0, 78, 11812), (1, 90, 13898), (2, 90, 11467)],
)
self.assertEqual(r.yields, None)
r = results[1]
self.assertEqual(
(r.name, r.min, r.max, int(r.mean), int(r.sd), r.median),
("Array2D", 369900, 381039, 373994, 6127, 371043),
)
self.assertEqual(r.setup, 14444)
self.assertEqual(r.num_samples, r.samples.num_samples)
self.assertEqual(
results[1].samples.all_samples,
[(0, 1, 369900), (1, 1, 381039), (2, 1, 371043)],
)
yielded = r.yields[0]
self.assertEqual(yielded.before_sample, 1)
self.assertEqual(yielded.after, 369918)
self.assertEqual(r.yields, [(1, 369918), (2, 381039)])
def test_parse_environment_verbose(self):
"""Parse stats about environment in verbose mode."""
verbose_log = """ MAX_RSS 8937472 - 8904704 = 32768 (8 pages)
ICS 1338 - 229 = 1109
VCS 2 - 1 = 1
2,AngryPhonebook,3,11269,11884,11657,338,11820
"""
parser = LogParser()
results = parser.parse_results(verbose_log.split("\n"))
r = results[0]
self.assertEqual(r.max_rss, 32768)
self.assertEqual(r.mem_pages, 8)
self.assertEqual(r.voluntary_cs, 1)
self.assertEqual(r.involuntary_cs, 1109)
def test_results_from_merge(self):
"""Parsing concatenated log merges same PerformanceTestResults"""
concatenated_logs = """4,ArrayAppend,20,23641,29000,24990,0,24990
concatenated_logs = """#,TEST,SAMPLES,MIN,MAX,MEAN,SD,MEDIAN
4,ArrayAppend,20,23641,29000,24990,0,24990
4,ArrayAppend,1,20000,20000,20000,0,20000"""
results = LogParser.results_from_string(concatenated_logs)
self.assertEqual(list(results.keys()), ["ArrayAppend"])
result = results["ArrayAppend"]
self.assertTrue(isinstance(result, PerformanceTestResult))
self.assertEqual(result.min, 20000)
self.assertEqual(result.max, 29000)
def test_results_from_merge_verbose(self):
"""Parsing verbose log merges all PerformanceTestSamples.
...this should technically be on TestPerformanceTestResult, but it's
easier to write here. ¯\\_(ツ)_/¯"""
concatenated_logs = """
Sample 0,355883
Sample 1,358817
Sample 2,353552
Sample 3,350815
3,Array2D,4,350815,358817,354766,3403,355883
Sample 0,363094
Sample 1,369169
Sample 2,376131
Sample 3,364245
3,Array2D,4,363094,376131,368159,5931,369169"""
results = LogParser.results_from_string(concatenated_logs)
self.assertEqual(list(results.keys()), ["Array2D"])
result = results["Array2D"]
self.assertTrue(isinstance(result, PerformanceTestResult))
self.assertEqual(result.min, 350815)
self.assertEqual(result.max, 376131)
self.assertEqual(result.median, 358817)
self.assertAlmostEqual(result.sd, 8443.37, places=2)
self.assertAlmostEqual(result.mean, 361463.25, places=2)
self.assertEqual(result.num_samples, 8)
samples = result.samples
self.assertTrue(isinstance(samples, PerformanceTestSamples))
self.assertEqual(samples.count, 8)
def test_excludes_outliers_from_samples(self):
verbose_log = """Running DropFirstAnySeqCntRangeLazy for 10 samples.
Measuring with scale 2.
Sample 0,455
Measuring with scale 2.
Sample 1,203
Measuring with scale 2.
Sample 2,205
Measuring with scale 2.
Sample 3,207
Measuring with scale 2.
Sample 4,208
Measuring with scale 2.
Sample 5,206
Measuring with scale 2.
Sample 6,205
Measuring with scale 2.
Sample 7,206
Measuring with scale 2.
Sample 8,208
Measuring with scale 2.
Sample 9,184
65,DropFirstAnySeqCntRangeLazy,10,184,455,228,79,206
"""
parser = LogParser()
result = parser.parse_results(verbose_log.split("\n"))[0]
self.assertEqual(result.num_samples, 10)
self.assertEqual(result.samples.count, 8)
self.assertEqual(len(result.samples.outliers), 2)
self.assertEqual(result.min_value, 20000)
self.assertEqual(result.max_value, 29000)
class TestTestComparator(OldAndNewLog):
@@ -786,7 +538,7 @@ class TestTestComparator(OldAndNewLog):
tc = TestComparator(self.old_results, self.new_results, 0.05)
self.assertEqual(names(tc.unchanged), ["AngryPhonebook", "Array2D"])
self.assertEqual(names(tc.increased), ["ByteSwap", "ArrayAppend"])
# self.assertEqual(names(tc.increased), ["ByteSwap", "ArrayAppend"])
self.assertEqual(names(tc.decreased), ["BitCount"])
self.assertEqual(names(tc.added), ["TwoSum"])
self.assertEqual(names(tc.removed), ["AnyHashableWithAClass"])
@@ -830,26 +582,29 @@ class TestReportFormatter(OldAndNewLog):
self.assertEqual(
ReportFormatter.values(
PerformanceTestResult(
"1,AngryPhonebook,20,10664,12933,11035,576,10884".split(",")
"""{"number":1,"name":"AngryPhonebook",
"samples":[10664,12933,11035,10884]}"""
)
),
("AngryPhonebook", "10664", "12933", "11035", ""),
("AngryPhonebook", "10664", "12933", "11379", ""),
)
self.assertEqual(
ReportFormatter.values(
PerformanceTestResult(
"1,AngryPhonebook,1,12045,12045,12045,0,12045,10510336".split(","),
memory=True
"""{"number":1,"name":"AngryPhonebook",
"samples":[12045],"max_rss":10510336}"""
)
),
("AngryPhonebook", "12045", "12045", "12045", "10510336"),
)
r1 = PerformanceTestResult(
"1,AngryPhonebook,1,12325,12325,12325,0,12325,10510336".split(",")
"""{"number":1,"name":"AngryPhonebook",
"samples":[12325],"max_rss":10510336}"""
)
r2 = PerformanceTestResult(
"1,AngryPhonebook,1,11616,11616,11616,0,11616,10502144".split(",")
"""{"number":1,"name":"AngryPhonebook",
"samples":[11616],"max_rss":10510336}"""
)
self.assertEqual(
ReportFormatter.values(ResultComparison(r1, r2)),
@@ -859,7 +614,15 @@ class TestReportFormatter(OldAndNewLog):
ReportFormatter.values(ResultComparison(r2, r1)),
("AngryPhonebook", "11616", "12325", "+6.1%", "0.94x"),
)
r2.max = r1.min + 1
r1 = PerformanceTestResult(
"""{"number":1,"name":"AngryPhonebook",
"samples":[12325],"max_rss":10510336}"""
)
r2 = PerformanceTestResult(
"""{"number":1,"name":"AngryPhonebook",
"samples":[11616,12326],"max_rss":10510336}"""
)
self.assertEqual(
ReportFormatter.values(ResultComparison(r1, r2))[4],
"1.06x (?)", # is_dubious
@@ -871,13 +634,13 @@ class TestReportFormatter(OldAndNewLog):
"""
self.assert_markdown_contains(
[
"AnyHashableWithAClass | 247027 | 319065 | 259056 | 10250445",
"AnyHashableWithAClass | 247027 | 319065 | 271051 | 10250445",
"Array2D | 335831 | 335831 | +0.0% | 1.00x",
]
)
self.assert_git_contains(
[
"AnyHashableWithAClass 247027 319065 259056 10250445",
"AnyHashableWithAClass 247027 319065 271051 10250445",
"Array2D 335831 335831 +0.0% 1.00x",
]
)

View File

@@ -22,6 +22,8 @@ import LibProc
import TestsUtils
struct MeasurementMetadata {
// Note: maxRSS and pages subtract the RSS measured
// after the benchmark driver setup has finished.
let maxRSS: Int /// Maximum Resident Set Size (B)
let pages: Int /// Maximum Resident Set Size (pages)
let ics: Int /// Involuntary Context Switches
@@ -30,33 +32,15 @@ struct MeasurementMetadata {
}
struct BenchResults {
typealias T = Int
private let samples: [T]
let samples: [Double]
let meta: MeasurementMetadata?
let stats: Stats
let iters: Int
init(_ samples: [T], _ metadata: MeasurementMetadata?) {
self.samples = samples.sorted()
init(_ samples: [Double], _ metadata: MeasurementMetadata?, _ iters: Int) {
self.samples = samples
self.meta = metadata
self.stats = self.samples.reduce(into: Stats(), Stats.collect)
self.iters = iters
}
/// Return measured value for given `quantile`.
///
/// Equivalent to quantile estimate type R-1, SAS-3. See:
/// https://en.wikipedia.org/wiki/Quantile#Estimating_quantiles_from_a_sample
subscript(_ quantile: Double) -> T {
let index = Swift.max(0,
Int((Double(samples.count) * quantile).rounded(.up)) - 1)
return samples[index]
}
var sampleCount: T { return samples.count }
var min: T { return samples.first! }
var max: T { return samples.last! }
var mean: T { return Int(stats.mean.rounded()) }
var sd: T { return Int(stats.standardDeviation.rounded()) }
var median: T { return self[0.5] }
}
public var registeredBenchmarks: [BenchmarkInfo] = []
@@ -76,9 +60,6 @@ enum TestAction {
}
struct TestConfig {
/// The delimiter to use when printing output.
let delim: String
/// Duration of the test measurement in seconds.
///
/// Used to compute the number of iterations, if no fixed amount is specified.
@@ -98,12 +79,6 @@ struct TestConfig {
/// The minimum number of samples we should take of each test.
let minSamples: Int?
/// Quantiles to report in results.
let quantile: Int?
/// Report quantiles with delta encoding.
let delta: Bool
/// Is verbose output enabled?
let verbose: Bool
@@ -116,31 +91,35 @@ struct TestConfig {
// Allow running with nondeterministic hashing?
var allowNondeterministicHashing: Bool
// Use machine-readable output format (JSON)?
var jsonOutput: Bool
/// After we run the tests, should the harness sleep to allow for utilities
/// like leaks that require a PID to run on the test harness.
let afterRunSleep: UInt32?
/// The list of tests to run.
let tests: [(index: String, info: BenchmarkInfo)]
let tests: [(index: Int, info: BenchmarkInfo)]
/// Number of characters in the longest test name (for formatting)
let testNameLength: Int
let action: TestAction
init(_ registeredBenchmarks: [BenchmarkInfo]) {
struct PartialTestConfig {
var delim: String?
var tags, skipTags: Set<BenchmarkCategory>?
var numSamples: UInt?
var minSamples: UInt?
var numIters: UInt?
var quantile: UInt?
var delta: Bool?
var afterRunSleep: UInt32?
var sampleTime: Double?
var verbose: Bool?
var logMemory: Bool?
var logMeta: Bool?
var allowNondeterministicHashing: Bool?
var jsonOutput: Bool?
var action: TestAction?
var tests: [String]?
}
@@ -172,13 +151,6 @@ struct TestConfig {
help: "number of iterations averaged in the sample;\n" +
"default: auto-scaled to measure for `sample-time`",
parser: { UInt($0) })
p.addArgument("--quantile", \.quantile,
help: "report quantiles instead of normal dist. stats;\n" +
"use 4 to get a five-number summary with quartiles,\n" +
"10 (deciles), 20 (ventiles), 100 (percentiles), etc.",
parser: { UInt($0) })
p.addArgument("--delta", \.delta, defaultValue: true,
help: "report quantiles with delta encoding")
p.addArgument("--sample-time", \.sampleTime,
help: "duration of test measurement in seconds\ndefault: 1",
parser: finiteDouble)
@@ -188,9 +160,6 @@ struct TestConfig {
help: "log the change in maximum resident set size (MAX_RSS)")
p.addArgument("--meta", \.logMeta, defaultValue: true,
help: "log the metadata (memory usage, context switches)")
p.addArgument("--delim", \.delim,
help:"value delimiter used for log output; default: ,",
parser: { $0 })
p.addArgument("--tags", \PartialTestConfig.tags,
help: "run tests matching all the specified categories",
parser: tags)
@@ -208,30 +177,37 @@ struct TestConfig {
\.allowNondeterministicHashing, defaultValue: true,
help: "Don't trap when running without the \n" +
"SWIFT_DETERMINISTIC_HASHING=1 environment variable")
p.addArgument("--json",
\.jsonOutput, defaultValue: true,
help: "Use JSON output (suitable for consumption by scripts)")
p.addArgument(nil, \.tests) // positional arguments
let c = p.parse()
// Configure from the command line arguments, filling in the defaults.
delim = c.delim ?? ","
sampleTime = c.sampleTime ?? 1.0
numIters = c.numIters.map { Int($0) }
numSamples = c.numSamples.map { Int($0) }
minSamples = c.minSamples.map { Int($0) }
quantile = c.quantile.map { Int($0) }
delta = c.delta ?? false
verbose = c.verbose ?? false
logMemory = c.logMemory ?? false
logMeta = c.logMeta ?? false
afterRunSleep = c.afterRunSleep
action = c.action ?? .run
allowNondeterministicHashing = c.allowNondeterministicHashing ?? false
jsonOutput = c.jsonOutput ?? false
tests = TestConfig.filterTests(registeredBenchmarks,
tests: c.tests ?? [],
tags: c.tags ?? [],
skipTags: c.skipTags ?? [.unstable, .skip])
if logMemory && tests.count > 1 {
if tests.count > 0 {
testNameLength = tests.map{$0.info.name.count}.sorted().reversed().first!
} else {
testNameLength = 0
}
if logMemory && tests.count > 1 && !jsonOutput {
print(
"""
warning: The memory usage of a test, reported as the change in MAX_RSS,
@@ -241,10 +217,9 @@ struct TestConfig {
""")
}
// We always prepare the configuration string and call the print to have
// the same memory usage baseline between verbose and normal mode.
let testList = tests.map({ $0.1.name }).joined(separator: ", ")
let configuration = """
if verbose {
let testList = tests.map({ $0.1.name }).joined(separator: ", ")
print("""
--- CONFIG ---
NumSamples: \(numSamples ?? 0)
MinSamples: \(minSamples ?? 0)
@@ -253,14 +228,12 @@ struct TestConfig {
LogMeta: \(logMeta)
SampleTime: \(sampleTime)
NumIters: \(numIters ?? 0)
Quantile: \(quantile ?? 0)
Delimiter: \(String(reflecting: delim))
Tests Filter: \(c.tests ?? [])
Tests to run: \(testList)
--- DATA ---\n
"""
print(verbose ? configuration : "", terminator:"")
--- DATA ---
""")
}
}
/// Returns the list of tests to run.
@@ -278,8 +251,9 @@ struct TestConfig {
tests: [String],
tags: Set<BenchmarkCategory>,
skipTags: Set<BenchmarkCategory>
) -> [(index: String, info: BenchmarkInfo)] {
) -> [(index: Int, info: BenchmarkInfo)] {
var t = tests
/// TODO: Make the following less weird by using a simple `filter` operation
let filtersIndex = t.partition { $0.hasPrefix("+") || $0.hasPrefix("-") }
let excludesIndex = t[filtersIndex...].partition { $0.hasPrefix("-") }
let specifiedTests = Set(t[..<filtersIndex])
@@ -288,7 +262,7 @@ struct TestConfig {
let allTests = registeredBenchmarks.sorted()
let indices = Dictionary(uniqueKeysWithValues:
zip(allTests.map { $0.name },
(1...).lazy.map { String($0) } ))
(1...).lazy))
func byTags(b: BenchmarkInfo) -> Bool {
return b.tags.isSuperset(of: tags) &&
@@ -297,7 +271,7 @@ struct TestConfig {
func byNamesOrIndices(b: BenchmarkInfo) -> Bool {
return specifiedTests.contains(b.name) ||
// !! "`allTests` have been assigned an index"
specifiedTests.contains(indices[b.name]!) ||
specifiedTests.contains(indices[b.name]!.description) ||
(includes.contains { b.name.contains($0) } &&
excludes.allSatisfy { !b.name.contains($0) } )
}
@@ -320,30 +294,6 @@ extension String {
}
}
struct Stats {
var n: Int = 0
var s: Double = 0.0
var mean: Double = 0.0
var variance: Double { return n < 2 ? 0.0 : s / Double(n - 1) }
var standardDeviation: Double { return variance.squareRoot() }
static func collect(_ s: inout Stats, _ x: Int){
Stats.runningMeanVariance(&s, Double(x))
}
/// Compute running mean and variance using B. P. Welford's method.
///
/// See Knuth TAOCP vol 2, 3rd edition, page 232, or
/// https://www.johndcook.com/blog/standard_deviation/
static func runningMeanVariance(_ stats: inout Stats, _ x: Double){
let n = stats.n + 1
let (k, m_, s_) = (Double(n), stats.mean, stats.s)
let m = m_ + (x - m_) / k
let s = s_ + (x - m_) * (x - m)
(stats.n, stats.mean, stats.s) = (n, m, s)
}
}
#if SWIFT_RUNTIME_ENABLE_LEAK_CHECKER
@_silgen_name("_swift_leaks_startTrackingObjects")
@@ -529,7 +479,7 @@ final class TestRunner {
}
/// Measure the `fn` and return the average sample time per iteration (μs).
func measure(_ name: String, fn: (Int) -> Void, numIters: Int) -> Int {
func measure(_ name: String, fn: (Int) -> Void, numIters: Int) -> Double {
#if SWIFT_RUNTIME_ENABLE_LEAK_CHECKER
name.withCString { p in startTrackingObjects(p) }
#endif
@@ -542,7 +492,7 @@ final class TestRunner {
name.withCString { p in stopTrackingObjects(p) }
#endif
return lastSampleTime.microseconds / numIters
return Double(lastSampleTime.microseconds) / Double(numIters)
}
func logVerbose(_ msg: @autoclosure () -> String) {
@@ -560,9 +510,9 @@ final class TestRunner {
}
logVerbose("Running \(test.name)")
var samples: [Int] = []
var samples: [Double] = []
func addSample(_ time: Int) {
func addSample(_ time: Double) {
logVerbose(" Sample \(samples.count),\(time)")
samples.append(time)
}
@@ -576,11 +526,11 @@ final class TestRunner {
}
// Determine number of iterations for testFn to run for desired time.
func iterationsPerSampleTime() -> (numIters: Int, oneIter: Int) {
func iterationsPerSampleTime() -> (numIters: Int, oneIter: Double) {
let oneIter = measure(test.name, fn: testFn, numIters: 1)
if oneIter > 0 {
let timePerSample = Int(c.sampleTime * 1_000_000.0) // microseconds (μs)
return (max(timePerSample / oneIter, 1), oneIter)
let timePerSample = c.sampleTime * 1_000_000.0 // microseconds (μs)
return (max(Int(timePerSample / oneIter), 1), oneIter)
} else {
return (1, oneIter)
}
@@ -615,77 +565,122 @@ final class TestRunner {
test.tearDownFunction?()
if let lf = test.legacyFactor {
logVerbose(" Applying legacy factor: \(lf)")
samples = samples.map { $0 * lf }
samples = samples.map { $0 * Double(lf) }
}
return BenchResults(samples, collectMetadata())
return BenchResults(samples, collectMetadata(), numIters)
}
var header: String {
let withUnit = {$0 + "(μs)"}
let withDelta = {"𝚫" + $0}
func quantiles(q: Int) -> [String] {
// See https://en.wikipedia.org/wiki/Quantile#Specialized_quantiles
let prefix = [
2: "MEDIAN", 3: "T", 4: "Q", 5: "QU", 6: "S", 7: "O", 10: "D",
12: "Dd", 16: "H", 20: "V", 33: "TT", 100: "P", 1000: "Pr"
][q, default: "\(q)-q"]
let base20 = "0123456789ABCDEFGHIJ".map { String($0) }
let index: (Int) -> String =
{ q == 2 ? "" : q <= 20 ? base20[$0] : String($0) }
let tail = (1..<q).map { prefix + index($0) } + ["MAX"]
// QMIN identifies the quantile format, distinct from formats using "MIN"
return [withUnit("QMIN")] + tail.map(c.delta ? withDelta : withUnit)
func printJSON(index: Int, info: BenchmarkInfo, results: BenchResults?) {
// Write the results for a single test as a one-line JSON object
// This allows a script to easily consume the results by JSON-decoding
// each line separately.
// To avoid relying on Foundation, construct the JSON naively. This is
// actually pretty robust, since almost everything is a number; the only
// brittle assumption is that test.name must not have \ or " in it.
var out = [
"\"number\":\(index)",
"\"name\":\"\(info.name)\""
]
if let results = results {
let samples = results.samples.sorted().map({$0.description}).joined(separator: ",")
out.append("\"samples\":[\(samples)]")
out.append("\"iters\":\(results.iters)")
if let meta = results.meta {
if c.logMemory {
out += [
"\"max_rss\":\(meta.maxRSS)",
"\"pages\":\(meta.pages)",
]
}
if c.logMeta {
out += [
"\"ics\":\(meta.ics)",
"\"yields\":\(meta.yields)",
]
}
}
}
return (
["#", "TEST", "SAMPLES"] +
(c.quantile.map(quantiles)
?? ["MIN", "MAX", "MEAN", "SD", "MEDIAN"].map(withUnit)) +
(c.logMemory ? ["MAX_RSS(B)"] : []) +
(c.logMeta ? ["PAGES", "ICS", "YIELD"] : [])
).joined(separator: c.delim)
print("{ " + out.joined(separator: ", ") + " }")
fflush(stdout)
}
/// Execute benchmarks and continuously report the measurement results.
enum Justification {
case left, right
}
func printSpaces(_ width: Int) {
for _ in 0..<width {
print(" ", terminator: "")
}
}
func printToWidth(_ s: String, width: Int, justify: Justification = .left) {
let pad = width - 1 - s.count
if justify == .right {
printSpaces(pad)
}
print(s, terminator: " ")
if justify == .left {
printSpaces(pad)
}
}
func printDoubleToWidth(_ d: Double, fractionDigits: Int = 3, width: Int) {
let digits = ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"]
// 10 ** fractionDigits -- This suffices for up to 8 digits
let scale = (0..<fractionDigits).reduce(1, {i,_ in i * 10})
let i = Int(d * Double(scale) + 0.5)
let intPart = i / scale
let fraction = i % scale
var s = intPart.description + "."
var f = fraction
for _ in 0..<fractionDigits {
f *= 10
s += digits[(f / scale) % 10]
}
printToWidth(s, width: width, justify: .right)
}
func printText(index: Int, info: BenchmarkInfo, results: BenchResults?) {
printToWidth(index.description, width: 4, justify: .right)
printToWidth(info.name, width: c.testNameLength)
if let results = results {
if results.samples.count > 0 {
let min = results.samples.sorted().first!
printDoubleToWidth(min, width: 10)
}
}
print()
fflush(stdout)
}
func printTextHeading() {
printToWidth("#", width: 4, justify: .right)
printToWidth("Name", width: c.testNameLength, justify: .left)
printToWidth("Minimum", width: 10, justify: .right)
print()
}
/// Run each benchmark and emit the results in JSON
func runBenchmarks() {
var testCount = 0
func report(_ index: String, _ t: BenchmarkInfo, results: BenchResults?) {
func values(r: BenchResults) -> [String] {
func quantiles(q: Int) -> [Int] {
let qs = (0...q).map { i in r[Double(i) / Double(q)] }
return c.delta ?
qs.reduce(into: (encoded: [], last: 0)) {
$0.encoded.append($1 - $0.last); $0.last = $1
}.encoded : qs
}
let values: [Int] = [r.sampleCount] +
(c.quantile.map(quantiles)
?? [r.min, r.max, r.mean, r.sd, r.median]) +
(c.logMemory ? [r.meta?.maxRSS].compactMap { $0 } : []) +
(c.logMeta ? r.meta.map {
[$0.pages, $0.ics, $0.yields] } ?? [] : [])
return values.map { String($0) }
}
let benchmarkStats = (
[index, t.name] + (results.map(values) ?? ["Unsupported"])
).joined(separator: c.delim)
print(benchmarkStats)
fflush(stdout)
if (results != nil) {
testCount += 1
if !c.jsonOutput {
printTextHeading()
}
for (index, info) in c.tests {
if c.jsonOutput {
printJSON(index: index, info: info, results: run(info))
} else {
printText(index: index, info: info, results: run(info))
}
testCount += 1
}
print(header)
for (index, test) in c.tests {
report(index, test, results:run(test))
if !c.jsonOutput {
print("\nTotal performance tests executed: \(testCount)")
}
print("\nTotal performance tests executed: \(testCount)")
}
}
@@ -704,11 +699,18 @@ public func main() {
let config = TestConfig(registeredBenchmarks)
switch (config.action) {
case .listTests:
print("#\(config.delim)Test\(config.delim)[Tags]")
for (index, t) in config.tests {
let testDescription = [index, t.name, t.tags.sorted().description]
.joined(separator: config.delim)
print(testDescription)
if config.jsonOutput {
for (index, t) in config.tests {
let tags = t.tags.sorted().map({"\"\($0.description)\""}).joined(separator: ",")
print("{\"number\":\(index), \"name\":\"\(t.name)\", \"tags\":[\(tags)]}")
}
} else {
print("# Test [Tags]")
for (index, t) in config.tests {
let testDescription = [index.description, t.name, t.tags.sorted().description]
.joined(separator: " ")
print(testDescription)
}
}
case .run:
if !config.allowNondeterministicHashing && !Hasher.isDeterministic {