Merge pull request #19910 from palimondo/fluctuation-of-the-pupil

[benchmark] More Robust Benchmark_Driver
This commit is contained in:
eeckstein
2018-10-29 15:02:07 -07:00
committed by GitHub
7 changed files with 409 additions and 116 deletions

View File

@@ -665,11 +665,11 @@ function(swift_benchmark_compile)
COMMAND "${swift-bin-dir}/Benchmark_Driver" "run"
"-o" "O" "--output-dir" "${CMAKE_CURRENT_BINARY_DIR}/logs"
"--swift-repo" "${SWIFT_SOURCE_DIR}"
"--iterations" "${SWIFT_BENCHMARK_NUM_O_ITERATIONS}"
"--independent-samples" "${SWIFT_BENCHMARK_NUM_O_ITERATIONS}"
COMMAND "${swift-bin-dir}/Benchmark_Driver" "run"
"-o" "Onone" "--output-dir" "${CMAKE_CURRENT_BINARY_DIR}/logs"
"--swift-repo" "${SWIFT_SOURCE_DIR}"
"--iterations" "${SWIFT_BENCHMARK_NUM_ONONE_ITERATIONS}"
"--independent-samples" "${SWIFT_BENCHMARK_NUM_ONONE_ITERATIONS}"
COMMAND "${swift-bin-dir}/Benchmark_Driver" "compare"
"--log-dir" "${CMAKE_CURRENT_BINARY_DIR}/logs"
"--swift-repo" "${SWIFT_SOURCE_DIR}"

View File

@@ -133,20 +133,21 @@ class BenchmarkDriver(object):
benchmarks.intersection(set(self.all_tests)).union(indexed_names)))
def run(self, test, num_samples=None, num_iters=None,
verbose=None, measure_memory=False):
verbose=None, measure_memory=False, quantile=None):
"""Execute benchmark and gather results."""
num_samples = num_samples or 1
num_samples = num_samples or 0
num_iters = num_iters or 0 # automatically determine N to run for 1s
cmd = self._cmd_run(
test, num_samples, num_iters, verbose, measure_memory)
test, num_samples, num_iters, verbose, measure_memory, quantile)
output = self._invoke(cmd)
result = self.parser.results_from_string(output).items()[0][1]
return result
def _cmd_run(self, test, num_samples, num_iters, verbose, measure_memory):
def _cmd_run(self, test, num_samples, num_iters, verbose, measure_memory,
quantile):
cmd = [self.test_harness, test]
if num_samples > 1:
if num_samples > 0:
cmd.append('--num-samples={0}'.format(num_samples))
if num_iters > 0:
cmd.append('--num-iters={0}'.format(num_iters))
@@ -154,6 +155,9 @@ class BenchmarkDriver(object):
cmd.append('--verbose')
if measure_memory:
cmd.append('--memory')
if quantile:
cmd.append('--quantile={0}'.format(quantile))
cmd.append('--delta')
return cmd
def run_independent_samples(self, test):
@@ -166,8 +170,9 @@ class BenchmarkDriver(object):
return a
return reduce(merge_results,
[self.run(test, measure_memory=True)
for _ in range(self.args.iterations)])
[self.run(test, measure_memory=True,
num_iters=1, quantile=20)
for _ in range(self.args.independent_samples)])
def log_results(self, output, log_file=None):
"""Log output to `log_file`.
@@ -182,7 +187,7 @@ class BenchmarkDriver(object):
with open(log_file, 'w') as f:
f.write(output)
RESULT = '{:>3} {:<25} {:>7} {:>7} {:>7} {:>8} {:>6} {:>10} {:>10}'
RESULT = '{:>3} {:<25} {:>7} {:>7} {:>6} {:>10} {:>6} {:>7} {:>10}'
def run_and_log(self, csv_console=True):
"""Run benchmarks and continuously log results to the console.
@@ -201,14 +206,15 @@ class BenchmarkDriver(object):
def console_log(values):
print(format(values))
console_log(['#', 'TEST', 'SAMPLES', 'MIN(μs)', 'MAX(μs)', # header
'MEAN(μs)', 'SD(μs)', 'MEDIAN(μs)', 'MAX_RSS(B)'])
def result_values(r):
return map(str, [r.test_num, r.name, r.num_samples, r.min, r.max,
int(r.mean), int(r.sd), r.median, r.max_rss])
return map(str, [r.test_num, r.name, r.num_samples, r.min,
r.samples.q1, r.median, r.samples.q3, r.max,
r.max_rss])
results = []
header = ['#', 'TEST', 'SAMPLES', 'MIN(μs)', 'Q1(μs)', 'MEDIAN(μs)',
'Q3(μs)', 'MAX(μs)', 'MAX_RSS(B)']
console_log(header)
results = [header]
for test in self.tests:
result = result_values(self.run_independent_samples(test))
console_log(result)
@@ -290,6 +296,7 @@ class BenchmarkDoctor(object):
self._name_matches_capital_words_convention,
self._name_is_at_most_40_chars_long,
self._no_setup_overhead,
self._reasonable_setup_time,
self._optimized_runtime_in_range,
self._constant_memory_use
]
@@ -341,15 +348,20 @@ class BenchmarkDoctor(object):
for correction in [(setup / i) for i in [1, 2]]
] for result in i_series])
if 2500 < runtime:
log = (BenchmarkDoctor.log_runtime.warning if runtime < 500000 else
threshold = 1000
if threshold < runtime:
log = (BenchmarkDoctor.log_runtime.warning if runtime < 10000 else
BenchmarkDoctor.log_runtime.error)
caveat = '' if setup == 0 else ' (excluding the setup overhead)'
log("'%s' execution took at least %d μs%s.", name, runtime, caveat)
factor = int(pow(2, math.ceil(math.log(runtime / 2500.0, 2))))
def factor(base): # suitable divisior that's integer power of base
return int(pow(base, math.ceil(
math.log(runtime / float(threshold), base))))
BenchmarkDoctor.log_runtime.info(
"Decrease the workload of '%s' by a factor of %d, "
"to be less than 2500 μs.", name, factor)
"Decrease the workload of '%s' by a factor of %d (%d), to be "
"less than %d μs.", name, factor(2), factor(10), threshold)
@staticmethod
def _setup_overhead(measurements):
@@ -372,6 +384,17 @@ class BenchmarkDoctor(object):
'Move initialization of benchmark data to the `setUpFunction` '
'registered in `BenchmarkInfo`.')
@staticmethod
def _reasonable_setup_time(measurements):
setup = min([result.setup
for result in BenchmarkDoctor._select(measurements)])
if 200000 < setup: # 200 ms
BenchmarkDoctor.log_runtime.error(
"'%s' setup took at least %d μs.",
measurements['name'], setup)
BenchmarkDoctor.log_runtime.info(
'The `setUpFunction` should take no more than 200 ms.')
@staticmethod
def _constant_memory_use(measurements):
select = BenchmarkDoctor._select
@@ -603,7 +626,7 @@ def parse_args(args):
help='Run benchmarks and output results to stdout',
parents=[shared_benchmarks_parser])
run_parser.add_argument(
'-i', '--iterations',
'-i', '--independent-samples',
help='number of times to run each test (default: 1)',
type=positive_int, default=1)
run_parser.add_argument(

View File

@@ -51,6 +51,14 @@ class Sample(namedtuple('Sample', 'i num_iters runtime')):
return 's({0.i!r}, {0.num_iters!r}, {0.runtime!r})'.format(self)
class Yield(namedtuple('Yield', 'before_sample after')):
u"""Meta-measurement of when the Benchmark_X voluntarily yielded process.
`before_sample`: index of measurement taken just after returning from yield
`after`: time elapsed since the previous yield in microseconds (μs)
"""
class PerformanceTestSamples(object):
"""Collection of runtime samples from the benchmark execution.
@@ -69,7 +77,7 @@ class PerformanceTestSamples(object):
self.add(sample)
def __str__(self):
"""Text summary of benchmark statisctics."""
"""Text summary of benchmark statistics."""
return (
'{0.name!s} n={0.count!r} '
'Min={0.min!r} Q1={0.q1!r} M={0.median!r} Q3={0.q3!r} '
@@ -211,23 +219,50 @@ class PerformanceTestResult(object):
Reported by the test driver (Benchmark_O, Benchmark_Onone, Benchmark_Osize
or Benchmark_Driver).
It depends on the log format emitted by the test driver in the form:
It suppors 2 log formats emitted by the test driver. Legacy format with
statistics for normal distribution (MEAN, SD):
#,TEST,SAMPLES,MIN(μs),MAX(μs),MEAN(μs),SD(μs),MEDIAN(μs),MAX_RSS(B)
The last column, MAX_RSS, is emitted only for runs instrumented by the
Benchmark_Driver to measure rough memory use during the execution of the
benchmark.
And new quantiles format with variable number of columns:
#,TEST,SAMPLES,MIN(μs),MEDIAN(μs),MAX(μs)
#,TEST,SAMPLES,MIN(μs),Q1(μs),Q2(μs),Q3(μs),MAX(μs),MAX_RSS(B)
The number of columns between MIN and MAX depends on the test driver's
`--quantile`parameter. In both cases, the last column, MAX_RSS is optional.
"""
def __init__(self, csv_row):
"""Initialize from a row with 8 or 9 columns with benchmark summary.
def __init__(self, csv_row, quantiles=False, memory=False, delta=False):
"""Initialize from a row of multiple columns with benchmark summary.
The row is an iterable, such as a row provided by the CSV parser.
"""
self.test_num = csv_row[0] # Ordinal number of the test
self.name = csv_row[1] # Name of the performance test
self.num_samples = ( # Number of measurement samples taken
int(csv_row[2]))
self.num_samples = int(csv_row[2]) # Number of measurements taken
if quantiles: # Variable number of columns representing quantiles
runtimes = csv_row[3:-1] if memory else csv_row[3:]
if delta:
runtimes = [int(x) if x else 0 for x in runtimes]
runtimes = reduce(lambda l, x: l.append(l[-1] + x) or # runnin
l if l else [x], runtimes, None) # total
num_values = len(runtimes)
if self.num_samples < num_values: # remove repeated samples
quantile = num_values - 1
qs = [float(i) / float(quantile) for i in range(0, num_values)]
indices = [max(0, int(ceil(self.num_samples * float(q))) - 1)
for q in qs]
runtimes = [runtimes[indices.index(i)]
for i in range(0, self.num_samples)]
self.samples = PerformanceTestSamples(
self.name,
[Sample(None, None, int(runtime)) for runtime in runtimes])
self.samples.exclude_outliers(top_only=True)
sams = self.samples
self.min, self.max, self.median, self.mean, self.sd = \
sams.min, sams.max, sams.median, sams.mean, sams.sd
self.max_rss = ( # Maximum Resident Set Size (B)
int(csv_row[-1]) if memory else None)
else: # Legacy format with statistics for normal distribution.
self.min = int(csv_row[3]) # Minimum runtime (μs)
self.max = int(csv_row[4]) # Maximum runtime (μs)
self.mean = float(csv_row[5]) # Mean (average) runtime (μs)
@@ -236,6 +271,8 @@ class PerformanceTestResult(object):
self.max_rss = ( # Maximum Resident Set Size (B)
int(csv_row[8]) if len(csv_row) > 8 else None)
self.samples = None
self.yields = None
self.setup = None
def __repr__(self):
"""Short summary for debugging purposes."""
@@ -253,6 +290,7 @@ class PerformanceTestResult(object):
The use case here is comparing test results parsed from concatenated
log files from multiple runs of benchmark driver.
"""
# Statistics
if self.samples and r.samples:
map(self.samples.add, r.samples.samples)
sams = self.samples
@@ -266,8 +304,14 @@ class PerformanceTestResult(object):
(self.mean * self.num_samples) + (r.mean * r.num_samples)
) / float(self.num_samples + r.num_samples)
self.num_samples += r.num_samples
self.max_rss = min(self.max_rss, r.max_rss)
self.median, self.sd = 0, 0
self.median, self.sd = None, None
# Metadata
def minimum(a, b): # work around None being less than everything
return (min(filter(lambda x: x is not None, [a, b])) if any([a, b])
else None)
self.max_rss = minimum(self.max_rss, r.max_rss)
self.setup = minimum(self.setup, r.setup)
class ResultComparison(object):
@@ -307,27 +351,29 @@ class LogParser(object):
def __init__(self):
"""Create instance of `LogParser`."""
self.results = []
self.quantiles, self.delta, self.memory = False, False, False
self._reset()
def _reset(self):
"""Reset parser to the default state for reading a new result."""
self.samples, self.num_iters = [], 1
self.max_rss, self.mem_pages = None, None
self.samples, self.yields, self.num_iters = [], [], 1
self.setup, self.max_rss, self.mem_pages = None, None, None
self.voluntary_cs, self.involuntary_cs = None, None
# Parse lines like this
# #,TEST,SAMPLES,MIN(μs),MAX(μs),MEAN(μs),SD(μs),MEDIAN(μs)
results_re = re.compile(r'( *\d+[, \t]*[\w.]+[, \t]*' +
r'[, \t]*'.join([r'[\d.]+'] * 6) +
r'[, \t]*[\d.]*)') # optional MAX_RSS(B)
results_re = re.compile(
r'( *\d+[, \t]+[\w.]+[, \t]+' + # #,TEST
r'[, \t]+'.join([r'\d+'] * 2) + # at least 2...
r'(?:[, \t]+\d*)*)') # ...or more numeric columns
def _append_result(self, result):
columns = result.split(',')
if len(columns) < 8:
columns = result.split()
r = PerformanceTestResult(columns)
if self.max_rss:
r.max_rss = self.max_rss
columns = result.split(',') if ',' in result else result.split()
r = PerformanceTestResult(
columns, quantiles=self.quantiles, memory=self.memory,
delta=self.delta)
r.setup = self.setup
r.max_rss = r.max_rss or self.max_rss
r.mem_pages = self.mem_pages
r.voluntary_cs = self.voluntary_cs
r.involuntary_cs = self.involuntary_cs
@@ -335,12 +381,18 @@ class LogParser(object):
r.samples = PerformanceTestSamples(r.name, self.samples)
r.samples.exclude_outliers()
self.results.append(r)
r.yields = self.yields or None
self._reset()
def _store_memory_stats(self, max_rss, mem_pages):
self.max_rss = int(max_rss)
self.mem_pages = int(mem_pages)
def _configure_format(self, header):
self.quantiles = 'MEAN' not in header
self.memory = 'MAX_RSS' in header
self.delta = '𝚫' in header
# Regular expression and action to take when it matches the parsed line
state_actions = {
results_re: _append_result,
@@ -355,6 +407,17 @@ class LogParser(object):
self.samples.append(
Sample(int(i), int(self.num_iters), int(runtime)))),
re.compile(r'\s+SetUp (\d+)'):
(lambda self, setup: setattr(self, 'setup', int(setup))),
re.compile(r'\s+Yielding after ~(\d+) μs'):
(lambda self, since_last_yield:
self.yields.append(
Yield(len(self.samples), int(since_last_yield)))),
re.compile(r'( *#[, \t]+TEST[, \t]+SAMPLES[, \t]+MIN.*)'):
_configure_format,
# Environmental statistics: memory usage and context switches
re.compile(r'\s+MAX_RSS \d+ - \d+ = (\d+) \((\d+) pages\)'):
_store_memory_stats,

View File

@@ -96,15 +96,16 @@ class Test_parse_args(unittest.TestCase):
"(choose from 'O', 'Onone', 'Osize')"],
err.getvalue())
def test_iterations(self):
self.assertEquals(parse_args(['run']).iterations, 1)
self.assertEquals(parse_args(['run', '-i', '3']).iterations, 3)
def test_independent_samples(self):
self.assertEquals(parse_args(['run']).independent_samples, 1)
self.assertEquals(parse_args(['run', '-i', '3']).independent_samples,
3)
with captured_output() as (out, err):
self.assertRaises(SystemExit,
parse_args, ['run', '-i', '-3'])
self.assert_contains(
['error:',
"argument -i/--iterations: invalid positive_int value: '-3'"],
['error:', "argument -i/--independent-samples: " +
"invalid positive_int value: '-3'"],
err.getvalue())
def test_output_dir(self):
@@ -278,30 +279,40 @@ class TestBenchmarkDriverRunningTests(unittest.TestCase):
self.subprocess_mock.assert_called_with(
('/benchmarks/Benchmark_O', 'b', '--memory'))
def test_report_quantiles(self):
"""Use delta compression for quantile reports."""
self.driver.run('b', quantile=4)
self.subprocess_mock.assert_called_with(
('/benchmarks/Benchmark_O', 'b', '--quantile=4', '--delta'))
def test_run_benchmark_independent_samples(self):
self.driver.args.iterations = 3
"""Extract up to 20 measurements from an independent run."""
self.driver.args.independent_samples = 3
r = self.driver.run_independent_samples('b1')
self.assertEquals(self.subprocess_mock.calls.count(
('/benchmarks/Benchmark_O', 'b1', '--memory')), 3)
('/benchmarks/Benchmark_O', 'b1', '--num-iters=1', '--memory',
'--quantile=20', '--delta')), 3)
self.assertEquals(r.num_samples, 3) # results are merged
def test_run_and_log(self):
def mock_run(test):
self.assertEquals(test, 'b1')
return PerformanceTestResult(
'3,b1,1,123,123,123,0,123,888'.split(','))
'3,b1,5,101,1,1,1,1,888'.split(','),
quantiles=True, delta=True, memory=True)
driver = BenchmarkDriver(tests=['b1'], args=Stub(output_dir=None))
driver.run_independent_samples = mock_run # patching
with captured_output() as (out, _):
log = driver.run_and_log()
csv_log = '3,b1,1,123,123,123,0,123,888\n'
header = '#,TEST,SAMPLES,MIN(μs),Q1(μs),MEDIAN(μs),Q3(μs),MAX(μs),' +\
'MAX_RSS(B)\n'
csv_log = '3,b1,5,101,102,103,104,105,888\n'
self.assertEquals(log, None)
self.assertEquals(
out.getvalue(),
'#,TEST,SAMPLES,MIN(μs),MAX(μs),MEAN(μs),SD(μs),MEDIAN(μs),' +
'MAX_RSS(B)\n' +
header +
csv_log +
'\n' +
'Total performance tests executed: 1\n')
@@ -309,13 +320,13 @@ class TestBenchmarkDriverRunningTests(unittest.TestCase):
with captured_output() as (out, _):
log = driver.run_and_log(csv_console=False)
self.assertEquals(log, csv_log)
self.assertEquals(log, header + csv_log)
self.assertEquals(
out.getvalue(),
' # TEST SAMPLES MIN(μs) MAX(μs)' +
' MEAN(μs) SD(μs) MEDIAN(μs) MAX_RSS(B)\n' +
' 3 b1 1 123 123' +
' 123 0 123 888\n' +
' # TEST SAMPLES MIN(μs) Q1(μs)' +
' MEDIAN(μs) Q3(μs) MAX(μs) MAX_RSS(B)\n' +
' 3 b1 5 101 102' +
' 103 104 105 888\n' +
'\n' +
'Total performance tests executed: 1\n')
@@ -410,9 +421,9 @@ class TestLoggingReportFormatter(unittest.TestCase):
self.assertEquals(f.format(lr), 'INFO Hi!')
def _PTR(min=700, mem_pages=1000):
def _PTR(min=700, mem_pages=1000, setup=None):
"""Create PerformanceTestResult Stub."""
return Stub(min=min, mem_pages=mem_pages)
return Stub(min=min, mem_pages=mem_pages, setup=setup)
def _run(test, num_samples=None, num_iters=None, verbose=None,
@@ -537,13 +548,18 @@ class TestBenchmarkDoctor(unittest.TestCase):
self.logs['info'])
def test_benchmark_runtime_range(self):
"""Optimized benchmark should run in less then 2500 μs.
"""Optimized benchmark should run in less then 1000 μs.
With runtimes less than 2500 μs there is better than 1:4 chance of
being interrupted in the middle of measurement due to elapsed 10 ms
quantum used by macos scheduler.
Even on calm machine, benchmark with runtime of 2500 μs has 1:4 chance
of being interrupted in the middle of measurement due to elapsed 10 ms
quantum used by macos scheduler. Linux scheduler's quantum is 6ms.
Driver yielding the process before the 10ms quantum elapses helped
a lot, but benchmarks with runtimes under 1ms usually exhibit a strong
mode which is best for accurate performance charaterization.
To minimize the number of involuntary context switches that corrupt our
measurements, we should strive to stay in the microbenchmark range.
Warn about longer runtime. Runtimes over half a second are an error.
Warn about longer runtime. Runtimes over 10ms are an error.
"""
def measurements(name, runtime):
return {'name': name,
@@ -553,7 +569,7 @@ class TestBenchmarkDoctor(unittest.TestCase):
with captured_output() as (out, _):
doctor = BenchmarkDoctor(self.args, BenchmarkDriverMock([]))
doctor.analyze(measurements('Cheetah', 200))
doctor.analyze(measurements('Hare', 2501))
doctor.analyze(measurements('Hare', 1001))
doctor.analyze(measurements('Tortoise', 500000))
doctor.analyze({'name': 'OverheadTurtle',
'OverheadTurtle O i1a': _PTR(min=800000),
@@ -562,17 +578,17 @@ class TestBenchmarkDoctor(unittest.TestCase):
self.assertIn('runtime: ', output)
self.assertNotIn('Cheetah', output)
self.assert_contains(["'Hare' execution took at least 2501 μs."],
self.assert_contains(["'Hare' execution took at least 1001 μs."],
self.logs['warning'])
self.assert_contains(
["Decrease the workload of 'Hare' by a factor of 2, "
"to be less than 2500 μs."], self.logs['info'])
["Decrease the workload of 'Hare' by a factor of 2 (10), "
"to be less than 1000 μs."], self.logs['info'])
self.assert_contains(
["'Tortoise' execution took at least 500000 μs."],
self.logs['error'])
self.assert_contains(
["Decrease the workload of 'Tortoise' by a factor of 256, "
"to be less than 2500 μs."], self.logs['info'])
["Decrease the workload of 'Tortoise' by a factor of 512 (1000), "
"to be less than 1000 μs."], self.logs['info'])
self.assert_contains(
["'OverheadTurtle' execution took at least 600000 μs"
" (excluding the setup overhead)."],
@@ -605,6 +621,29 @@ class TestBenchmarkDoctor(unittest.TestCase):
["Move initialization of benchmark data to the `setUpFunction` "
"registered in `BenchmarkInfo`."], self.logs['info'])
def test_benchmark_setup_takes_reasonable_time(self):
"""Setup < 200 ms (20% extra on top of the typical 1 s measurement)"""
with captured_output() as (out, _):
doctor = BenchmarkDoctor(self.args, BenchmarkDriverMock([]))
doctor.analyze({
'name': 'NormalSetup',
'NormalSetup O i1a': _PTR(setup=199999),
'NormalSetup O i2a': _PTR(setup=200001)})
doctor.analyze({
'name': 'LongSetup',
'LongSetup O i1a': _PTR(setup=200001),
'LongSetup O i2a': _PTR(setup=200002)})
output = out.getvalue()
self.assertIn('runtime: ', output)
self.assertNotIn('NormalSetup', output)
self.assert_contains(
["'LongSetup' setup took at least 200001 μs."],
self.logs['error'])
self.assert_contains(
["The `setUpFunction` should take no more than 200 ms."],
self.logs['info'])
def test_benchmark_has_constant_memory_use(self):
"""Benchmark's memory footprint must not vary with num-iters."""
with captured_output() as (out, _):

View File

@@ -204,6 +204,110 @@ class TestPerformanceTestResult(unittest.TestCase):
r = PerformanceTestResult(log_line.split(','))
self.assertEquals(r.max_rss, 10510336)
def test_init_quantiles(self):
# #,TEST,SAMPLES,MIN(μs),MEDIAN(μs),MAX(μs)
log = '1,Ackermann,3,54383,54512,54601'
r = PerformanceTestResult(log.split(','), quantiles=True)
self.assertEquals(r.test_num, '1')
self.assertEquals(r.name, 'Ackermann')
self.assertEquals((r.num_samples, r.min, r.median, r.max),
(3, 54383, 54512, 54601))
self.assertAlmostEquals(r.mean, 54498.67, places=2)
self.assertAlmostEquals(r.sd, 109.61, places=2)
self.assertEquals(r.samples.count, 3)
self.assertEquals(r.samples.num_samples, 3)
self.assertEquals([s.runtime for s in r.samples.all_samples],
[54383, 54512, 54601])
# #,TEST,SAMPLES,MIN(μs),MEDIAN(μs),MAX(μs),MAX_RSS(B)
log = '1,Ackermann,3,54529,54760,55807,266240'
r = PerformanceTestResult(log.split(','), quantiles=True, memory=True)
self.assertEquals((r.samples.count, r.max_rss), (3, 266240))
# #,TEST,SAMPLES,MIN(μs),Q1(μs),Q2(μs),Q3(μs),MAX(μs)
log = '1,Ackermann,5,54570,54593,54644,57212,58304'
r = PerformanceTestResult(log.split(','), quantiles=True, memory=False)
self.assertEquals((r.num_samples, r.min, r.median, r.max),
(5, 54570, 54644, 58304))
self.assertEquals((r.samples.q1, r.samples.q3), (54593, 57212))
self.assertEquals(r.samples.count, 5)
# #,TEST,SAMPLES,MIN(μs),Q1(μs),Q2(μs),Q3(μs),MAX(μs),MAX_RSS(B)
log = '1,Ackermann,5,54686,54731,54774,55030,63466,270336'
r = PerformanceTestResult(log.split(','), quantiles=True, memory=True)
self.assertEquals(r.samples.num_samples, 5)
self.assertEquals(r.samples.count, 4) # outlier was excluded
self.assertEquals(r.max_rss, 270336)
def test_init_delta_quantiles(self):
# #,TEST,SAMPLES,MIN(μs),𝚫MEDIAN,𝚫MAX
# 2-quantile from 2 samples in repeated min, when delta encoded,
# the difference is 0, which is ommited -- only separator remains
log = '202,DropWhileArray,2,265,,22'
r = PerformanceTestResult(log.split(','), quantiles=True, delta=True)
self.assertEquals((r.num_samples, r.min, r.median, r.max),
(2, 265, 265, 287))
self.assertEquals(r.samples.count, 2)
self.assertEquals(r.samples.num_samples, 2)
def test_init_oversampled_quantiles(self):
"""When num_samples is < quantile + 1, some of the measurements are
repeated in the report summary. Samples should contain only true
values, discarding the repetated artifacts from quantile estimation.
The test string is slightly massaged output of the following R script:
subsample <- function(x, q) {
quantile(1:x, probs=((0:(q-1))/(q-1)), type=1)}
tbl <- function(s) t(sapply(1:s, function(x) {
qs <- subsample(x, s); c(qs[1], diff(qs)) }))
sapply(c(3, 5, 11, 21), tbl)
"""
def validatePTR(deq): # construct from delta encoded quantiles string
deq = deq.split(',')
num_samples = deq.count('1')
r = PerformanceTestResult(['0', 'B', str(num_samples)] + deq,
quantiles=True, delta=True)
self.assertEquals(r.samples.num_samples, num_samples)
self.assertEquals([s.runtime for s in r.samples.all_samples],
range(1, num_samples + 1))
delta_encoded_quantiles = """
1,,
1,,1
1,,,,
1,,,1,
1,,1,1,
1,,1,1,1
1,,,,,,,,,,
1,,,,,,1,,,,
1,,,,1,,,1,,,
1,,,1,,,1,,1,,
1,,,1,,1,,1,,1,
1,,1,,1,,1,1,,1,
1,,1,1,,1,1,,1,1,
1,,1,1,1,,1,1,1,1,
1,,1,1,1,1,1,1,1,1,
1,,1,1,1,1,1,1,1,1,1
1,,,,,,,,,,,,,,,,,,,,
1,,,,,,,,,,,1,,,,,,,,,
1,,,,,,,1,,,,,,,1,,,,,,
1,,,,,,1,,,,,1,,,,,1,,,,
1,,,,,1,,,,1,,,,1,,,,1,,,
1,,,,1,,,1,,,,1,,,1,,,1,,,
1,,,1,,,1,,,1,,,1,,,1,,,1,,
1,,,1,,,1,,1,,,1,,1,,,1,,1,,
1,,,1,,1,,1,,1,,,1,,1,,1,,1,,
1,,,1,,1,,1,,1,,1,,1,,1,,1,,1,
1,,1,,1,,1,,1,,1,1,,1,,1,,1,,1,
1,,1,,1,,1,1,,1,,1,1,,1,,1,1,,1,
1,,1,,1,1,,1,1,,1,1,,1,1,,1,1,,1,
1,,1,1,,1,1,,1,1,,1,1,1,,1,1,,1,1,
1,,1,1,,1,1,1,,1,1,1,,1,1,1,,1,1,1,
1,,1,1,1,,1,1,1,1,,1,1,1,1,,1,1,1,1,
1,,1,1,1,1,1,,1,1,1,1,1,1,,1,1,1,1,1,
1,,1,1,1,1,1,1,1,1,,1,1,1,1,1,1,1,1,1,
1,,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1"""
map(validatePTR, delta_encoded_quantiles.split('\n')[1:])
def test_repr(self):
log_line = '1,AngryPhonebook,20,10664,12933,11035,576,10884'
r = PerformanceTestResult(log_line.split(','))
@@ -214,29 +318,32 @@ class TestPerformanceTestResult(unittest.TestCase):
)
def test_merge(self):
tests = """1,AngryPhonebook,1,12045,12045,12045,0,12045,10510336
tests = """
1,AngryPhonebook,1,12045,12045,12045,0,12045
1,AngryPhonebook,1,12325,12325,12325,0,12325,10510336
1,AngryPhonebook,1,11616,11616,11616,0,11616,10502144
1,AngryPhonebook,1,12270,12270,12270,0,12270,10498048""".split('\n')
1,AngryPhonebook,1,12270,12270,12270,0,12270,10498048""".split('\n')[1:]
results = map(PerformanceTestResult,
[line.split(',') for line in tests])
results[2].setup = 9
results[3].setup = 7
def as_tuple(r):
return (r.num_samples, r.min, r.max, round(r.mean, 2),
r.sd, r.median, r.max_rss)
r.sd, r.median, r.max_rss, r.setup)
r = results[0]
self.assertEquals(as_tuple(r),
(1, 12045, 12045, 12045, 0, 12045, 10510336))
(1, 12045, 12045, 12045, 0, 12045, None, None))
r.merge(results[1])
self.assertEquals(as_tuple(r), # drops SD and median
(2, 12045, 12325, 12185, 0, 0, 10510336))
self.assertEquals(as_tuple(r), # drops SD and median, +max_rss
(2, 12045, 12325, 12185, None, None, 10510336, None))
r.merge(results[2])
self.assertEquals(as_tuple(r), # picks smaller of the MAX_RSS
(3, 11616, 12325, 11995.33, 0, 0, 10502144))
self.assertEquals(as_tuple(r), # picks smaller of the MAX_RSS, +setup
(3, 11616, 12325, 11995.33, None, None, 10502144, 9))
r.merge(results[3])
self.assertEquals(as_tuple(r),
(4, 11616, 12325, 12064, 0, 0, 10498048))
self.assertEquals(as_tuple(r), # picks smaller of the setup values
(4, 11616, 12325, 12064, None, None, 10498048, 7))
class TestResultComparison(unittest.TestCase):
@@ -336,7 +443,7 @@ class OldAndNewLog(unittest.TestCase):
class TestLogParser(unittest.TestCase):
def test_parse_results_csv(self):
"""Ignores header row, empty lines and Totals row."""
"""Ignores uknown lines, extracts data from supported formats."""
log = """#,TEST,SAMPLES,MIN(us),MAX(us),MEAN(us),SD(us),MEDIAN(us)
34,BitCount,20,3,4,4,0,4
@@ -369,6 +476,41 @@ Total performance tests executed: 1
self.assertEquals(r.name, 'Array2D')
self.assertEquals(r.max_rss, 20915200)
def test_parse_quantiles(self):
"""Gathers samples from reported quantiles. Handles optional memory."""
r = LogParser.results_from_string(
"""#,TEST,SAMPLES,MIN(μs),MEDIAN(μs),MAX(μs)
1,Ackermann,3,54383,54512,54601""")['Ackermann']
self.assertEquals([s.runtime for s in r.samples.all_samples],
[54383, 54512, 54601])
r = LogParser.results_from_string(
"""#,TEST,SAMPLES,MIN(μs),MEDIAN(μs),MAX(μs),MAX_RSS(B)
1,Ackermann,3,54529,54760,55807,266240""")['Ackermann']
self.assertEquals([s.runtime for s in r.samples.all_samples],
[54529, 54760, 55807])
self.assertEquals(r.max_rss, 266240)
def test_parse_delta_quantiles(self):
r = LogParser.results_from_string( # 2-quantile aka. median
'#,TEST,SAMPLES,MIN(μs),𝚫MEDIAN,𝚫MAX\n0,B,1,101,,')['B']
self.assertEquals(
(r.num_samples, r.min, r.median, r.max, r.samples.count),
(1, 101, 101, 101, 1))
r = LogParser.results_from_string(
'#,TEST,SAMPLES,MIN(μs),𝚫MEDIAN,𝚫MAX\n0,B,2,101,,1')['B']
self.assertEquals(
(r.num_samples, r.min, r.median, r.max, r.samples.count),
(2, 101, 101, 102, 2))
r = LogParser.results_from_string( # 20-quantiles aka. ventiles
'#,TEST,SAMPLES,MIN(μs),𝚫V1,𝚫V2,𝚫V3,𝚫V4,𝚫V5,𝚫V6,𝚫V7,𝚫V8,' +
'𝚫V9,𝚫VA,𝚫VB,𝚫VC,𝚫VD,𝚫VE,𝚫VF,𝚫VG,𝚫VH,𝚫VI,𝚫VJ,𝚫MAX\n' +
'202,DropWhileArray,200,214,,,,,,,,,,,,1,,,,,,2,16,464'
)['DropWhileArray']
self.assertEquals(
(r.num_samples, r.min, r.max, r.samples.count),
# last 3 ventiles were outliers and were excluded from the sample
(200, 214, 215, 18))
def test_parse_results_verbose(self):
"""Parse multiple performance test results with 2 sample formats:
single line for N = 1; two lines for N > 1.
@@ -383,8 +525,11 @@ Running AngryPhonebook for 3 samples.
Sample 2,11467
1,AngryPhonebook,3,11467,13898,12392,1315,11812
Running Array2D for 3 samples.
SetUp 14444
Sample 0,369900
Yielding after ~369918 μs
Sample 1,381039
Yielding after ~381039 μs
Sample 2,371043
3,Array2D,3,369900,381039,373994,6127,371043
@@ -400,15 +545,21 @@ Totals,2"""
self.assertEquals(r.num_samples, r.samples.num_samples)
self.assertEquals(results[0].samples.all_samples,
[(0, 78, 11812), (1, 90, 13898), (2, 90, 11467)])
self.assertEquals(r.yields, None)
r = results[1]
self.assertEquals(
(r.name, r.min, r.max, int(r.mean), int(r.sd), r.median),
('Array2D', 369900, 381039, 373994, 6127, 371043)
)
self.assertEquals(r.setup, 14444)
self.assertEquals(r.num_samples, r.samples.num_samples)
self.assertEquals(results[1].samples.all_samples,
[(0, 1, 369900), (1, 1, 381039), (2, 1, 371043)])
yielded = r.yields[0]
self.assertEquals(yielded.before_sample, 1)
self.assertEquals(yielded.after, 369918)
self.assertEquals(r.yields, [(1, 369918), (2, 381039)])
def test_parse_environment_verbose(self):
"""Parse stats about environment in verbose mode."""

View File

@@ -74,7 +74,7 @@ struct TestConfig {
let numIters: Int?
/// The number of samples we should take of each test.
let numSamples: Int
let numSamples: Int?
/// Quantiles to report in results.
let quantile: Int?
@@ -130,7 +130,9 @@ struct TestConfig {
// Configure the command line argument parser
let p = ArgumentParser(into: PartialTestConfig())
p.addArgument("--num-samples", \.numSamples,
help: "number of samples to take per benchmark; default: 1",
help: "number of samples to take per benchmark;\n" +
"default: 1 or auto-scaled to measure for\n" +
"`sample-time` if num-iters is also specified\n",
parser: { UInt($0) })
p.addArgument("--num-iters", \.numIters,
help: "number of iterations averaged in the sample;\n" +
@@ -174,7 +176,7 @@ struct TestConfig {
delim = c.delim ?? ","
sampleTime = c.sampleTime ?? 1.0
numIters = c.numIters.map { Int($0) }
numSamples = Int(c.numSamples ?? 1)
numSamples = c.numSamples.map { Int($0) }
quantile = c.quantile.map { Int($0) }
delta = c.delta ?? false
verbose = c.verbose ?? false
@@ -200,7 +202,7 @@ struct TestConfig {
let testList = tests.map({ $0.1.name }).joined(separator: ", ")
print("""
--- CONFIG ---
NumSamples: \(numSamples)
NumSamples: \(numSamples ?? 0)
Verbose: \(verbose)
LogMemory: \(logMemory)
SampleTime: \(sampleTime)
@@ -470,10 +472,9 @@ final class TestRunner {
logVerbose("Skipping unsupported benchmark \(test.name)!")
return nil
}
logVerbose("Running \(test.name) for \(c.numSamples) samples.")
logVerbose("Running \(test.name)")
var samples: [Int] = []
samples.reserveCapacity(c.numSamples)
func addSample(_ time: Int) {
logVerbose(" Sample \(samples.count),\(time)")
@@ -481,7 +482,12 @@ final class TestRunner {
}
resetMeasurements()
test.setUpFunction?()
if let setUp = test.setUpFunction {
setUp()
stopMeasurement()
logVerbose(" SetUp \(lastSampleTime.microseconds)")
resetMeasurements()
}
// Determine number of iterations for testFn to run for desired time.
func iterationsPerSampleTime() -> (numIters: Int, oneIter: Int) {
@@ -494,17 +500,26 @@ final class TestRunner {
}
}
let numIters = min( // Cap to prevent overflow on 32-bit systems when scaled
Int.max / 10_000, // by the inner loop multiplier inside the `testFn`.
c.numIters ?? {
// Determine the scale of measurements. Re-use the calibration result if
// it is just one measurement.
func calibrateMeasurements() -> Int {
let (numIters, oneIter) = iterationsPerSampleTime()
if numIters == 1 { addSample(oneIter) }
else { resetMeasurements() } // for accurate yielding reports
return numIters
}())
}
let numIters = min( // Cap to prevent overflow on 32-bit systems when scaled
Int.max / 10_000, // by the inner loop multiplier inside the `testFn`.
c.numIters ?? calibrateMeasurements())
let numSamples = c.numSamples ?? min(2000, // Cap the number of samples
c.numIters == nil ? 1 : calibrateMeasurements())
samples.reserveCapacity(numSamples)
logVerbose(" Collecting \(numSamples) samples.")
logVerbose(" Measuring with scale \(numIters).")
for _ in samples.count..<c.numSamples {
for _ in samples.count..<numSamples {
addSample(measure(test.name, fn: testFn, numIters: numIters))
}

View File

@@ -110,7 +110,7 @@ RUN: --check-prefix LOGHEADER \
RUN: --check-prefix LOGBENCH
LOGHEADER-LABEL: #,TEST,SAMPLES,MIN(μs),MAX(μs),MEAN(μs),SD(μs),MEDIAN(μs)
LOGBENCH: {{[0-9]+}},
NUMITERS1: AngryPhonebook,1
NUMITERS1: AngryPhonebook,{{[0-9]+}}
NUMITERS1-NOT: 0,0,0,0,0
LOGBENCH-SAME: ,{{[0-9]+}},{{[0-9]+}},{{[0-9]+}},{{[0-9]+}},{{[0-9]+}}
````
@@ -153,7 +153,8 @@ CONFIG: NumSamples: 2
CONFIG: Tests Filter: ["1", "Ackermann", "1", "AngryPhonebook"]
CONFIG: Tests to run: Ackermann, AngryPhonebook
LOGFORMAT: #,TEST,SAMPLES,MIN(μs),𝚫MEDIAN,𝚫MAX,MAX_RSS(B)
LOGVERBOSE-LABEL: Running Ackermann for 2 samples.
LOGVERBOSE-LABEL: Running Ackermann
LOGVERBOSE: Collecting 2 samples.
LOGVERBOSE: Measuring with scale {{[0-9]+}}.
LOGVERBOSE: Sample 0,{{[0-9]+}}
LOGVERBOSE: Sample 1,{{[0-9]+}}
@@ -163,7 +164,8 @@ MEASUREENV: VCS {{[0-9]+}} - {{[0-9]+}} = {{[0-9]+}}
RUNJUSTONCE-LABEL: 1,Ackermann
RUNJUSTONCE-NOT: 1,Ackermann
LOGFORMAT: ,{{[0-9]+}},{{[0-9]+}},,{{[0-9]+}},{{[0-9]+}}
LOGVERBOSE-LABEL: Running AngryPhonebook for 2 samples.
LOGVERBOSE-LABEL: Running AngryPhonebook
LOGVERBOSE: Collecting 2 samples.
````
Verify the specified delimiter is used when logging to console. The non-verbose