[benchmark] Introduced PerformanceTestSamples

* Moved the functionality to compute median, standard deviation and related statistics from `PerformanceTestResult` into `PerformanceTestSamples`.
* Fixed wrong unit in comments
This commit is contained in:
Pavol Vaskovic
2018-08-16 13:44:02 +02:00
parent bea35cb7c1
commit 91077e3289
2 changed files with 235 additions and 69 deletions

View File

@@ -23,6 +23,90 @@ import argparse
import re
import sys
from math import sqrt
from collections import namedtuple
class Sample(namedtuple('Sample', 'i num_iters runtime')):
u"""Single benchmark measurement.
Initialized with:
`i`: ordinal number of the sample taken,
`num-num_iters`: number or iterations used to compute it,
`runtime`: in microseconds (μs).
"""
def __repr__(self):
"""Shorter Sample formating for debugging purposes."""
return 's({0.i!r}, {0.num_iters!r}, {0.runtime!r})'.format(self)
class PerformanceTestSamples(object):
"""Collection of runtime samples from the benchmark execution.
Computes the sample population statistics.
"""
def __init__(self, name, samples=None):
"""Initialized with benchmark name and optional list of Samples."""
self.name = name # Name of the performance test
self.samples = []
self.mean = 0.0
self.S_runtime = 0.0 # For computing running variance
for sample in samples or []:
self.add(sample)
def add(self, sample):
"""Add sample to collection and recompute statistics."""
assert isinstance(sample, Sample)
state = (self.count, self.mean, self.S_runtime)
state = self.running_mean_variance(state, sample.runtime)
_, self.mean, self.S_runtime = state
self.samples.append(sample)
self.samples.sort(key=lambda s: s.runtime)
@property
def count(self):
"""Number of samples used to compute the statistics."""
return len(self.samples)
@property
def min(self):
"""Minimum sampled value."""
return self.samples[0].runtime
@property
def max(self):
"""Maximum sampled value."""
return self.samples[-1].runtime
@property
def median(self):
"""Median sampled value."""
return self.samples[self.count / 2].runtime
@property
def sd(self):
u"""Standard Deviation (μs)."""
return (0 if self.count < 2 else
sqrt(self.S_runtime / (self.count - 1)))
@staticmethod
def running_mean_variance((k, M_, S_), x):
"""Compute running variance, B. P. Welford's method.
See Knuth TAOCP vol 2, 3rd edition, page 232, or
https://www.johndcook.com/blog/standard_deviation/
M is mean, Standard Deviation is defined as sqrt(S/k-1)
"""
k = float(k + 1)
M = M_ + (x - M_) / k
S = S_ + (x - M_) * (x - M)
return (k, M, S)
@property
def cv(self):
"""Coeficient of Variation (%)."""
return (self.sd / self.mean) if self.mean else 0
class PerformanceTestResult(object):
@@ -46,67 +130,48 @@ class PerformanceTestResult(object):
"""
# csv_row[0] is just an ordinal number of the test - skip that
self.name = csv_row[1] # Name of the performance test
self.samples = int(csv_row[2]) # Number of measurement samples taken
self.min = int(csv_row[3]) # Minimum runtime (ms)
self.max = int(csv_row[4]) # Maximum runtime (ms)
self.mean = int(csv_row[5]) # Mean (average) runtime (ms)
sd = int(csv_row[6]) # Standard Deviation (ms)
# For computing running variance
self.S_runtime = (0 if self.samples < 2 else
(sd * sd) * (self.samples - 1))
self.median = int(csv_row[7]) # Median runtime (ms)
self.num_samples = ( # Number of measurement samples taken
int(csv_row[2]))
self.min = int(csv_row[3]) # Minimum runtime (μs)
self.max = int(csv_row[4]) # Maximum runtime (μs)
self.mean = float(csv_row[5]) # Mean (average) runtime (μs)
self.sd = float(csv_row[6]) # Standard Deviation (μs)
self.median = int(csv_row[7]) # Median runtime (μs)
self.max_rss = ( # Maximum Resident Set Size (B)
int(csv_row[8]) if len(csv_row) > 8 else None)
# Sample lists for statistical analysis of measured results
self.all_samples = None
self.samples = None
def __repr__(self):
"""Short summary for debugging purposes."""
return (
'<PerformanceTestResult name:{0.name!r} '
'samples:{0.samples!r} min:{0.min!r} max:{0.max!r} '
'mean:{0.mean!r} sd:{0.sd!r} median:{0.median!r}>'.format(self))
@property
def sd(self):
"""Standard Deviation (ms)"""
return (0 if self.samples < 2 else
sqrt(self.S_runtime / (self.samples - 1)))
@staticmethod
def running_mean_variance((k, M_, S_), x):
"""
Compute running variance, B. P. Welford's method
See Knuth TAOCP vol 2, 3rd edition, page 232, or
https://www.johndcook.com/blog/standard_deviation/
M is mean, Standard Deviation is defined as sqrt(S/k-1)
"""
k = float(k + 1)
M = M_ + (x - M_) / k
S = S_ + (x - M_) * (x - M)
return (k, M, S)
'samples:{0.num_samples!r} min:{0.min!r} max:{0.max!r} '
'mean:{0.mean:.0f} sd:{0.sd:.0f} median:{0.median!r}>'
.format(self))
def merge(self, r):
"""Merging test results recomputes min and max.
It attempts to recompute mean and standard deviation when all_samples
are available. There is no correct way to compute these values from
test results that are summaries from more than 3 samples.
"""Merge two results.
The use case here is comparing tests results parsed from concatenated
Recomputes min, max and mean statistics. If all `samples` are
avaliable, it recomputes all the statistics.
The use case here is comparing test results parsed from concatenated
log files from multiple runs of benchmark driver.
"""
if self.samples and r.samples:
map(self.samples.add, r.samples.samples)
sams = self.samples
self.num_samples = sams.count
self.min, self.max, self.median, self.mean, self.sd = \
sams.min, sams.max, sams.median, sams.mean, sams.sd
else:
self.min = min(self.min, r.min)
self.max = max(self.max, r.max)
# self.median = None # unclear what to do here
def push(x):
state = (self.samples, self.mean, self.S_runtime)
state = self.running_mean_variance(state, x)
(self.samples, self.mean, self.S_runtime) = state
# Merging test results with up to 3 samples is exact
values = [r.min, r.max, r.median][:min(r.samples, 3)]
map(push, values)
self.mean = ( # pooled mean is the weighted sum of means
(self.mean * self.num_samples) + (r.mean * r.num_samples)
) / float(self.num_samples + r.num_samples)
self.num_samples += r.num_samples
self.max_rss = min(self.max_rss, r.max_rss)
self.median, self.sd = 0, 0
class ResultComparison(object):
@@ -119,7 +184,7 @@ class ResultComparison(object):
"""Initialize with old and new `PerformanceTestResult`s to compare."""
self.old = old
self.new = new
assert(old.name == new.name)
assert old.name == new.name
self.name = old.name # Test name, convenience accessor
# Speedup ratio
@@ -171,7 +236,7 @@ class LogParser(object):
r.voluntary_cs = self.voluntary_cs
r.involuntary_cs = self.involuntary_cs
if self.samples:
r.all_samples = self.samples
r.samples = PerformanceTestSamples(r.name, self.samples)
self.results.append(r)
self._reset()
@@ -190,8 +255,8 @@ class LogParser(object):
re.compile(r'\s+Sample (\d+),(\d+)'):
(lambda self, i, runtime:
self.samples.append((int(i), int(self.num_iters), int(runtime)))
),
self.samples.append(
Sample(int(i), int(self.num_iters), int(runtime)))),
# Environmental statistics: memory usage and context switches
re.compile(r'\s+MAX_RSS \d+ - \d+ = (\d+) \((\d+) pages\)'):

View File

@@ -21,8 +21,10 @@ import unittest
from compare_perf_tests import LogParser
from compare_perf_tests import PerformanceTestResult
from compare_perf_tests import PerformanceTestSamples
from compare_perf_tests import ReportFormatter
from compare_perf_tests import ResultComparison
from compare_perf_tests import Sample
from compare_perf_tests import TestComparator
from compare_perf_tests import main
from compare_perf_tests import parse_args
@@ -30,14 +32,82 @@ from compare_perf_tests import parse_args
from test_utils import captured_output
class TestPerformanceTestResult(unittest.TestCase):
class TestSample(unittest.TestCase):
def test_has_named_fields(self):
s = Sample(1, 2, 3)
self.assertEquals(s.i, 1)
self.assertEquals(s.num_iters, 2)
self.assertEquals(s.runtime, 3)
def test_is_iterable(self):
s = Sample(1, 2, 3)
self.assertEquals(s[0], 1)
self.assertEquals(s[1], 2)
self.assertEquals(s[2], 3)
class TestPerformanceTestSamples(unittest.TestCase):
def setUp(self):
self.rs = [Sample(*map(int, line.split())) for line in
'0 316 233,' # this is anomalous sample - max
'1 4417 208, 2 4745 216, 3 4867 208, 4 4934 197,'
'5 5209 205, 6 4271 204, 7 4971 208, 8 5276 206,'
'9 4596 221, 10 5278 198'.split(',')]
self.samples = PerformanceTestSamples('DropFirstAnyCollection')
self.samples.add(self.rs[1])
def test_has_name(self):
self.assertEquals(self.samples.name, 'DropFirstAnyCollection')
def test_stores_samples(self):
self.assertEquals(self.samples.count, 1)
s = self.samples.samples[0]
self.assertTrue(isinstance(s, Sample))
self.assertEquals(s.i, 1)
self.assertEquals(s.num_iters, 4417)
self.assertEquals(s.runtime, 208)
def test_computes_min_max_median(self):
self.assertEquals(self.samples.min, 208)
self.assertEquals(self.samples.max, 208)
self.assertEquals(self.samples.median, 208)
self.samples.add(self.rs[2])
self.assertEquals(self.samples.min, 208)
self.assertEquals(self.samples.max, 216)
self.assertEquals(self.samples.median, 216)
self.samples.add(self.rs[4])
self.assertEquals(self.samples.min, 197)
self.assertEquals(self.samples.max, 216)
self.assertEquals(self.samples.median, 208)
def assertEqualStats(self, expected_stats):
stats = (self.samples.mean, self.samples.sd, self.samples.cv)
for actual, expected in zip(stats, expected_stats):
self.assertAlmostEquals(actual, expected, places=2)
def test_computes_mean_sd_cv(self):
self.assertEqualStats((208.0, 0.0, 0.0))
self.samples.add(self.rs[2])
self.assertEqualStats((212.0, 5.66, 2.67 / 100))
self.samples.add(self.rs[3])
self.assertEqualStats((210.67, 4.62, 2.19 / 100))
def test_init_with_samples(self):
ss = PerformanceTestSamples('Lots', self.rs[1:])
self.assertEquals(ss.count, 10)
self.samples = ss
self.assertEqualStats((207.10, 7.26, 3.51 / 100))
class TestPerformanceTestResult(unittest.TestCase):
def test_init(self):
log_line = '1,AngryPhonebook,20,10664,12933,11035,576,10884'
r = PerformanceTestResult(log_line.split(','))
self.assertEquals(r.name, 'AngryPhonebook')
self.assertEquals((r.samples, r.min, r.max, r.mean, r.sd, r.median),
self.assertEquals(
(r.num_samples, r.min, r.max, r.mean, r.sd, r.median),
(20, 10664, 12933, 11035, 576, 10884))
self.assertEquals(r.samples, None)
log_line = '1,AngryPhonebook,1,12045,12045,12045,0,12045,10510336'
r = PerformanceTestResult(log_line.split(','))
@@ -49,7 +119,7 @@ class TestPerformanceTestResult(unittest.TestCase):
self.assertEquals(
str(r),
'<PerformanceTestResult name:\'AngryPhonebook\' samples:20 '
'min:10664 max:12933 mean:11035 sd:576.0 median:10884>'
'min:10664 max:12933 mean:11035 sd:576 median:10884>'
)
def test_merge(self):
@@ -61,21 +131,21 @@ class TestPerformanceTestResult(unittest.TestCase):
[line.split(',') for line in tests])
def as_tuple(r):
return (r.min, r.max, round(r.mean, 2), round(r.sd, 2), r.median,
r.max_rss)
return (r.num_samples, r.min, r.max, round(r.mean, 2),
r.sd, r.median, r.max_rss)
r = results[0]
self.assertEquals(as_tuple(r),
(12045, 12045, 12045, 0, 12045, 10510336))
(1, 12045, 12045, 12045, 0, 12045, 10510336))
r.merge(results[1])
self.assertEquals(as_tuple(r),
(12045, 12325, 12185, 197.99, 12045, 10510336))
self.assertEquals(as_tuple(r), # drops SD and median
(2, 12045, 12325, 12185, 0, 0, 10510336))
r.merge(results[2])
self.assertEquals(as_tuple(r),
(11616, 12325, 11995.33, 357.10, 12045, 10510336))
self.assertEquals(as_tuple(r), # picks smaller of the MAX_RSS
(3, 11616, 12325, 11995.33, 0, 0, 10502144))
r.merge(results[3])
self.assertEquals(as_tuple(r),
(11616, 12325, 12064, 322.29, 12045, 10510336))
(4, 11616, 12325, 12064, 0, 0, 10498048))
class TestResultComparison(unittest.TestCase):
@@ -235,8 +305,9 @@ Totals,2"""
(r.name, r.min, r.max, int(r.mean), int(r.sd), r.median),
('AngryPhonebook', 11467, 13898, 12392, 1315, 11812)
)
self.assertEquals(r.samples, len(r.all_samples))
self.assertEquals(results[0].all_samples,
self.assertEquals(r.num_samples, r.samples.count)
self.assertEquals(sorted(results[0].samples.samples,
key=lambda s: s.i),
[(0, 78, 11812), (1, 90, 13898), (2, 91, 11467)])
r = results[1]
@@ -244,8 +315,9 @@ Totals,2"""
(r.name, r.min, r.max, int(r.mean), int(r.sd), r.median),
('Array2D', 369900, 381039, 373994, 6127, 371043)
)
self.assertEquals(r.samples, len(r.all_samples))
self.assertEquals(results[1].all_samples,
self.assertEquals(r.num_samples, r.samples.count)
self.assertEquals(sorted(results[1].samples.samples,
key=lambda s: s.i),
[(0, 1, 369900), (1, 1, 381039), (2, 1, 371043)])
def test_parse_environment_verbose(self):
@@ -275,6 +347,35 @@ Totals,2"""
self.assertEquals(result.min, 20000)
self.assertEquals(result.max, 29000)
def test_results_from_merge_verbose(self):
"""Parsing verbose log merges all PerformanceTestSamples.
...this should technically be on TestPerformanceTestResult, but it's
easier to write here. ¯\_(ツ)_/¯"""
concatenated_logs = """
Sample 0,355883
Sample 1,358817
Sample 2,353552
Sample 3,350815
3,Array2D,4,350815,358817,354766,3403,355883
Sample 0,363094
Sample 1,369169
Sample 2,376131
Sample 3,364245
3,Array2D,4,363094,376131,368159,5931,369169"""
results = LogParser.results_from_string(concatenated_logs)
self.assertEquals(results.keys(), ['Array2D'])
result = results['Array2D']
self.assertTrue(isinstance(result, PerformanceTestResult))
self.assertEquals(result.min, 350815)
self.assertEquals(result.max, 376131)
self.assertEquals(result.median, 363094)
self.assertAlmostEquals(result.sd, 8443.37, places=2)
self.assertAlmostEquals(result.mean, 361463.25, places=2)
self.assertEquals(result.num_samples, 8)
samples = result.samples
self.assertTrue(isinstance(samples, PerformanceTestSamples))
self.assertEquals(samples.count, 8)
class TestTestComparator(OldAndNewLog):
def test_init(self):