[benchmark] Measure and analyze benchmark runtimes

`BenchmarkDoctor` measures benchmark execution (using `BenchmarkDriver`) and verifies that their runtime stays under 2500 microseconds.
2025-12-14 20:36:38 +01:00 · 2018-08-16 20:40:28 +02:00
parent ab16999e20
commit 7725c0096e
2 changed files with 178 additions and 14 deletions
--- a/benchmark/scripts/Benchmark_Driver
+++ b/benchmark/scripts/Benchmark_Driver
@@ -16,6 +16,7 @@
 import argparse
 import glob
 import logging
+import math
 import os
 import re
 import subprocess
@@ -185,6 +186,11 @@ class BenchmarkDoctor(object):
            LoggingReportFormatter(use_color=sys.stdout.isatty()))
        self.log.addHandler(self.console_handler)
        self.log.debug('Checking tests: %s', ', '.join(self.driver.tests))
+        self.requirements = [
+            self._name_matches_capital_words_convention,
+            self._name_is_at_most_40_chars_long,
+            self._optimized_runtime_in_range,
+        ]

    def __del__(self):
        """Unregister handler on exit."""
@@ -193,10 +199,11 @@ class BenchmarkDoctor(object):
    capital_words_re = re.compile('[A-Z][a-zA-Z0-9]+')

    @staticmethod
-    def _name_matches_capital_words_convention(test):
-        name = test
+    def _name_matches_capital_words_convention(measurements):
+        name = measurements['name']
        match = BenchmarkDoctor.capital_words_re.match(name)
        matched = match.group(0) if match else ''
+
        if name != matched:
            BenchmarkDoctor.log_naming.error(
                "'%s' name doesn't conform to UpperCamelCase convention.",
@@ -205,24 +212,101 @@ class BenchmarkDoctor(object):
                'See http://bit.ly/UpperCamelCase')

    @staticmethod
-    def _name_is_at_most_40_chars_long(test):
-        if len(test) > 40:
-            BenchmarkDoctor.log_naming.warning(
-                "'%s' name is longer than 40 characters.", test)
+    def _name_is_at_most_40_chars_long(measurements):
+        name = measurements['name']
+
+        if len(name) > 40:
+            BenchmarkDoctor.log_naming.error(
+                "'%s' name is %d characters long.", name, len(name))
+            BenchmarkDoctor.log_naming.info(
+                'Benchmark name should not be longer than 40 characters.')
+
+    @staticmethod
+    def _select(measurements, num_iters=None, opt_level='O'):
+        prefix = measurements['name'] + ' ' + opt_level
+        prefix += '' if num_iters is None else (' i' + str(num_iters))
+        return [series for name, series in measurements.items()
+                if name.startswith(prefix)]
+
+    @staticmethod
+    def _optimized_runtime_in_range(measurements):
+        name = measurements['name']
+        runtime = min(BenchmarkDoctor._select(measurements),
+                      key=lambda r: r.min).min
+        if 2500 < runtime < 500000:
+            BenchmarkDoctor.log_runtime.warning(
+                "'%s' execution takes at least %d μs.", name, runtime)
+        elif 500000 <= runtime:
+            BenchmarkDoctor.log_runtime.error(
+                "'%s' execution takes at least %d μs.", name, runtime)
+        if 2500 < runtime:
+            factor = int(pow(2, math.ceil(math.log(runtime / 2500.0, 2))))
+            BenchmarkDoctor.log_runtime.info(
+                "Decrease the workload of '%s' by a factor of %d, "
+                "to be less than 2500 μs.", name, factor)
+
+    @staticmethod
+    def _adjusted_1s_samples(runtime):
+        u"""Return sample count that can be taken in approximately 1 second.
+
+        Based on the runtime (μs) of one sample taken with num-iters=1.
+        """
+        if runtime == 0:
+            return 2
+        s = 1000000 / float(runtime)  # samples for 1s run
+        s = int(pow(2, round(math.log(s, 2))))  # rounding to power of 2
+        return s if s > 2 else 2  # always take at least 2 samples
+
+    def measure(self, benchmark):
+        """Measure benchmark with varying iterations and optimization levels.
+
+        Returns a dictionary with benchmark name and `PerformanceTestResult`s.
+        """
+        self.log.debug('Calibrating num-samples for {0}:'.format(benchmark))
+        r = self.driver.run(benchmark, num_samples=3, num_iters=1)  # calibrate
+        num_samples = self._adjusted_1s_samples(r.min)
+
+        def capped(s):
+            return min(s, 2048)
+        run_args = [(capped(num_samples), 1), (capped(num_samples / 2), 2)]
+        opts = self.driver.args.optimization
+        opts = opts if isinstance(opts, list) else [opts]
+        self.log.debug(
+            'Runtime {0} μs yields {1} adjusted samples per second.'.format(
+                r.min, num_samples))
+        self.log.debug(
+            'Measuring {0}, 5 x i1 ({1} samples), 5 x i2 ({2} samples)'.format(
+                benchmark, run_args[0][0], run_args[1][0]))
+
+        measurements = dict(
+            [('{0} {1} i{2}{3}'.format(benchmark, o, i, suffix),
+              self.driver.run(benchmark, num_samples=s, num_iters=i,
+                              measure_memory=True))
+             for o in opts
+             for s, i in run_args
+             for suffix in list('abcde')
+             ]
+        )
+        measurements['name'] = benchmark
+        return measurements
+
+    def analyze(self, benchmark_measurements):
+        """Analyze whether benchmark fullfills all requirtements."""
+        self.log.debug('Analyzing %s', benchmark_measurements['name'])
+        for rule in self.requirements:
+            rule(benchmark_measurements)

    def check(self):
-        checks = [
-            self._name_matches_capital_words_convention,
-            self._name_is_at_most_40_chars_long,
-            ]
+        """Measure and analyse all enabled tests."""
        for test in self.driver.tests:
-            for check in checks:
-                check(test)
+            self.analyze(self.measure(test))

    @staticmethod
    def run_check(args):
        doctor = BenchmarkDoctor(args)
        doctor.check()
+        # TODO non-zero error code when errors are logged
+        # See https://stackoverflow.com/a/31142078/41307
        return 0


--- a/benchmark/scripts/test_Benchmark_Driver.py
+++ b/benchmark/scripts/test_Benchmark_Driver.py
@@ -290,7 +290,7 @@ class BenchmarkDriverMock(Mock):
                           verbose, measure_memory):
        args = (test, num_samples, num_iters, verbose, measure_memory)
        self.calls.append(args)
-        return self.respond.get(args, '')
+        return self.respond.get(args, Stub(min=700))


 class TestLoggingReportFormatter(unittest.TestCase):
@@ -326,6 +326,13 @@ class TestLoggingReportFormatter(unittest.TestCase):
        self.assertEquals(f.format(lr), 'INFO Hi!')


+def _run(test, num_samples=None, num_iters=None, verbose=None,
+         measure_memory=False):
+    """Helper function that constructs tuple with arguments for run method."""
+    return (
+        test, num_samples, num_iters, verbose, measure_memory)
+
+
 class TestBenchmarkDoctor(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
@@ -367,6 +374,41 @@ class TestBenchmarkDoctor(unittest.TestCase):
        self.assertTrue(isinstance(console_handler.formatter,
                                   LoggingReportFormatter))

+    def test_measure_10_independent_1s_benchmark_series(self):
+        """Measurement strategy takes 5 i2 and 5 i1 series.
+
+        Num-samples for Benchmark Driver are calibrated to be powers of two,
+        take measurements for approximately 1s
+        based on short initial runtime sampling. Capped at 2k samples.
+        """
+        driver = BenchmarkDriverMock(tests=['B1'], responses=([
+            # calibration run, returns a stand-in for PerformanceTestResult
+            (_run('B1', num_samples=3, num_iters=1), Stub(min=300))] +
+            # 5x i1 series, with 300 μs runtime its possible to take 4098
+            # samples/s, but it should be capped at 2k
+            ([(_run('B1', num_samples=2048, num_iters=1,
+                    measure_memory=True), Stub(min=300))] * 5) +
+            # 5x i2 series
+            ([(_run('B1', num_samples=2048, num_iters=2,
+                    measure_memory=True), Stub(min=300))] * 5)
+        ))
+        doctor = BenchmarkDoctor(self.args, driver)
+        with captured_output() as (out, _):
+            measurements = doctor.measure('B1')
+
+        driver.assert_called_all_expected()
+        self.assert_contains(
+            ['name',
+             'B1 O i1a', 'B1 O i1b', 'B1 O i1c', 'B1 O i1d', 'B1 O i1e',
+             'B1 O i2a', 'B1 O i2b', 'B1 O i2c', 'B1 O i2d', 'B1 O i2e'],
+            measurements.keys())
+        self.assertEquals(measurements['name'], 'B1')
+        self.assert_contains(
+            ['Calibrating num-samples for B1:',
+             'Runtime 300 μs yields 4096 adjusted samples per second.',
+             'Measuring B1, 5 x i1 (2048 samples), 5 x i2 (2048 samples)'],
+            self.logs['debug'])
+
    def test_benchmark_name_matches_capital_words_conventions(self):
        driver = BenchmarkDriverMock(tests=[
            'BenchmarkName', 'CapitalWordsConvention', 'ABBRName',
@@ -400,7 +442,45 @@ class TestBenchmarkDoctor(unittest.TestCase):
        self.assertNotIn('BenchmarkName', output)
        self.assert_contains(
            ["'ThisTestNameIsTooLongAndCausesOverflowsInReports' name is "
-             "longer than 40 characters."], output)
+             "48 characters long."], self.logs['error'])
+        self.assert_contains(
+            ["Benchmark name should not be longer than 40 characters."],
+            self.logs['info'])
+
+    def test_benchmark_runtime_range(self):
+        """Optimized benchmark should run in less then 2500 μs.
+
+        With runtimes less than 2500 μs there is better than 1:4 chance of
+        being interrupted in the middle of measurement due to elapsed 10 ms
+        quantum used by macos scheduler.
+
+        Warn about longer runtime. Runtimes over half a second are an error.
+        """
+        def measurements(name, runtime):
+            return {'name': name,
+                    name + ' O i1a': Stub(min=runtime + 2),
+                    name + ' O i2a': Stub(min=runtime)}
+
+        with captured_output() as (out, _):
+            doctor = BenchmarkDoctor(self.args, BenchmarkDriverMock([]))
+            doctor.analyze(measurements('Cheetah', 200))
+            doctor.analyze(measurements('Hare', 2501))
+            doctor.analyze(measurements('Tortoise', 500000))
+        output = out.getvalue()
+
+        self.assertIn('runtime: ', output)
+        self.assertNotIn('Cheetah', output)
+        self.assert_contains(["'Hare' execution takes at least 2501 μs."],
+                             self.logs['warning'])
+        self.assert_contains(
+            ["Decrease the workload of 'Hare' by a factor of 2, "
+             "to be less than 2500 μs."], self.logs['info'])
+        self.assert_contains(
+            ["'Tortoise' execution takes at least 500000 μs."],
+            self.logs['error'])
+        self.assert_contains(
+            ["Decrease the workload of 'Tortoise' by a factor of 256, "
+             "to be less than 2500 μs."], self.logs['info'])


 if __name__ == '__main__':