[benchmark] Quantiles in Benchmark_Driver

Switching the measurement technique from gathering `i` independent samples characterized by their mean values, to a finer grained characterization of these measurements using quantiles. The distribution of benchmark measurements is non-normal, with outliers that significantly inflate the mean and standard-deviation due to presence of uncontrolled variable of the system load. Therefore the MEAN and SD were incorrect statistics to properly characterize the benchmark measurements. Benchmark_Driver now gathers more individual measurements from Benchmark_O. It is executed with `--num-iters=1`, because we don’t want to average the runtimes, we want raw data. This collects variable number of measurements gathered in about 1 second. Using the `--quantile=20` we get up to 20 measured values that properly characterize the empirical distribution of the benchmark from each independent run. The measurements from `i` independent executions are combined to form the final empirical distribution, which is reported in a five-number summary (MIN, Q1, MEDIAN, Q3, MAX).
2025-12-21 12:14:44 +01:00 · 2018-10-10 20:06:39 +02:00
parent 0438c45e2d
commit a04edd1d47
2 changed files with 34 additions and 19 deletions
--- a/benchmark/scripts/test_Benchmark_Driver.py
+++ b/benchmark/scripts/test_Benchmark_Driver.py
@@ -279,29 +279,38 @@ class TestBenchmarkDriverRunningTests(unittest.TestCase):
        self.subprocess_mock.assert_called_with(
            ('/benchmarks/Benchmark_O', 'b', '--memory'))

+    def test_report_quantiles(self):
+        """Use delta compression for quantile reports."""
+        self.driver.run('b', quantile=4)
+        self.subprocess_mock.assert_called_with(
+            ('/benchmarks/Benchmark_O', 'b', '--quantile=4', '--delta'))
+
    def test_run_benchmark_independent_samples(self):
+        """Extract up to 20 measurements from an independent run."""
        self.driver.args.independent_samples = 3
        r = self.driver.run_independent_samples('b1')
        self.assertEquals(self.subprocess_mock.calls.count(
-            ('/benchmarks/Benchmark_O', 'b1', '--memory')), 3)
+            ('/benchmarks/Benchmark_O', 'b1', '--num-iters=1', '--memory',
+             '--quantile=20', '--delta')), 3)
        self.assertEquals(r.num_samples, 3)  # results are merged

    def test_run_and_log(self):
        def mock_run(test):
            self.assertEquals(test, 'b1')
            return PerformanceTestResult(
-                '3,b1,1,123,123,123,0,123,888'.split(','))
+                '3,b1,5,101,1,1,1,1,888'.split(','),
+                quantiles=True, delta=True, memory=True)
        driver = BenchmarkDriver(tests=['b1'], args=Stub(output_dir=None))
        driver.run_independent_samples = mock_run  # patching

        with captured_output() as (out, _):
            log = driver.run_and_log()

-        csv_log = '3,b1,1,123,123,123,0,123,888\n'
+        csv_log = '3,b1,5,101,102,103,104,105,888\n'
        self.assertEquals(log, None)
        self.assertEquals(
            out.getvalue(),
-            '#,TEST,SAMPLES,MIN(μs),MAX(μs),MEAN(μs),SD(μs),MEDIAN(μs),' +
+            '#,TEST,SAMPLES,MIN(μs),Q1(μs),MEDIAN(μs),Q3(μs),MAX(μs),' +
            'MAX_RSS(B)\n' +
            csv_log +
            '\n' +
@@ -313,10 +322,10 @@ class TestBenchmarkDriverRunningTests(unittest.TestCase):
        self.assertEquals(log, csv_log)
        self.assertEquals(
            out.getvalue(),
-            '  # TEST                      SAMPLES MIN(μs) MAX(μs)' +
-            ' MEAN(μs) SD(μs) MEDIAN(μs) MAX_RSS(B)\n' +
-            '  3 b1                              1     123     123' +
-            '      123      0        123        888\n' +
+            '  # TEST                      SAMPLES MIN(μs) Q1(μs)' +
+            ' MEDIAN(μs) Q3(μs) MAX(μs) MAX_RSS(B)\n' +
+            '  3 b1                              5     101    102' +
+            '        103    104     105        888\n' +
            '\n' +
            'Total performance tests executed: 1\n')