[benchmark] Exclude outliers from samples

Introduce algorithm for excluding of outliers after collecting all samples using the Interquartile Range rule. The `exclude_outliers` method uses 1st and 3rd Quartile to compute Interquartile Range, then uses inner fences at Q1 - 1.5*IQR and Q3 + 1.5*IQR to remove samples outside this fence. Based on experiments with collecting hundreads and thousands of samples (`num_samples`) per test with low iteration count (`num_iters`) with ~1s runtime, this rule is very effective in providing much better quality of sample population, effectively removing short environmental fluctuations that were previously averaged into the overall result (by the adaptively determined `num_iters` to run for ~1s), enlarging the reported result with these measurement errors. This technique can be used for some benchmarks, to get more stable results faster than before. This outlier filering is employed when parsing `--verbose` test results.
2025-12-21 12:14:44 +01:00 · 2018-08-17 00:39:29 +02:00
parent 91077e3289
commit 27cc77c590
2 changed files with 216 additions and 47 deletions
--- a/benchmark/scripts/compare_perf_tests.py
+++ b/benchmark/scripts/compare_perf_tests.py
@@ -22,8 +22,9 @@ from __future__ import print_function
 import argparse
 import re
 import sys
-from math import sqrt
+from bisect import bisect, bisect_left, bisect_right
 from collections import namedtuple
+from math import sqrt


 class Sample(namedtuple('Sample', 'i num_iters runtime')):
@@ -50,25 +51,75 @@ class PerformanceTestSamples(object):
        """Initialized with benchmark name and optional list of Samples."""
        self.name = name  # Name of the performance test
        self.samples = []
+        self.outliers = []
+        self._runtimes = []
        self.mean = 0.0
        self.S_runtime = 0.0  # For computing running variance
        for sample in samples or []:
            self.add(sample)

+    def __str__(self):
+        """Text summary of benchmark statisctics."""
+        return (
+            '{0.name!s} n={0.count!r} '
+            'Min={0.min!r} Q1={0.q1!r} M={0.median!r} Q3={0.q3!r} '
+            'Max={0.max!r} '
+            'R={0.range!r} {0.spread:.2%} IQR={0.iqr!r} '
+            'Mean={0.mean:.0f} SD={0.sd:.0f} CV={0.cv:.2%}'
+            .format(self) if self.samples else
+            '{0.name!s} n=0'.format(self))
+
    def add(self, sample):
        """Add sample to collection and recompute statistics."""
        assert isinstance(sample, Sample)
-        state = (self.count, self.mean, self.S_runtime)
-        state = self.running_mean_variance(state, sample.runtime)
-        _, self.mean, self.S_runtime = state
-        self.samples.append(sample)
-        self.samples.sort(key=lambda s: s.runtime)
+        self._update_stats(sample)
+        i = bisect(self._runtimes, sample.runtime)
+        self._runtimes.insert(i, sample.runtime)
+        self.samples.insert(i, sample)
+
+    def _update_stats(self, sample):
+        old_stats = (self.count, self.mean, self.S_runtime)
+        _, self.mean, self.S_runtime = (
+            self.running_mean_variance(old_stats, sample.runtime))
+
+    def exclude_outliers(self):
+        """Exclude outliers by applying Interquartile Range Rule.
+
+        Moves the samples outside of the inner fences
+        (Q1 - 1.5*IQR and Q3 + 1.5*IQR) into outliers list and recomputes
+        statistics for the remaining sample population. Optionally apply
+        only the top inner fence, preserving the small outliers.
+
+        Experimentally, this rule seems to perform well-enough on the
+        benchmark runtimes in the microbenchmark range to filter out
+        the environment noise caused by preemtive multitasking.
+        """
+        lo = bisect_left(self._runtimes, int(self.q1 - 1.5 * self.iqr))
+        hi = bisect_right(self._runtimes, int(self.q3 + 1.5 * self.iqr))
+
+        outliers = self.samples[:lo] + self.samples[hi:]
+        samples = self.samples[lo:hi]
+
+        self.__init__(self.name)  # re-initialize
+        for sample in samples:  # and
+            self.add(sample)  # re-compute stats
+        self.outliers = outliers

    @property
    def count(self):
        """Number of samples used to compute the statistics."""
        return len(self.samples)

+    @property
+    def num_samples(self):
+        """Number of all samples in the collection."""
+        return len(self.samples) + len(self.outliers)
+
+    @property
+    def all_samples(self):
+        """List of all samples in ascending order."""
+        return sorted(self.samples + self.outliers, key=lambda s: s.i)
+
    @property
    def min(self):
        """Minimum sampled value."""
@@ -84,6 +135,21 @@ class PerformanceTestSamples(object):
        """Median sampled value."""
        return self.samples[self.count / 2].runtime

+    @property
+    def q1(self):
+        """First Quartile (25th Percentile)."""
+        return self.samples[self.count / 4].runtime
+
+    @property
+    def q3(self):
+        """Third Quartile (75th Percentile)."""
+        return self.samples[(self.count / 2) + (self.count / 4)].runtime
+
+    @property
+    def iqr(self):
+        """Interquartile Range."""
+        return self.q3 - self.q1
+
    @property
    def sd(self):
        u"""Standard Deviation (μs)."""
@@ -108,6 +174,16 @@ class PerformanceTestSamples(object):
        """Coeficient of Variation (%)."""
        return (self.sd / self.mean) if self.mean else 0

+    @property
+    def range(self):
+        """Range of samples values (Max - Min)."""
+        return self.max - self.min
+
+    @property
+    def spread(self):
+        """Sample Spread; i.e. Range as (%) of Min."""
+        return self.range / float(self.min) if self.min else 0
+

 class PerformanceTestResult(object):
    u"""Result from executing an individual Swift Benchmark Suite benchmark.
@@ -160,7 +236,7 @@ class PerformanceTestResult(object):
        if self.samples and r.samples:
            map(self.samples.add, r.samples.samples)
            sams = self.samples
-            self.num_samples = sams.count
+            self.num_samples = sams.num_samples
            self.min, self.max, self.median, self.mean, self.sd = \
                sams.min, sams.max, sams.median, sams.mean, sams.sd
        else:
@@ -237,6 +313,7 @@ class LogParser(object):
            r.involuntary_cs = self.involuntary_cs
        if self.samples:
            r.samples = PerformanceTestSamples(r.name, self.samples)
+            r.samples.exclude_outliers()
        self.results.append(r)
        self._reset()