Merge pull request #19910 from palimondo/fluctuation-of-the-pupil

[benchmark] More Robust Benchmark_Driver
2025-12-14 20:36:38 +01:00 · 2018-10-29 15:02:07 -07:00
parent 9f7ee563cb 897b9ef82e
commit cd920b69f4
7 changed files with 409 additions and 116 deletions
--- a/benchmark/cmake/modules/AddSwiftBenchmarkSuite.cmake
+++ b/benchmark/cmake/modules/AddSwiftBenchmarkSuite.cmake
@@ -665,11 +665,11 @@ function(swift_benchmark_compile)
          COMMAND "${swift-bin-dir}/Benchmark_Driver" "run"
                  "-o" "O" "--output-dir" "${CMAKE_CURRENT_BINARY_DIR}/logs"
                  "--swift-repo" "${SWIFT_SOURCE_DIR}"
-                  "--iterations" "${SWIFT_BENCHMARK_NUM_O_ITERATIONS}"
+                  "--independent-samples" "${SWIFT_BENCHMARK_NUM_O_ITERATIONS}"
          COMMAND "${swift-bin-dir}/Benchmark_Driver" "run"
                  "-o" "Onone" "--output-dir" "${CMAKE_CURRENT_BINARY_DIR}/logs"
                  "--swift-repo" "${SWIFT_SOURCE_DIR}"
-                  "--iterations" "${SWIFT_BENCHMARK_NUM_ONONE_ITERATIONS}"
+                  "--independent-samples" "${SWIFT_BENCHMARK_NUM_ONONE_ITERATIONS}"
          COMMAND "${swift-bin-dir}/Benchmark_Driver" "compare"
                  "--log-dir" "${CMAKE_CURRENT_BINARY_DIR}/logs"
                  "--swift-repo" "${SWIFT_SOURCE_DIR}"
--- a/benchmark/scripts/Benchmark_Driver
+++ b/benchmark/scripts/Benchmark_Driver
@@ -133,20 +133,21 @@ class BenchmarkDriver(object):
            benchmarks.intersection(set(self.all_tests)).union(indexed_names)))

    def run(self, test, num_samples=None, num_iters=None,
-            verbose=None, measure_memory=False):
+            verbose=None, measure_memory=False, quantile=None):
        """Execute benchmark and gather results."""
-        num_samples = num_samples or 1
+        num_samples = num_samples or 0
        num_iters = num_iters or 0  # automatically determine N to run for 1s

        cmd = self._cmd_run(
-            test, num_samples, num_iters, verbose, measure_memory)
+            test, num_samples, num_iters, verbose, measure_memory, quantile)
        output = self._invoke(cmd)
        result = self.parser.results_from_string(output).items()[0][1]
        return result

-    def _cmd_run(self, test, num_samples, num_iters, verbose, measure_memory):
+    def _cmd_run(self, test, num_samples, num_iters, verbose, measure_memory,
+                 quantile):
        cmd = [self.test_harness, test]
-        if num_samples > 1:
+        if num_samples > 0:
            cmd.append('--num-samples={0}'.format(num_samples))
        if num_iters > 0:
            cmd.append('--num-iters={0}'.format(num_iters))
@@ -154,6 +155,9 @@ class BenchmarkDriver(object):
            cmd.append('--verbose')
        if measure_memory:
            cmd.append('--memory')
+        if quantile:
+            cmd.append('--quantile={0}'.format(quantile))
+            cmd.append('--delta')
        return cmd

    def run_independent_samples(self, test):
@@ -166,8 +170,9 @@ class BenchmarkDriver(object):
            return a

        return reduce(merge_results,
-                      [self.run(test, measure_memory=True)
-                       for _ in range(self.args.iterations)])
+                      [self.run(test, measure_memory=True,
+                                num_iters=1, quantile=20)
+                       for _ in range(self.args.independent_samples)])

    def log_results(self, output, log_file=None):
        """Log output to `log_file`.
@@ -182,7 +187,7 @@ class BenchmarkDriver(object):
        with open(log_file, 'w') as f:
            f.write(output)

-    RESULT = '{:>3} {:<25} {:>7} {:>7} {:>7} {:>8} {:>6} {:>10} {:>10}'
+    RESULT = '{:>3} {:<25} {:>7} {:>7} {:>6} {:>10} {:>6} {:>7} {:>10}'

    def run_and_log(self, csv_console=True):
        """Run benchmarks and continuously log results to the console.
@@ -201,14 +206,15 @@ class BenchmarkDriver(object):
        def console_log(values):
            print(format(values))

-        console_log(['#', 'TEST', 'SAMPLES', 'MIN(μs)', 'MAX(μs)',  # header
-                    'MEAN(μs)', 'SD(μs)', 'MEDIAN(μs)', 'MAX_RSS(B)'])
-
        def result_values(r):
-            return map(str, [r.test_num, r.name, r.num_samples, r.min, r.max,
-                             int(r.mean), int(r.sd), r.median, r.max_rss])
+            return map(str, [r.test_num, r.name, r.num_samples, r.min,
+                             r.samples.q1, r.median, r.samples.q3, r.max,
+                             r.max_rss])

-        results = []
+        header = ['#', 'TEST', 'SAMPLES', 'MIN(μs)', 'Q1(μs)', 'MEDIAN(μs)',
+                  'Q3(μs)', 'MAX(μs)', 'MAX_RSS(B)']
+        console_log(header)
+        results = [header]
        for test in self.tests:
            result = result_values(self.run_independent_samples(test))
            console_log(result)
@@ -290,6 +296,7 @@ class BenchmarkDoctor(object):
            self._name_matches_capital_words_convention,
            self._name_is_at_most_40_chars_long,
            self._no_setup_overhead,
+            self._reasonable_setup_time,
            self._optimized_runtime_in_range,
            self._constant_memory_use
        ]
@@ -341,15 +348,20 @@ class BenchmarkDoctor(object):
              for correction in [(setup / i) for i in [1, 2]]
              ] for result in i_series])

-        if 2500 < runtime:
-            log = (BenchmarkDoctor.log_runtime.warning if runtime < 500000 else
+        threshold = 1000
+        if threshold < runtime:
+            log = (BenchmarkDoctor.log_runtime.warning if runtime < 10000 else
                   BenchmarkDoctor.log_runtime.error)
            caveat = '' if setup == 0 else ' (excluding the setup overhead)'
            log("'%s' execution took at least %d μs%s.", name, runtime, caveat)
-            factor = int(pow(2, math.ceil(math.log(runtime / 2500.0, 2))))
+
+            def factor(base):  # suitable divisior that's integer power of base
+                return int(pow(base, math.ceil(
+                    math.log(runtime / float(threshold), base))))
+
            BenchmarkDoctor.log_runtime.info(
-                "Decrease the workload of '%s' by a factor of %d, "
-                "to be less than 2500 μs.", name, factor)
+                "Decrease the workload of '%s' by a factor of %d (%d), to be "
+                "less than %d μs.", name, factor(2), factor(10), threshold)

    @staticmethod
    def _setup_overhead(measurements):
@@ -372,6 +384,17 @@ class BenchmarkDoctor(object):
                'Move initialization of benchmark data to the `setUpFunction` '
                'registered in `BenchmarkInfo`.')

+    @staticmethod
+    def _reasonable_setup_time(measurements):
+        setup = min([result.setup
+                     for result in BenchmarkDoctor._select(measurements)])
+        if 200000 < setup:  # 200 ms
+            BenchmarkDoctor.log_runtime.error(
+                "'%s' setup took at least %d μs.",
+                measurements['name'], setup)
+            BenchmarkDoctor.log_runtime.info(
+                'The `setUpFunction` should take no more than 200 ms.')
+
    @staticmethod
    def _constant_memory_use(measurements):
        select = BenchmarkDoctor._select
@@ -603,7 +626,7 @@ def parse_args(args):
        help='Run benchmarks and output results to stdout',
        parents=[shared_benchmarks_parser])
    run_parser.add_argument(
-        '-i', '--iterations',
+        '-i', '--independent-samples',
        help='number of times to run each test (default: 1)',
        type=positive_int, default=1)
    run_parser.add_argument(
--- a/benchmark/scripts/compare_perf_tests.py
+++ b/benchmark/scripts/compare_perf_tests.py
@@ -51,6 +51,14 @@ class Sample(namedtuple('Sample', 'i num_iters runtime')):
        return 's({0.i!r}, {0.num_iters!r}, {0.runtime!r})'.format(self)


+class Yield(namedtuple('Yield', 'before_sample after')):
+    u"""Meta-measurement of when the Benchmark_X voluntarily yielded process.
+
+    `before_sample`: index of measurement taken just after returning from yield
+    `after`: time elapsed since the previous yield in microseconds (μs)
+    """
+
+
 class PerformanceTestSamples(object):
    """Collection of runtime samples from the benchmark execution.

@@ -69,7 +77,7 @@ class PerformanceTestSamples(object):
            self.add(sample)

    def __str__(self):
-        """Text summary of benchmark statisctics."""
+        """Text summary of benchmark statistics."""
        return (
            '{0.name!s} n={0.count!r} '
            'Min={0.min!r} Q1={0.q1!r} M={0.median!r} Q3={0.q3!r} '
@@ -211,23 +219,50 @@ class PerformanceTestResult(object):
    Reported by the test driver (Benchmark_O, Benchmark_Onone, Benchmark_Osize
    or Benchmark_Driver).

-    It depends on the log format emitted by the test driver in the form:
+    It suppors 2 log formats emitted by the test driver. Legacy format with
+    statistics for normal distribution (MEAN, SD):
        #,TEST,SAMPLES,MIN(μs),MAX(μs),MEAN(μs),SD(μs),MEDIAN(μs),MAX_RSS(B)
-
-    The last column, MAX_RSS, is emitted only for runs instrumented by the
-    Benchmark_Driver to measure rough memory use during the execution of the
-    benchmark.
+    And new quantiles format with variable number of columns:
+        #,TEST,SAMPLES,MIN(μs),MEDIAN(μs),MAX(μs)
+        #,TEST,SAMPLES,MIN(μs),Q1(μs),Q2(μs),Q3(μs),MAX(μs),MAX_RSS(B)
+    The number of columns between MIN and MAX depends on the test driver's
+    `--quantile`parameter. In both cases, the last column, MAX_RSS is optional.
    """

-    def __init__(self, csv_row):
-        """Initialize from a row with 8 or 9 columns with benchmark summary.
+    def __init__(self, csv_row, quantiles=False, memory=False, delta=False):
+        """Initialize from a row of multiple columns with benchmark summary.

        The row is an iterable, such as a row provided by the CSV parser.
        """
        self.test_num = csv_row[0]          # Ordinal number of the test
        self.name = csv_row[1]              # Name of the performance test
-        self.num_samples = (            # Number of measurement samples taken
-            int(csv_row[2]))
+        self.num_samples = int(csv_row[2])  # Number of measurements taken
+
+        if quantiles:  # Variable number of columns representing quantiles
+            runtimes = csv_row[3:-1] if memory else csv_row[3:]
+            if delta:
+                runtimes = [int(x) if x else 0 for x in runtimes]
+                runtimes = reduce(lambda l, x: l.append(l[-1] + x) or  # runnin
+                                  l if l else [x], runtimes, None)     # total
+            num_values = len(runtimes)
+            if self.num_samples < num_values:  # remove repeated samples
+                quantile = num_values - 1
+                qs = [float(i) / float(quantile) for i in range(0, num_values)]
+                indices = [max(0, int(ceil(self.num_samples * float(q))) - 1)
+                           for q in qs]
+                runtimes = [runtimes[indices.index(i)]
+                            for i in range(0, self.num_samples)]
+
+            self.samples = PerformanceTestSamples(
+                self.name,
+                [Sample(None, None, int(runtime)) for runtime in runtimes])
+            self.samples.exclude_outliers(top_only=True)
+            sams = self.samples
+            self.min, self.max, self.median, self.mean, self.sd = \
+                sams.min, sams.max, sams.median, sams.mean, sams.sd
+            self.max_rss = (                # Maximum Resident Set Size (B)
+                int(csv_row[-1]) if memory else None)
+        else:  # Legacy format with statistics for normal distribution.
            self.min = int(csv_row[3])      # Minimum runtime (μs)
            self.max = int(csv_row[4])      # Maximum runtime (μs)
            self.mean = float(csv_row[5])   # Mean (average) runtime (μs)
@@ -236,6 +271,8 @@ class PerformanceTestResult(object):
            self.max_rss = (                # Maximum Resident Set Size (B)
                int(csv_row[8]) if len(csv_row) > 8 else None)
            self.samples = None
+        self.yields = None
+        self.setup = None

    def __repr__(self):
        """Short summary for debugging purposes."""
@@ -253,6 +290,7 @@ class PerformanceTestResult(object):
        The use case here is comparing test results parsed from concatenated
        log files from multiple runs of benchmark driver.
        """
+        # Statistics
        if self.samples and r.samples:
            map(self.samples.add, r.samples.samples)
            sams = self.samples
@@ -266,8 +304,14 @@ class PerformanceTestResult(object):
                (self.mean * self.num_samples) + (r.mean * r.num_samples)
            ) / float(self.num_samples + r.num_samples)
            self.num_samples += r.num_samples
-            self.max_rss = min(self.max_rss, r.max_rss)
-            self.median, self.sd = 0, 0
+            self.median, self.sd = None, None
+
+        # Metadata
+        def minimum(a, b):  # work around None being less than everything
+            return (min(filter(lambda x: x is not None, [a, b])) if any([a, b])
+                    else None)
+        self.max_rss = minimum(self.max_rss, r.max_rss)
+        self.setup = minimum(self.setup, r.setup)


 class ResultComparison(object):
@@ -307,27 +351,29 @@ class LogParser(object):
    def __init__(self):
        """Create instance of `LogParser`."""
        self.results = []
+        self.quantiles, self.delta, self.memory = False, False, False
        self._reset()

    def _reset(self):
        """Reset parser to the default state for reading a new result."""
-        self.samples, self.num_iters = [], 1
-        self.max_rss, self.mem_pages = None, None
+        self.samples, self.yields, self.num_iters = [], [], 1
+        self.setup, self.max_rss, self.mem_pages = None, None, None
        self.voluntary_cs, self.involuntary_cs = None, None

    # Parse lines like this
    # #,TEST,SAMPLES,MIN(μs),MAX(μs),MEAN(μs),SD(μs),MEDIAN(μs)
-    results_re = re.compile(r'( *\d+[, \t]*[\w.]+[, \t]*' +
-                            r'[, \t]*'.join([r'[\d.]+'] * 6) +
-                            r'[, \t]*[\d.]*)')  # optional MAX_RSS(B)
+    results_re = re.compile(
+        r'( *\d+[, \t]+[\w.]+[, \t]+' +  # #,TEST
+        r'[, \t]+'.join([r'\d+'] * 2) +  # at least 2...
+        r'(?:[, \t]+\d*)*)')             # ...or more numeric columns

    def _append_result(self, result):
-        columns = result.split(',')
-        if len(columns) < 8:
-            columns = result.split()
-        r = PerformanceTestResult(columns)
-        if self.max_rss:
-            r.max_rss = self.max_rss
+        columns = result.split(',') if ',' in result else result.split()
+        r = PerformanceTestResult(
+            columns, quantiles=self.quantiles, memory=self.memory,
+            delta=self.delta)
+        r.setup = self.setup
+        r.max_rss = r.max_rss or self.max_rss
        r.mem_pages = self.mem_pages
        r.voluntary_cs = self.voluntary_cs
        r.involuntary_cs = self.involuntary_cs
@@ -335,12 +381,18 @@ class LogParser(object):
            r.samples = PerformanceTestSamples(r.name, self.samples)
            r.samples.exclude_outliers()
        self.results.append(r)
+        r.yields = self.yields or None
        self._reset()

    def _store_memory_stats(self, max_rss, mem_pages):
        self.max_rss = int(max_rss)
        self.mem_pages = int(mem_pages)

+    def _configure_format(self, header):
+        self.quantiles = 'MEAN' not in header
+        self.memory = 'MAX_RSS' in header
+        self.delta = '𝚫' in header
+
    # Regular expression and action to take when it matches the parsed line
    state_actions = {
        results_re: _append_result,
@@ -355,6 +407,17 @@ class LogParser(object):
         self.samples.append(
             Sample(int(i), int(self.num_iters), int(runtime)))),

+        re.compile(r'\s+SetUp (\d+)'):
+        (lambda self, setup: setattr(self, 'setup', int(setup))),
+
+        re.compile(r'\s+Yielding after ~(\d+) μs'):
+        (lambda self, since_last_yield:
+            self.yields.append(
+                Yield(len(self.samples), int(since_last_yield)))),
+
+        re.compile(r'( *#[, \t]+TEST[, \t]+SAMPLES[, \t]+MIN.*)'):
+        _configure_format,
+
        # Environmental statistics: memory usage and context switches
        re.compile(r'\s+MAX_RSS \d+ - \d+ = (\d+) \((\d+) pages\)'):
        _store_memory_stats,
--- a/benchmark/scripts/test_Benchmark_Driver.py
+++ b/benchmark/scripts/test_Benchmark_Driver.py
@@ -96,15 +96,16 @@ class Test_parse_args(unittest.TestCase):
             "(choose from 'O', 'Onone', 'Osize')"],
            err.getvalue())

-    def test_iterations(self):
-        self.assertEquals(parse_args(['run']).iterations, 1)
-        self.assertEquals(parse_args(['run', '-i', '3']).iterations, 3)
+    def test_independent_samples(self):
+        self.assertEquals(parse_args(['run']).independent_samples, 1)
+        self.assertEquals(parse_args(['run', '-i', '3']).independent_samples,
+                          3)
        with captured_output() as (out, err):
            self.assertRaises(SystemExit,
                              parse_args, ['run', '-i', '-3'])
        self.assert_contains(
-            ['error:',
-             "argument -i/--iterations: invalid positive_int value: '-3'"],
+            ['error:', "argument -i/--independent-samples: " +
+             "invalid positive_int value: '-3'"],
            err.getvalue())

    def test_output_dir(self):
@@ -278,30 +279,40 @@ class TestBenchmarkDriverRunningTests(unittest.TestCase):
        self.subprocess_mock.assert_called_with(
            ('/benchmarks/Benchmark_O', 'b', '--memory'))

+    def test_report_quantiles(self):
+        """Use delta compression for quantile reports."""
+        self.driver.run('b', quantile=4)
+        self.subprocess_mock.assert_called_with(
+            ('/benchmarks/Benchmark_O', 'b', '--quantile=4', '--delta'))
+
    def test_run_benchmark_independent_samples(self):
-        self.driver.args.iterations = 3
+        """Extract up to 20 measurements from an independent run."""
+        self.driver.args.independent_samples = 3
        r = self.driver.run_independent_samples('b1')
        self.assertEquals(self.subprocess_mock.calls.count(
-            ('/benchmarks/Benchmark_O', 'b1', '--memory')), 3)
+            ('/benchmarks/Benchmark_O', 'b1', '--num-iters=1', '--memory',
+             '--quantile=20', '--delta')), 3)
        self.assertEquals(r.num_samples, 3)  # results are merged

    def test_run_and_log(self):
        def mock_run(test):
            self.assertEquals(test, 'b1')
            return PerformanceTestResult(
-                '3,b1,1,123,123,123,0,123,888'.split(','))
+                '3,b1,5,101,1,1,1,1,888'.split(','),
+                quantiles=True, delta=True, memory=True)
        driver = BenchmarkDriver(tests=['b1'], args=Stub(output_dir=None))
        driver.run_independent_samples = mock_run  # patching

        with captured_output() as (out, _):
            log = driver.run_and_log()

-        csv_log = '3,b1,1,123,123,123,0,123,888\n'
+        header = '#,TEST,SAMPLES,MIN(μs),Q1(μs),MEDIAN(μs),Q3(μs),MAX(μs),' +\
+            'MAX_RSS(B)\n'
+        csv_log = '3,b1,5,101,102,103,104,105,888\n'
        self.assertEquals(log, None)
        self.assertEquals(
            out.getvalue(),
-            '#,TEST,SAMPLES,MIN(μs),MAX(μs),MEAN(μs),SD(μs),MEDIAN(μs),' +
-            'MAX_RSS(B)\n' +
+            header +
            csv_log +
            '\n' +
            'Total performance tests executed: 1\n')
@@ -309,13 +320,13 @@ class TestBenchmarkDriverRunningTests(unittest.TestCase):
        with captured_output() as (out, _):
            log = driver.run_and_log(csv_console=False)

-        self.assertEquals(log, csv_log)
+        self.assertEquals(log, header + csv_log)
        self.assertEquals(
            out.getvalue(),
-            '  # TEST                      SAMPLES MIN(μs) MAX(μs)' +
-            ' MEAN(μs) SD(μs) MEDIAN(μs) MAX_RSS(B)\n' +
-            '  3 b1                              1     123     123' +
-            '      123      0        123        888\n' +
+            '  # TEST                      SAMPLES MIN(μs) Q1(μs)' +
+            ' MEDIAN(μs) Q3(μs) MAX(μs) MAX_RSS(B)\n' +
+            '  3 b1                              5     101    102' +
+            '        103    104     105        888\n' +
            '\n' +
            'Total performance tests executed: 1\n')

@@ -410,9 +421,9 @@ class TestLoggingReportFormatter(unittest.TestCase):
        self.assertEquals(f.format(lr), 'INFO Hi!')


-def _PTR(min=700, mem_pages=1000):
+def _PTR(min=700, mem_pages=1000, setup=None):
    """Create PerformanceTestResult Stub."""
-    return Stub(min=min, mem_pages=mem_pages)
+    return Stub(min=min, mem_pages=mem_pages, setup=setup)


 def _run(test, num_samples=None, num_iters=None, verbose=None,
@@ -537,13 +548,18 @@ class TestBenchmarkDoctor(unittest.TestCase):
            self.logs['info'])

    def test_benchmark_runtime_range(self):
-        """Optimized benchmark should run in less then 2500 μs.
+        """Optimized benchmark should run in less then 1000 μs.

-        With runtimes less than 2500 μs there is better than 1:4 chance of
-        being interrupted in the middle of measurement due to elapsed 10 ms
-        quantum used by macos scheduler.
+        Even on calm machine, benchmark with runtime of 2500 μs has 1:4 chance
+        of being interrupted in the middle of measurement due to elapsed 10 ms
+        quantum used by macos scheduler. Linux scheduler's quantum is 6ms.
+        Driver yielding the process before the 10ms quantum elapses helped
+        a lot, but benchmarks with runtimes under 1ms usually exhibit a strong
+        mode which is best for accurate performance charaterization.
+        To minimize the number of involuntary context switches that corrupt our
+        measurements, we should strive to stay in the microbenchmark range.

-        Warn about longer runtime. Runtimes over half a second are an error.
+        Warn about longer runtime. Runtimes over 10ms are an error.
        """
        def measurements(name, runtime):
            return {'name': name,
@@ -553,7 +569,7 @@ class TestBenchmarkDoctor(unittest.TestCase):
        with captured_output() as (out, _):
            doctor = BenchmarkDoctor(self.args, BenchmarkDriverMock([]))
            doctor.analyze(measurements('Cheetah', 200))
-            doctor.analyze(measurements('Hare', 2501))
+            doctor.analyze(measurements('Hare', 1001))
            doctor.analyze(measurements('Tortoise', 500000))
            doctor.analyze({'name': 'OverheadTurtle',
                            'OverheadTurtle O i1a': _PTR(min=800000),
@@ -562,17 +578,17 @@ class TestBenchmarkDoctor(unittest.TestCase):

        self.assertIn('runtime: ', output)
        self.assertNotIn('Cheetah', output)
-        self.assert_contains(["'Hare' execution took at least 2501 μs."],
+        self.assert_contains(["'Hare' execution took at least 1001 μs."],
                             self.logs['warning'])
        self.assert_contains(
-            ["Decrease the workload of 'Hare' by a factor of 2, "
-             "to be less than 2500 μs."], self.logs['info'])
+            ["Decrease the workload of 'Hare' by a factor of 2 (10), "
+             "to be less than 1000 μs."], self.logs['info'])
        self.assert_contains(
            ["'Tortoise' execution took at least 500000 μs."],
            self.logs['error'])
        self.assert_contains(
-            ["Decrease the workload of 'Tortoise' by a factor of 256, "
-             "to be less than 2500 μs."], self.logs['info'])
+            ["Decrease the workload of 'Tortoise' by a factor of 512 (1000), "
+             "to be less than 1000 μs."], self.logs['info'])
        self.assert_contains(
            ["'OverheadTurtle' execution took at least 600000 μs"
             " (excluding the setup overhead)."],
@@ -605,6 +621,29 @@ class TestBenchmarkDoctor(unittest.TestCase):
            ["Move initialization of benchmark data to the `setUpFunction` "
             "registered in `BenchmarkInfo`."], self.logs['info'])

+    def test_benchmark_setup_takes_reasonable_time(self):
+        """Setup < 200 ms (20% extra on top of the typical 1 s measurement)"""
+        with captured_output() as (out, _):
+            doctor = BenchmarkDoctor(self.args, BenchmarkDriverMock([]))
+            doctor.analyze({
+                'name': 'NormalSetup',
+                'NormalSetup O i1a': _PTR(setup=199999),
+                'NormalSetup O i2a': _PTR(setup=200001)})
+            doctor.analyze({
+                'name': 'LongSetup',
+                'LongSetup O i1a': _PTR(setup=200001),
+                'LongSetup O i2a': _PTR(setup=200002)})
+        output = out.getvalue()
+
+        self.assertIn('runtime: ', output)
+        self.assertNotIn('NormalSetup', output)
+        self.assert_contains(
+            ["'LongSetup' setup took at least 200001 μs."],
+            self.logs['error'])
+        self.assert_contains(
+            ["The `setUpFunction` should take no more than 200 ms."],
+            self.logs['info'])
+
    def test_benchmark_has_constant_memory_use(self):
        """Benchmark's memory footprint must not vary with num-iters."""
        with captured_output() as (out, _):
--- a/benchmark/scripts/test_compare_perf_tests.py
+++ b/benchmark/scripts/test_compare_perf_tests.py
@@ -204,6 +204,110 @@ class TestPerformanceTestResult(unittest.TestCase):
        r = PerformanceTestResult(log_line.split(','))
        self.assertEquals(r.max_rss, 10510336)

+    def test_init_quantiles(self):
+        # #,TEST,SAMPLES,MIN(μs),MEDIAN(μs),MAX(μs)
+        log = '1,Ackermann,3,54383,54512,54601'
+        r = PerformanceTestResult(log.split(','), quantiles=True)
+        self.assertEquals(r.test_num, '1')
+        self.assertEquals(r.name, 'Ackermann')
+        self.assertEquals((r.num_samples, r.min, r.median, r.max),
+                          (3, 54383, 54512, 54601))
+        self.assertAlmostEquals(r.mean, 54498.67, places=2)
+        self.assertAlmostEquals(r.sd, 109.61, places=2)
+        self.assertEquals(r.samples.count, 3)
+        self.assertEquals(r.samples.num_samples, 3)
+        self.assertEquals([s.runtime for s in r.samples.all_samples],
+                          [54383, 54512, 54601])
+
+        # #,TEST,SAMPLES,MIN(μs),MEDIAN(μs),MAX(μs),MAX_RSS(B)
+        log = '1,Ackermann,3,54529,54760,55807,266240'
+        r = PerformanceTestResult(log.split(','), quantiles=True, memory=True)
+        self.assertEquals((r.samples.count, r.max_rss), (3, 266240))
+        # #,TEST,SAMPLES,MIN(μs),Q1(μs),Q2(μs),Q3(μs),MAX(μs)
+        log = '1,Ackermann,5,54570,54593,54644,57212,58304'
+        r = PerformanceTestResult(log.split(','), quantiles=True, memory=False)
+        self.assertEquals((r.num_samples, r.min, r.median, r.max),
+                          (5, 54570, 54644, 58304))
+        self.assertEquals((r.samples.q1, r.samples.q3), (54593, 57212))
+        self.assertEquals(r.samples.count, 5)
+        # #,TEST,SAMPLES,MIN(μs),Q1(μs),Q2(μs),Q3(μs),MAX(μs),MAX_RSS(B)
+        log = '1,Ackermann,5,54686,54731,54774,55030,63466,270336'
+        r = PerformanceTestResult(log.split(','), quantiles=True, memory=True)
+        self.assertEquals(r.samples.num_samples, 5)
+        self.assertEquals(r.samples.count, 4)  # outlier was excluded
+        self.assertEquals(r.max_rss, 270336)
+
+    def test_init_delta_quantiles(self):
+        # #,TEST,SAMPLES,MIN(μs),𝚫MEDIAN,𝚫MAX
+        # 2-quantile from 2 samples in repeated min, when delta encoded,
+        # the difference is 0, which is ommited -- only separator remains
+        log = '202,DropWhileArray,2,265,,22'
+        r = PerformanceTestResult(log.split(','), quantiles=True, delta=True)
+        self.assertEquals((r.num_samples, r.min, r.median, r.max),
+                          (2, 265, 265, 287))
+        self.assertEquals(r.samples.count, 2)
+        self.assertEquals(r.samples.num_samples, 2)
+
+    def test_init_oversampled_quantiles(self):
+        """When num_samples is < quantile + 1, some of the measurements are
+        repeated in the report summary. Samples should contain only true
+        values, discarding the repetated artifacts from quantile estimation.
+
+        The test string is slightly massaged output of the following R script:
+        subsample <- function(x, q) {
+          quantile(1:x, probs=((0:(q-1))/(q-1)), type=1)}
+        tbl <- function(s) t(sapply(1:s, function(x) {
+          qs <- subsample(x, s); c(qs[1], diff(qs)) }))
+        sapply(c(3, 5, 11, 21), tbl)
+        """
+        def validatePTR(deq):  # construct from delta encoded quantiles string
+            deq = deq.split(',')
+            num_samples = deq.count('1')
+            r = PerformanceTestResult(['0', 'B', str(num_samples)] + deq,
+                                      quantiles=True, delta=True)
+            self.assertEquals(r.samples.num_samples, num_samples)
+            self.assertEquals([s.runtime for s in r.samples.all_samples],
+                              range(1, num_samples + 1))
+
+        delta_encoded_quantiles = """
+1,,
+1,,1
+1,,,,
+1,,,1,
+1,,1,1,
+1,,1,1,1
+1,,,,,,,,,,
+1,,,,,,1,,,,
+1,,,,1,,,1,,,
+1,,,1,,,1,,1,,
+1,,,1,,1,,1,,1,
+1,,1,,1,,1,1,,1,
+1,,1,1,,1,1,,1,1,
+1,,1,1,1,,1,1,1,1,
+1,,1,1,1,1,1,1,1,1,
+1,,1,1,1,1,1,1,1,1,1
+1,,,,,,,,,,,,,,,,,,,,
+1,,,,,,,,,,,1,,,,,,,,,
+1,,,,,,,1,,,,,,,1,,,,,,
+1,,,,,,1,,,,,1,,,,,1,,,,
+1,,,,,1,,,,1,,,,1,,,,1,,,
+1,,,,1,,,1,,,,1,,,1,,,1,,,
+1,,,1,,,1,,,1,,,1,,,1,,,1,,
+1,,,1,,,1,,1,,,1,,1,,,1,,1,,
+1,,,1,,1,,1,,1,,,1,,1,,1,,1,,
+1,,,1,,1,,1,,1,,1,,1,,1,,1,,1,
+1,,1,,1,,1,,1,,1,1,,1,,1,,1,,1,
+1,,1,,1,,1,1,,1,,1,1,,1,,1,1,,1,
+1,,1,,1,1,,1,1,,1,1,,1,1,,1,1,,1,
+1,,1,1,,1,1,,1,1,,1,1,1,,1,1,,1,1,
+1,,1,1,,1,1,1,,1,1,1,,1,1,1,,1,1,1,
+1,,1,1,1,,1,1,1,1,,1,1,1,1,,1,1,1,1,
+1,,1,1,1,1,1,,1,1,1,1,1,1,,1,1,1,1,1,
+1,,1,1,1,1,1,1,1,1,,1,1,1,1,1,1,1,1,1,
+1,,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+1,,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1"""
+        map(validatePTR, delta_encoded_quantiles.split('\n')[1:])
+
    def test_repr(self):
        log_line = '1,AngryPhonebook,20,10664,12933,11035,576,10884'
        r = PerformanceTestResult(log_line.split(','))
@@ -214,29 +318,32 @@ class TestPerformanceTestResult(unittest.TestCase):
        )

    def test_merge(self):
-        tests = """1,AngryPhonebook,1,12045,12045,12045,0,12045,10510336
+        tests = """
+1,AngryPhonebook,1,12045,12045,12045,0,12045
 1,AngryPhonebook,1,12325,12325,12325,0,12325,10510336
 1,AngryPhonebook,1,11616,11616,11616,0,11616,10502144
-1,AngryPhonebook,1,12270,12270,12270,0,12270,10498048""".split('\n')
+1,AngryPhonebook,1,12270,12270,12270,0,12270,10498048""".split('\n')[1:]
        results = map(PerformanceTestResult,
                      [line.split(',') for line in tests])
+        results[2].setup = 9
+        results[3].setup = 7

        def as_tuple(r):
            return (r.num_samples, r.min, r.max, round(r.mean, 2),
-                    r.sd, r.median, r.max_rss)
+                    r.sd, r.median, r.max_rss, r.setup)

        r = results[0]
        self.assertEquals(as_tuple(r),
-                          (1, 12045, 12045, 12045, 0, 12045, 10510336))
+                          (1, 12045, 12045, 12045, 0, 12045, None, None))
        r.merge(results[1])
-        self.assertEquals(as_tuple(r),  # drops SD and median
-                          (2, 12045, 12325, 12185, 0, 0, 10510336))
+        self.assertEquals(as_tuple(r),  # drops SD and median, +max_rss
+                          (2, 12045, 12325, 12185, None, None, 10510336, None))
        r.merge(results[2])
-        self.assertEquals(as_tuple(r),  # picks smaller of the MAX_RSS
-                          (3, 11616, 12325, 11995.33, 0, 0, 10502144))
+        self.assertEquals(as_tuple(r),  # picks smaller of the MAX_RSS, +setup
+                          (3, 11616, 12325, 11995.33, None, None, 10502144, 9))
        r.merge(results[3])
-        self.assertEquals(as_tuple(r),
-                          (4, 11616, 12325, 12064, 0, 0, 10498048))
+        self.assertEquals(as_tuple(r),  # picks smaller of the setup values
+                          (4, 11616, 12325, 12064, None, None, 10498048, 7))


 class TestResultComparison(unittest.TestCase):
@@ -336,7 +443,7 @@ class OldAndNewLog(unittest.TestCase):

 class TestLogParser(unittest.TestCase):
    def test_parse_results_csv(self):
-        """Ignores header row, empty lines and Totals row."""
+        """Ignores uknown lines, extracts data from supported formats."""
        log = """#,TEST,SAMPLES,MIN(us),MAX(us),MEAN(us),SD(us),MEDIAN(us)
 34,BitCount,20,3,4,4,0,4

@@ -369,6 +476,41 @@ Total performance tests executed: 1
        self.assertEquals(r.name, 'Array2D')
        self.assertEquals(r.max_rss, 20915200)

+    def test_parse_quantiles(self):
+        """Gathers samples from reported quantiles. Handles optional memory."""
+        r = LogParser.results_from_string(
+            """#,TEST,SAMPLES,MIN(μs),MEDIAN(μs),MAX(μs)
+1,Ackermann,3,54383,54512,54601""")['Ackermann']
+        self.assertEquals([s.runtime for s in r.samples.all_samples],
+                          [54383, 54512, 54601])
+        r = LogParser.results_from_string(
+            """#,TEST,SAMPLES,MIN(μs),MEDIAN(μs),MAX(μs),MAX_RSS(B)
+1,Ackermann,3,54529,54760,55807,266240""")['Ackermann']
+        self.assertEquals([s.runtime for s in r.samples.all_samples],
+                          [54529, 54760, 55807])
+        self.assertEquals(r.max_rss, 266240)
+
+    def test_parse_delta_quantiles(self):
+        r = LogParser.results_from_string(  # 2-quantile aka. median
+            '#,TEST,SAMPLES,MIN(μs),𝚫MEDIAN,𝚫MAX\n0,B,1,101,,')['B']
+        self.assertEquals(
+            (r.num_samples, r.min, r.median, r.max, r.samples.count),
+            (1, 101, 101, 101, 1))
+        r = LogParser.results_from_string(
+            '#,TEST,SAMPLES,MIN(μs),𝚫MEDIAN,𝚫MAX\n0,B,2,101,,1')['B']
+        self.assertEquals(
+            (r.num_samples, r.min, r.median, r.max, r.samples.count),
+            (2, 101, 101, 102, 2))
+        r = LogParser.results_from_string(  # 20-quantiles aka. ventiles
+            '#,TEST,SAMPLES,MIN(μs),𝚫V1,𝚫V2,𝚫V3,𝚫V4,𝚫V5,𝚫V6,𝚫V7,𝚫V8,' +
+            '𝚫V9,𝚫VA,𝚫VB,𝚫VC,𝚫VD,𝚫VE,𝚫VF,𝚫VG,𝚫VH,𝚫VI,𝚫VJ,𝚫MAX\n' +
+            '202,DropWhileArray,200,214,,,,,,,,,,,,1,,,,,,2,16,464'
+        )['DropWhileArray']
+        self.assertEquals(
+            (r.num_samples, r.min, r.max, r.samples.count),
+            # last 3 ventiles were outliers and were excluded from the sample
+            (200, 214, 215, 18))
+
    def test_parse_results_verbose(self):
        """Parse multiple performance test results with 2 sample formats:
        single line for N = 1; two lines for N > 1.
@@ -383,8 +525,11 @@ Running AngryPhonebook for 3 samples.
    Sample 2,11467
 1,AngryPhonebook,3,11467,13898,12392,1315,11812
 Running Array2D for 3 samples.
+    SetUp 14444
    Sample 0,369900
+    Yielding after ~369918 μs
    Sample 1,381039
+    Yielding after ~381039 μs
    Sample 2,371043
 3,Array2D,3,369900,381039,373994,6127,371043

@@ -400,15 +545,21 @@ Totals,2"""
        self.assertEquals(r.num_samples, r.samples.num_samples)
        self.assertEquals(results[0].samples.all_samples,
                          [(0, 78, 11812), (1, 90, 13898), (2, 90, 11467)])
+        self.assertEquals(r.yields, None)

        r = results[1]
        self.assertEquals(
            (r.name, r.min, r.max, int(r.mean), int(r.sd), r.median),
            ('Array2D', 369900, 381039, 373994, 6127, 371043)
        )
+        self.assertEquals(r.setup, 14444)
        self.assertEquals(r.num_samples, r.samples.num_samples)
        self.assertEquals(results[1].samples.all_samples,
                          [(0, 1, 369900), (1, 1, 381039), (2, 1, 371043)])
+        yielded = r.yields[0]
+        self.assertEquals(yielded.before_sample, 1)
+        self.assertEquals(yielded.after, 369918)
+        self.assertEquals(r.yields, [(1, 369918), (2, 381039)])

    def test_parse_environment_verbose(self):
        """Parse stats about environment in verbose mode."""
--- a/benchmark/utils/DriverUtils.swift
+++ b/benchmark/utils/DriverUtils.swift
@@ -74,7 +74,7 @@ struct TestConfig {
  let numIters: Int?

  /// The number of samples we should take of each test.
-  let numSamples: Int
+  let numSamples: Int?

  /// Quantiles to report in results.
  let quantile: Int?
@@ -130,7 +130,9 @@ struct TestConfig {
    // Configure the command line argument parser
    let p = ArgumentParser(into: PartialTestConfig())
    p.addArgument("--num-samples", \.numSamples,
-                  help: "number of samples to take per benchmark; default: 1",
+                  help: "number of samples to take per benchmark;\n" +
+                        "default: 1 or auto-scaled to measure for\n" +
+                        "`sample-time` if num-iters is also specified\n",
                  parser: { UInt($0) })
    p.addArgument("--num-iters", \.numIters,
                  help: "number of iterations averaged in the sample;\n" +
@@ -174,7 +176,7 @@ struct TestConfig {
    delim = c.delim ?? ","
    sampleTime = c.sampleTime ?? 1.0
    numIters = c.numIters.map { Int($0) }
-    numSamples = Int(c.numSamples ?? 1)
+    numSamples = c.numSamples.map { Int($0) }
    quantile = c.quantile.map { Int($0) }
    delta = c.delta ?? false
    verbose = c.verbose ?? false
@@ -200,7 +202,7 @@ struct TestConfig {
      let testList = tests.map({ $0.1.name }).joined(separator: ", ")
      print("""
            --- CONFIG ---
-            NumSamples: \(numSamples)
+            NumSamples: \(numSamples ?? 0)
            Verbose: \(verbose)
            LogMemory: \(logMemory)
            SampleTime: \(sampleTime)
@@ -470,10 +472,9 @@ final class TestRunner {
      logVerbose("Skipping unsupported benchmark \(test.name)!")
      return nil
    }
-    logVerbose("Running \(test.name) for \(c.numSamples) samples.")
+    logVerbose("Running \(test.name)")

    var samples: [Int] = []
-    samples.reserveCapacity(c.numSamples)

    func addSample(_ time: Int) {
      logVerbose("    Sample \(samples.count),\(time)")
@@ -481,7 +482,12 @@ final class TestRunner {
    }

    resetMeasurements()
-    test.setUpFunction?()
+    if let setUp = test.setUpFunction {
+      setUp()
+      stopMeasurement()
+      logVerbose("    SetUp \(lastSampleTime.microseconds)")
+      resetMeasurements()
+    }

    // Determine number of iterations for testFn to run for desired time.
    func iterationsPerSampleTime() -> (numIters: Int, oneIter: Int) {
@@ -494,17 +500,26 @@ final class TestRunner {
      }
    }

-    let numIters = min( // Cap to prevent overflow on 32-bit systems when scaled
-      Int.max / 10_000, // by the inner loop multiplier inside the `testFn`.
-      c.numIters ?? {
+    // Determine the scale of measurements. Re-use the calibration result if
+    // it is just one measurement.
+    func calibrateMeasurements() -> Int {
      let (numIters, oneIter) = iterationsPerSampleTime()
      if numIters == 1 { addSample(oneIter) }
      else { resetMeasurements() } // for accurate yielding reports
      return numIters
-      }())
+    }

+    let numIters = min( // Cap to prevent overflow on 32-bit systems when scaled
+      Int.max / 10_000, // by the inner loop multiplier inside the `testFn`.
+      c.numIters ?? calibrateMeasurements())
+
+    let numSamples = c.numSamples ?? min(2000, // Cap the number of samples
+      c.numIters == nil ? 1 : calibrateMeasurements())
+
+    samples.reserveCapacity(numSamples)
+    logVerbose("    Collecting \(numSamples) samples.")
    logVerbose("    Measuring with scale \(numIters).")
-    for _ in samples.count..<c.numSamples {
+    for _ in samples.count..<numSamples {
      addSample(measure(test.name, fn: testFn, numIters: numIters))
    }

--- a/test/benchmark/Benchmark_O.test.md
+++ b/test/benchmark/Benchmark_O.test.md
@@ -110,7 +110,7 @@ RUN:                              --check-prefix LOGHEADER \
 RUN:                              --check-prefix LOGBENCH
 LOGHEADER-LABEL: #,TEST,SAMPLES,MIN(μs),MAX(μs),MEAN(μs),SD(μs),MEDIAN(μs)
 LOGBENCH: {{[0-9]+}},
-NUMITERS1: AngryPhonebook,1
+NUMITERS1: AngryPhonebook,{{[0-9]+}}
 NUMITERS1-NOT: 0,0,0,0,0
 LOGBENCH-SAME: ,{{[0-9]+}},{{[0-9]+}},{{[0-9]+}},{{[0-9]+}},{{[0-9]+}}
 ````
@@ -153,7 +153,8 @@ CONFIG: NumSamples: 2
 CONFIG: Tests Filter: ["1", "Ackermann", "1", "AngryPhonebook"]
 CONFIG: Tests to run: Ackermann, AngryPhonebook
 LOGFORMAT: #,TEST,SAMPLES,MIN(μs),𝚫MEDIAN,𝚫MAX,MAX_RSS(B)
-LOGVERBOSE-LABEL: Running Ackermann for 2 samples.
+LOGVERBOSE-LABEL: Running Ackermann
+LOGVERBOSE: Collecting 2 samples.
 LOGVERBOSE: Measuring with scale {{[0-9]+}}.
 LOGVERBOSE: Sample 0,{{[0-9]+}}
 LOGVERBOSE: Sample 1,{{[0-9]+}}
@@ -163,7 +164,8 @@ MEASUREENV: VCS {{[0-9]+}} - {{[0-9]+}} = {{[0-9]+}}
 RUNJUSTONCE-LABEL: 1,Ackermann
 RUNJUSTONCE-NOT: 1,Ackermann
 LOGFORMAT: ,{{[0-9]+}},{{[0-9]+}},,{{[0-9]+}},{{[0-9]+}}
-LOGVERBOSE-LABEL: Running AngryPhonebook for 2 samples.
+LOGVERBOSE-LABEL: Running AngryPhonebook
+LOGVERBOSE: Collecting 2 samples.
 ````

 Verify the specified delimiter is used when logging to console. The non-verbose