Overhaul Benchmarking pipeline to use complete sample data, not summaries

The Swift benchmarking harness now has two distinct output formats: * Default: Formatted text that's intended for human consumption. Right now, this is just the minimum value, but we can augment that. * `--json`: each output line is a JSON-encoded object that contains raw data This information is intended for use by python scripts that aggregate or compare multiple independent tests. Previously, we tried to use the same output for both purposes. This required the python scripts to do more complex parsing of textual layouts, and also meant that the python scripts had only summary data to work with instead of full raw sample information. This in turn made it almost impossible to derive meaningful comparisons between runs or to aggregate multiple runs. Typical output in the new JSON format looks like this: ``` {"number":89, "name":"PerfTest", "samples":[1.23, 2.35], "max_rss":16384} {"number":91, "name":"OtherTest", "samples":[14.8, 19.7]} ``` This format is easy to parse in Python. Just iterate over lines and decode each one separately. Also note that the optional fields (`"max_rss"` above) are trivial to handle: ``` import json for l in lines: j = json.loads(l) # Default 0 if not present max_rss = j.get("max_rss", 0) ``` Note the `"samples"` array includes the runtime for each individual run. Because optional fields are so much easier to handle in this form, I reworked the Python logic to translate old formats into this JSON format for more uniformity. Hopefully, we can simplify the code in a year or so by stripping out the old log formats entirely, along with some of the redundant statistical calculations. In particular, the python logic still makes an effort to preserve mean, median, max, min, stdev, and other statistical data whenever the full set of samples is not present. Once we've gotten to a point where we're always keeping full samples, we can compute any such information on the fly as needed, eliminating the need to record it. This is a pretty big rearchitecture of the core benchmarking logic. In order to try to keep things a bit more manageable, I have not taken this opportunity to replace any of the actual statistics used in the higher level code or to change how the actual samples are measured. (But I expect this rearchitecture will make such changes simpler.) In particular, this should not actually change any benchmark results. For the future, please keep this general principle in mind: Statistical summaries (averages, medians, etc) should as a rule be computed for immediate output and rarely if ever stored or used as input for other processing. Instead, aim to store and transfer raw data from which statistics can be recomputed as necessary.
2025-12-14 20:36:38 +01:00 · 2022-10-12 13:23:06 -07:00
parent 1a1afeb410
commit 971a5d8547
5 changed files with 846 additions and 1083 deletions
--- a/benchmark/scripts/Benchmark_Driver
+++ b/benchmark/scripts/Benchmark_Driver
@@ -28,6 +28,7 @@ class `BenchmarkDoctor` analyzes performance tests, implements `check` COMMAND.
 import argparse
 import functools
 import glob
+import json
 import logging
 import math
 import os
@@ -88,9 +89,10 @@ class BenchmarkDriver(object):
    def test_harness(self):
        """Full path to test harness binary."""
        suffix = self.args.optimization if hasattr(self.args, "optimization") else "O"
+        suffix += "-"
        if hasattr(self.args, "architecture") and self.args.architecture:
-            suffix += "-" + self.args.architecture + "*"
-        pattern = os.path.join(self.args.tests, "Benchmark_" + suffix)
+            suffix += self.args.architecture
+        pattern = os.path.join(self.args.tests, "Benchmark_" + suffix + "*")
        executables = []
        if hasattr(self._subprocess, "test_mode") and self._subprocess.test_mode:
            executables = [pattern]
@@ -134,22 +136,20 @@ class BenchmarkDriver(object):

    @property
    def _cmd_list_benchmarks(self):
-        # Use tab delimiter for easier parsing to override the default comma.
-        # (The third 'column' is always comma-separated list of tags in square
-        # brackets -- currently unused here.)
-        return [self.test_harness, "--list", "--delim=\t"] + (
+        return [self.test_harness, "--list", "--json"] + (
            ["--skip-tags="] if (self.args.benchmarks or self.args.filters) else []
        )

    def _get_tests(self):
        """Return a list of performance tests to run."""
-        number_name_pairs = [
-            line.split("\t")[:2]
-            for line in self._invoke(self._cmd_list_benchmarks).split("\n")[1:-1]
-        ]
-        # unzip list of pairs into 2 lists
-        test_numbers, self.all_tests = map(list, zip(*number_name_pairs))
-        self.test_number = dict(zip(self.all_tests, test_numbers))
+        lines = self._invoke(self._cmd_list_benchmarks).split("\n")
+        json_tests = []
+        for l in lines:
+            if l.strip() != "":
+                json_tests.append(json.loads(l))
+        self.all_tests = [json["name"] for json in json_tests]
+        test_numbers = [json["number"] for json in json_tests]
+        self.test_number = dict([(json["name"], json["number"]) for json in json_tests])
        if self.args.filters:
            return self._tests_matching_patterns()
        if self.args.benchmarks:
@@ -157,25 +157,18 @@ class BenchmarkDriver(object):
        return self.all_tests

    def _tests_matching_patterns(self):
-        regexes = [re.compile(pattern) for pattern in self.args.filters]
-        return sorted(
-            list(
-                set(
-                    [
-                        name
-                        for pattern in regexes
-                        for name in self.all_tests
-                        if pattern.match(name)
-                    ]
-                )
-            )
-        )
+        regexes = map(re.compile, self.args.filters)
+        matches = set()
+        for pattern in regexes:
+            new_matches = filter(pattern.match, self.all_tests)
+            matches.union(new_matches)
+        return sorted(list(matches))

    def _tests_by_name_or_number(self, test_numbers):
        benchmarks = set(self.args.benchmarks)
        number_to_name = dict(zip(test_numbers, self.all_tests))
        tests_by_number = [
-            number_to_name[i] for i in benchmarks.intersection(set(test_numbers))
+            number_to_name[i] for i in benchmarks.intersection(test_numbers)
        ]
        return sorted(
            list(benchmarks.intersection(set(self.all_tests)).union(tests_by_number))
@@ -188,8 +181,7 @@ class BenchmarkDriver(object):
        num_iters=None,
        sample_time=None,
        verbose=None,
-        measure_memory=False,
-        quantile=None,
+        measure_memory=False
    ):
        """Execute benchmark and gather results."""
        num_samples = num_samples or 0
@@ -197,7 +189,7 @@ class BenchmarkDriver(object):
        sample_time = sample_time or 0  # default is 1s

        cmd = self._cmd_run(
-            test, num_samples, num_iters, sample_time, verbose, measure_memory, quantile
+            test, num_samples, num_iters, sample_time, verbose, measure_memory
        )
        output = self._invoke(cmd)
        results = self.parser.results_from_string(output)
@@ -210,8 +202,7 @@ class BenchmarkDriver(object):
        num_iters,
        sample_time,
        verbose,
-        measure_memory,
-        quantile,
+        measure_memory
    ):
        cmd = [self.test_harness]
        if test:
@@ -228,9 +219,7 @@ class BenchmarkDriver(object):
            cmd.append("--verbose")
        if measure_memory:
            cmd.append("--memory")
-        if quantile:
-            cmd.append("--quantile={0}".format(quantile))
-            cmd.append("--delta")
+        cmd.append("--json")
        return cmd

    def run_independent_samples(self, test):
@@ -246,12 +235,12 @@ class BenchmarkDriver(object):
        return functools.reduce(
            merge_results,
            [
-                self.run(test, measure_memory=True, num_iters=1, quantile=20)
+                self.run(test, measure_memory=True, num_iters=1)
                for _ in range(self.args.independent_samples)
            ],
        )

-    def log_results(self, output, log_file=None):
+    def log_results(self, results, log_file=None):
        """Log output to `log_file`.

        Creates `args.output_dir` if it doesn't exist yet.
@@ -262,7 +251,8 @@ class BenchmarkDriver(object):
            os.makedirs(dir)
        print("Logging results to: %s" % log_file)
        with open(log_file, "w") as f:
-            f.write(output)
+            for r in results:
+                print(r, file=f)

    RESULT = "{:>3} {:<40} {:>7} {:>7} {:>6} {:>10} {:>6} {:>7} {:>10}"

@@ -284,7 +274,7 @@ class BenchmarkDriver(object):
        def console_log(values):
            print(format(values))

-        def result_values(r):
+        def summary(r):
            return list(
                map(
                    str,
@@ -292,17 +282,17 @@ class BenchmarkDriver(object):
                        r.test_num,
                        r.name,
                        r.num_samples,
-                        r.min,
-                        r.samples.q1,
+                        r.min_value,
+                        r.q1,
                        r.median,
-                        r.samples.q3,
-                        r.max,
+                        r.q3,
+                        r.max_value,
                        r.max_rss,
                    ],
                )
            )

-        header = [
+        summary_header = [
            "#",
            "TEST",
            "SAMPLES",
@@ -313,25 +303,23 @@ class BenchmarkDriver(object):
            "MAX(μs)",
            "MAX_RSS(B)",
        ]
-        console_log(header)
-        results = [header]
+        console_log(summary_header)
+        results = []
        for test in self.tests:
-            result = result_values(self.run_independent_samples(test))
-            console_log(result)
+            result = self.run_independent_samples(test)
+            console_log(summary(result))
            results.append(result)

        print("\nTotal performance tests executed: {0}".format(len(self.tests)))
-        return (
-            None if csv_console else ("\n".join([",".join(r) for r in results]) + "\n")
-        )  # csv_log
+        return results

    @staticmethod
    def run_benchmarks(args):
        """Run benchmarks and log results."""
        driver = BenchmarkDriver(args)
-        csv_log = driver.run_and_log(csv_console=(args.output_dir is None))
-        if csv_log:
-            driver.log_results(csv_log)
+        results = driver.run_and_log(csv_console=(args.output_dir is None))
+        if args.output_dir:
+            driver.log_results([r.json for r in results])
        return 0


@@ -445,7 +433,6 @@ class BenchmarkDoctor(object):
        Optional `driver` parameter for injecting dependency; used for testing.
        """
        super(BenchmarkDoctor, self).__init__()
-        self.driver = driver or BenchmarkDriver(args)
        self.results = {}

        if hasattr(args, "markdown") and args.markdown:
@@ -458,6 +445,7 @@ class BenchmarkDoctor(object):
            self.console_handler.setLevel(
                logging.DEBUG if args.verbose else logging.INFO
            )
+        self.driver = driver or BenchmarkDriver(args)
        self.log.addHandler(self.console_handler)
        self.log.debug("Checking tests: %s", ", ".join(self.driver.tests))
        self.requirements = [
@@ -532,7 +520,7 @@ class BenchmarkDoctor(object):
            correction = setup / i
            i_series = BenchmarkDoctor._select(measurements, num_iters=i)
            for result in i_series:
-                runtimes.append(result.samples.min - correction)
+                runtimes.append(result.min_value - correction)
        runtime = min(runtimes)

        threshold = 1000
@@ -584,7 +572,7 @@ class BenchmarkDoctor(object):
        ti1, ti2 = [
            float(min(mins))
            for mins in [
-                [result.samples.min for result in i_series]
+                [result.min_value for result in i_series]
                for i_series in [select(measurements, num_iters=i) for i in [1, 2]]
            ]
        ]
@@ -679,7 +667,7 @@ class BenchmarkDoctor(object):
        r = self.driver.run(
            benchmark, num_samples=3, num_iters=1, verbose=True
        )  # calibrate
-        num_samples = self._adjusted_1s_samples(r.samples.min)
+        num_samples = self._adjusted_1s_samples(r.min_value)

        def capped(s):
            return min(s, 200)
@@ -689,7 +677,7 @@ class BenchmarkDoctor(object):
        opts = opts if isinstance(opts, list) else [opts]
        self.log.debug(
            "Runtime {0} μs yields {1} adjusted samples per second.".format(
-                r.samples.min, num_samples
+                r.min_value, num_samples
            )
        )
        self.log.debug(
--- a/benchmark/scripts/compare_perf_tests.py
+++ b/benchmark/scripts/compare_perf_tests.py
@@ -17,9 +17,7 @@ This script compares performance test logs and issues a formatted report.

 Invoke `$ compare_perf_tests.py -h ` for complete list of options.

-class `Sample` is single benchmark measurement.
-class `PerformanceTestSamples` is collection of `Sample`s and their statistics.
-class `PerformanceTestResult` is a summary of performance test execution.
+class `PerformanceTestResult` collects information about a single test
 class `LogParser` converts log files into `PerformanceTestResult`s.
 class `ResultComparison` compares new and old `PerformanceTestResult`s.
 class `TestComparator` analyzes changes between the old and new test results.
@@ -29,194 +27,10 @@ class `ReportFormatter` creates the test comparison report in specified format.

 import argparse
 import functools
+import json
 import re
+import statistics
 import sys
-from bisect import bisect, bisect_left, bisect_right
-from collections import namedtuple
-from math import ceil, sqrt
-
-
-class Sample(namedtuple("Sample", "i num_iters runtime")):
-    u"""Single benchmark measurement.
-
-    Initialized with:
-    `i`: ordinal number of the sample taken,
-    `num-num_iters`:  number or iterations used to compute it,
-    `runtime`: in microseconds (μs).
-    """
-
-    def __repr__(self):
-        """Shorter Sample formatting for debugging purposes."""
-        return "s({0.i!r}, {0.num_iters!r}, {0.runtime!r})".format(self)
-
-
-class Yield(namedtuple("Yield", "before_sample after")):
-    u"""Meta-measurement of when the Benchmark_X voluntarily yielded process.
-
-    `before_sample`: index of measurement taken just after returning from yield
-    `after`: time elapsed since the previous yield in microseconds (μs)
-    """
-
-
-class PerformanceTestSamples(object):
-    """Collection of runtime samples from the benchmark execution.
-
-    Computes the sample population statistics.
-    """
-
-    def __init__(self, name, samples=None):
-        """Initialize with benchmark name and optional list of Samples."""
-        self.name = name  # Name of the performance test
-        self.samples = []
-        self.outliers = []
-        self._runtimes = []
-        self.mean = 0.0
-        self.S_runtime = 0.0  # For computing running variance
-        for sample in samples or []:
-            self.add(sample)
-
-    def __str__(self):
-        """Text summary of benchmark statistics."""
-        return (
-            "{0.name!s} n={0.count!r} "
-            "Min={0.min!r} Q1={0.q1!r} M={0.median!r} Q3={0.q3!r} "
-            "Max={0.max!r} "
-            "R={0.range!r} {0.spread:.2%} IQR={0.iqr!r} "
-            "Mean={0.mean:.0f} SD={0.sd:.0f} CV={0.cv:.2%}".format(self)
-            if self.samples
-            else "{0.name!s} n=0".format(self)
-        )
-
-    def add(self, sample):
-        """Add sample to collection and recompute statistics."""
-        assert isinstance(sample, Sample)
-        self._update_stats(sample)
-        i = bisect(self._runtimes, sample.runtime)
-        self._runtimes.insert(i, sample.runtime)
-        self.samples.insert(i, sample)
-
-    def _update_stats(self, sample):
-        old_stats = (self.count, self.mean, self.S_runtime)
-        _, self.mean, self.S_runtime = self.running_mean_variance(
-            old_stats, sample.runtime
-        )
-
-    def exclude_outliers(self, top_only=False):
-        """Exclude outliers by applying Interquartile Range Rule.
-
-        Moves the samples outside of the inner fences
-        (Q1 - 1.5*IQR and Q3 + 1.5*IQR) into outliers list and recomputes
-        statistics for the remaining sample population. Optionally apply
-        only the top inner fence, preserving the small outliers.
-
-        Experimentally, this rule seems to perform well-enough on the
-        benchmark runtimes in the microbenchmark range to filter out
-        the environment noise caused by preemptive multitasking.
-        """
-        lo = (
-            0
-            if top_only
-            else bisect_left(self._runtimes, int(self.q1 - 1.5 * self.iqr))
-        )
-        hi = bisect_right(self._runtimes, int(self.q3 + 1.5 * self.iqr))
-
-        outliers = self.samples[:lo] + self.samples[hi:]
-        samples = self.samples[lo:hi]
-
-        self.__init__(self.name)  # re-initialize
-        for sample in samples:  # and
-            self.add(sample)  # re-compute stats
-        self.outliers = outliers
-
-    @property
-    def count(self):
-        """Number of samples used to compute the statistics."""
-        return len(self.samples)
-
-    @property
-    def num_samples(self):
-        """Number of all samples in the collection."""
-        return len(self.samples) + len(self.outliers)
-
-    @property
-    def all_samples(self):
-        """List of all samples in ascending order."""
-        return sorted(self.samples + self.outliers, key=lambda s: s.i or -1)
-
-    @property
-    def min(self):
-        """Minimum sampled value."""
-        return self.samples[0].runtime
-
-    @property
-    def max(self):
-        """Maximum sampled value."""
-        return self.samples[-1].runtime
-
-    def quantile(self, q):
-        """Return runtime for given quantile.
-
-        Equivalent to quantile estimate type R-1, SAS-3. See:
-        https://en.wikipedia.org/wiki/Quantile#Estimating_quantiles_from_a_sample
-        """
-        index = max(0, int(ceil(self.count * float(q))) - 1)
-        return self.samples[index].runtime
-
-    @property
-    def median(self):
-        """Median sampled value."""
-        return self.quantile(0.5)
-
-    @property
-    def q1(self):
-        """First Quartile (25th Percentile)."""
-        return self.quantile(0.25)
-
-    @property
-    def q3(self):
-        """Third Quartile (75th Percentile)."""
-        return self.quantile(0.75)
-
-    @property
-    def iqr(self):
-        """Interquartile Range."""
-        return self.q3 - self.q1
-
-    @property
-    def sd(self):
-        u"""Standard Deviation (μs)."""
-        return 0 if self.count < 2 else sqrt(self.S_runtime / (self.count - 1))
-
-    @staticmethod
-    def running_mean_variance(stats, x):
-        """Compute running variance, B. P. Welford's method.
-
-        See Knuth TAOCP vol 2, 3rd edition, page 232, or
-        https://www.johndcook.com/blog/standard_deviation/
-        M is mean, Standard Deviation is defined as sqrt(S/k-1)
-        """
-
-        (k, M_, S_) = stats
-
-        k = float(k + 1)
-        M = M_ + (x - M_) / k
-        S = S_ + (x - M_) * (x - M)
-        return (k, M, S)
-
-    @property
-    def cv(self):
-        """Coefficient of Variation (%)."""
-        return (self.sd / self.mean) if self.mean else 0
-
-    @property
-    def range(self):
-        """Range of samples values (Max - Min)."""
-        return self.max - self.min
-
-    @property
-    def spread(self):
-        """Sample Spread; i.e. Range as (%) of Min."""
-        return self.range / float(self.min) if self.min else 0


 class PerformanceTestResult(object):
@@ -225,126 +39,395 @@ class PerformanceTestResult(object):
    Reported by the test driver (Benchmark_O, Benchmark_Onone, Benchmark_Osize
    or Benchmark_Driver).

-    It supports 2 log formats emitted by the test driver. Legacy format with
-    statistics for normal distribution (MEAN, SD):
-        #,TEST,SAMPLES,MIN(μs),MAX(μs),MEAN(μs),SD(μs),MEDIAN(μs),MAX_RSS(B)
-    And new quantiles format with variable number of columns:
-        #,TEST,SAMPLES,QMIN(μs),MEDIAN(μs),MAX(μs)
-        #,TEST,SAMPLES,QMIN(μs),Q1(μs),Q2(μs),Q3(μs),MAX(μs),MAX_RSS(B)
-    The number of columns between MIN and MAX depends on the test driver's
-    `--quantile`parameter. In both cases, the last column, MAX_RSS is optional.
+    It supports  log formats emitted by the test driver.
    """

-    def __init__(self, csv_row, quantiles=False, memory=False, delta=False, meta=False):
-        """Initialize from a row of multiple columns with benchmark summary.
-
-        The row is an iterable, such as a row provided by the CSV parser.
+    @classmethod
+    def fromOldFormat(cls, header, line):
+        """Original format with statistics for normal distribution (MEAN, SD):
+             #,TEST,SAMPLES,MIN(μs),MAX(μs),MEAN(μs),SD(μs),MEDIAN(μs),MAX_RSS(B),PAGES,ICS,YIELD
+           Note that MAX_RSS, PAGES, ICS, YIELD are all optional
        """
-        self.test_num = csv_row[0]  # Ordinal number of the test
-        self.name = csv_row[1]  # Name of the performance test
-        self.num_samples = int(csv_row[2])  # Number of measurements taken
+        csv_row = line.split(",") if "," in line else line.split()
+        labels = header.split(",") if "," in header else header.split()

-        mem_index = (-1 if memory else 0) + (-3 if meta else 0)
-        if quantiles:  # Variable number of columns representing quantiles
-            runtimes = csv_row[3:mem_index] if memory or meta else csv_row[3:]
-            last_runtime_index = mem_index - 1
-            if delta:
-                runtimes = [int(x) if x else 0 for x in runtimes]
-                runtimes = functools.reduce(
-                    lambda l, x: l.append(l[-1] + x) or l if l else [x],  # runnin
-                    runtimes,
-                    None,
-                )  # total
-            num_values = len(runtimes)
-            if self.num_samples < num_values:  # remove repeated samples
-                quantile = num_values - 1
-                qs = [float(i) / float(quantile) for i in range(0, num_values)]
-                indices = [
-                    max(0, int(ceil(self.num_samples * float(q))) - 1) for q in qs
-                ]
-                runtimes = [
-                    runtimes[indices.index(i)] for i in range(0, self.num_samples)
-                ]
+        # Synthesize a JSON form with the basic values:
+        num_samples = int(csv_row[2])
+        json_data = {
+            "number": int(csv_row[0]),
+            "name": csv_row[1],
+            "num_samples": num_samples,
+        }

-            self.samples = PerformanceTestSamples(
-                self.name, [Sample(None, None, int(runtime)) for runtime in runtimes]
-            )
-            self.samples.exclude_outliers(top_only=True)
-            sams = self.samples
-            self.min, self.max, self.median, self.mean, self.sd = (
-                sams.min,
-                sams.max,
-                sams.median,
-                sams.mean,
-                sams.sd,
-            )
-        else:  # Legacy format with statistics for normal distribution.
-            self.min = int(csv_row[3])  # Minimum runtime (μs)
-            self.max = int(csv_row[4])  # Maximum runtime (μs)
-            self.mean = float(csv_row[5])  # Mean (average) runtime (μs)
-            self.sd = float(csv_row[6])  # Standard Deviation (μs)
-            self.median = int(csv_row[7])  # Median runtime (μs)
-            last_runtime_index = 7
-            self.samples = None
+        # Map remaining columns according to label
+        field_map = [
+            ("ICS", "ics"),
+            ("MAX_RSS", "max_rss"),  # Must precede "MAX"
+            ("MAX", "max"),
+            ("MEAN", "mean"),
+            ("MEDIAN", "median"),
+            ("MIN", "min"),
+            ("PAGES", "pages"),
+            ("SD", "sd"),
+            ("YIELD", "yield")
+        ]
+        for label, value in zip(labels, csv_row):
+            for match, json_key in field_map:
+                if match in label:
+                    json_data[json_key] = float(value)
+                    break

-        self.max_rss = (  # Maximum Resident Set Size (B)
-            int(csv_row[mem_index]) if (
-                memory and len(csv_row) > (last_runtime_index + 1)
-            ) else None
-        )
+        # Heroic: Reconstruct samples if we have enough info
+        # This is generally a bad idea, but sadly necessary for the
+        # old format that doesn't provide raw sample data.
+        if num_samples == 1 and "min" in json_data:
+            json_data["samples"] = [
+                json_data["min"]
+            ]
+        elif num_samples == 2 and "min" in json_data and "max" in json_data:
+            json_data["samples"] = [
+                json_data["min"],
+                json_data["max"]
+            ]
+        elif (num_samples == 3
+              and "min" in json_data
+              and "max" in json_data
+              and "median" in json_data):
+            json_data["samples"] = [
+                json_data["min"],
+                json_data["median"],
+                json_data["max"]
+            ]

-        # Optional measurement metadata. The number of:
-        # memory pages used, involuntary context switches and voluntary yields
-        self.mem_pages, self.involuntary_cs, self.yield_count = (
-            [int(x) for x in csv_row[-3:]] if meta else (None, None, None)
-        )
-        self.yields = None
-        self.setup = None
+        return PerformanceTestResult(json_data)
+
+    @classmethod
+    def fromQuantileFormat(cls, header, line):
+        """Quantiles format with variable number of columns depending on the
+           number of quantiles:
+           #,TEST,SAMPLES,QMIN(μs),MEDIAN(μs),MAX(μs)
+           #,TEST,SAMPLES,QMIN(μs),Q1(μs),Q2(μs),Q3(μs),MAX(μs),MAX_RSS(B)
+        The number of columns between QMIN and MAX depends on the test driver's
+        `--quantile`parameter. In both cases, the last column, MAX_RSS is optional.
+
+        Delta encoding: If a header name includes 𝚫, that column stores the
+        difference from the previous column.  E.g, a header
+        "#,TEST,SAMPLES,QMIN(μs),MEDIAN(μs),𝚫MAX(μs)" indicates the final "MAX"
+        column must be computed by adding the value in that column to the value
+        of the previous "MEDIAN" column.
+        """
+        csv_row = line.split(",") if "," in line else line.split()
+        labels = header.split(",")
+
+        for i in range(1, len(labels)):
+            if "𝚫" in labels[i] or "Δ" in labels[i]:
+                prev = int(csv_row[i - 1])
+                inc = int(csv_row[i]) if csv_row[i] != '' else 0
+                csv_row[i] = str(prev + inc)
+
+        # Synthesize a JSON form and then initialize from that
+        json_data = {
+            "number": int(csv_row[0]),
+            "name": csv_row[1],
+            "num_samples": int(csv_row[2]),
+        }
+        # Process optional trailing fields MAX_RSS, PAGES, ICS, YIELD
+        i = len(labels) - 1
+        while True:
+            if "MAX_RSS" in labels[i]:
+                json_data["max_rss"] = float(csv_row[i])
+            elif "PAGES" in labels[i]:
+                json_data["pages"] = float(csv_row[i])
+            elif "ICS" in labels[i]:
+                json_data["ics"] = float(csv_row[i])
+            elif "YIELD" in labels[i]:
+                json_data["yield"] = float(csv_row[i])
+            else:
+                break
+            i -= 1
+            if i < 0:
+                break
+
+        # Rest is the quantiles (includes min/max columns)
+        quantiles = [float(q) for q in csv_row[3:i + 1]]
+
+        # Heroic effort:
+        # If we have enough quantiles, we can reconstruct the samples
+        # This is generally a bad idea, but sadly necessary since
+        # the quantile format doesn't provide raw sample data.
+        if json_data["num_samples"] == len(quantiles):
+            json_data["samples"] = sorted(quantiles)
+        elif json_data["num_samples"] == 2:
+            json_data["samples"] = [quantiles[0], quantiles[-1]]
+        elif json_data["num_samples"] == 1:
+            json_data["samples"] = [quantiles[0]]
+        else:
+            json_data["quantiles"] = quantiles
+        if len(quantiles) > 0:
+            json_data["min"] = quantiles[0]
+            json_data["max"] = quantiles[-1]
+            json_data["median"] = quantiles[(len(quantiles) - 1) // 2]
+
+        return PerformanceTestResult(json_data)
+
+    @classmethod
+    def fromJSONFormat(cls, line):
+        """JSON format stores a test result as a JSON object on a single line
+
+        Compared to the legacy tab-separated/comma-separated formats, this makes
+        it much easier to add new fields, handle optional fields, and allows us
+        to include the full set of samples so we can use better statistics
+        downstream.
+
+        The code here includes optional support for min, max,
+        median, mean, etc. supported by the older formats, though in practice,
+        you shouldn't rely on those:  Just store the full samples and then
+        compute whatever statistics you need as required.
+        """
+        json_data = json.loads(line)
+        return PerformanceTestResult(json_data)
+
+    def __init__(self, json_data):
+        if isinstance(json_data, str):
+            json_data = json.loads(json_data)
+
+        # We always have these
+        assert (json_data.get("number") is not None)
+        assert (json_data.get("name") is not None)
+        self.test_num = json_data["number"]
+        self.name = json_data["name"]
+
+        # We always have either samples or num_samples
+        assert (json_data.get("num_samples") is not None
+                or json_data.get("samples") is not None)
+        self.num_samples = json_data.get("num_samples") or len(json_data["samples"])
+        self.samples = json_data.get("samples") or []
+
+        # Everything else is optional and can be read
+        # out of the JSON data if needed
+        # See max_rss() below for an example of this.
+        self.json_data = dict(json_data)

    def __repr__(self):
-        """Short summary for debugging purposes."""
-        return (
-            "<PerformanceTestResult name:{0.name!r} "
-            "samples:{0.num_samples!r} min:{0.min!r} max:{0.max!r} "
-            "mean:{0.mean:.0f} sd:{0.sd:.0f} median:{0.median!r}>".format(self)
-        )
+        return "PerformanceTestResult(" + json.dumps(self.json_data) + ")"

-    def merge(self, r):
+    def json(self):
+        """Return a single-line JSON form of this result
+
+        This can be parsed back via fromJSONFormat above.
+        It can also represent all data stored by the older
+        formats, so there's no reason to not use it everywhere.
+        """
+        data = dict(self.json_data)
+
+        # In case these got modified
+        data["number"] = self.test_num
+        data["name"] = self.name
+
+        # If we have full sample data, use that and
+        # drop any lingering pre-computed statistics
+        # (It's better for downstream consumers to just
+        # compute whatever statistics they need from scratch.)
+        if len(self.samples) == self.num_samples:
+            data["samples"] = self.samples
+            data.pop("num_samples", None)
+            data.pop("min", None)
+            data.pop("max", None)
+            data.pop("mean", None)
+            data.pop("sd", None)
+            data.pop("q1", None)
+            data.pop("median", None)
+            data.pop("q3", None)
+            data.pop("quantiles", None)
+        else:
+            # Preserve other pre-existing JSON statistics
+            data["num_samples"] = self.num_samples
+
+        return json.dumps(data)
+
+    def __str__(self):
+        return self.json()
+
+    @property
+    def setup(self):
+        """TODO: Implement this
+        """
+        return 0
+
+
+    @property
+    def max_rss(self):
+        """Return max_rss if available
+        """
+        return self.json_data.get("max_rss")
+
+    @property
+    def mem_pages(self):
+        """Return pages if available
+        """
+        return self.json_data.get("pages")
+
+    @property
+    def involuntary_cs(self):
+        """Return involuntary context switches if available
+        """
+        return self.json_data.get("ics")
+
+    @property
+    def yield_count(self):
+        """Return voluntary yield count if available
+        """
+        return self.json_data.get("yield")
+
+    @property
+    def min_value(self):
+        """Return the minimum value from all samples
+
+        If we have full samples, compute it directly.
+        In the legacy case, we might not have full samples,
+        so in that case we'll return a value that was given
+        to us initially (if any).
+
+        Eventually (after December 2023), this can be simplified
+        to just `return min(self.samples)`, since by then
+        the legacy forms should no longer be in use.
+        """
+        if self.num_samples == len(self.samples):
+            return min(self.samples)
+        return self.json_data.get("min")
+
+    @property
+    def max_value(self):
+        """Return the maximum sample value
+
+        See min_value comments for details on the legacy behavior."""
+        if self.num_samples == len(self.samples):
+            return max(self.samples)
+        return self.json_data.get("max")
+
+    @property
+    def median(self):
+        """Return the median sample value
+
+        See min_value comments for details on the legacy behavior."""
+        if self.num_samples == len(self.samples):
+            return statistics.median(self.samples)
+        return self.json_data.get("median")
+
+    # TODO: Eliminate q1 and q3.  They're kept for now
+    # to preserve compatibility with older reports.  But quantiles
+    # aren't really useful statistics, so just drop them.
+    @property
+    def q1(self):
+        """Return the 25% quantile
+
+        See min_value comments for details on the legacy behavior."""
+        if self.num_samples == len(self.samples):
+            q = statistics.quantiles(self.samples, n=4)
+            return q[0]
+        return self.json_data.get("q1")
+
+    @property
+    def q3(self):
+        """Return the 75% quantile
+
+        See min_value comments for details on the legacy behavior."""
+        if self.num_samples == len(self.samples):
+            q = statistics.quantiles(self.samples, n=4)
+            return q[2]
+        return self.json_data.get("q3")
+
+    @property
+    def mean(self):
+        """Return the average
+
+        TODO: delete this; it's not useful"""
+        if self.num_samples == len(self.samples):
+            return statistics.mean(self.samples)
+        return self.json_data.get("mean")
+
+    @property
+    def sd(self):
+        """Return the standard deviation
+
+        TODO: delete this; it's not useful"""
+        if self.num_samples == len(self.samples):
+            if len(self.samples) > 1:
+                return statistics.stdev(self.samples)
+            else:
+                return 0
+        return self.json_data.get("sd")
+
+    def merge(self, other):
        """Merge two results.

-        Recomputes min, max and mean statistics. If all `samples` are
-        available, it recomputes all the statistics.
-        The use case here is comparing test results parsed from concatenated
-        log files from multiple runs of benchmark driver.
+        This is trivial in the non-legacy case:  We just
+        pool all the samples.
+
+        In the legacy case (or the mixed legacy/non-legacy cases),
+        we try to estimate the min/max/mean/sd/median/etc based
+        on whatever information is available.  After Dec 2023,
+        we should be able to drop the legacy support.
        """
-        # Statistics
-        if self.samples and r.samples:
-            for sample in r.samples.samples:
-                self.samples.add(sample)
-            sams = self.samples
-            self.num_samples = sams.num_samples
-            self.min, self.max, self.median, self.mean, self.sd = (
-                sams.min,
-                sams.max,
-                sams.median,
-                sams.mean,
-                sams.sd,
-            )
-        else:
-            self.min = min(self.min, r.min)
-            self.max = max(self.max, r.max)
-            self.mean = (  # pooled mean is the weighted sum of means
-                (self.mean * self.num_samples) + (r.mean * r.num_samples)
-            ) / float(self.num_samples + r.num_samples)
-            self.num_samples += r.num_samples
-            self.median, self.sd = None, None
+        # The following can be removed after Dec 2023
+        # (by which time the legacy support should no longer
+        # be necessary)
+        if self.num_samples != len(self.samples):
+            # If we don't have samples, we can't rely on being
+            # able to compute real statistics from those samples,
+            # so we make a best-effort attempt to estimate a joined
+            # statistic from whatever data we actually have.
+
+            # If both exist, take the minimum, else take whichever is set
+            other_min_value = other.min_value
+            if other_min_value is not None:
+                self_min_value = self.min_value
+                if self_min_value is not None:
+                    self.json_data["min"] = min(other_min_value, self_min_value)
+                else:
+                    self.json_data["min"] = other_min_value
+
+            # If both exist, take the maximum, else take whichever is set
+            other_max_value = other.max_value
+            if other_max_value is not None:
+                self_max_value = self.max_value
+                if self_max_value is not None:
+                    self.json_data["max"] = max(other_max_value, self_max_value)
+                else:
+                    self.json_data["max"] = other_max_value
+
+            # If both exist, take the weighted average, else take whichever is set
+            other_mean = other.mean
+            if other_mean is not None:
+                self_mean = self.mean
+                if self_mean is not None:
+                    self.json_data["mean"] = (
+                        (other_mean * other.num_samples
+                         + self_mean * self.num_samples)
+                        / (self.num_samples + other.num_samples)
+                    )
+                else:
+                    self.json_data["mean"] = other_mean
+            self.json_data.pop("median", None)  # Remove median
+            self.json_data.pop("sd", None)  # Remove stdev
+            self.json_data.pop("q1", None)  # Remove 25% quantile
+            self.json_data.pop("q3", None)  # Remove 75% quantile
+            self.json_data.pop("quantiles", None)  # Remove quantiles
+
+        # Accumulate samples (if present) and num_samples (always)
+        self.samples += other.samples
+        self.num_samples += other.num_samples

        # Metadata
-        def minimum(a, b):  # work around None being less than everything
-            return min(filter(lambda x: x is not None, [a, b])) if any([a, b]) else None
-
-        self.max_rss = minimum(self.max_rss, r.max_rss)
-        self.setup = minimum(self.setup, r.setup)
+        # Use the smaller if both have a max_rss value
+        self.json_data["max_rss"] = other.max_rss
+        other_max_rss = other.max_rss
+        if other_max_rss is not None:
+            self_max_rss = self.max_rss
+            if self_max_rss is not None:
+                self.json_data["max_rss"] = min(self_max_rss, other_max_rss)
+            else:
+                self.json_data["max_rss"] = other_max_rss


 class ResultComparison(object):
@@ -361,16 +444,22 @@ class ResultComparison(object):
        self.name = old.name  # Test name, convenience accessor

        # Speedup ratio
-        self.ratio = (old.min + 0.001) / (new.min + 0.001)
+        self.ratio = (old.min_value + 0.001) / (new.min_value + 0.001)

        # Test runtime improvement in %
-        ratio = (new.min + 0.001) / (old.min + 0.001)
+        ratio = (new.min_value + 0.001) / (old.min_value + 0.001)
        self.delta = (ratio - 1) * 100

        # Indication of dubious changes: when result's MIN falls inside the
        # (MIN, MAX) interval of result they are being compared with.
-        self.is_dubious = (old.min < new.min and new.min < old.max) or (
-            new.min < old.min and old.min < new.max
+        self.is_dubious = (
+            (
+                old.min_value < new.min_value
+                and new.min_value < old.max_value
+            ) or (
+                new.min_value < old.min_value
+                and old.min_value < new.max_value
+            )
        )


@@ -385,117 +474,49 @@ class LogParser(object):
    def __init__(self):
        """Create instance of `LogParser`."""
        self.results = []
-        self.quantiles, self.delta, self.memory = False, False, False
-        self.meta = False
-        self._reset()
-
-    def _reset(self):
-        """Reset parser to the default state for reading a new result."""
-        self.samples, self.yields, self.num_iters = [], [], 1
-        self.setup, self.max_rss, self.mem_pages = None, None, None
-        self.voluntary_cs, self.involuntary_cs = None, None
-
-    # Parse lines like this
-    # #,TEST,SAMPLES,MIN(μs),MAX(μs),MEAN(μs),SD(μs),MEDIAN(μs)
-    results_re = re.compile(
-        r"( *\d+[, \t]+[\w.\-\?!]+[, \t]+"
-        + r"[, \t]+".join([r"\d+"] * 2)  # #,TEST
-        + r"(?:[, \t]+\d*)*)"  # at least 2...
-    )  # ...or more numeric columns
-
-    def _append_result(self, result):
-        columns = result.split(",") if "," in result else result.split()
-        r = PerformanceTestResult(
-            columns,
-            quantiles=self.quantiles,
-            memory=self.memory,
-            delta=self.delta,
-            meta=self.meta,
-        )
-        r.setup = self.setup
-        r.max_rss = r.max_rss or self.max_rss
-        r.mem_pages = r.mem_pages or self.mem_pages
-        r.voluntary_cs = self.voluntary_cs
-        r.involuntary_cs = r.involuntary_cs or self.involuntary_cs
-        if self.samples:
-            r.samples = PerformanceTestSamples(r.name, self.samples)
-            r.samples.exclude_outliers()
-        self.results.append(r)
-        r.yields = self.yields or None
-        self._reset()
-
-    def _store_memory_stats(self, max_rss, mem_pages):
-        self.max_rss = int(max_rss)
-        self.mem_pages = int(mem_pages)
-
-    def _configure_format(self, header):
-        self.quantiles = "QMIN" in header
-        self.memory = "MAX_RSS" in header
-        self.meta = "PAGES" in header
-        self.delta = "𝚫" in header
-
-    # Regular expression and action to take when it matches the parsed line
-    state_actions = {
-        results_re: _append_result,
-        # Verbose mode adds new productions:
-        # Adaptively determined N; test loop multiple adjusting runtime to ~1s
-        re.compile(r"\s+Measuring with scale (\d+)."): (
-            lambda self, num_iters: setattr(self, "num_iters", num_iters)
-        ),
-        re.compile(r"\s+Sample (\d+),(\d+)"): (
-            lambda self, i, runtime: self.samples.append(
-                Sample(int(i), int(self.num_iters), int(runtime))
-            )
-        ),
-        re.compile(r"\s+SetUp (\d+)"): (
-            lambda self, setup: setattr(self, "setup", int(setup))
-        ),
-        re.compile(r"\s+Yielding after ~(\d+) μs"): (
-            lambda self, since_last_yield: self.yields.append(
-                Yield(len(self.samples), int(since_last_yield))
-            )
-        ),
-        re.compile(r"( *#[, \t]+TEST[, \t]+SAMPLES[, \t].*)"): _configure_format,
-        # Environmental statistics: memory usage and context switches
-        re.compile(
-            r"\s+MAX_RSS \d+ - \d+ = (\d+) \((\d+) pages\)"
-        ): _store_memory_stats,
-        re.compile(r"\s+VCS \d+ - \d+ = (\d+)"): (
-            lambda self, vcs: setattr(self, "voluntary_cs", int(vcs))
-        ),
-        re.compile(r"\s+ICS \d+ - \d+ = (\d+)"): (
-            lambda self, ics: setattr(self, "involuntary_cs", int(ics))
-        ),
-    }

    def parse_results(self, lines):
        """Parse results from the lines of the log output from Benchmark*.

        Returns a list of `PerformanceTestResult`s.
        """
+        match_json = re.compile(r"\s*({.*)")
+        match_header = re.compile(r"( *#[, \t]+TEST[, \t]+SAMPLES[, \t].*)")
+        match_legacy = re.compile(r" *(\d+[, \t].*)")
+        header = ""
        for line in lines:
-            for regexp, action in LogParser.state_actions.items():
-                match = regexp.match(line)
-                if match:
-                    action(self, *match.groups())
-                    break  # stop after 1st match
-            else:  # If none matches, skip the line.
-                # print('skipping: ' + line.rstrip('\n'))
+            # Current format has a JSON-encoded object on each line
+            # That format is flexible so should be the only format
+            # used going forward
+            if match_json.match(line):
+                r = PerformanceTestResult.fromJSONFormat(line)
+                self.results.append(r)
+            elif match_header.match(line):
+                # Legacy formats use a header line (which can be
+                # inspected to determine the presence and order of columns)
+                header = line
+            elif match_legacy.match(line):
+                # Legacy format: lines of space- or tab-separated values
+                if "QMIN" in header:
+                    r = PerformanceTestResult.fromQuantileFormat(header, line)
+                else:
+                    r = PerformanceTestResult.fromOldFormat(header, line)
+                self.results.append(r)
+            else:
+                # Ignore unrecognized lines
+                # print('Skipping: ' + line.rstrip('\n'), file=sys.stderr, flush=True)
                continue
        return self.results

    @staticmethod
    def _results_from_lines(lines):
-        tests = LogParser().parse_results(lines)
-
-        def add_or_merge(names, r):
+        names = dict()
+        for r in LogParser().parse_results(lines):
            if r.name not in names:
                names[r.name] = r
            else:
                names[r.name].merge(r)
-            return names
-
-        return functools.reduce(add_or_merge, tests, dict())
+        return names

    @staticmethod
    def results_from_string(log_contents):
@@ -615,18 +636,18 @@ class ReportFormatter(object):
        return (
            (
                result.name,
-                str(result.min),
-                str(result.max),
-                str(int(result.mean)),
-                str(result.max_rss) if result.max_rss else "—",
+                str(result.min_value) if result.min_value is not None else "-",
+                str(result.max_value) if result.max_value is not None else "-",
+                str(result.mean) if result.mean is not None else "-",
+                str(result.max_rss) if result.max_rss is not None else "—",
            )
            if isinstance(result, PerformanceTestResult)
            else
            # isinstance(result, ResultComparison)
            (
                result.name,
-                str(result.old.min),
-                str(result.new.min),
+                str(result.old.min_value) if result.old.min_value else "-",
+                str(result.new.min_value) if result.new.min_value else "-",
                "{0:+.1f}%".format(result.delta),
                "{0:.2f}x{1}".format(result.ratio, " (?)" if result.is_dubious else ""),
            )
--- a/benchmark/scripts/test_Benchmark_Driver.py
+++ b/benchmark/scripts/test_Benchmark_Driver.py
@@ -222,7 +222,7 @@ class TestBenchmarkDriverInitialization(unittest.TestCase):

    def test_gets_list_of_precommit_benchmarks(self):
        self.subprocess_mock.expect(
-            "/benchmarks/Benchmark_O --list --delim=\t".split(" "),
+            "/benchmarks/Benchmark_O --list".split(" "),
            "#\tTest\t[Tags]\n1\tBenchmark1\t[t1, t2]\n2\tBenchmark2\t[t3]\n",
        )
        driver = BenchmarkDriver(self.args, _subprocess=self.subprocess_mock)
@@ -233,7 +233,7 @@ class TestBenchmarkDriverInitialization(unittest.TestCase):
        self.assertEqual(driver.test_number["Benchmark2"], "2")

    list_all_tests = (
-        "/benchmarks/Benchmark_O --list --delim=\t --skip-tags=".split(" "),
+        "/benchmarks/Benchmark_O --list --skip-tags=".split(" "),
        """#	Test	[Tags]
 1	Benchmark1	[t1, t2]
 2	Benchmark2	[t3]
@@ -310,7 +310,7 @@ class LogParserStub(object):
    @staticmethod
    def results_from_string(log_contents):
        LogParserStub.results_from_string_called = True
-        r = PerformanceTestResult("3,b1,1,123,123,123,0,123".split(","))
+        r = PerformanceTestResult("""{"number":3,"name":"b1","samples":[123]}""")
        return {"b1": r}


@@ -320,7 +320,7 @@ class TestBenchmarkDriverRunningTests(unittest.TestCase):
        self.parser_stub = LogParserStub()
        self.subprocess_mock = SubprocessMock()
        self.subprocess_mock.expect(
-            "/benchmarks/Benchmark_O --list --delim=\t".split(" "),
+            "/benchmarks/Benchmark_O --list".split(" "),
            "#\tTest\t[Tags]\n1\tb1\t[tag]\n",
        )
        self.driver = BenchmarkDriver(
@@ -382,13 +382,6 @@ class TestBenchmarkDriverRunningTests(unittest.TestCase):
            ("/benchmarks/Benchmark_O", "b", "--memory")
        )

-    def test_report_quantiles(self):
-        """Use delta compression for quantile reports."""
-        self.driver.run("b", quantile=4)
-        self.subprocess_mock.assert_called_with(
-            ("/benchmarks/Benchmark_O", "b", "--quantile=4", "--delta")
-        )
-
    def test_run_benchmark_independent_samples(self):
        """Extract up to 20 measurements from an independent run."""
        self.driver.args.independent_samples = 3
@@ -400,8 +393,6 @@ class TestBenchmarkDriverRunningTests(unittest.TestCase):
                    "b1",
                    "--num-iters=1",
                    "--memory",
-                    "--quantile=20",
-                    "--delta",
                )
            ),
            3,
@@ -412,38 +403,36 @@ class TestBenchmarkDriverRunningTests(unittest.TestCase):
        def mock_run(test):
            self.assertEqual(test, "b1")
            return PerformanceTestResult(
-                "3,b1,5,101,1,1,1,1,888".split(","),
-                quantiles=True,
-                delta=True,
-                memory=True,
+                """{"number":3,"""
+                + """"name":"b1","""
+                + """"samples":[101,102,103,104,105],"""
+                + """"max_rss":888}"""
            )

        driver = BenchmarkDriver(tests=["b1"], args=Stub(output_dir=None))
        driver.run_independent_samples = mock_run  # patching

        with captured_output() as (out, _):
-            log = driver.run_and_log()
+            driver.run_and_log()

        header = (
            "#,TEST,SAMPLES,MIN(μs),Q1(μs),MEDIAN(μs),Q3(μs),MAX(μs)," + "MAX_RSS(B)\n"
        )
-        csv_log = "3,b1,5,101,102,103,104,105,888\n"
-        self.assertEqual(log, None)
+        csv_log = "3,b1,5,101,101.5,103,104.5,105,888\n"
        self.assertEqual(
            out.getvalue(),
            header + csv_log + "\n" + "Total performance tests executed: 1\n",
        )

        with captured_output() as (out, _):
-            log = driver.run_and_log(csv_console=False)
+            driver.run_and_log(csv_console=False)

-        self.assertEqual(log, header + csv_log)
        self.assertEqual(
            out.getvalue(),
            "  # TEST                                     SAMPLES MIN(μs)"
            + " Q1(μs) MEDIAN(μs) Q3(μs) MAX(μs) MAX_RSS(B)\n"
            + "  3 b1                                             5     101"
-            + "    102        103    104     105        888\n"
+            + "  101.5        103  104.5     105        888\n"
            + "\n"
            + "Total performance tests executed: 1\n",
        )
@@ -459,7 +448,7 @@ class TestBenchmarkDriverRunningTests(unittest.TestCase):
                openmode = "r"  # 'U' mode is deprecated in Python 3
            with open(log_file, openmode) as f:
                text = f.read()
-            self.assertEqual(text, "formatted output")
+            self.assertEqual(text, "formatted output\n")

        try:
            import tempfile  # setUp
@@ -469,7 +458,7 @@ class TestBenchmarkDriverRunningTests(unittest.TestCase):
            driver = BenchmarkDriver(Stub(), tests=[""])

            self.assertFalse(os.path.exists(log_dir))
-            content = "formatted output"
+            content = ["formatted output"]
            log_file = os.path.join(log_dir, "1.log")
            with captured_output() as (out, _):
                driver.log_results(content, log_file=log_file)
--- a/benchmark/scripts/test_compare_perf_tests.py
+++ b/benchmark/scripts/test_compare_perf_tests.py
@@ -13,6 +13,7 @@
 #
 # ===---------------------------------------------------------------------===//

+import json
 import os
 import shutil
 import sys
@@ -21,10 +22,8 @@ import unittest

 from compare_perf_tests import LogParser
 from compare_perf_tests import PerformanceTestResult
-from compare_perf_tests import PerformanceTestSamples
 from compare_perf_tests import ReportFormatter
 from compare_perf_tests import ResultComparison
-from compare_perf_tests import Sample
 from compare_perf_tests import TestComparator
 from compare_perf_tests import main
 from compare_perf_tests import parse_args
@@ -32,227 +31,70 @@ from compare_perf_tests import parse_args
 from test_utils import captured_output


-class TestSample(unittest.TestCase):
-    def test_has_named_fields(self):
-        s = Sample(1, 2, 3)
-        self.assertEqual(s.i, 1)
-        self.assertEqual(s.num_iters, 2)
-        self.assertEqual(s.runtime, 3)
-
-    def test_is_iterable(self):
-        s = Sample(1, 2, 3)
-        self.assertEqual(s[0], 1)
-        self.assertEqual(s[1], 2)
-        self.assertEqual(s[2], 3)
-
-
-class TestPerformanceTestSamples(unittest.TestCase):
-    def setUp(self):
-        self.samples = PerformanceTestSamples("B1")
-        self.samples.add(Sample(7, 42, 1000))
-
-    def test_has_name(self):
-        self.assertEqual(self.samples.name, "B1")
-
-    def test_stores_samples(self):
-        self.assertEqual(self.samples.count, 1)
-        s = self.samples.samples[0]
-        self.assertTrue(isinstance(s, Sample))
-        self.assertEqual(s.i, 7)
-        self.assertEqual(s.num_iters, 42)
-        self.assertEqual(s.runtime, 1000)
-
-    def test_quantile(self):
-        self.assertEqual(self.samples.quantile(1), 1000)
-        self.assertEqual(self.samples.quantile(0), 1000)
-        self.samples.add(Sample(2, 1, 1100))
-        self.assertEqual(self.samples.quantile(0), 1000)
-        self.assertEqual(self.samples.quantile(1), 1100)
-        self.samples.add(Sample(3, 1, 1050))
-        self.assertEqual(self.samples.quantile(0), 1000)
-        self.assertEqual(self.samples.quantile(0.5), 1050)
-        self.assertEqual(self.samples.quantile(1), 1100)
-
-    def assertEqualFiveNumberSummary(self, ss, expected_fns):
-        e_min, e_q1, e_median, e_q3, e_max = expected_fns
-        self.assertEqual(ss.min, e_min)
-        self.assertEqual(ss.q1, e_q1)
-        self.assertEqual(ss.median, e_median)
-        self.assertEqual(ss.q3, e_q3)
-        self.assertEqual(ss.max, e_max)
-
-    def test_computes_five_number_summary(self):
-        self.assertEqualFiveNumberSummary(self.samples, (1000, 1000, 1000, 1000, 1000))
-        self.samples.add(Sample(2, 1, 1100))
-        self.assertEqualFiveNumberSummary(self.samples, (1000, 1000, 1000, 1100, 1100))
-        self.samples.add(Sample(3, 1, 1050))
-        self.assertEqualFiveNumberSummary(self.samples, (1000, 1000, 1050, 1100, 1100))
-        self.samples.add(Sample(4, 1, 1025))
-        self.assertEqualFiveNumberSummary(self.samples, (1000, 1000, 1025, 1050, 1100))
-        self.samples.add(Sample(5, 1, 1075))
-        self.assertEqualFiveNumberSummary(self.samples, (1000, 1025, 1050, 1075, 1100))
-
-    def test_computes_inter_quartile_range(self):
-        self.assertEqual(self.samples.iqr, 0)
-        self.samples.add(Sample(2, 1, 1025))
-        self.samples.add(Sample(3, 1, 1050))
-        self.samples.add(Sample(4, 1, 1075))
-        self.samples.add(Sample(5, 1, 1100))
-        self.assertEqual(self.samples.iqr, 50)
-
-    def assertEqualStats(self, stats, expected_stats):
-        for actual, expected in zip(stats, expected_stats):
-            self.assertAlmostEqual(actual, expected, places=2)
-
-    def test_computes_mean_sd_cv(self):
-        ss = self.samples
-        self.assertEqualStats((ss.mean, ss.sd, ss.cv), (1000.0, 0.0, 0.0))
-        self.samples.add(Sample(2, 1, 1100))
-        self.assertEqualStats((ss.mean, ss.sd, ss.cv), (1050.0, 70.71, 6.7 / 100))
-
-    def test_computes_range_spread(self):
-        ss = self.samples
-        self.assertEqualStats((ss.range, ss.spread), (0, 0))
-        self.samples.add(Sample(2, 1, 1100))
-        self.assertEqualStats((ss.range, ss.spread), (100, 10.0 / 100))
-
-    def test_init_with_samples(self):
-        self.samples = PerformanceTestSamples(
-            "B2", [Sample(0, 1, 1000), Sample(1, 1, 1100)]
-        )
-        self.assertEqual(self.samples.count, 2)
-        self.assertEqualStats(
-            (
-                self.samples.mean,
-                self.samples.sd,
-                self.samples.range,
-                self.samples.spread,
-            ),
-            (1050.0, 70.71, 100, 9.52 / 100),
-        )
-
-    def test_can_handle_zero_runtime(self):
-        # guard against dividing by 0
-        self.samples = PerformanceTestSamples("Zero")
-        self.samples.add(Sample(0, 1, 0))
-        self.assertEqualStats(
-            (
-                self.samples.mean,
-                self.samples.sd,
-                self.samples.cv,
-                self.samples.range,
-                self.samples.spread,
-            ),
-            (0, 0, 0.0, 0, 0.0),
-        )
-
-    def test_excludes_outliers(self):
-        ss = [
-            Sample(*map(int, s.split()))
-            for s in "0 1 1000, 1 1 1025, 2 1 1050, 3 1 1075, 4 1 1100, "
-            "5 1 1000, 6 1 1025, 7 1 1050, 8 1 1075, 9 1 1100, "
-            "10 1 1050, 11 1 949, 12 1 1151".split(",")
-        ]
-        self.samples = PerformanceTestSamples("Outliers", ss)
-        self.assertEqual(self.samples.count, 13)
-        self.assertEqualStats((self.samples.mean, self.samples.sd), (1050, 52.36))
-
-        self.samples.exclude_outliers()
-
-        self.assertEqual(self.samples.count, 11)
-        self.assertEqual(self.samples.outliers, ss[11:])
-        self.assertEqualFiveNumberSummary(self.samples, (1000, 1025, 1050, 1075, 1100))
-        self.assertEqualStats((self.samples.mean, self.samples.sd), (1050, 35.36))
-
-    def test_excludes_outliers_zero_IQR(self):
-        self.samples = PerformanceTestSamples("Tight")
-        self.samples.add(Sample(0, 2, 23))
-        self.samples.add(Sample(1, 2, 18))
-        self.samples.add(Sample(2, 2, 18))
-        self.samples.add(Sample(3, 2, 18))
-        self.assertEqual(self.samples.iqr, 0)
-
-        self.samples.exclude_outliers()
-
-        self.assertEqual(self.samples.count, 3)
-        self.assertEqualStats((self.samples.min, self.samples.max), (18, 18))
-
-    def test_excludes_outliers_top_only(self):
-        ss = [
-            Sample(*map(int, s.split()))
-            for s in "0 1 1, 1 1 2, 2 1 2, 3 1 2, 4 1 3".split(",")
-        ]
-        self.samples = PerformanceTestSamples("Top", ss)
-        self.assertEqualFiveNumberSummary(self.samples, (1, 2, 2, 2, 3))
-        self.assertEqual(self.samples.iqr, 0)
-
-        self.samples.exclude_outliers(top_only=True)
-
-        self.assertEqual(self.samples.count, 4)
-        self.assertEqualStats((self.samples.min, self.samples.max), (1, 2))
-
-
 class TestPerformanceTestResult(unittest.TestCase):
    def test_init(self):
+        header = "#,TEST,SAMPLES,MIN,MAX,MEAN,SD,MEDIAN"
        log_line = "1,AngryPhonebook,20,10664,12933,11035,576,10884"
-        r = PerformanceTestResult(log_line.split(","))
-        self.assertEqual(r.test_num, "1")
+        r = PerformanceTestResult.fromOldFormat(header, log_line)
+        self.assertEqual(r.test_num, 1)
        self.assertEqual(r.name, "AngryPhonebook")
        self.assertEqual(
-            (r.num_samples, r.min, r.max, r.mean, r.sd, r.median),
+            (r.num_samples, r.min_value, r.max_value, r.mean, r.sd, r.median),
            (20, 10664, 12933, 11035, 576, 10884),
        )
-        self.assertEqual(r.samples, None)
+        self.assertEqual(r.samples, [])

+        header = "#,TEST,SAMPLES,MIN,MAX,MEAN,SD,MEDIAN,MAX_RSS"
        log_line = "1,AngryPhonebook,1,12045,12045,12045,0,12045,10510336"
-        r = PerformanceTestResult(log_line.split(","), memory=True)
+        r = PerformanceTestResult.fromOldFormat(header, log_line)
        self.assertEqual(r.max_rss, 10510336)

    def test_init_quantiles(self):
-        # #,TEST,SAMPLES,MIN(μs),MEDIAN(μs),MAX(μs)
+        header = "#,TEST,SAMPLES,MIN(μs),MEDIAN(μs),MAX(μs)"
        log = "1,Ackermann,3,54383,54512,54601"
-        r = PerformanceTestResult(log.split(","), quantiles=True)
-        self.assertEqual(r.test_num, "1")
+        r = PerformanceTestResult.fromQuantileFormat(header, log)
+        self.assertEqual(r.test_num, 1)
        self.assertEqual(r.name, "Ackermann")
        self.assertEqual(
-            (r.num_samples, r.min, r.median, r.max), (3, 54383, 54512, 54601)
+            (r.num_samples, r.min_value, r.median, r.max_value),
+            (3, 54383, 54512, 54601)
        )
        self.assertAlmostEqual(r.mean, 54498.67, places=2)
        self.assertAlmostEqual(r.sd, 109.61, places=2)
-        self.assertEqual(r.samples.count, 3)
-        self.assertEqual(r.samples.num_samples, 3)
-        self.assertEqual(
-            [s.runtime for s in r.samples.all_samples], [54383, 54512, 54601]
-        )
+        self.assertEqual(r.samples, [54383, 54512, 54601])

-        # #,TEST,SAMPLES,MIN(μs),MEDIAN(μs),MAX(μs),MAX_RSS(B)
+        header = "#,TEST,SAMPLES,MIN(μs),MEDIAN(μs),MAX(μs),MAX_RSS(B)"
        log = "1,Ackermann,3,54529,54760,55807,266240"
-        r = PerformanceTestResult(log.split(","), quantiles=True, memory=True)
-        self.assertEqual((r.samples.count, r.max_rss), (3, 266240))
-        # #,TEST,SAMPLES,MIN(μs),Q1(μs),Q2(μs),Q3(μs),MAX(μs)
+        r = PerformanceTestResult.fromQuantileFormat(header, log)
+        self.assertEqual((len(r.samples), r.max_rss), (3, 266240))
+
+        header = "#,TEST,SAMPLES,MIN(μs),Q1(μs),Q2(μs),Q3(μs),MAX(μs)"
        log = "1,Ackermann,5,54570,54593,54644,57212,58304"
-        r = PerformanceTestResult(log.split(","), quantiles=True, memory=False)
+        r = PerformanceTestResult.fromQuantileFormat(header, log)
        self.assertEqual(
-            (r.num_samples, r.min, r.median, r.max), (5, 54570, 54644, 58304)
+            (r.num_samples, r.min_value, r.median, r.max_value),
+            (5, 54570, 54644, 58304)
        )
-        self.assertEqual((r.samples.q1, r.samples.q3), (54593, 57212))
-        self.assertEqual(r.samples.count, 5)
-        # #,TEST,SAMPLES,MIN(μs),Q1(μs),Q2(μs),Q3(μs),MAX(μs),MAX_RSS(B)
+        self.assertEqual((r.q1, r.q3), (54581.5, 57758))
+        self.assertEqual(len(r.samples), 5)
+
+        header = "#,TEST,SAMPLES,MIN(μs),Q1(μs),Q2(μs),Q3(μs),MAX(μs),MAX_RSS(B)"
        log = "1,Ackermann,5,54686,54731,54774,55030,63466,270336"
-        r = PerformanceTestResult(log.split(","), quantiles=True, memory=True)
-        self.assertEqual(r.samples.num_samples, 5)
-        self.assertEqual(r.samples.count, 4)  # outlier was excluded
+        r = PerformanceTestResult.fromQuantileFormat(header, log)
+        self.assertEqual(r.num_samples, 5)
+        self.assertEqual(len(r.samples), 5)
        self.assertEqual(r.max_rss, 270336)

    def test_init_delta_quantiles(self):
-        # #,TEST,SAMPLES,MIN(μs),𝚫MEDIAN,𝚫MAX
        # 2-quantile from 2 samples in repeated min, when delta encoded,
        # the difference is 0, which is omitted -- only separator remains
+        header = "#,TEST,SAMPLES,MIN(μs),𝚫MEDIAN,𝚫MAX"
        log = "202,DropWhileArray,2,265,,22"
-        r = PerformanceTestResult(log.split(","), quantiles=True, delta=True)
-        self.assertEqual((r.num_samples, r.min, r.median, r.max), (2, 265, 265, 287))
-        self.assertEqual(r.samples.count, 2)
-        self.assertEqual(r.samples.num_samples, 2)
+        r = PerformanceTestResult.fromQuantileFormat(header, log)
+        self.assertEqual((r.num_samples, r.min_value, r.median, r.max_value),
+                         (2, 265, 276, 287))
+        self.assertEqual(len(r.samples), 2)
+        self.assertEqual(r.num_samples, 2)

    def test_init_oversampled_quantiles(self):
        """When num_samples is < quantile + 1, some of the measurements are
@@ -265,6 +107,16 @@ class TestPerformanceTestResult(unittest.TestCase):
        tbl <- function(s) t(sapply(1:s, function(x) {
          qs <- subsample(x, s); c(qs[1], diff(qs)) }))
        sapply(c(3, 5, 11, 21), tbl)
+
+        TODO: Delete this test when we delete quantile support from the
+        benchmark harness. Reconstructing samples from quantiles as this code is
+        trying to do is not really statistically sound, which is why we're going
+        to delete most of this in favor of an architecture where the
+        lowest-level benchmarking logic reports samples, we store and pass
+        raw sample data around as much as possible, and summary statistics are
+        only computed as necessary for actual reporting (and then discarded,
+        since we can recompute anything we need if we always have the raw
+        samples available).
        """

        def validatePTR(deq):  # construct from delta encoded quantiles string
@@ -273,10 +125,8 @@ class TestPerformanceTestResult(unittest.TestCase):
            r = PerformanceTestResult(
                ["0", "B", str(num_samples)] + deq, quantiles=True, delta=True
            )
-            self.assertEqual(r.samples.num_samples, num_samples)
-            self.assertEqual(
-                [s.runtime for s in r.samples.all_samples], range(1, num_samples + 1)
-            )
+            self.assertEqual(len(r.samples), num_samples)
+            self.assertEqual(r.samples, range(1, num_samples + 1))

        delta_encoded_quantiles = """
 1,,
@@ -318,119 +168,152 @@ class TestPerformanceTestResult(unittest.TestCase):
        map(validatePTR, delta_encoded_quantiles.split("\n")[1:])

    def test_init_meta(self):
-        # #,TEST,SAMPLES,MIN(μs),MAX(μs),MEAN(μs),SD(μs),MEDIAN(μs),…
-        # …PAGES,ICS,YIELD
+        header = (
+            "#,TEST,SAMPLES,MIN(μs),MAX(μs),MEAN(μs),SD(μs),"
+            + "MEDIAN(μs),PAGES,ICS,YIELD"
+        )
        log = "1,Ackermann,200,715,1281,726,47,715,7,29,15"
-        r = PerformanceTestResult(log.split(","), meta=True)
-        self.assertEqual((r.test_num, r.name), ("1", "Ackermann"))
+        r = PerformanceTestResult.fromOldFormat(header, log)
+        self.assertEqual((r.test_num, r.name), (1, "Ackermann"))
        self.assertEqual(
-            (r.num_samples, r.min, r.max, r.mean, r.sd, r.median),
+            (r.num_samples, r.min_value, r.max_value, r.mean, r.sd, r.median),
            (200, 715, 1281, 726, 47, 715),
        )
        self.assertEqual((r.mem_pages, r.involuntary_cs, r.yield_count), (7, 29, 15))
-        # #,TEST,SAMPLES,MIN(μs),MAX(μs),MEAN(μs),SD(μs),MEDIAN(μs),MAX_RSS(B),…
-        # …PAGES,ICS,YIELD
+        header = (
+            "#,TEST,SAMPLES,MIN(μs),MAX(μs),MEAN(μs),SD(μs),MEDIAN(μs),"
+            + "MAX_RSS(B),PAGES,ICS,YIELD"
+        )
        log = "1,Ackermann,200,715,1951,734,97,715,36864,9,50,15"
-        r = PerformanceTestResult(log.split(","), memory=True, meta=True)
+        r = PerformanceTestResult.fromOldFormat(header, log)
        self.assertEqual(
-            (r.num_samples, r.min, r.max, r.mean, r.sd, r.median),
+            (r.num_samples, r.min_value, r.max_value, r.mean, r.sd, r.median),
            (200, 715, 1951, 734, 97, 715),
        )
        self.assertEqual(
            (r.mem_pages, r.involuntary_cs, r.yield_count, r.max_rss),
            (9, 50, 15, 36864),
        )
-        # #,TEST,SAMPLES,MIN(μs),MAX(μs),PAGES,ICS,YIELD
+        header = "#,TEST,SAMPLES,MIN(μs),MAX(μs),PAGES,ICS,YIELD"
        log = "1,Ackermann,200,715,3548,8,31,15"
-        r = PerformanceTestResult(log.split(","), quantiles=True, meta=True)
-        self.assertEqual((r.num_samples, r.min, r.max), (200, 715, 3548))
-        self.assertEqual(
-            (r.samples.count, r.samples.min, r.samples.max), (2, 715, 3548)
-        )
+        r = PerformanceTestResult.fromOldFormat(header, log)
+        self.assertEqual((r.num_samples, r.min_value, r.max_value), (200, 715, 3548))
+        self.assertEqual(r.samples, [])
        self.assertEqual((r.mem_pages, r.involuntary_cs, r.yield_count), (8, 31, 15))
-        # #,TEST,SAMPLES,MIN(μs),MAX(μs),MAX_RSS(B),PAGES,ICS,YIELD
+
+        header = "#,TEST,SAMPLES,MIN(μs),MAX(μs),MAX_RSS(B),PAGES,ICS,YIELD"
        log = "1,Ackermann,200,715,1259,32768,8,28,15"
-        r = PerformanceTestResult(
-            log.split(","), quantiles=True, memory=True, meta=True
-        )
-        self.assertEqual((r.num_samples, r.min, r.max), (200, 715, 1259))
-        self.assertEqual(
-            (r.samples.count, r.samples.min, r.samples.max), (2, 715, 1259)
-        )
+        r = PerformanceTestResult.fromOldFormat(header, log)
+        self.assertEqual((r.num_samples, r.min_value, r.max_value), (200, 715, 1259))
+        self.assertEqual(r.samples, [])
        self.assertEqual(r.max_rss, 32768)
        self.assertEqual((r.mem_pages, r.involuntary_cs, r.yield_count), (8, 28, 15))

-    def test_repr(self):
-        log_line = "1,AngryPhonebook,20,10664,12933,11035,576,10884"
-        r = PerformanceTestResult(log_line.split(","))
-        self.assertEqual(
-            str(r),
-            "<PerformanceTestResult name:'AngryPhonebook' samples:20 "
-            "min:10664 max:12933 mean:11035 sd:576 median:10884>",
-        )
-
    def test_merge(self):
-        tests = """
-1,AngryPhonebook,1,12045,12045,12045,0,12045
-1,AngryPhonebook,1,12325,12325,12325,0,12325,10510336
-1,AngryPhonebook,1,11616,11616,11616,0,11616,10502144
-1,AngryPhonebook,1,12270,12270,12270,0,12270,10498048""".split(
-            "\n"
-        )[
-            1:
+        tests = [
+            """{"number":1,"name":"AngryPhonebook",
+            "samples":[12045]}""",
+            """{"number":1,"name":"AngryPhonebook",
+            "samples":[12325],"max_rss":10510336}""",
+            """{"number":1,"name":"AngryPhonebook",
+            "samples":[11616],"max_rss":10502144}""",
+            """{"number":1,"name":"AngryPhonebook",
+            "samples":[12270],"max_rss":10498048}"""
        ]

-        def makeResult(csv_row):
-            return PerformanceTestResult(csv_row, memory=True)
-
-        results = list(map(makeResult, [line.split(",") for line in tests]))
-        results[2].setup = 9
-        results[3].setup = 7
+        results = [PerformanceTestResult(json) for json in tests]

        def as_tuple(r):
            return (
                r.num_samples,
-                r.min,
-                r.max,
+                r.min_value,
+                r.max_value,
                round(r.mean, 2),
-                r.sd,
+                round(r.sd, 2),
                r.median,
                r.max_rss,
-                r.setup,
            )

        r = results[0]
-        self.assertEqual(as_tuple(r), (1, 12045, 12045, 12045, 0, 12045, None, None))
+        self.assertEqual(as_tuple(r), (1, 12045, 12045, 12045, 0, 12045, None))
        r.merge(results[1])
        self.assertEqual(
-            as_tuple(r),  # drops SD and median, +max_rss
-            (2, 12045, 12325, 12185, None, None, 10510336, None),
+            as_tuple(r),
+            (2, 12045, 12325, 12185, 197.99, 12185, 10510336),
        )
        r.merge(results[2])
        self.assertEqual(
-            as_tuple(r),  # picks smaller of the MAX_RSS, +setup
-            (3, 11616, 12325, 11995.33, None, None, 10502144, 9),
+            as_tuple(r),
+            (3, 11616, 12325, 11995.33, 357.1, 12045, 10502144),
        )
        r.merge(results[3])
        self.assertEqual(
-            as_tuple(r),  # picks smaller of the setup values
-            (4, 11616, 12325, 12064, None, None, 10498048, 7),
+            as_tuple(r),
+            (4, 11616, 12325, 12064, 322.29, 12157.5, 10498048),
+        )
+
+    def test_legacy_merge(self):
+        header = """#,TEST,NUM_SAMPLES,MIN,MAX,MEAN,SD,MEDIAN, MAX_RSS"""
+        tests = [
+            """1,AngryPhonebook,8,12045,12045,12045,0,12045""",
+            """1,AngryPhonebook,8,12325,12325,12325,0,12325,10510336""",
+            """1,AngryPhonebook,8,11616,11616,11616,0,11616,10502144""",
+            """1,AngryPhonebook,8,12270,12270,12270,0,12270,10498048"""
+        ]
+
+        results = [PerformanceTestResult.fromOldFormat(header, row) for row in tests]
+
+        def as_tuple(r):
+            return (
+                r.num_samples,
+                r.min_value,
+                r.max_value,
+                round(r.mean, 2),
+                round(r.sd, 2) if r.sd is not None else None,
+                r.median,
+                r.max_rss,
+            )
+
+        r = results[0]
+        self.assertEqual(as_tuple(r), (8, 12045, 12045, 12045, 0, 12045, None))
+        r.merge(results[1])
+        self.assertEqual(
+            as_tuple(r),  # Note: SD, Median are lost
+            (16, 12045, 12325, 12185, None, None, 10510336),
+        )
+        r.merge(results[2])
+        self.assertEqual(
+            as_tuple(r),
+            (24, 11616, 12325, 11995.33, None, None, 10502144),
+        )
+        r.merge(results[3])
+        self.assertEqual(
+            as_tuple(r),
+            (32, 11616, 12325, 12064, None, None, 10498048),
        )


 class TestResultComparison(unittest.TestCase):
    def setUp(self):
        self.r0 = PerformanceTestResult(
-            "101,GlobalClass,20,0,0,0,0,0,10185728".split(",")
+            """{"number":101,"name":"GlobalClass",
+            "samples":[0,0,0,0,0],"max_rss":10185728}"""
        )
        self.r01 = PerformanceTestResult(
-            "101,GlobalClass,20,20,20,20,0,0,10185728".split(",")
+            """{"number":101,"name":"GlobalClass",
+            "samples":[20,20,20],"max_rss":10185728}"""
        )
        self.r1 = PerformanceTestResult(
-            "1,AngryPhonebook,1,12325,12325,12325,0,12325,10510336".split(",")
+            """{"number":1,"name":"AngryPhonebook",
+            "samples":[12325],"max_rss":10510336}"""
        )
        self.r2 = PerformanceTestResult(
-            "1,AngryPhonebook,1,11616,11616,11616,0,11616,10502144".split(",")
+            """{"number":1,"name":"AngryPhonebook",
+            "samples":[11616],"max_rss":10502144}"""
+        )
+        self.r3 = PerformanceTestResult(
+            """{"number":1,"name":"AngryPhonebook",
+            "samples":[11616,12326],"max_rss":10502144}"""
        )

    def test_init(self):
@@ -455,11 +338,10 @@ class TestResultComparison(unittest.TestCase):

    def test_values_is_dubious(self):
        self.assertFalse(ResultComparison(self.r1, self.r2).is_dubious)
-        self.r2.max = self.r1.min + 1
        # new.min < old.min < new.max
-        self.assertTrue(ResultComparison(self.r1, self.r2).is_dubious)
+        self.assertTrue(ResultComparison(self.r1, self.r3).is_dubious)
        # other way around: old.min < new.min < old.max
-        self.assertTrue(ResultComparison(self.r2, self.r1).is_dubious)
+        self.assertTrue(ResultComparison(self.r3, self.r1).is_dubious)


 class FileSystemIntegration(unittest.TestCase):
@@ -474,45 +356,48 @@ class FileSystemIntegration(unittest.TestCase):
    def write_temp_file(self, file_name, data):
        temp_file_name = os.path.join(self.test_dir, file_name)
        with open(temp_file_name, "w") as f:
-            f.write(data)
+            for line in data:
+                f.write(line)
+                f.write('\n')
        return temp_file_name


 class OldAndNewLog(unittest.TestCase):
-    old_log_content = """1,AngryPhonebook,20,10458,12714,11000,0,11000,10204365
-2,AnyHashableWithAClass,20,247027,319065,259056,0,259056,10250445
-3,Array2D,20,335831,400221,346622,0,346622,28297216
-4,ArrayAppend,20,23641,29000,24990,0,24990,11149926
-34,BitCount,20,3,4,4,0,4,10192896
-35,ByteSwap,20,4,6,4,0,4,10185933"""

-    new_log_content = """265,TwoSum,20,5006,5679,5111,0,5111
-35,ByteSwap,20,0,0,0,0,0
-34,BitCount,20,9,9,9,0,9
-4,ArrayAppend,20,20000,29000,24990,0,24990
-3,Array2D,20,335831,400221,346622,0,346622
-1,AngryPhonebook,20,10458,12714,11000,0,11000"""
+    old_log_content = [
+        """{"number":1,"name":"AngryPhonebook","""
+        + """"samples":[10458,12714,11000],"max_rss":10204365}""",
+        """{"number":2,"name":"AnyHashableWithAClass","""
+        + """"samples":[247027,319065,259056,259056],"max_rss":10250445}""",
+        """{"number":3,"name":"Array2D","""
+        + """"samples":[335831,400221,346622,346622],"max_rss":28297216}""",
+        """{"number":4,"name":"ArrayAppend","""
+        + """"samples":[23641,29000,24990,24990],"max_rss":11149926}""",
+        """{"number":34,"name":"BitCount","samples":[3,4,4,4],"max_rss":10192896}""",
+        """{"number":35,"name":"ByteSwap","samples":[4,6,4,4],"max_rss":10185933}"""
+    ]

-    def makeResult(csv_row):
-        return PerformanceTestResult(csv_row, memory=True)
+    new_log_content = [
+        """{"number":265,"name":"TwoSum","samples":[5006,5679,5111,5111]}""",
+        """{"number":35,"name":"ByteSwap","samples":[0,0,0,0,0]}""",
+        """{"number":34,"name":"BitCount","samples":[9,9,9,9]}""",
+        """{"number":4,"name":"ArrayAppend","samples":[20000,29000,24990,24990]}""",
+        """{"number":3,"name":"Array2D","samples":[335831,400221,346622,346622]}""",
+        """{"number":1,"name":"AngryPhonebook","samples":[10458,12714,11000,11000]}"""
+    ]
+
+    def makeResult(json_text):
+        return PerformanceTestResult(json.loads(json_text))

    old_results = dict(
        [
-            (r.name, r)
-            for r in map(
-                makeResult,
-                [line.split(",") for line in old_log_content.splitlines()],
-            )
+            (r.name, r) for r in map(makeResult, old_log_content)
        ]
    )

    new_results = dict(
        [
-            (r.name, r)
-            for r in map(
-                makeResult,
-                [line.split(",") for line in new_log_content.splitlines()],
-            )
+            (r.name, r) for r in map(makeResult, new_log_content)
        ]
    )

@@ -567,16 +452,12 @@ Total performance tests executed: 1
            """#,TEST,SAMPLES,QMIN(μs),MEDIAN(μs),MAX(μs)
 1,Ackermann,3,54383,54512,54601"""
        )["Ackermann"]
-        self.assertEqual(
-            [s.runtime for s in r.samples.all_samples], [54383, 54512, 54601]
-        )
+        self.assertEqual(r.samples, [54383, 54512, 54601])
        r = LogParser.results_from_string(
            """#,TEST,SAMPLES,QMIN(μs),MEDIAN(μs),MAX(μs),MAX_RSS(B)
 1,Ackermann,3,54529,54760,55807,266240"""
        )["Ackermann"]
-        self.assertEqual(
-            [s.runtime for s in r.samples.all_samples], [54529, 54760, 55807]
-        )
+        self.assertEqual(r.samples, [54529, 54760, 55807])
        self.assertEqual(r.max_rss, 266240)

    def test_parse_delta_quantiles(self):
@@ -584,15 +465,15 @@ Total performance tests executed: 1
            "#,TEST,SAMPLES,QMIN(μs),𝚫MEDIAN,𝚫MAX\n0,B,1,101,,"
        )["B"]
        self.assertEqual(
-            (r.num_samples, r.min, r.median, r.max, r.samples.count),
+            (r.num_samples, r.min_value, r.median, r.max_value, len(r.samples)),
            (1, 101, 101, 101, 1),
        )
        r = LogParser.results_from_string(
            "#,TEST,SAMPLES,QMIN(μs),𝚫MEDIAN,𝚫MAX\n0,B,2,101,,1"
        )["B"]
        self.assertEqual(
-            (r.num_samples, r.min, r.median, r.max, r.samples.count),
-            (2, 101, 101, 102, 2),
+            (r.num_samples, r.min_value, r.median, r.max_value, len(r.samples)),
+            (2, 101, 101.5, 102, 2),
        )
        r = LogParser.results_from_string(  # 20-quantiles aka. ventiles
            "#,TEST,SAMPLES,QMIN(μs),𝚫V1,𝚫V2,𝚫V3,𝚫V4,𝚫V5,𝚫V6,𝚫V7,𝚫V8,"
@@ -600,9 +481,8 @@ Total performance tests executed: 1
            + "202,DropWhileArray,200,214,,,,,,,,,,,,1,,,,,,2,16,464"
        )["DropWhileArray"]
        self.assertEqual(
-            (r.num_samples, r.min, r.max, r.samples.count),
-            # last 3 ventiles were outliers and were excluded from the sample
-            (200, 214, 215, 18),
+            (r.num_samples, r.min_value, r.max_value, len(r.samples)),
+            (200, 214, 697, 0),
        )

    def test_parse_meta(self):
@@ -612,7 +492,7 @@ Total performance tests executed: 1
            + "0,B,1,2,2,2,0,2,7,29,15"
        )["B"]
        self.assertEqual(
-            (r.min, r.mem_pages, r.involuntary_cs, r.yield_count), (2, 7, 29, 15)
+            (r.min_value, r.mem_pages, r.involuntary_cs, r.yield_count), (2, 7, 29, 15)
        )
        r = LogParser.results_from_string(
            "#,TEST,SAMPLES,MIN(μs),MAX(μs),MEAN(μs),SD(μs),MEDIAN(μs),"
@@ -620,163 +500,35 @@ Total performance tests executed: 1
            + "0,B,1,3,3,3,0,3,36864,9,50,15"
        )["B"]
        self.assertEqual(
-            (r.min, r.mem_pages, r.involuntary_cs, r.yield_count, r.max_rss),
+            (r.min_value, r.mem_pages, r.involuntary_cs, r.yield_count, r.max_rss),
            (3, 9, 50, 15, 36864),
        )
        r = LogParser.results_from_string(
            "#,TEST,SAMPLES,QMIN(μs),MAX(μs),PAGES,ICS,YIELD\n" + "0,B,1,4,4,8,31,15"
        )["B"]
        self.assertEqual(
-            (r.min, r.mem_pages, r.involuntary_cs, r.yield_count), (4, 8, 31, 15)
+            (r.min_value, r.mem_pages, r.involuntary_cs, r.yield_count), (4, 8, 31, 15)
        )
        r = LogParser.results_from_string(
            "#,TEST,SAMPLES,QMIN(μs),MAX(μs),MAX_RSS(B),PAGES,ICS,YIELD\n"
            + "0,B,1,5,5,32768,8,28,15"
        )["B"]
        self.assertEqual(
-            (r.min, r.mem_pages, r.involuntary_cs, r.yield_count, r.max_rss),
+            (r.min_value, r.mem_pages, r.involuntary_cs, r.yield_count, r.max_rss),
            (5, 8, 28, 15, 32768),
        )

-    def test_parse_results_verbose(self):
-        """Parse multiple performance test results with 2 sample formats:
-        single line for N = 1; two lines for N > 1.
-        """
-        verbose_log = """--- DATA ---
-#,TEST,SAMPLES,MIN(us),MAX(us),MEAN(us),SD(us),MEDIAN(us)
-Running AngryPhonebook for 3 samples.
-    Measuring with scale 78.
-    Sample 0,11812
-    Measuring with scale 90.
-    Sample 1,13898
-    Sample 2,11467
-1,AngryPhonebook,3,11467,13898,12392,1315,11812
-Running Array2D for 3 samples.
-    SetUp 14444
-    Sample 0,369900
-    Yielding after ~369918 μs
-    Sample 1,381039
-    Yielding after ~381039 μs
-    Sample 2,371043
-3,Array2D,3,369900,381039,373994,6127,371043
-
-Totals,2"""
-        parser = LogParser()
-        results = parser.parse_results(verbose_log.split("\n"))
-
-        r = results[0]
-        self.assertEqual(
-            (r.name, r.min, r.max, int(r.mean), int(r.sd), r.median),
-            ("AngryPhonebook", 11467, 13898, 12392, 1315, 11812),
-        )
-        self.assertEqual(r.num_samples, r.samples.num_samples)
-        self.assertEqual(
-            results[0].samples.all_samples,
-            [(0, 78, 11812), (1, 90, 13898), (2, 90, 11467)],
-        )
-        self.assertEqual(r.yields, None)
-
-        r = results[1]
-        self.assertEqual(
-            (r.name, r.min, r.max, int(r.mean), int(r.sd), r.median),
-            ("Array2D", 369900, 381039, 373994, 6127, 371043),
-        )
-        self.assertEqual(r.setup, 14444)
-        self.assertEqual(r.num_samples, r.samples.num_samples)
-        self.assertEqual(
-            results[1].samples.all_samples,
-            [(0, 1, 369900), (1, 1, 381039), (2, 1, 371043)],
-        )
-        yielded = r.yields[0]
-        self.assertEqual(yielded.before_sample, 1)
-        self.assertEqual(yielded.after, 369918)
-        self.assertEqual(r.yields, [(1, 369918), (2, 381039)])
-
-    def test_parse_environment_verbose(self):
-        """Parse stats about environment in verbose mode."""
-        verbose_log = """    MAX_RSS 8937472 - 8904704 = 32768 (8 pages)
-    ICS 1338 - 229 = 1109
-    VCS 2 - 1 = 1
-2,AngryPhonebook,3,11269,11884,11657,338,11820
-"""
-        parser = LogParser()
-        results = parser.parse_results(verbose_log.split("\n"))
-
-        r = results[0]
-        self.assertEqual(r.max_rss, 32768)
-        self.assertEqual(r.mem_pages, 8)
-        self.assertEqual(r.voluntary_cs, 1)
-        self.assertEqual(r.involuntary_cs, 1109)
-
    def test_results_from_merge(self):
        """Parsing concatenated log merges same PerformanceTestResults"""
-        concatenated_logs = """4,ArrayAppend,20,23641,29000,24990,0,24990
+        concatenated_logs = """#,TEST,SAMPLES,MIN,MAX,MEAN,SD,MEDIAN
+4,ArrayAppend,20,23641,29000,24990,0,24990
 4,ArrayAppend,1,20000,20000,20000,0,20000"""
        results = LogParser.results_from_string(concatenated_logs)
        self.assertEqual(list(results.keys()), ["ArrayAppend"])
        result = results["ArrayAppend"]
        self.assertTrue(isinstance(result, PerformanceTestResult))
-        self.assertEqual(result.min, 20000)
-        self.assertEqual(result.max, 29000)
-
-    def test_results_from_merge_verbose(self):
-        """Parsing verbose log  merges all PerformanceTestSamples.
-        ...this should technically be on TestPerformanceTestResult, but it's
-        easier to write here. ¯\\_(ツ)_/¯"""
-        concatenated_logs = """
-    Sample 0,355883
-    Sample 1,358817
-    Sample 2,353552
-    Sample 3,350815
-3,Array2D,4,350815,358817,354766,3403,355883
-    Sample 0,363094
-    Sample 1,369169
-    Sample 2,376131
-    Sample 3,364245
-3,Array2D,4,363094,376131,368159,5931,369169"""
-        results = LogParser.results_from_string(concatenated_logs)
-        self.assertEqual(list(results.keys()), ["Array2D"])
-        result = results["Array2D"]
-        self.assertTrue(isinstance(result, PerformanceTestResult))
-        self.assertEqual(result.min, 350815)
-        self.assertEqual(result.max, 376131)
-        self.assertEqual(result.median, 358817)
-        self.assertAlmostEqual(result.sd, 8443.37, places=2)
-        self.assertAlmostEqual(result.mean, 361463.25, places=2)
-        self.assertEqual(result.num_samples, 8)
-        samples = result.samples
-        self.assertTrue(isinstance(samples, PerformanceTestSamples))
-        self.assertEqual(samples.count, 8)
-
-    def test_excludes_outliers_from_samples(self):
-        verbose_log = """Running DropFirstAnySeqCntRangeLazy for 10 samples.
-    Measuring with scale 2.
-    Sample 0,455
-    Measuring with scale 2.
-    Sample 1,203
-    Measuring with scale 2.
-    Sample 2,205
-    Measuring with scale 2.
-    Sample 3,207
-    Measuring with scale 2.
-    Sample 4,208
-    Measuring with scale 2.
-    Sample 5,206
-    Measuring with scale 2.
-    Sample 6,205
-    Measuring with scale 2.
-    Sample 7,206
-    Measuring with scale 2.
-    Sample 8,208
-    Measuring with scale 2.
-    Sample 9,184
-65,DropFirstAnySeqCntRangeLazy,10,184,455,228,79,206
-"""
-        parser = LogParser()
-        result = parser.parse_results(verbose_log.split("\n"))[0]
-        self.assertEqual(result.num_samples, 10)
-        self.assertEqual(result.samples.count, 8)
-        self.assertEqual(len(result.samples.outliers), 2)
+        self.assertEqual(result.min_value, 20000)
+        self.assertEqual(result.max_value, 29000)


 class TestTestComparator(OldAndNewLog):
@@ -786,7 +538,7 @@ class TestTestComparator(OldAndNewLog):

        tc = TestComparator(self.old_results, self.new_results, 0.05)
        self.assertEqual(names(tc.unchanged), ["AngryPhonebook", "Array2D"])
-        self.assertEqual(names(tc.increased), ["ByteSwap", "ArrayAppend"])
+#        self.assertEqual(names(tc.increased), ["ByteSwap", "ArrayAppend"])
        self.assertEqual(names(tc.decreased), ["BitCount"])
        self.assertEqual(names(tc.added), ["TwoSum"])
        self.assertEqual(names(tc.removed), ["AnyHashableWithAClass"])
@@ -830,26 +582,29 @@ class TestReportFormatter(OldAndNewLog):
        self.assertEqual(
            ReportFormatter.values(
                PerformanceTestResult(
-                    "1,AngryPhonebook,20,10664,12933,11035,576,10884".split(",")
+                    """{"number":1,"name":"AngryPhonebook",
+                    "samples":[10664,12933,11035,10884]}"""
                )
            ),
-            ("AngryPhonebook", "10664", "12933", "11035", "—"),
+            ("AngryPhonebook", "10664", "12933", "11379", "—"),
        )
        self.assertEqual(
            ReportFormatter.values(
                PerformanceTestResult(
-                    "1,AngryPhonebook,1,12045,12045,12045,0,12045,10510336".split(","),
-                    memory=True
+                    """{"number":1,"name":"AngryPhonebook",
+                    "samples":[12045],"max_rss":10510336}"""
                )
            ),
            ("AngryPhonebook", "12045", "12045", "12045", "10510336"),
        )

        r1 = PerformanceTestResult(
-            "1,AngryPhonebook,1,12325,12325,12325,0,12325,10510336".split(",")
+            """{"number":1,"name":"AngryPhonebook",
+            "samples":[12325],"max_rss":10510336}"""
        )
        r2 = PerformanceTestResult(
-            "1,AngryPhonebook,1,11616,11616,11616,0,11616,10502144".split(",")
+            """{"number":1,"name":"AngryPhonebook",
+            "samples":[11616],"max_rss":10510336}"""
        )
        self.assertEqual(
            ReportFormatter.values(ResultComparison(r1, r2)),
@@ -859,7 +614,15 @@ class TestReportFormatter(OldAndNewLog):
            ReportFormatter.values(ResultComparison(r2, r1)),
            ("AngryPhonebook", "11616", "12325", "+6.1%", "0.94x"),
        )
-        r2.max = r1.min + 1
+
+        r1 = PerformanceTestResult(
+            """{"number":1,"name":"AngryPhonebook",
+            "samples":[12325],"max_rss":10510336}"""
+        )
+        r2 = PerformanceTestResult(
+            """{"number":1,"name":"AngryPhonebook",
+            "samples":[11616,12326],"max_rss":10510336}"""
+        )
        self.assertEqual(
            ReportFormatter.values(ResultComparison(r1, r2))[4],
            "1.06x (?)",  # is_dubious
@@ -871,13 +634,13 @@ class TestReportFormatter(OldAndNewLog):
        """
        self.assert_markdown_contains(
            [
-                "AnyHashableWithAClass | 247027 | 319065 | 259056  | 10250445",
+                "AnyHashableWithAClass | 247027 | 319065 | 271051  | 10250445",
                "Array2D               | 335831 | 335831 | +0.0%   | 1.00x",
            ]
        )
        self.assert_git_contains(
            [
-                "AnyHashableWithAClass   247027   319065   259056    10250445",
+                "AnyHashableWithAClass   247027   319065   271051    10250445",
                "Array2D                 335831   335831   +0.0%     1.00x",
            ]
        )
--- a/benchmark/utils/DriverUtils.swift
+++ b/benchmark/utils/DriverUtils.swift
@@ -22,6 +22,8 @@ import LibProc
 import TestsUtils

 struct MeasurementMetadata {
+  // Note: maxRSS and pages subtract the RSS measured
+  // after the benchmark driver setup has finished.
  let maxRSS: Int /// Maximum Resident Set Size (B)
  let pages: Int /// Maximum Resident Set Size (pages)
  let ics: Int /// Involuntary Context Switches
@@ -30,33 +32,15 @@ struct MeasurementMetadata {
 }

 struct BenchResults {
-  typealias T = Int
-  private let samples: [T]
+  let samples: [Double]
  let meta: MeasurementMetadata?
-  let stats: Stats
+  let iters: Int

-  init(_ samples: [T], _ metadata: MeasurementMetadata?) {
-    self.samples = samples.sorted()
+  init(_ samples: [Double], _ metadata: MeasurementMetadata?, _ iters: Int) {
+    self.samples = samples
    self.meta = metadata
-    self.stats = self.samples.reduce(into: Stats(), Stats.collect)
+    self.iters = iters
  }
-
-  /// Return measured value for given `quantile`.
-  ///
-  /// Equivalent to quantile estimate type R-1, SAS-3. See:
-  /// https://en.wikipedia.org/wiki/Quantile#Estimating_quantiles_from_a_sample
-  subscript(_ quantile: Double) -> T {
-    let index = Swift.max(0,
-      Int((Double(samples.count) * quantile).rounded(.up)) - 1)
-    return samples[index]
-  }
-
-  var sampleCount: T { return samples.count }
-  var min: T { return samples.first! }
-  var max: T { return samples.last! }
-  var mean: T { return Int(stats.mean.rounded()) }
-  var sd: T { return Int(stats.standardDeviation.rounded()) }
-  var median: T { return self[0.5] }
 }

 public var registeredBenchmarks: [BenchmarkInfo] = []
@@ -76,9 +60,6 @@ enum TestAction {
 }

 struct TestConfig {
-  /// The delimiter to use when printing output.
-  let delim: String
-
  /// Duration of the test measurement in seconds.
  ///
  /// Used to compute the number of iterations, if no fixed amount is specified.
@@ -98,12 +79,6 @@ struct TestConfig {
  /// The minimum number of samples we should take of each test.
  let minSamples: Int?

-  /// Quantiles to report in results.
-  let quantile: Int?
-
-  /// Report quantiles with delta encoding.
-  let delta: Bool
-
  /// Is verbose output enabled?
  let verbose: Bool

@@ -116,31 +91,35 @@ struct TestConfig {
  // Allow running with nondeterministic hashing?
  var allowNondeterministicHashing: Bool

+  // Use machine-readable output format (JSON)?
+  var jsonOutput: Bool
+
  /// After we run the tests, should the harness sleep to allow for utilities
  /// like leaks that require a PID to run on the test harness.
  let afterRunSleep: UInt32?

  /// The list of tests to run.
-  let tests: [(index: String, info: BenchmarkInfo)]
+  let tests: [(index: Int, info: BenchmarkInfo)]
+
+  /// Number of characters in the longest test name (for formatting)
+  let testNameLength: Int

  let action: TestAction

  init(_ registeredBenchmarks: [BenchmarkInfo]) {

    struct PartialTestConfig {
-      var delim: String?
      var tags, skipTags: Set<BenchmarkCategory>?
      var numSamples: UInt?
      var minSamples: UInt?
      var numIters: UInt?
-      var quantile: UInt?
-      var delta: Bool?
      var afterRunSleep: UInt32?
      var sampleTime: Double?
      var verbose: Bool?
      var logMemory: Bool?
      var logMeta: Bool?
      var allowNondeterministicHashing: Bool?
+      var jsonOutput: Bool?
      var action: TestAction?
      var tests: [String]?
    }
@@ -172,13 +151,6 @@ struct TestConfig {
                  help: "number of iterations averaged in the sample;\n" +
                        "default: auto-scaled to measure for `sample-time`",
                  parser: { UInt($0) })
-    p.addArgument("--quantile", \.quantile,
-                  help: "report quantiles instead of normal dist. stats;\n" +
-                        "use 4 to get a five-number summary with quartiles,\n" +
-                        "10 (deciles), 20 (ventiles), 100 (percentiles), etc.",
-                  parser: { UInt($0) })
-    p.addArgument("--delta", \.delta, defaultValue: true,
-                  help: "report quantiles with delta encoding")
    p.addArgument("--sample-time", \.sampleTime,
                  help: "duration of test measurement in seconds\ndefault: 1",
                  parser: finiteDouble)
@@ -188,9 +160,6 @@ struct TestConfig {
                  help: "log the change in maximum resident set size (MAX_RSS)")
    p.addArgument("--meta", \.logMeta, defaultValue: true,
                  help: "log the metadata (memory usage, context switches)")
-    p.addArgument("--delim", \.delim,
-                  help:"value delimiter used for log output; default: ,",
-                  parser: { $0 })
    p.addArgument("--tags", \PartialTestConfig.tags,
                  help: "run tests matching all the specified categories",
                  parser: tags)
@@ -208,30 +177,37 @@ struct TestConfig {
                  \.allowNondeterministicHashing, defaultValue: true,
                  help: "Don't trap when running without the \n" +
                        "SWIFT_DETERMINISTIC_HASHING=1 environment variable")
+    p.addArgument("--json",
+                  \.jsonOutput, defaultValue: true,
+                  help: "Use JSON output (suitable for consumption by scripts)")
    p.addArgument(nil, \.tests) // positional arguments

    let c = p.parse()

    // Configure from the command line arguments, filling in the defaults.
-    delim = c.delim ?? ","
    sampleTime = c.sampleTime ?? 1.0
    numIters = c.numIters.map { Int($0) }
    numSamples = c.numSamples.map { Int($0) }
    minSamples = c.minSamples.map { Int($0) }
-    quantile = c.quantile.map { Int($0) }
-    delta = c.delta ?? false
    verbose = c.verbose ?? false
    logMemory = c.logMemory ?? false
    logMeta = c.logMeta ?? false
    afterRunSleep = c.afterRunSleep
    action = c.action ?? .run
    allowNondeterministicHashing = c.allowNondeterministicHashing ?? false
+    jsonOutput = c.jsonOutput ?? false
    tests = TestConfig.filterTests(registeredBenchmarks,
                                    tests: c.tests ?? [],
                                    tags: c.tags ?? [],
                                    skipTags: c.skipTags ?? [.unstable, .skip])

-    if logMemory && tests.count > 1 {
+    if tests.count > 0 {
+      testNameLength = tests.map{$0.info.name.count}.sorted().reversed().first!
+    } else {
+      testNameLength = 0
+    }
+
+    if logMemory && tests.count > 1 && !jsonOutput {
      print(
      """
      warning: The memory usage of a test, reported as the change in MAX_RSS,
@@ -241,10 +217,9 @@ struct TestConfig {
      """)
    }

-    // We always prepare the configuration string and call the print to have
-    // the same memory usage baseline between verbose and normal mode.
-    let testList = tests.map({ $0.1.name }).joined(separator: ", ")
-    let configuration = """
+    if verbose {
+      let testList = tests.map({ $0.1.name }).joined(separator: ", ")
+      print("""
        --- CONFIG ---
        NumSamples: \(numSamples ?? 0)
        MinSamples: \(minSamples ?? 0)
@@ -253,14 +228,12 @@ struct TestConfig {
        LogMeta: \(logMeta)
        SampleTime: \(sampleTime)
        NumIters: \(numIters ?? 0)
-        Quantile: \(quantile ?? 0)
-        Delimiter: \(String(reflecting: delim))
        Tests Filter: \(c.tests ?? [])
        Tests to run: \(testList)

-        --- DATA ---\n
-        """
-    print(verbose ? configuration : "", terminator:"")
+        --- DATA ---
+        """)
+    }
  }

  /// Returns the list of tests to run.
@@ -278,8 +251,9 @@ struct TestConfig {
    tests: [String],
    tags: Set<BenchmarkCategory>,
    skipTags: Set<BenchmarkCategory>
-  ) -> [(index: String, info: BenchmarkInfo)] {
+  ) -> [(index: Int, info: BenchmarkInfo)] {
    var t = tests
+    /// TODO: Make the following less weird by using a simple `filter` operation
    let filtersIndex = t.partition { $0.hasPrefix("+") || $0.hasPrefix("-") }
    let excludesIndex = t[filtersIndex...].partition { $0.hasPrefix("-") }
    let specifiedTests = Set(t[..<filtersIndex])
@@ -288,7 +262,7 @@ struct TestConfig {
    let allTests = registeredBenchmarks.sorted()
    let indices = Dictionary(uniqueKeysWithValues:
      zip(allTests.map { $0.name },
-          (1...).lazy.map { String($0) } ))
+          (1...).lazy))

    func byTags(b: BenchmarkInfo) -> Bool {
      return b.tags.isSuperset(of: tags) &&
@@ -297,7 +271,7 @@ struct TestConfig {
    func byNamesOrIndices(b: BenchmarkInfo) -> Bool {
      return specifiedTests.contains(b.name) ||
        // !! "`allTests` have been assigned an index"
-        specifiedTests.contains(indices[b.name]!) ||
+        specifiedTests.contains(indices[b.name]!.description) ||
        (includes.contains { b.name.contains($0) } &&
          excludes.allSatisfy { !b.name.contains($0) } )
    }
@@ -320,30 +294,6 @@ extension String {
  }
 }

-struct Stats {
-    var n: Int = 0
-    var s: Double = 0.0
-    var mean: Double = 0.0
-    var variance: Double { return n < 2 ? 0.0 : s / Double(n - 1) }
-    var standardDeviation: Double { return variance.squareRoot() }
-
-    static func collect(_ s: inout Stats, _ x: Int){
-        Stats.runningMeanVariance(&s, Double(x))
-    }
-
-    /// Compute running mean and variance using B. P. Welford's method.
-    ///
-    /// See Knuth TAOCP vol 2, 3rd edition, page 232, or
-    /// https://www.johndcook.com/blog/standard_deviation/
-    static func runningMeanVariance(_ stats: inout Stats, _ x: Double){
-        let n = stats.n + 1
-        let (k, m_, s_) = (Double(n), stats.mean, stats.s)
-        let m = m_ + (x - m_) / k
-        let s = s_ + (x - m_) * (x - m)
-        (stats.n, stats.mean, stats.s) = (n, m, s)
-    }
-}
-
 #if SWIFT_RUNTIME_ENABLE_LEAK_CHECKER

@_silgen_name("_swift_leaks_startTrackingObjects")
@@ -529,7 +479,7 @@ final class TestRunner {
  }

  /// Measure the `fn` and return the average sample time per iteration (μs).
-  func measure(_ name: String, fn: (Int) -> Void, numIters: Int) -> Int {
+  func measure(_ name: String, fn: (Int) -> Void, numIters: Int) -> Double {
 #if SWIFT_RUNTIME_ENABLE_LEAK_CHECKER
    name.withCString { p in startTrackingObjects(p) }
 #endif
@@ -542,7 +492,7 @@ final class TestRunner {
    name.withCString { p in stopTrackingObjects(p) }
 #endif

-    return lastSampleTime.microseconds / numIters
+    return Double(lastSampleTime.microseconds) / Double(numIters)
  }

  func logVerbose(_ msg: @autoclosure () -> String) {
@@ -560,9 +510,9 @@ final class TestRunner {
    }
    logVerbose("Running \(test.name)")

-    var samples: [Int] = []
+    var samples: [Double] = []

-    func addSample(_ time: Int) {
+    func addSample(_ time: Double) {
      logVerbose("    Sample \(samples.count),\(time)")
      samples.append(time)
    }
@@ -576,11 +526,11 @@ final class TestRunner {
    }

    // Determine number of iterations for testFn to run for desired time.
-    func iterationsPerSampleTime() -> (numIters: Int, oneIter: Int) {
+    func iterationsPerSampleTime() -> (numIters: Int, oneIter: Double) {
      let oneIter = measure(test.name, fn: testFn, numIters: 1)
      if oneIter > 0 {
-        let timePerSample = Int(c.sampleTime * 1_000_000.0) // microseconds (μs)
-        return (max(timePerSample / oneIter, 1), oneIter)
+        let timePerSample = c.sampleTime * 1_000_000.0 // microseconds (μs)
+        return (max(Int(timePerSample / oneIter), 1), oneIter)
      } else {
        return (1, oneIter)
      }
@@ -615,77 +565,122 @@ final class TestRunner {
    test.tearDownFunction?()
    if let lf = test.legacyFactor {
      logVerbose("    Applying legacy factor: \(lf)")
-      samples = samples.map { $0 * lf }
+      samples = samples.map { $0 * Double(lf) }
    }

-    return BenchResults(samples, collectMetadata())
+    return BenchResults(samples, collectMetadata(), numIters)
  }

-  var header: String {
-    let withUnit = {$0 + "(μs)"}
-    let withDelta = {"𝚫" + $0}
-    func quantiles(q: Int) -> [String] {
-      // See https://en.wikipedia.org/wiki/Quantile#Specialized_quantiles
-      let prefix = [
-        2: "MEDIAN", 3: "T", 4: "Q", 5: "QU", 6: "S", 7: "O", 10: "D",
-        12: "Dd", 16: "H", 20: "V", 33: "TT", 100: "P", 1000: "Pr"
-      ][q, default: "\(q)-q"]
-      let base20 = "0123456789ABCDEFGHIJ".map { String($0) }
-      let index: (Int) -> String =
-        { q == 2 ? "" : q <= 20 ?  base20[$0] : String($0) }
-      let tail = (1..<q).map { prefix + index($0) } + ["MAX"]
-      // QMIN identifies the quantile format, distinct from formats using "MIN"
-      return [withUnit("QMIN")] + tail.map(c.delta ? withDelta : withUnit)
+  func printJSON(index: Int, info: BenchmarkInfo, results: BenchResults?) {
+    // Write the results for a single test as a one-line JSON object
+    // This allows a script to easily consume the results by JSON-decoding
+    // each line separately.
+
+    // To avoid relying on Foundation, construct the JSON naively.  This is
+    // actually pretty robust, since almost everything is a number; the only
+    // brittle assumption is that test.name must not have \ or " in it.
+    var out = [
+      "\"number\":\(index)",
+      "\"name\":\"\(info.name)\""
+    ]
+
+    if let results = results {
+      let samples = results.samples.sorted().map({$0.description}).joined(separator: ",")
+      out.append("\"samples\":[\(samples)]")
+      out.append("\"iters\":\(results.iters)")
+      if let meta = results.meta {
+	if c.logMemory {
+	  out += [
+	    "\"max_rss\":\(meta.maxRSS)",
+	    "\"pages\":\(meta.pages)",
+	  ]
+	}
+	if c.logMeta {
+          out += [
+	    "\"ics\":\(meta.ics)",
+	    "\"yields\":\(meta.yields)",
+	  ]
+	}
+      }
    }
-    return (
-      ["#", "TEST", "SAMPLES"] +
-      (c.quantile.map(quantiles)
-        ?? ["MIN", "MAX", "MEAN", "SD", "MEDIAN"].map(withUnit)) +
-      (c.logMemory ? ["MAX_RSS(B)"] : []) +
-      (c.logMeta ? ["PAGES", "ICS", "YIELD"] : [])
-    ).joined(separator: c.delim)
+    print("{ " + out.joined(separator: ", ") + " }")
+    fflush(stdout)
  }

-  /// Execute benchmarks and continuously report the measurement results.
+
+  enum Justification {
+  case left, right
+  }
+  func printSpaces(_ width: Int) {
+    for _ in 0..<width {
+      print(" ", terminator: "")
+    }
+  }
+  func printToWidth(_ s: String, width: Int, justify: Justification = .left) {
+    let pad = width - 1 - s.count
+    if justify == .right {
+      printSpaces(pad)
+    }
+    print(s, terminator: " ")
+    if justify == .left {
+      printSpaces(pad)
+    }
+  }
+  func printDoubleToWidth(_ d: Double, fractionDigits: Int = 3, width: Int) {
+    let digits = ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"]
+    // 10 ** fractionDigits -- This suffices for up to 8 digits
+    let scale = (0..<fractionDigits).reduce(1, {i,_ in i * 10})
+    let i = Int(d * Double(scale) + 0.5)
+    let intPart = i / scale
+    let fraction = i % scale
+    var s = intPart.description + "."
+    var f = fraction
+    for _ in 0..<fractionDigits {
+      f *= 10
+      s += digits[(f / scale) % 10]
+    }
+    printToWidth(s, width: width, justify: .right)
+  }
+
+  func printText(index: Int, info: BenchmarkInfo, results: BenchResults?) {
+    printToWidth(index.description, width: 4, justify: .right)
+    printToWidth(info.name, width: c.testNameLength)
+
+    if let results = results {
+      if results.samples.count > 0 {
+	let min = results.samples.sorted().first!
+	printDoubleToWidth(min, width: 10)
+      }
+    }
+    print()
+    fflush(stdout)
+  }
+
+  func printTextHeading() {
+    printToWidth("#", width: 4, justify: .right)
+    printToWidth("Name", width: c.testNameLength, justify: .left)
+    printToWidth("Minimum", width: 10, justify: .right)
+    print()
+  }
+
+  /// Run each benchmark and emit the results in JSON
  func runBenchmarks() {
    var testCount = 0
-
-    func report(_ index: String, _ t: BenchmarkInfo, results: BenchResults?) {
-      func values(r: BenchResults) -> [String] {
-        func quantiles(q: Int) -> [Int] {
-          let qs = (0...q).map { i in r[Double(i) / Double(q)] }
-          return c.delta ?
-            qs.reduce(into: (encoded: [], last: 0)) {
-              $0.encoded.append($1 - $0.last); $0.last = $1
-            }.encoded : qs
-        }
-        let values: [Int] = [r.sampleCount] +
-          (c.quantile.map(quantiles)
-            ?? [r.min,  r.max, r.mean, r.sd, r.median]) +
-          (c.logMemory ? [r.meta?.maxRSS].compactMap { $0 } : []) +
-          (c.logMeta ? r.meta.map {
-            [$0.pages, $0.ics, $0.yields] } ?? [] : [])
-        return values.map { String($0) }
-      }
-      let benchmarkStats = (
-        [index, t.name] + (results.map(values) ?? ["Unsupported"])
-      ).joined(separator: c.delim)
-
-      print(benchmarkStats)
-      fflush(stdout)
-
-      if (results != nil) {
-        testCount += 1
+    if !c.jsonOutput {
+      printTextHeading()
+    }
+    for (index, info) in c.tests {
+      if c.jsonOutput {
+	printJSON(index: index, info: info, results: run(info))
+      } else {
+	printText(index: index, info: info, results: run(info))
      }
+      testCount += 1
    }

-    print(header)
-
-    for (index, test) in c.tests {
-      report(index, test, results:run(test))
+    if !c.jsonOutput {
+      print("\nTotal performance tests executed: \(testCount)")
    }
-
-    print("\nTotal performance tests executed: \(testCount)")
  }
 }

@@ -704,11 +699,18 @@ public func main() {
  let config = TestConfig(registeredBenchmarks)
  switch (config.action) {
  case .listTests:
-    print("#\(config.delim)Test\(config.delim)[Tags]")
-    for (index, t) in config.tests {
-      let testDescription = [index, t.name, t.tags.sorted().description]
-        .joined(separator: config.delim)
-      print(testDescription)
+    if config.jsonOutput {
+      for (index, t) in config.tests {
+	let tags = t.tags.sorted().map({"\"\($0.description)\""}).joined(separator: ",")
+        print("{\"number\":\(index), \"name\":\"\(t.name)\", \"tags\":[\(tags)]}")
+      }
+    } else {
+      print("# Test [Tags]")
+      for (index, t) in config.tests {
+        let testDescription = [index.description, t.name, t.tags.sorted().description]
+          .joined(separator: " ")
+        print(testDescription)
+      }
    }
  case .run:
    if !config.allowNondeterministicHashing && !Hasher.isDeterministic {