Merge pull request #61559 from tbkka/tbkka-benchmarking

Overhaul Benchmarking pipeline to use complete sample data, not summaries
2025-12-14 20:36:38 +01:00 · 2022-11-09 07:38:58 -08:00
parent a47803485c 961a38b636
commit c056e6396b
6 changed files with 967 additions and 1153 deletions
--- a/benchmark/scripts/Benchmark_Driver
+++ b/benchmark/scripts/Benchmark_Driver
@@ -88,9 +88,10 @@ class BenchmarkDriver(object):
    def test_harness(self):
        """Full path to test harness binary."""
        suffix = self.args.optimization if hasattr(self.args, "optimization") else "O"
+        suffix += "-"
        if hasattr(self.args, "architecture") and self.args.architecture:
-            suffix += "-" + self.args.architecture + "*"
-        pattern = os.path.join(self.args.tests, "Benchmark_" + suffix)
+            suffix += self.args.architecture
+        pattern = os.path.join(self.args.tests, "Benchmark_" + suffix + "*")
        executables = []
        if hasattr(self._subprocess, "test_mode") and self._subprocess.test_mode:
            executables = [pattern]
@@ -134,22 +135,32 @@ class BenchmarkDriver(object):

    @property
    def _cmd_list_benchmarks(self):
-        # Use tab delimiter for easier parsing to override the default comma.
-        # (The third 'column' is always comma-separated list of tags in square
-        # brackets -- currently unused here.)
-        return [self.test_harness, "--list", "--delim=\t"] + (
+        # TODO: Switch to JSON format: add "--json" here
+        return [self.test_harness, "--list"] + (
            ["--skip-tags="] if (self.args.benchmarks or self.args.filters) else []
        )

    def _get_tests(self):
        """Return a list of performance tests to run."""
-        number_name_pairs = [
-            line.split("\t")[:2]
-            for line in self._invoke(self._cmd_list_benchmarks).split("\n")[1:-1]
-        ]
-        # unzip list of pairs into 2 lists
-        test_numbers, self.all_tests = map(list, zip(*number_name_pairs))
-        self.test_number = dict(zip(self.all_tests, test_numbers))
+        lines = self._invoke(self._cmd_list_benchmarks).split("\n")
+        json_tests = []
+        for line in lines:
+            columns = re.split(r'[ ,]+', line.strip())
+            try:
+                number = int(columns[0])
+                name = columns[1]
+                json_descr = {"number": number, "name": name}
+                json_tests.append(json_descr)
+            except Exception:
+                continue
+            # TODO: Replace the above with the following to
+            # use the JSON output from the benchmark driver
+            # directly
+            # if line.strip() != "":
+            #    json_tests.append(json.loads(line))
+        self.all_tests = [json["name"] for json in json_tests]
+        test_numbers = [json["number"] for json in json_tests]
+        self.test_number = dict([(json["name"], json["number"]) for json in json_tests])
        if self.args.filters:
            return self._tests_matching_patterns()
        if self.args.benchmarks:
@@ -157,25 +168,19 @@ class BenchmarkDriver(object):
        return self.all_tests

    def _tests_matching_patterns(self):
-        regexes = [re.compile(pattern) for pattern in self.args.filters]
-        return sorted(
-            list(
-                set(
-                    [
-                        name
-                        for pattern in regexes
-                        for name in self.all_tests
-                        if pattern.match(name)
-                    ]
-                )
-            )
-        )
+        matches = set()
+        for fil in self.args.filters:
+            pattern = re.compile(fil)
+            new_matches = filter(pattern.match, self.all_tests)
+            matches = matches.union(new_matches)
+        return sorted(list(matches))

    def _tests_by_name_or_number(self, test_numbers):
        benchmarks = set(self.args.benchmarks)
-        number_to_name = dict(zip(test_numbers, self.all_tests))
+        numbers = list(map(str, test_numbers))
+        number_to_name = dict(zip(numbers, self.all_tests))
        tests_by_number = [
-            number_to_name[i] for i in benchmarks.intersection(set(test_numbers))
+            number_to_name[i] for i in benchmarks.intersection(numbers)
        ]
        return sorted(
            list(benchmarks.intersection(set(self.all_tests)).union(tests_by_number))
@@ -188,8 +193,7 @@ class BenchmarkDriver(object):
        num_iters=None,
        sample_time=None,
        verbose=None,
-        measure_memory=False,
-        quantile=None,
+        measure_memory=False
    ):
        """Execute benchmark and gather results."""
        num_samples = num_samples or 0
@@ -197,11 +201,14 @@ class BenchmarkDriver(object):
        sample_time = sample_time or 0  # default is 1s

        cmd = self._cmd_run(
-            test, num_samples, num_iters, sample_time, verbose, measure_memory, quantile
+            test, num_samples, num_iters, sample_time, verbose, measure_memory
        )
        output = self._invoke(cmd)
        results = self.parser.results_from_string(output)
-        return list(results.items())[0][1] if test else results
+        if test:
+            return list(results.items())[0][1]
+        else:
+            return results

    def _cmd_run(
        self,
@@ -210,14 +217,13 @@ class BenchmarkDriver(object):
        num_iters,
        sample_time,
        verbose,
-        measure_memory,
-        quantile,
+        measure_memory
    ):
        cmd = [self.test_harness]
        if test:
            cmd.append(test)
        else:
-            cmd.extend([self.test_number.get(name, name) for name in self.tests])
+            cmd.extend([str(self.test_number.get(name, name)) for name in self.tests])
        if num_samples > 0:
            cmd.append("--num-samples={0}".format(num_samples))
        if num_iters > 0:
@@ -228,9 +234,8 @@ class BenchmarkDriver(object):
            cmd.append("--verbose")
        if measure_memory:
            cmd.append("--memory")
-        if quantile:
-            cmd.append("--quantile={0}".format(quantile))
-            cmd.append("--delta")
+# TODO: Uncomment this as soon as the new Benchmark Swift logic is available everywhere
+#        cmd.append("--json")
        return cmd

    def run_independent_samples(self, test):
@@ -246,12 +251,12 @@ class BenchmarkDriver(object):
        return functools.reduce(
            merge_results,
            [
-                self.run(test, measure_memory=True, num_iters=1, quantile=20)
+                self.run(test, measure_memory=True, num_iters=1)
                for _ in range(self.args.independent_samples)
            ],
        )

-    def log_results(self, output, log_file=None):
+    def log_results(self, results, log_file=None):
        """Log output to `log_file`.

        Creates `args.output_dir` if it doesn't exist yet.
@@ -262,7 +267,8 @@ class BenchmarkDriver(object):
            os.makedirs(dir)
        print("Logging results to: %s" % log_file)
        with open(log_file, "w") as f:
-            f.write(output)
+            for r in results:
+                print(r, file=f)

    RESULT = "{:>3} {:<40} {:>7} {:>7} {:>6} {:>10} {:>6} {:>7} {:>10}"

@@ -284,7 +290,7 @@ class BenchmarkDriver(object):
        def console_log(values):
            print(format(values))

-        def result_values(r):
+        def summary(r):
            return list(
                map(
                    str,
@@ -292,17 +298,17 @@ class BenchmarkDriver(object):
                        r.test_num,
                        r.name,
                        r.num_samples,
-                        r.min,
-                        r.samples.q1,
+                        r.min_value,
+                        r.q1,
                        r.median,
-                        r.samples.q3,
-                        r.max,
+                        r.q3,
+                        r.max_value,
                        r.max_rss,
                    ],
                )
            )

-        header = [
+        summary_header = [
            "#",
            "TEST",
            "SAMPLES",
@@ -313,25 +319,23 @@ class BenchmarkDriver(object):
            "MAX(μs)",
            "MAX_RSS(B)",
        ]
-        console_log(header)
-        results = [header]
+        console_log(summary_header)
+        results = []
        for test in self.tests:
-            result = result_values(self.run_independent_samples(test))
-            console_log(result)
+            result = self.run_independent_samples(test)
+            console_log(summary(result))
            results.append(result)

        print("\nTotal performance tests executed: {0}".format(len(self.tests)))
-        return (
-            None if csv_console else ("\n".join([",".join(r) for r in results]) + "\n")
-        )  # csv_log
+        return results

    @staticmethod
    def run_benchmarks(args):
        """Run benchmarks and log results."""
        driver = BenchmarkDriver(args)
-        csv_log = driver.run_and_log(csv_console=(args.output_dir is None))
-        if csv_log:
-            driver.log_results(csv_log)
+        results = driver.run_and_log(csv_console=(args.output_dir is None))
+        if args.output_dir:
+            driver.log_results([r.json for r in results])
        return 0


@@ -445,7 +449,6 @@ class BenchmarkDoctor(object):
        Optional `driver` parameter for injecting dependency; used for testing.
        """
        super(BenchmarkDoctor, self).__init__()
-        self.driver = driver or BenchmarkDriver(args)
        self.results = {}

        if hasattr(args, "markdown") and args.markdown:
@@ -458,6 +461,7 @@ class BenchmarkDoctor(object):
            self.console_handler.setLevel(
                logging.DEBUG if args.verbose else logging.INFO
            )
+        self.driver = driver or BenchmarkDriver(args)
        self.log.addHandler(self.console_handler)
        self.log.debug("Checking tests: %s", ", ".join(self.driver.tests))
        self.requirements = [
@@ -532,7 +536,7 @@ class BenchmarkDoctor(object):
            correction = setup / i
            i_series = BenchmarkDoctor._select(measurements, num_iters=i)
            for result in i_series:
-                runtimes.append(result.samples.min - correction)
+                runtimes.append(result.min_value - correction)
        runtime = min(runtimes)

        threshold = 1000
@@ -584,7 +588,7 @@ class BenchmarkDoctor(object):
        ti1, ti2 = [
            float(min(mins))
            for mins in [
-                [result.samples.min for result in i_series]
+                [result.min_value for result in i_series]
                for i_series in [select(measurements, num_iters=i) for i in [1, 2]]
            ]
        ]
@@ -679,7 +683,7 @@ class BenchmarkDoctor(object):
        r = self.driver.run(
            benchmark, num_samples=3, num_iters=1, verbose=True
        )  # calibrate
-        num_samples = self._adjusted_1s_samples(r.samples.min)
+        num_samples = self._adjusted_1s_samples(r.min_value)

        def capped(s):
            return min(s, 200)
@@ -689,7 +693,7 @@ class BenchmarkDoctor(object):
        opts = opts if isinstance(opts, list) else [opts]
        self.log.debug(
            "Runtime {0} μs yields {1} adjusted samples per second.".format(
-                r.samples.min, num_samples
+                r.min_value, num_samples
            )
        )
        self.log.debug(
--- a/benchmark/scripts/compare_perf_tests.py
+++ b/benchmark/scripts/compare_perf_tests.py
@@ -17,9 +17,7 @@ This script compares performance test logs and issues a formatted report.

 Invoke `$ compare_perf_tests.py -h ` for complete list of options.

-class `Sample` is single benchmark measurement.
-class `PerformanceTestSamples` is collection of `Sample`s and their statistics.
-class `PerformanceTestResult` is a summary of performance test execution.
+class `PerformanceTestResult` collects information about a single test
 class `LogParser` converts log files into `PerformanceTestResult`s.
 class `ResultComparison` compares new and old `PerformanceTestResult`s.
 class `TestComparator` analyzes changes between the old and new test results.
@@ -29,194 +27,10 @@ class `ReportFormatter` creates the test comparison report in specified format.

 import argparse
 import functools
+import json
 import re
+import statistics
 import sys
-from bisect import bisect, bisect_left, bisect_right
-from collections import namedtuple
-from math import ceil, sqrt
-
-
-class Sample(namedtuple("Sample", "i num_iters runtime")):
-    u"""Single benchmark measurement.
-
-    Initialized with:
-    `i`: ordinal number of the sample taken,
-    `num-num_iters`:  number or iterations used to compute it,
-    `runtime`: in microseconds (μs).
-    """
-
-    def __repr__(self):
-        """Shorter Sample formatting for debugging purposes."""
-        return "s({0.i!r}, {0.num_iters!r}, {0.runtime!r})".format(self)
-
-
-class Yield(namedtuple("Yield", "before_sample after")):
-    u"""Meta-measurement of when the Benchmark_X voluntarily yielded process.
-
-    `before_sample`: index of measurement taken just after returning from yield
-    `after`: time elapsed since the previous yield in microseconds (μs)
-    """
-
-
-class PerformanceTestSamples(object):
-    """Collection of runtime samples from the benchmark execution.
-
-    Computes the sample population statistics.
-    """
-
-    def __init__(self, name, samples=None):
-        """Initialize with benchmark name and optional list of Samples."""
-        self.name = name  # Name of the performance test
-        self.samples = []
-        self.outliers = []
-        self._runtimes = []
-        self.mean = 0.0
-        self.S_runtime = 0.0  # For computing running variance
-        for sample in samples or []:
-            self.add(sample)
-
-    def __str__(self):
-        """Text summary of benchmark statistics."""
-        return (
-            "{0.name!s} n={0.count!r} "
-            "Min={0.min!r} Q1={0.q1!r} M={0.median!r} Q3={0.q3!r} "
-            "Max={0.max!r} "
-            "R={0.range!r} {0.spread:.2%} IQR={0.iqr!r} "
-            "Mean={0.mean:.0f} SD={0.sd:.0f} CV={0.cv:.2%}".format(self)
-            if self.samples
-            else "{0.name!s} n=0".format(self)
-        )
-
-    def add(self, sample):
-        """Add sample to collection and recompute statistics."""
-        assert isinstance(sample, Sample)
-        self._update_stats(sample)
-        i = bisect(self._runtimes, sample.runtime)
-        self._runtimes.insert(i, sample.runtime)
-        self.samples.insert(i, sample)
-
-    def _update_stats(self, sample):
-        old_stats = (self.count, self.mean, self.S_runtime)
-        _, self.mean, self.S_runtime = self.running_mean_variance(
-            old_stats, sample.runtime
-        )
-
-    def exclude_outliers(self, top_only=False):
-        """Exclude outliers by applying Interquartile Range Rule.
-
-        Moves the samples outside of the inner fences
-        (Q1 - 1.5*IQR and Q3 + 1.5*IQR) into outliers list and recomputes
-        statistics for the remaining sample population. Optionally apply
-        only the top inner fence, preserving the small outliers.
-
-        Experimentally, this rule seems to perform well-enough on the
-        benchmark runtimes in the microbenchmark range to filter out
-        the environment noise caused by preemptive multitasking.
-        """
-        lo = (
-            0
-            if top_only
-            else bisect_left(self._runtimes, int(self.q1 - 1.5 * self.iqr))
-        )
-        hi = bisect_right(self._runtimes, int(self.q3 + 1.5 * self.iqr))
-
-        outliers = self.samples[:lo] + self.samples[hi:]
-        samples = self.samples[lo:hi]
-
-        self.__init__(self.name)  # re-initialize
-        for sample in samples:  # and
-            self.add(sample)  # re-compute stats
-        self.outliers = outliers
-
-    @property
-    def count(self):
-        """Number of samples used to compute the statistics."""
-        return len(self.samples)
-
-    @property
-    def num_samples(self):
-        """Number of all samples in the collection."""
-        return len(self.samples) + len(self.outliers)
-
-    @property
-    def all_samples(self):
-        """List of all samples in ascending order."""
-        return sorted(self.samples + self.outliers, key=lambda s: s.i or -1)
-
-    @property
-    def min(self):
-        """Minimum sampled value."""
-        return self.samples[0].runtime
-
-    @property
-    def max(self):
-        """Maximum sampled value."""
-        return self.samples[-1].runtime
-
-    def quantile(self, q):
-        """Return runtime for given quantile.
-
-        Equivalent to quantile estimate type R-1, SAS-3. See:
-        https://en.wikipedia.org/wiki/Quantile#Estimating_quantiles_from_a_sample
-        """
-        index = max(0, int(ceil(self.count * float(q))) - 1)
-        return self.samples[index].runtime
-
-    @property
-    def median(self):
-        """Median sampled value."""
-        return self.quantile(0.5)
-
-    @property
-    def q1(self):
-        """First Quartile (25th Percentile)."""
-        return self.quantile(0.25)
-
-    @property
-    def q3(self):
-        """Third Quartile (75th Percentile)."""
-        return self.quantile(0.75)
-
-    @property
-    def iqr(self):
-        """Interquartile Range."""
-        return self.q3 - self.q1
-
-    @property
-    def sd(self):
-        u"""Standard Deviation (μs)."""
-        return 0 if self.count < 2 else sqrt(self.S_runtime / (self.count - 1))
-
-    @staticmethod
-    def running_mean_variance(stats, x):
-        """Compute running variance, B. P. Welford's method.
-
-        See Knuth TAOCP vol 2, 3rd edition, page 232, or
-        https://www.johndcook.com/blog/standard_deviation/
-        M is mean, Standard Deviation is defined as sqrt(S/k-1)
-        """
-
-        (k, M_, S_) = stats
-
-        k = float(k + 1)
-        M = M_ + (x - M_) / k
-        S = S_ + (x - M_) * (x - M)
-        return (k, M, S)
-
-    @property
-    def cv(self):
-        """Coefficient of Variation (%)."""
-        return (self.sd / self.mean) if self.mean else 0
-
-    @property
-    def range(self):
-        """Range of samples values (Max - Min)."""
-        return self.max - self.min
-
-    @property
-    def spread(self):
-        """Sample Spread; i.e. Range as (%) of Min."""
-        return self.range / float(self.min) if self.min else 0


 class PerformanceTestResult(object):
@@ -225,126 +39,402 @@ class PerformanceTestResult(object):
    Reported by the test driver (Benchmark_O, Benchmark_Onone, Benchmark_Osize
    or Benchmark_Driver).

-    It supports 2 log formats emitted by the test driver. Legacy format with
-    statistics for normal distribution (MEAN, SD):
-        #,TEST,SAMPLES,MIN(μs),MAX(μs),MEAN(μs),SD(μs),MEDIAN(μs),MAX_RSS(B)
-    And new quantiles format with variable number of columns:
-        #,TEST,SAMPLES,QMIN(μs),MEDIAN(μs),MAX(μs)
-        #,TEST,SAMPLES,QMIN(μs),Q1(μs),Q2(μs),Q3(μs),MAX(μs),MAX_RSS(B)
-    The number of columns between MIN and MAX depends on the test driver's
-    `--quantile`parameter. In both cases, the last column, MAX_RSS is optional.
+    It supports  log formats emitted by the test driver.
    """

-    def __init__(self, csv_row, quantiles=False, memory=False, delta=False, meta=False):
-        """Initialize from a row of multiple columns with benchmark summary.
-
-        The row is an iterable, such as a row provided by the CSV parser.
+    # TODO: Delete after December 2023
+    @classmethod
+    def fromOldFormat(cls, header, line):
+        """Original format with statistics for normal distribution (MEAN, SD):
+             #,TEST,SAMPLES,MIN(μs),MAX(μs),MEAN(μs),SD(μs),MEDIAN(μs),MAX_RSS(B),PAGES,ICS,YIELD
+           Note that MAX_RSS, PAGES, ICS, YIELD are all optional
        """
-        self.test_num = csv_row[0]  # Ordinal number of the test
-        self.name = csv_row[1]  # Name of the performance test
-        self.num_samples = int(csv_row[2])  # Number of measurements taken
+        csv_row = line.split(",") if "," in line else line.split()
+        labels = header.split(",") if "," in header else header.split()

-        mem_index = (-1 if memory else 0) + (-3 if meta else 0)
-        if quantiles:  # Variable number of columns representing quantiles
-            runtimes = csv_row[3:mem_index] if memory or meta else csv_row[3:]
-            last_runtime_index = mem_index - 1
-            if delta:
-                runtimes = [int(x) if x else 0 for x in runtimes]
-                runtimes = functools.reduce(
-                    lambda l, x: l.append(l[-1] + x) or l if l else [x],  # runnin
-                    runtimes,
-                    None,
-                )  # total
-            num_values = len(runtimes)
-            if self.num_samples < num_values:  # remove repeated samples
-                quantile = num_values - 1
-                qs = [float(i) / float(quantile) for i in range(0, num_values)]
-                indices = [
-                    max(0, int(ceil(self.num_samples * float(q))) - 1) for q in qs
-                ]
-                runtimes = [
-                    runtimes[indices.index(i)] for i in range(0, self.num_samples)
-                ]
+        # Synthesize a JSON form with the basic values:
+        num_samples = int(csv_row[2])
+        json_data = {
+            "number": int(csv_row[0]),
+            "name": csv_row[1],
+            "num_samples": num_samples,
+        }

-            self.samples = PerformanceTestSamples(
-                self.name, [Sample(None, None, int(runtime)) for runtime in runtimes]
-            )
-            self.samples.exclude_outliers(top_only=True)
-            sams = self.samples
-            self.min, self.max, self.median, self.mean, self.sd = (
-                sams.min,
-                sams.max,
-                sams.median,
-                sams.mean,
-                sams.sd,
-            )
-        else:  # Legacy format with statistics for normal distribution.
-            self.min = int(csv_row[3])  # Minimum runtime (μs)
-            self.max = int(csv_row[4])  # Maximum runtime (μs)
-            self.mean = float(csv_row[5])  # Mean (average) runtime (μs)
-            self.sd = float(csv_row[6])  # Standard Deviation (μs)
-            self.median = int(csv_row[7])  # Median runtime (μs)
-            last_runtime_index = 7
-            self.samples = None
+        # Map remaining columns according to label
+        field_map = [
+            ("ICS", "ics"),
+            ("MAX_RSS", "max_rss"),  # Must precede "MAX"
+            ("MAX", "max"),
+            ("MEAN", "mean"),
+            ("MEDIAN", "median"),
+            ("MIN", "min"),
+            ("PAGES", "pages"),
+            ("SD", "sd"),
+            ("YIELD", "yield")
+        ]
+        for label, value in zip(labels, csv_row):
+            for match, json_key in field_map:
+                if match in label:
+                    json_data[json_key] = float(value)
+                    break

-        self.max_rss = (  # Maximum Resident Set Size (B)
-            int(csv_row[mem_index]) if (
-                memory and len(csv_row) > (last_runtime_index + 1)
-            ) else None
-        )
+        # Heroic: Reconstruct samples if we have enough info
+        # This is generally a bad idea, but sadly necessary for the
+        # old format that doesn't provide raw sample data.
+        if num_samples == 1 and "min" in json_data:
+            json_data["samples"] = [
+                json_data["min"]
+            ]
+        elif num_samples == 2 and "min" in json_data and "max" in json_data:
+            json_data["samples"] = [
+                json_data["min"],
+                json_data["max"]
+            ]
+        elif (num_samples == 3
+              and "min" in json_data
+              and "max" in json_data
+              and "median" in json_data):
+            json_data["samples"] = [
+                json_data["min"],
+                json_data["median"],
+                json_data["max"]
+            ]

-        # Optional measurement metadata. The number of:
-        # memory pages used, involuntary context switches and voluntary yields
-        self.mem_pages, self.involuntary_cs, self.yield_count = (
-            [int(x) for x in csv_row[-3:]] if meta else (None, None, None)
-        )
-        self.yields = None
-        self.setup = None
+        return PerformanceTestResult(json_data)
+
+    # TODO: Delete after December 2023
+    @classmethod
+    def fromQuantileFormat(cls, header, line):
+        """Quantiles format with variable number of columns depending on the
+           number of quantiles:
+           #,TEST,SAMPLES,QMIN(μs),MEDIAN(μs),MAX(μs)
+           #,TEST,SAMPLES,QMIN(μs),Q1(μs),Q2(μs),Q3(μs),MAX(μs),MAX_RSS(B)
+        The number of columns between QMIN and MAX depends on the test driver's
+        `--quantile`parameter. In both cases, the last column, MAX_RSS is optional.
+
+        Delta encoding: If a header name includes 𝚫, that column stores the
+        difference from the previous column.  E.g, a header
+        "#,TEST,SAMPLES,QMIN(μs),MEDIAN(μs),𝚫MAX(μs)" indicates the final "MAX"
+        column must be computed by adding the value in that column to the value
+        of the previous "MEDIAN" column.
+        """
+        csv_row = line.split(",") if "," in line else line.split()
+        labels = header.split(",")
+
+        for i in range(1, len(labels)):
+            if "𝚫" in labels[i] or "Δ" in labels[i]:
+                prev = int(csv_row[i - 1])
+                inc = int(csv_row[i]) if csv_row[i] != '' else 0
+                csv_row[i] = str(prev + inc)
+
+        # Synthesize a JSON form and then initialize from that
+        json_data = {
+            "number": int(csv_row[0]),
+            "name": csv_row[1],
+            "num_samples": int(csv_row[2]),
+        }
+        # Process optional trailing fields MAX_RSS, PAGES, ICS, YIELD
+        i = len(labels) - 1
+        while True:
+            if "MAX_RSS" in labels[i]:
+                json_data["max_rss"] = float(csv_row[i])
+            elif "PAGES" in labels[i]:
+                json_data["pages"] = float(csv_row[i])
+            elif "ICS" in labels[i]:
+                json_data["ics"] = float(csv_row[i])
+            elif "YIELD" in labels[i]:
+                json_data["yield"] = float(csv_row[i])
+            else:
+                break
+            i -= 1
+            if i < 0:
+                break
+
+        # Rest is the quantiles (includes min/max columns)
+        quantiles = [float(q) for q in csv_row[3:i + 1]]
+
+        # Heroic effort:
+        # If we have enough quantiles, we can reconstruct the samples
+        # This is generally a bad idea, but sadly necessary since
+        # the quantile format doesn't provide raw sample data.
+        if json_data["num_samples"] == len(quantiles):
+            json_data["samples"] = sorted(quantiles)
+        elif json_data["num_samples"] == 2:
+            json_data["samples"] = [quantiles[0], quantiles[-1]]
+        elif json_data["num_samples"] == 1:
+            json_data["samples"] = [quantiles[0]]
+        else:
+            json_data["quantiles"] = quantiles
+        if len(quantiles) > 0:
+            json_data["min"] = quantiles[0]
+            json_data["max"] = quantiles[-1]
+            json_data["median"] = quantiles[(len(quantiles) - 1) // 2]
+
+        return PerformanceTestResult(json_data)
+
+    @classmethod
+    def fromJSONFormat(cls, line):
+        """JSON format stores a test result as a JSON object on a single line
+
+        Compared to the legacy tab-separated/comma-separated formats, this makes
+        it much easier to add new fields, handle optional fields, and allows us
+        to include the full set of samples so we can use better statistics
+        downstream.
+
+        The code here includes optional support for min, max,
+        median, mean, etc. supported by the older formats, though in practice,
+        you shouldn't rely on those:  Just store the full samples and then
+        compute whatever statistics you need as required.
+        """
+        json_data = json.loads(line)
+        return PerformanceTestResult(json_data)
+
+    def __init__(self, json_data):
+        # Ugly hack to get the old tests to run
+        if isinstance(json_data, str):
+            json_data = json.loads(json_data)
+
+        # We always have these
+        assert (json_data.get("number") is not None)
+        assert (json_data.get("name") is not None)
+        self.test_num = json_data["number"]
+        self.name = json_data["name"]
+
+        # We always have either samples or num_samples
+        assert (json_data.get("num_samples") is not None
+                or json_data.get("samples") is not None)
+        self.num_samples = json_data.get("num_samples") or len(json_data["samples"])
+        self.samples = json_data.get("samples") or []
+
+        # Everything else is optional and can be read
+        # out of the JSON data if needed
+        # See max_rss() below for an example of this.
+        self.json_data = dict(json_data)

    def __repr__(self):
-        """Short summary for debugging purposes."""
-        return (
-            "<PerformanceTestResult name:{0.name!r} "
-            "samples:{0.num_samples!r} min:{0.min!r} max:{0.max!r} "
-            "mean:{0.mean:.0f} sd:{0.sd:.0f} median:{0.median!r}>".format(self)
-        )
+        return "PerformanceTestResult(" + json.dumps(self.json_data) + ")"

-    def merge(self, r):
+    def json(self):
+        """Return a single-line JSON form of this result
+
+        This can be parsed back via fromJSONFormat above.
+        It can also represent all data stored by the older
+        formats, so there's no reason to not use it everywhere.
+        """
+        data = dict(self.json_data)
+
+        # In case these got modified
+        data["number"] = self.test_num
+        data["name"] = self.name
+
+        # If we have full sample data, use that and
+        # drop any lingering pre-computed statistics
+        # (It's better for downstream consumers to just
+        # compute whatever statistics they need from scratch.)
+
+        # After December 2023, uncomment the next line:
+        # assert len(self.samples) == self.num_samples
+        if len(self.samples) == self.num_samples:
+            data["samples"] = self.samples
+            data.pop("num_samples", None)
+            # TODO: Delete min/max/mean/sd/q1/median/q3/quantiles
+            # after December 2023
+            data.pop("min", None)
+            data.pop("max", None)
+            data.pop("mean", None)
+            data.pop("sd", None)
+            data.pop("q1", None)
+            data.pop("median", None)
+            data.pop("q3", None)
+            data.pop("quantiles", None)
+        else:
+            # Preserve other pre-existing JSON statistics
+            data["num_samples"] = self.num_samples
+
+        return json.dumps(data)
+
+    def __str__(self):
+        return self.json()
+
+    @property
+    def setup(self):
+        """TODO: Implement this
+        """
+        return 0
+
+    @property
+    def max_rss(self):
+        """Return max_rss if available
+        """
+        return self.json_data.get("max_rss")
+
+    @property
+    def mem_pages(self):
+        """Return pages if available
+        """
+        return self.json_data.get("pages")
+
+    @property
+    def involuntary_cs(self):
+        """Return involuntary context switches if available
+        """
+        return self.json_data.get("ics")
+
+    @property
+    def yield_count(self):
+        """Return voluntary yield count if available
+        """
+        return self.json_data.get("yield")
+
+    @property
+    def min_value(self):
+        """Return the minimum value from all samples
+
+        If we have full samples, compute it directly.
+        In the legacy case, we might not have full samples,
+        so in that case we'll return a value that was given
+        to us initially (if any).
+
+        Eventually (after December 2023), this can be simplified
+        to just `return min(self.samples)`, since by then
+        the legacy forms should no longer be in use.
+        """
+        if self.num_samples == len(self.samples):
+            return min(self.samples)
+        return self.json_data.get("min")
+
+    @property
+    def max_value(self):
+        """Return the maximum sample value
+
+        See min_value comments for details on the legacy behavior."""
+        if self.num_samples == len(self.samples):
+            return max(self.samples)
+        return self.json_data.get("max")
+
+    @property
+    def median(self):
+        """Return the median sample value
+
+        See min_value comments for details on the legacy behavior."""
+        if self.num_samples == len(self.samples):
+            return statistics.median(self.samples)
+        return self.json_data.get("median")
+
+    # TODO: Eliminate q1 and q3.  They're kept for now
+    # to preserve compatibility with older reports.  But quantiles
+    # aren't really useful statistics, so just drop them.
+    @property
+    def q1(self):
+        """Return the 25% quantile
+
+        See min_value comments for details on the legacy behavior."""
+        if self.num_samples == len(self.samples):
+            q = statistics.quantiles(self.samples, n=4)
+            return q[0]
+        return self.json_data.get("q1")
+
+    @property
+    def q3(self):
+        """Return the 75% quantile
+
+        See min_value comments for details on the legacy behavior."""
+        if self.num_samples == len(self.samples):
+            q = statistics.quantiles(self.samples, n=4)
+            return q[2]
+        return self.json_data.get("q3")
+
+    @property
+    def mean(self):
+        """Return the average
+
+        TODO: delete this; it's not useful"""
+        if self.num_samples == len(self.samples):
+            return statistics.mean(self.samples)
+        return self.json_data.get("mean")
+
+    @property
+    def sd(self):
+        """Return the standard deviation
+
+        TODO: delete this; it's not useful"""
+        if self.num_samples == len(self.samples):
+            if len(self.samples) > 1:
+                return statistics.stdev(self.samples)
+            else:
+                return 0
+        return self.json_data.get("sd")
+
+    def merge(self, other):
        """Merge two results.

-        Recomputes min, max and mean statistics. If all `samples` are
-        available, it recomputes all the statistics.
-        The use case here is comparing test results parsed from concatenated
-        log files from multiple runs of benchmark driver.
+        This is trivial in the non-legacy case:  We just
+        pool all the samples.
+
+        In the legacy case (or the mixed legacy/non-legacy cases),
+        we try to estimate the min/max/mean/sd/median/etc based
+        on whatever information is available.  After Dec 2023,
+        we should be able to drop the legacy support.
        """
-        # Statistics
-        if self.samples and r.samples:
-            for sample in r.samples.samples:
-                self.samples.add(sample)
-            sams = self.samples
-            self.num_samples = sams.num_samples
-            self.min, self.max, self.median, self.mean, self.sd = (
-                sams.min,
-                sams.max,
-                sams.median,
-                sams.mean,
-                sams.sd,
-            )
-        else:
-            self.min = min(self.min, r.min)
-            self.max = max(self.max, r.max)
-            self.mean = (  # pooled mean is the weighted sum of means
-                (self.mean * self.num_samples) + (r.mean * r.num_samples)
-            ) / float(self.num_samples + r.num_samples)
-            self.num_samples += r.num_samples
-            self.median, self.sd = None, None
+        # The following can be removed after Dec 2023
+        # (by which time the legacy support should no longer
+        # be necessary)
+        if self.num_samples != len(self.samples):
+            # If we don't have samples, we can't rely on being
+            # able to compute real statistics from those samples,
+            # so we make a best-effort attempt to estimate a joined
+            # statistic from whatever data we actually have.
+
+            # If both exist, take the minimum, else take whichever is set
+            other_min_value = other.min_value
+            if other_min_value is not None:
+                self_min_value = self.min_value
+                if self_min_value is not None:
+                    self.json_data["min"] = min(other_min_value, self_min_value)
+                else:
+                    self.json_data["min"] = other_min_value
+
+            # If both exist, take the maximum, else take whichever is set
+            other_max_value = other.max_value
+            if other_max_value is not None:
+                self_max_value = self.max_value
+                if self_max_value is not None:
+                    self.json_data["max"] = max(other_max_value, self_max_value)
+                else:
+                    self.json_data["max"] = other_max_value
+
+            # If both exist, take the weighted average, else take whichever is set
+            other_mean = other.mean
+            if other_mean is not None:
+                self_mean = self.mean
+                if self_mean is not None:
+                    self.json_data["mean"] = (
+                        (other_mean * other.num_samples
+                         + self_mean * self.num_samples)
+                        / (self.num_samples + other.num_samples)
+                    )
+                else:
+                    self.json_data["mean"] = other_mean
+            self.json_data.pop("median", None)  # Remove median
+            self.json_data.pop("sd", None)  # Remove stdev
+            self.json_data.pop("q1", None)  # Remove 25% quantile
+            self.json_data.pop("q3", None)  # Remove 75% quantile
+            self.json_data.pop("quantiles", None)  # Remove quantiles
+
+        # Accumulate samples (if present) and num_samples (always)
+        self.samples += other.samples
+        self.num_samples += other.num_samples

        # Metadata
-        def minimum(a, b):  # work around None being less than everything
-            return min(filter(lambda x: x is not None, [a, b])) if any([a, b]) else None
-
-        self.max_rss = minimum(self.max_rss, r.max_rss)
-        self.setup = minimum(self.setup, r.setup)
+        # Use the smaller if both have a max_rss value
+        self.json_data["max_rss"] = other.max_rss
+        other_max_rss = other.max_rss
+        if other_max_rss is not None:
+            self_max_rss = self.max_rss
+            if self_max_rss is not None:
+                self.json_data["max_rss"] = min(self_max_rss, other_max_rss)
+            else:
+                self.json_data["max_rss"] = other_max_rss


 class ResultComparison(object):
@@ -361,16 +451,37 @@ class ResultComparison(object):
        self.name = old.name  # Test name, convenience accessor

        # Speedup ratio
-        self.ratio = (old.min + 0.001) / (new.min + 0.001)
+        self.ratio = (old.min_value + 0.001) / (new.min_value + 0.001)

        # Test runtime improvement in %
-        ratio = (new.min + 0.001) / (old.min + 0.001)
+        ratio = (new.min_value + 0.001) / (old.min_value + 0.001)
        self.delta = (ratio - 1) * 100

+        # If we have full samples for both old and new...
+        if (
+                len(old.samples) == old.num_samples
+                and len(new.samples) == new.num_samples
+        ):
+            # TODO: Use a T-Test or U-Test to determine whether
+            # one set of samples should be considered reliably better than
+            # the other.
+            None
+
+        # If we do not have full samples, we'll use the
+        # legacy calculation for compatibility.
+        # TODO: After Dec 2023, we should always be using full samples
+        # everywhere and can delete the following entirely.
+        #
        # Indication of dubious changes: when result's MIN falls inside the
        # (MIN, MAX) interval of result they are being compared with.
-        self.is_dubious = (old.min < new.min and new.min < old.max) or (
-            new.min < old.min and old.min < new.max
+        self.is_dubious = (
+            (
+                old.min_value < new.min_value
+                and new.min_value < old.max_value
+            ) or (
+                new.min_value < old.min_value
+                and old.min_value < new.max_value
+            )
        )


@@ -385,117 +496,49 @@ class LogParser(object):
    def __init__(self):
        """Create instance of `LogParser`."""
        self.results = []
-        self.quantiles, self.delta, self.memory = False, False, False
-        self.meta = False
-        self._reset()
-
-    def _reset(self):
-        """Reset parser to the default state for reading a new result."""
-        self.samples, self.yields, self.num_iters = [], [], 1
-        self.setup, self.max_rss, self.mem_pages = None, None, None
-        self.voluntary_cs, self.involuntary_cs = None, None
-
-    # Parse lines like this
-    # #,TEST,SAMPLES,MIN(μs),MAX(μs),MEAN(μs),SD(μs),MEDIAN(μs)
-    results_re = re.compile(
-        r"( *\d+[, \t]+[\w.\-\?!]+[, \t]+"
-        + r"[, \t]+".join([r"\d+"] * 2)  # #,TEST
-        + r"(?:[, \t]+\d*)*)"  # at least 2...
-    )  # ...or more numeric columns
-
-    def _append_result(self, result):
-        columns = result.split(",") if "," in result else result.split()
-        r = PerformanceTestResult(
-            columns,
-            quantiles=self.quantiles,
-            memory=self.memory,
-            delta=self.delta,
-            meta=self.meta,
-        )
-        r.setup = self.setup
-        r.max_rss = r.max_rss or self.max_rss
-        r.mem_pages = r.mem_pages or self.mem_pages
-        r.voluntary_cs = self.voluntary_cs
-        r.involuntary_cs = r.involuntary_cs or self.involuntary_cs
-        if self.samples:
-            r.samples = PerformanceTestSamples(r.name, self.samples)
-            r.samples.exclude_outliers()
-        self.results.append(r)
-        r.yields = self.yields or None
-        self._reset()
-
-    def _store_memory_stats(self, max_rss, mem_pages):
-        self.max_rss = int(max_rss)
-        self.mem_pages = int(mem_pages)
-
-    def _configure_format(self, header):
-        self.quantiles = "QMIN" in header
-        self.memory = "MAX_RSS" in header
-        self.meta = "PAGES" in header
-        self.delta = "𝚫" in header
-
-    # Regular expression and action to take when it matches the parsed line
-    state_actions = {
-        results_re: _append_result,
-        # Verbose mode adds new productions:
-        # Adaptively determined N; test loop multiple adjusting runtime to ~1s
-        re.compile(r"\s+Measuring with scale (\d+)."): (
-            lambda self, num_iters: setattr(self, "num_iters", num_iters)
-        ),
-        re.compile(r"\s+Sample (\d+),(\d+)"): (
-            lambda self, i, runtime: self.samples.append(
-                Sample(int(i), int(self.num_iters), int(runtime))
-            )
-        ),
-        re.compile(r"\s+SetUp (\d+)"): (
-            lambda self, setup: setattr(self, "setup", int(setup))
-        ),
-        re.compile(r"\s+Yielding after ~(\d+) μs"): (
-            lambda self, since_last_yield: self.yields.append(
-                Yield(len(self.samples), int(since_last_yield))
-            )
-        ),
-        re.compile(r"( *#[, \t]+TEST[, \t]+SAMPLES[, \t].*)"): _configure_format,
-        # Environmental statistics: memory usage and context switches
-        re.compile(
-            r"\s+MAX_RSS \d+ - \d+ = (\d+) \((\d+) pages\)"
-        ): _store_memory_stats,
-        re.compile(r"\s+VCS \d+ - \d+ = (\d+)"): (
-            lambda self, vcs: setattr(self, "voluntary_cs", int(vcs))
-        ),
-        re.compile(r"\s+ICS \d+ - \d+ = (\d+)"): (
-            lambda self, ics: setattr(self, "involuntary_cs", int(ics))
-        ),
-    }

    def parse_results(self, lines):
        """Parse results from the lines of the log output from Benchmark*.

        Returns a list of `PerformanceTestResult`s.
        """
+        match_json = re.compile(r"\s*({.*)")
+        match_header = re.compile(r"( *#[, \t]+TEST.*)")
+        match_legacy = re.compile(r" *(\d+[, \t].*)")
+        header = ""
        for line in lines:
-            for regexp, action in LogParser.state_actions.items():
-                match = regexp.match(line)
-                if match:
-                    action(self, *match.groups())
-                    break  # stop after 1st match
-            else:  # If none matches, skip the line.
-                # print('skipping: ' + line.rstrip('\n'))
+            # Current format has a JSON-encoded object on each line
+            # That format is flexible so should be the only format
+            # used going forward
+            if match_json.match(line):
+                r = PerformanceTestResult.fromJSONFormat(line)
+                self.results.append(r)
+            elif match_header.match(line):
+                # Legacy formats use a header line (which can be
+                # inspected to determine the presence and order of columns)
+                header = line
+            elif match_legacy.match(line):
+                # Legacy format: lines of space- or tab-separated values
+                if "QMIN" in header:
+                    r = PerformanceTestResult.fromQuantileFormat(header, line)
+                else:
+                    r = PerformanceTestResult.fromOldFormat(header, line)
+                self.results.append(r)
+            else:
+                # Ignore unrecognized lines
+                # print('Skipping: ' + line.rstrip('\n'), file=sys.stderr, flush=True)
                continue
        return self.results

    @staticmethod
    def _results_from_lines(lines):
-        tests = LogParser().parse_results(lines)
-
-        def add_or_merge(names, r):
+        names = dict()
+        for r in LogParser().parse_results(lines):
            if r.name not in names:
                names[r.name] = r
            else:
                names[r.name].merge(r)
-            return names
-
-        return functools.reduce(add_or_merge, tests, dict())
+        return names

    @staticmethod
    def results_from_string(log_contents):
@@ -615,18 +658,18 @@ class ReportFormatter(object):
        return (
            (
                result.name,
-                str(result.min),
-                str(result.max),
-                str(int(result.mean)),
-                str(result.max_rss) if result.max_rss else "—",
+                str(result.min_value) if result.min_value is not None else "-",
+                str(result.max_value) if result.max_value is not None else "-",
+                str(result.mean) if result.mean is not None else "-",
+                str(result.max_rss) if result.max_rss is not None else "—",
            )
            if isinstance(result, PerformanceTestResult)
            else
            # isinstance(result, ResultComparison)
            (
                result.name,
-                str(result.old.min),
-                str(result.new.min),
+                str(result.old.min_value) if result.old.min_value is not None else "-",
+                str(result.new.min_value) if result.new.min_value is not None else "-",
                "{0:+.1f}%".format(result.delta),
                "{0:.2f}x{1}".format(result.ratio, " (?)" if result.is_dubious else ""),
            )
--- a/benchmark/scripts/run_smoke_bench
+++ b/benchmark/scripts/run_smoke_bench
@@ -28,7 +28,7 @@ import subprocess
 import sys
 from imp import load_source

-from compare_perf_tests import LogParser, TestComparator, create_report
+from compare_perf_tests import PerformanceTestResult, TestComparator, create_report

 # import Benchmark_Driver  # doesn't work because it misses '.py' extension
 Benchmark_Driver = load_source(
@@ -204,12 +204,12 @@ def test_opt_levels(args):
    return 0


-def measure(driver, tests, i):
+def measure(driver, tests, i, min_num_samples):
    """Log and measure samples of the tests with the given driver.

    Collect increasing number of samples, depending on the iteration.
    """
-    num_samples = min(i + 3, 10)
+    num_samples = min(i + min_num_samples, 4 * min_num_samples)
    msg = "    Iteration {0} for {1}: num samples = {2}, ".format(
        i, driver.args.tests, num_samples
    )
@@ -246,7 +246,7 @@ def test_performance(
                                   optimization=opt_level))
        for dir in [old_dir, new_dir]
    ]
-    results = [measure(driver, driver.tests, i) for driver in [old, new]]
+    results = [measure(driver, driver.tests, i, num_samples) for driver in [old, new]]
    tests = TestComparator(results[0], results[1], threshold)
    changed = tests.decreased + tests.increased

@@ -254,11 +254,11 @@ def test_performance(
        i += 1
        if VERBOSE:
            log("        test again: " + str([test.name for test in changed]))
-        results = [
-            merge(the_results, measure(driver, [test.name for test in changed], i))
-            for the_results, driver in zip(results, [old, new])
-        ]
-        tests = TestComparator(results[0], results[1], threshold)
+        old_measurement = measure(old, [test.name for test in changed], i, num_samples)
+        old_results = merge(results[0], old_measurement)
+        new_measurement = measure(new, [test.name for test in changed], i, num_samples)
+        new_results = merge(results[1], new_measurement)
+        tests = TestComparator(old_results, new_results, threshold)
        changed = tests.decreased + tests.increased

        if len(old.tests) == len(changed):
@@ -269,7 +269,7 @@ def test_performance(
    log("")
    report_title = "Performance ({}): -{}".format(arch, opt_level)
    return report_results(
-        report_title, None, None, threshold * 1.4, output_file, *results
+        report_title, threshold * 1.4, output_file, old_results, new_results
    )


@@ -283,8 +283,8 @@ def report_code_size(opt_level, old_dir, new_dir, architecture, platform, output
        )

    idx = 1
-    old_lines = ""
-    new_lines = ""
+    old_results = {}
+    new_results = {}
    for oldfile in files:
        new_dir = os.path.join(new_dir, '')
        newfile = oldfile.replace(old_dir, new_dir, 1)
@@ -292,17 +292,13 @@ def report_code_size(opt_level, old_dir, new_dir, architecture, platform, output
            oldsize = get_codesize(oldfile)
            newsize = get_codesize(newfile)
            bname = os.path.basename(oldfile)
-
-            def result_line(value):
-                v = "," + str(value)
-                return str(idx) + "," + bname + ",1" + (v * 3) + ",0" + v + "\n"
-
-            old_lines += result_line(oldsize)
-            new_lines += result_line(newsize)
+            old_json = {"number": idx, "name": bname, "samples": [oldsize]}
+            new_json = {"number": idx, "name": bname, "samples": [newsize]}
+            old_results[bname] = PerformanceTestResult(old_json)
+            new_results[bname] = PerformanceTestResult(new_json)
            idx += 1
-
    return report_results(
-        "Code size: -" + opt_level, old_lines, new_lines, 0.01, output_file
+        "Code size: -" + opt_level, 0.01, output_file, old_results, new_results
    )


@@ -318,16 +314,11 @@ def get_codesize(filename):

 def report_results(
    title,
-    old_lines,
-    new_lines,
    threshold,
    output_file,
-    old_results=None,
-    new_results=None,
+    old_results,
+    new_results,
 ):
-    old_results = old_results or LogParser.results_from_string(old_lines)
-    new_results = new_results or LogParser.results_from_string(new_lines)
-
    print("------- " + title + " -------")
    print(create_report(old_results, new_results, threshold, "git"))

--- a/benchmark/scripts/test_Benchmark_Driver.py
+++ b/benchmark/scripts/test_Benchmark_Driver.py
@@ -208,7 +208,7 @@ class TestBenchmarkDriverInitialization(unittest.TestCase):
                self.args,
                tests=["ignored"],
                _subprocess=self.subprocess_mock).test_harness,
-            "/benchmarks/Benchmark_O",
+            "/benchmarks/Benchmark_O-*",
        )
        self.args.tests = "/path"
        self.args.optimization = "Suffix"
@@ -217,28 +217,27 @@ class TestBenchmarkDriverInitialization(unittest.TestCase):
                self.args,
                tests=["ignored"],
                _subprocess=self.subprocess_mock).test_harness,
-            "/path/Benchmark_Suffix",
+            "/path/Benchmark_Suffix-*",
        )

    def test_gets_list_of_precommit_benchmarks(self):
        self.subprocess_mock.expect(
-            "/benchmarks/Benchmark_O --list --delim=\t".split(" "),
-            "#\tTest\t[Tags]\n1\tBenchmark1\t[t1, t2]\n2\tBenchmark2\t[t3]\n",
+            "/benchmarks/Benchmark_O-* --list".split(" "),
+            """1 Benchmark1 ["t1" "t2"]\n"""
+            + """2 Benchmark2 ["t3"]\n""",
        )
        driver = BenchmarkDriver(self.args, _subprocess=self.subprocess_mock)
        self.subprocess_mock.assert_called_all_expected()
        self.assertEqual(driver.tests, ["Benchmark1", "Benchmark2"])
        self.assertEqual(driver.all_tests, ["Benchmark1", "Benchmark2"])
-        self.assertEqual(driver.test_number["Benchmark1"], "1")
-        self.assertEqual(driver.test_number["Benchmark2"], "2")
+        self.assertEqual(driver.test_number["Benchmark1"], 1)
+        self.assertEqual(driver.test_number["Benchmark2"], 2)

    list_all_tests = (
-        "/benchmarks/Benchmark_O --list --delim=\t --skip-tags=".split(" "),
-        """#	Test	[Tags]
-1	Benchmark1	[t1, t2]
-2	Benchmark2	[t3]
-3	Benchmark3	[t3, t4]
-""",
+        "/benchmarks/Benchmark_O-* --list --skip-tags=".split(" "),
+        """1 Benchmark1 ["t1","t2"]\n"""
+        + """2 Benchmark2 ["t3"]\n"""
+        + """3 Benchmark3 ["t3","t4"]\n""",
    )

    def test_gets_list_of_all_benchmarks_when_benchmarks_args_exist(self):
@@ -251,7 +250,7 @@ class TestBenchmarkDriverInitialization(unittest.TestCase):
        self.assertEqual(driver.all_tests, ["Benchmark1", "Benchmark2", "Benchmark3"])

    def test_filters_benchmarks_by_pattern(self):
-        self.args.filters = "-f .+3".split()
+        self.args.filters = [".+3"]
        self.subprocess_mock.expect(*self.list_all_tests)
        driver = BenchmarkDriver(self.args, _subprocess=self.subprocess_mock)
        self.subprocess_mock.assert_called_all_expected()
@@ -310,7 +309,7 @@ class LogParserStub(object):
    @staticmethod
    def results_from_string(log_contents):
        LogParserStub.results_from_string_called = True
-        r = PerformanceTestResult("3,b1,1,123,123,123,0,123".split(","))
+        r = PerformanceTestResult("""{"number":3,"name":"b1","samples":[123]}""")
        return {"b1": r}


@@ -320,8 +319,8 @@ class TestBenchmarkDriverRunningTests(unittest.TestCase):
        self.parser_stub = LogParserStub()
        self.subprocess_mock = SubprocessMock()
        self.subprocess_mock.expect(
-            "/benchmarks/Benchmark_O --list --delim=\t".split(" "),
-            "#\tTest\t[Tags]\n1\tb1\t[tag]\n",
+            "/benchmarks/Benchmark_O-* --list".split(" "),
+            """1 b1 ["tag"]""",
        )
        self.driver = BenchmarkDriver(
            self.args, _subprocess=self.subprocess_mock, parser=self.parser_stub
@@ -329,28 +328,30 @@ class TestBenchmarkDriverRunningTests(unittest.TestCase):

    def test_run_benchmark_with_multiple_samples(self):
        self.driver.run("b1")
-        self.subprocess_mock.assert_called_with(("/benchmarks/Benchmark_O", "b1"))
+        self.subprocess_mock.assert_called_with(
+            ("/benchmarks/Benchmark_O-*", "b1")
+        )
        self.driver.run("b2", num_samples=5)
        self.subprocess_mock.assert_called_with(
-            ("/benchmarks/Benchmark_O", "b2", "--num-samples=5")
+            ("/benchmarks/Benchmark_O-*", "b2", "--num-samples=5")
        )

    def test_run_benchmark_with_specified_number_of_iterations(self):
        self.driver.run("b", num_iters=1)
        self.subprocess_mock.assert_called_with(
-            ("/benchmarks/Benchmark_O", "b", "--num-iters=1")
+            ("/benchmarks/Benchmark_O-*", "b", "--num-iters=1")
        )

    def test_run_benchmark_for_specified_time(self):
        self.driver.run("b", sample_time=0.5)
        self.subprocess_mock.assert_called_with(
-            ("/benchmarks/Benchmark_O", "b", "--sample-time=0.5")
+            ("/benchmarks/Benchmark_O-*", "b", "--sample-time=0.5")
        )

    def test_run_benchmark_in_verbose_mode(self):
        self.driver.run("b", verbose=True)
        self.subprocess_mock.assert_called_with(
-            ("/benchmarks/Benchmark_O", "b", "--verbose")
+            ("/benchmarks/Benchmark_O-*", "b", "--verbose")
        )

    def test_run_batch(self):
@@ -361,7 +362,9 @@ class TestBenchmarkDriverRunningTests(unittest.TestCase):
        """
        self.driver.tests = ["b1", "bx"]
        self.driver.run()
-        self.subprocess_mock.assert_called_with(("/benchmarks/Benchmark_O", "1", "bx"))
+        self.subprocess_mock.assert_called_with(
+            ("/benchmarks/Benchmark_O-*", "1", "bx")
+        )

    def test_parse_results_from_running_benchmarks(self):
        """Parse measurements results using LogParser.
@@ -379,14 +382,7 @@ class TestBenchmarkDriverRunningTests(unittest.TestCase):
    def test_measure_memory(self):
        self.driver.run("b", measure_memory=True)
        self.subprocess_mock.assert_called_with(
-            ("/benchmarks/Benchmark_O", "b", "--memory")
-        )
-
-    def test_report_quantiles(self):
-        """Use delta compression for quantile reports."""
-        self.driver.run("b", quantile=4)
-        self.subprocess_mock.assert_called_with(
-            ("/benchmarks/Benchmark_O", "b", "--quantile=4", "--delta")
+            ("/benchmarks/Benchmark_O-*", "b", "--memory")
        )

    def test_run_benchmark_independent_samples(self):
@@ -396,12 +392,10 @@ class TestBenchmarkDriverRunningTests(unittest.TestCase):
        self.assertEqual(
            self.subprocess_mock.calls.count(
                (
-                    "/benchmarks/Benchmark_O",
+                    "/benchmarks/Benchmark_O-*",
                    "b1",
                    "--num-iters=1",
                    "--memory",
-                    "--quantile=20",
-                    "--delta",
                )
            ),
            3,
@@ -412,38 +406,36 @@ class TestBenchmarkDriverRunningTests(unittest.TestCase):
        def mock_run(test):
            self.assertEqual(test, "b1")
            return PerformanceTestResult(
-                "3,b1,5,101,1,1,1,1,888".split(","),
-                quantiles=True,
-                delta=True,
-                memory=True,
+                """{"number":3,"""
+                + """"name":"b1","""
+                + """"samples":[101,102,103,104,105],"""
+                + """"max_rss":888}"""
            )

        driver = BenchmarkDriver(tests=["b1"], args=Stub(output_dir=None))
        driver.run_independent_samples = mock_run  # patching

        with captured_output() as (out, _):
-            log = driver.run_and_log()
+            driver.run_and_log()

        header = (
            "#,TEST,SAMPLES,MIN(μs),Q1(μs),MEDIAN(μs),Q3(μs),MAX(μs)," + "MAX_RSS(B)\n"
        )
-        csv_log = "3,b1,5,101,102,103,104,105,888\n"
-        self.assertEqual(log, None)
+        csv_log = "3,b1,5,101,101.5,103,104.5,105,888\n"
        self.assertEqual(
            out.getvalue(),
            header + csv_log + "\n" + "Total performance tests executed: 1\n",
        )

        with captured_output() as (out, _):
-            log = driver.run_and_log(csv_console=False)
+            driver.run_and_log(csv_console=False)

-        self.assertEqual(log, header + csv_log)
        self.assertEqual(
            out.getvalue(),
            "  # TEST                                     SAMPLES MIN(μs)"
            + " Q1(μs) MEDIAN(μs) Q3(μs) MAX(μs) MAX_RSS(B)\n"
            + "  3 b1                                             5     101"
-            + "    102        103    104     105        888\n"
+            + "  101.5        103  104.5     105        888\n"
            + "\n"
            + "Total performance tests executed: 1\n",
        )
@@ -459,7 +451,7 @@ class TestBenchmarkDriverRunningTests(unittest.TestCase):
                openmode = "r"  # 'U' mode is deprecated in Python 3
            with open(log_file, openmode) as f:
                text = f.read()
-            self.assertEqual(text, "formatted output")
+            self.assertEqual(text, "formatted output\n")

        try:
            import tempfile  # setUp
@@ -469,7 +461,7 @@ class TestBenchmarkDriverRunningTests(unittest.TestCase):
            driver = BenchmarkDriver(Stub(), tests=[""])

            self.assertFalse(os.path.exists(log_dir))
-            content = "formatted output"
+            content = ["formatted output"]
            log_file = os.path.join(log_dir, "1.log")
            with captured_output() as (out, _):
                driver.log_results(content, log_file=log_file)
@@ -512,7 +504,7 @@ class BenchmarkDriverMock(Mock):
    def record_and_respond(self, test, num_samples, num_iters, verbose, measure_memory):
        args = (test, num_samples, num_iters, verbose, measure_memory)
        self.calls.append(args)
-        return self.respond.get(args, _PTR(min=700))
+        return self.respond.get(args, _PTR(min_value=700))


 class TestLoggingReportFormatter(unittest.TestCase):
@@ -615,9 +607,9 @@ class TestMarkdownReportHandler(unittest.TestCase):
        self.assert_contains(["| `QuotedName`"])


-def _PTR(min=700, mem_pages=1000, setup=None):
+def _PTR(min_value=700, mem_pages=1000, setup=None):
    """Create PerformanceTestResult Stub."""
-    return Stub(samples=Stub(min=min), mem_pages=mem_pages, setup=setup)
+    return Stub(min_value=min_value, mem_pages=mem_pages, setup=setup)


 def _run(test, num_samples=None, num_iters=None, verbose=None, measure_memory=False):
@@ -688,7 +680,7 @@ class TestBenchmarkDoctor(unittest.TestCase):
                    # calibration run, returns a stand-in for PerformanceTestResult
                    (
                        _run("B1", num_samples=3, num_iters=1, verbose=True),
-                        _PTR(min=300),
+                        _PTR(min_value=300),
                    )
                ]
                +
@@ -704,7 +696,7 @@ class TestBenchmarkDoctor(unittest.TestCase):
                                verbose=True,
                                measure_memory=True,
                            ),
-                            _PTR(min=300),
+                            _PTR(min_value=300),
                        )
                    ]
                    * 5
@@ -721,7 +713,7 @@ class TestBenchmarkDoctor(unittest.TestCase):
                                verbose=True,
                                measure_memory=True,
                            ),
-                            _PTR(min=300),
+                            _PTR(min_value=300),
                        )
                    ]
                    * 5
@@ -849,8 +841,8 @@ class TestBenchmarkDoctor(unittest.TestCase):
        def measurements(name, runtime):
            return {
                "name": name,
-                name + " O i1a": _PTR(min=runtime + 2),
-                name + " O i2a": _PTR(min=runtime),
+                name + " O i1a": _PTR(min_value=runtime + 2),
+                name + " O i2a": _PTR(min_value=runtime),
            }

        with captured_output() as (out, _):
@@ -863,8 +855,8 @@ class TestBenchmarkDoctor(unittest.TestCase):
            doctor.analyze(
                {
                    "name": "OverheadTurtle",
-                    "OverheadTurtle O i1a": _PTR(min=800000),
-                    "OverheadTurtle O i2a": _PTR(min=700000),
+                    "OverheadTurtle O i1a": _PTR(min_value=800000),
+                    "OverheadTurtle O i2a": _PTR(min_value=700000),
                }
            )
        output = out.getvalue()
@@ -920,30 +912,34 @@ class TestBenchmarkDoctor(unittest.TestCase):
                {
                    "name": "NoOverhead",  # not 'significant' enough
                    # Based on DropFirstArray a10/e10: overhead 3.7% (6 μs)
-                    "NoOverhead O i1a": _PTR(min=162),
-                    "NoOverhead O i2a": _PTR(min=159),
+                    "NoOverhead O i1a": _PTR(min_value=162),
+                    "NoOverhead O i2a": _PTR(min_value=159),
                }
            )
            doctor.analyze(
                {
                    "name": "SO",  # Setup Overhead
                    # Based on SuffixArrayLazy a10/e10: overhead 5.8% (4 μs)
-                    "SO O i1a": _PTR(min=69),
-                    "SO O i1b": _PTR(min=70),
-                    "SO O i2a": _PTR(min=67),
-                    "SO O i2b": _PTR(min=68),
+                    "SO O i1a": _PTR(min_value=69),
+                    "SO O i1b": _PTR(min_value=70),
+                    "SO O i2a": _PTR(min_value=67),
+                    "SO O i2b": _PTR(min_value=68),
                }
            )
            doctor.analyze(
-                {"name": "Zero", "Zero O i1a": _PTR(min=0), "Zero O i2a": _PTR(min=0)}
+                {
+                    "name": "Zero",
+                    "Zero O i1a": _PTR(min_value=0),
+                    "Zero O i2a": _PTR(min_value=0)
+                }
            )
            doctor.analyze(
                {
                    "name": "LOA",  # Limit of Accuracy
                    # Impossible to detect overhead:
                    # Even 1μs change in 20μs runtime is 5%.
-                    "LOA O i1a": _PTR(min=21),
-                    "LOA O i2a": _PTR(min=20),
+                    "LOA O i1a": _PTR(min_value=21),
+                    "LOA O i2a": _PTR(min_value=20),
                }
            )
        output = out.getvalue()
--- a/benchmark/scripts/test_compare_perf_tests.py
+++ b/benchmark/scripts/test_compare_perf_tests.py
@@ -13,6 +13,7 @@
 #
 # ===---------------------------------------------------------------------===//

+import json
 import os
 import shutil
 import sys
@@ -21,10 +22,8 @@ import unittest

 from compare_perf_tests import LogParser
 from compare_perf_tests import PerformanceTestResult
-from compare_perf_tests import PerformanceTestSamples
 from compare_perf_tests import ReportFormatter
 from compare_perf_tests import ResultComparison
-from compare_perf_tests import Sample
 from compare_perf_tests import TestComparator
 from compare_perf_tests import main
 from compare_perf_tests import parse_args
@@ -32,227 +31,70 @@ from compare_perf_tests import parse_args
 from test_utils import captured_output


-class TestSample(unittest.TestCase):
-    def test_has_named_fields(self):
-        s = Sample(1, 2, 3)
-        self.assertEqual(s.i, 1)
-        self.assertEqual(s.num_iters, 2)
-        self.assertEqual(s.runtime, 3)
-
-    def test_is_iterable(self):
-        s = Sample(1, 2, 3)
-        self.assertEqual(s[0], 1)
-        self.assertEqual(s[1], 2)
-        self.assertEqual(s[2], 3)
-
-
-class TestPerformanceTestSamples(unittest.TestCase):
-    def setUp(self):
-        self.samples = PerformanceTestSamples("B1")
-        self.samples.add(Sample(7, 42, 1000))
-
-    def test_has_name(self):
-        self.assertEqual(self.samples.name, "B1")
-
-    def test_stores_samples(self):
-        self.assertEqual(self.samples.count, 1)
-        s = self.samples.samples[0]
-        self.assertTrue(isinstance(s, Sample))
-        self.assertEqual(s.i, 7)
-        self.assertEqual(s.num_iters, 42)
-        self.assertEqual(s.runtime, 1000)
-
-    def test_quantile(self):
-        self.assertEqual(self.samples.quantile(1), 1000)
-        self.assertEqual(self.samples.quantile(0), 1000)
-        self.samples.add(Sample(2, 1, 1100))
-        self.assertEqual(self.samples.quantile(0), 1000)
-        self.assertEqual(self.samples.quantile(1), 1100)
-        self.samples.add(Sample(3, 1, 1050))
-        self.assertEqual(self.samples.quantile(0), 1000)
-        self.assertEqual(self.samples.quantile(0.5), 1050)
-        self.assertEqual(self.samples.quantile(1), 1100)
-
-    def assertEqualFiveNumberSummary(self, ss, expected_fns):
-        e_min, e_q1, e_median, e_q3, e_max = expected_fns
-        self.assertEqual(ss.min, e_min)
-        self.assertEqual(ss.q1, e_q1)
-        self.assertEqual(ss.median, e_median)
-        self.assertEqual(ss.q3, e_q3)
-        self.assertEqual(ss.max, e_max)
-
-    def test_computes_five_number_summary(self):
-        self.assertEqualFiveNumberSummary(self.samples, (1000, 1000, 1000, 1000, 1000))
-        self.samples.add(Sample(2, 1, 1100))
-        self.assertEqualFiveNumberSummary(self.samples, (1000, 1000, 1000, 1100, 1100))
-        self.samples.add(Sample(3, 1, 1050))
-        self.assertEqualFiveNumberSummary(self.samples, (1000, 1000, 1050, 1100, 1100))
-        self.samples.add(Sample(4, 1, 1025))
-        self.assertEqualFiveNumberSummary(self.samples, (1000, 1000, 1025, 1050, 1100))
-        self.samples.add(Sample(5, 1, 1075))
-        self.assertEqualFiveNumberSummary(self.samples, (1000, 1025, 1050, 1075, 1100))
-
-    def test_computes_inter_quartile_range(self):
-        self.assertEqual(self.samples.iqr, 0)
-        self.samples.add(Sample(2, 1, 1025))
-        self.samples.add(Sample(3, 1, 1050))
-        self.samples.add(Sample(4, 1, 1075))
-        self.samples.add(Sample(5, 1, 1100))
-        self.assertEqual(self.samples.iqr, 50)
-
-    def assertEqualStats(self, stats, expected_stats):
-        for actual, expected in zip(stats, expected_stats):
-            self.assertAlmostEqual(actual, expected, places=2)
-
-    def test_computes_mean_sd_cv(self):
-        ss = self.samples
-        self.assertEqualStats((ss.mean, ss.sd, ss.cv), (1000.0, 0.0, 0.0))
-        self.samples.add(Sample(2, 1, 1100))
-        self.assertEqualStats((ss.mean, ss.sd, ss.cv), (1050.0, 70.71, 6.7 / 100))
-
-    def test_computes_range_spread(self):
-        ss = self.samples
-        self.assertEqualStats((ss.range, ss.spread), (0, 0))
-        self.samples.add(Sample(2, 1, 1100))
-        self.assertEqualStats((ss.range, ss.spread), (100, 10.0 / 100))
-
-    def test_init_with_samples(self):
-        self.samples = PerformanceTestSamples(
-            "B2", [Sample(0, 1, 1000), Sample(1, 1, 1100)]
-        )
-        self.assertEqual(self.samples.count, 2)
-        self.assertEqualStats(
-            (
-                self.samples.mean,
-                self.samples.sd,
-                self.samples.range,
-                self.samples.spread,
-            ),
-            (1050.0, 70.71, 100, 9.52 / 100),
-        )
-
-    def test_can_handle_zero_runtime(self):
-        # guard against dividing by 0
-        self.samples = PerformanceTestSamples("Zero")
-        self.samples.add(Sample(0, 1, 0))
-        self.assertEqualStats(
-            (
-                self.samples.mean,
-                self.samples.sd,
-                self.samples.cv,
-                self.samples.range,
-                self.samples.spread,
-            ),
-            (0, 0, 0.0, 0, 0.0),
-        )
-
-    def test_excludes_outliers(self):
-        ss = [
-            Sample(*map(int, s.split()))
-            for s in "0 1 1000, 1 1 1025, 2 1 1050, 3 1 1075, 4 1 1100, "
-            "5 1 1000, 6 1 1025, 7 1 1050, 8 1 1075, 9 1 1100, "
-            "10 1 1050, 11 1 949, 12 1 1151".split(",")
-        ]
-        self.samples = PerformanceTestSamples("Outliers", ss)
-        self.assertEqual(self.samples.count, 13)
-        self.assertEqualStats((self.samples.mean, self.samples.sd), (1050, 52.36))
-
-        self.samples.exclude_outliers()
-
-        self.assertEqual(self.samples.count, 11)
-        self.assertEqual(self.samples.outliers, ss[11:])
-        self.assertEqualFiveNumberSummary(self.samples, (1000, 1025, 1050, 1075, 1100))
-        self.assertEqualStats((self.samples.mean, self.samples.sd), (1050, 35.36))
-
-    def test_excludes_outliers_zero_IQR(self):
-        self.samples = PerformanceTestSamples("Tight")
-        self.samples.add(Sample(0, 2, 23))
-        self.samples.add(Sample(1, 2, 18))
-        self.samples.add(Sample(2, 2, 18))
-        self.samples.add(Sample(3, 2, 18))
-        self.assertEqual(self.samples.iqr, 0)
-
-        self.samples.exclude_outliers()
-
-        self.assertEqual(self.samples.count, 3)
-        self.assertEqualStats((self.samples.min, self.samples.max), (18, 18))
-
-    def test_excludes_outliers_top_only(self):
-        ss = [
-            Sample(*map(int, s.split()))
-            for s in "0 1 1, 1 1 2, 2 1 2, 3 1 2, 4 1 3".split(",")
-        ]
-        self.samples = PerformanceTestSamples("Top", ss)
-        self.assertEqualFiveNumberSummary(self.samples, (1, 2, 2, 2, 3))
-        self.assertEqual(self.samples.iqr, 0)
-
-        self.samples.exclude_outliers(top_only=True)
-
-        self.assertEqual(self.samples.count, 4)
-        self.assertEqualStats((self.samples.min, self.samples.max), (1, 2))
-
-
 class TestPerformanceTestResult(unittest.TestCase):
    def test_init(self):
+        header = "#,TEST,SAMPLES,MIN,MAX,MEAN,SD,MEDIAN"
        log_line = "1,AngryPhonebook,20,10664,12933,11035,576,10884"
-        r = PerformanceTestResult(log_line.split(","))
-        self.assertEqual(r.test_num, "1")
+        r = PerformanceTestResult.fromOldFormat(header, log_line)
+        self.assertEqual(r.test_num, 1)
        self.assertEqual(r.name, "AngryPhonebook")
        self.assertEqual(
-            (r.num_samples, r.min, r.max, r.mean, r.sd, r.median),
+            (r.num_samples, r.min_value, r.max_value, r.mean, r.sd, r.median),
            (20, 10664, 12933, 11035, 576, 10884),
        )
-        self.assertEqual(r.samples, None)
+        self.assertEqual(r.samples, [])

+        header = "#,TEST,SAMPLES,MIN,MAX,MEAN,SD,MEDIAN,MAX_RSS"
        log_line = "1,AngryPhonebook,1,12045,12045,12045,0,12045,10510336"
-        r = PerformanceTestResult(log_line.split(","), memory=True)
+        r = PerformanceTestResult.fromOldFormat(header, log_line)
        self.assertEqual(r.max_rss, 10510336)

    def test_init_quantiles(self):
-        # #,TEST,SAMPLES,MIN(μs),MEDIAN(μs),MAX(μs)
+        header = "#,TEST,SAMPLES,MIN(μs),MEDIAN(μs),MAX(μs)"
        log = "1,Ackermann,3,54383,54512,54601"
-        r = PerformanceTestResult(log.split(","), quantiles=True)
-        self.assertEqual(r.test_num, "1")
+        r = PerformanceTestResult.fromQuantileFormat(header, log)
+        self.assertEqual(r.test_num, 1)
        self.assertEqual(r.name, "Ackermann")
        self.assertEqual(
-            (r.num_samples, r.min, r.median, r.max), (3, 54383, 54512, 54601)
+            (r.num_samples, r.min_value, r.median, r.max_value),
+            (3, 54383, 54512, 54601)
        )
        self.assertAlmostEqual(r.mean, 54498.67, places=2)
        self.assertAlmostEqual(r.sd, 109.61, places=2)
-        self.assertEqual(r.samples.count, 3)
-        self.assertEqual(r.samples.num_samples, 3)
-        self.assertEqual(
-            [s.runtime for s in r.samples.all_samples], [54383, 54512, 54601]
-        )
+        self.assertEqual(r.samples, [54383, 54512, 54601])

-        # #,TEST,SAMPLES,MIN(μs),MEDIAN(μs),MAX(μs),MAX_RSS(B)
+        header = "#,TEST,SAMPLES,MIN(μs),MEDIAN(μs),MAX(μs),MAX_RSS(B)"
        log = "1,Ackermann,3,54529,54760,55807,266240"
-        r = PerformanceTestResult(log.split(","), quantiles=True, memory=True)
-        self.assertEqual((r.samples.count, r.max_rss), (3, 266240))
-        # #,TEST,SAMPLES,MIN(μs),Q1(μs),Q2(μs),Q3(μs),MAX(μs)
+        r = PerformanceTestResult.fromQuantileFormat(header, log)
+        self.assertEqual((len(r.samples), r.max_rss), (3, 266240))
+
+        header = "#,TEST,SAMPLES,MIN(μs),Q1(μs),Q2(μs),Q3(μs),MAX(μs)"
        log = "1,Ackermann,5,54570,54593,54644,57212,58304"
-        r = PerformanceTestResult(log.split(","), quantiles=True, memory=False)
+        r = PerformanceTestResult.fromQuantileFormat(header, log)
        self.assertEqual(
-            (r.num_samples, r.min, r.median, r.max), (5, 54570, 54644, 58304)
+            (r.num_samples, r.min_value, r.median, r.max_value),
+            (5, 54570, 54644, 58304)
        )
-        self.assertEqual((r.samples.q1, r.samples.q3), (54593, 57212))
-        self.assertEqual(r.samples.count, 5)
-        # #,TEST,SAMPLES,MIN(μs),Q1(μs),Q2(μs),Q3(μs),MAX(μs),MAX_RSS(B)
+        self.assertEqual((r.q1, r.q3), (54581.5, 57758))
+        self.assertEqual(len(r.samples), 5)
+
+        header = "#,TEST,SAMPLES,MIN(μs),Q1(μs),Q2(μs),Q3(μs),MAX(μs),MAX_RSS(B)"
        log = "1,Ackermann,5,54686,54731,54774,55030,63466,270336"
-        r = PerformanceTestResult(log.split(","), quantiles=True, memory=True)
-        self.assertEqual(r.samples.num_samples, 5)
-        self.assertEqual(r.samples.count, 4)  # outlier was excluded
+        r = PerformanceTestResult.fromQuantileFormat(header, log)
+        self.assertEqual(r.num_samples, 5)
+        self.assertEqual(len(r.samples), 5)
        self.assertEqual(r.max_rss, 270336)

    def test_init_delta_quantiles(self):
-        # #,TEST,SAMPLES,MIN(μs),𝚫MEDIAN,𝚫MAX
        # 2-quantile from 2 samples in repeated min, when delta encoded,
        # the difference is 0, which is omitted -- only separator remains
+        header = "#,TEST,SAMPLES,MIN(μs),𝚫MEDIAN,𝚫MAX"
        log = "202,DropWhileArray,2,265,,22"
-        r = PerformanceTestResult(log.split(","), quantiles=True, delta=True)
-        self.assertEqual((r.num_samples, r.min, r.median, r.max), (2, 265, 265, 287))
-        self.assertEqual(r.samples.count, 2)
-        self.assertEqual(r.samples.num_samples, 2)
+        r = PerformanceTestResult.fromQuantileFormat(header, log)
+        self.assertEqual((r.num_samples, r.min_value, r.median, r.max_value),
+                         (2, 265, 276, 287))
+        self.assertEqual(len(r.samples), 2)
+        self.assertEqual(r.num_samples, 2)

    def test_init_oversampled_quantiles(self):
        """When num_samples is < quantile + 1, some of the measurements are
@@ -265,6 +107,16 @@ class TestPerformanceTestResult(unittest.TestCase):
        tbl <- function(s) t(sapply(1:s, function(x) {
          qs <- subsample(x, s); c(qs[1], diff(qs)) }))
        sapply(c(3, 5, 11, 21), tbl)
+
+        TODO: Delete this test when we delete quantile support from the
+        benchmark harness. Reconstructing samples from quantiles as this code is
+        trying to do is not really statistically sound, which is why we're going
+        to delete most of this in favor of an architecture where the
+        lowest-level benchmarking logic reports samples, we store and pass
+        raw sample data around as much as possible, and summary statistics are
+        only computed as necessary for actual reporting (and then discarded,
+        since we can recompute anything we need if we always have the raw
+        samples available).
        """

        def validatePTR(deq):  # construct from delta encoded quantiles string
@@ -273,10 +125,8 @@ class TestPerformanceTestResult(unittest.TestCase):
            r = PerformanceTestResult(
                ["0", "B", str(num_samples)] + deq, quantiles=True, delta=True
            )
-            self.assertEqual(r.samples.num_samples, num_samples)
-            self.assertEqual(
-                [s.runtime for s in r.samples.all_samples], range(1, num_samples + 1)
-            )
+            self.assertEqual(len(r.samples), num_samples)
+            self.assertEqual(r.samples, range(1, num_samples + 1))

        delta_encoded_quantiles = """
 1,,
@@ -318,119 +168,152 @@ class TestPerformanceTestResult(unittest.TestCase):
        map(validatePTR, delta_encoded_quantiles.split("\n")[1:])

    def test_init_meta(self):
-        # #,TEST,SAMPLES,MIN(μs),MAX(μs),MEAN(μs),SD(μs),MEDIAN(μs),…
-        # …PAGES,ICS,YIELD
+        header = (
+            "#,TEST,SAMPLES,MIN(μs),MAX(μs),MEAN(μs),SD(μs),"
+            + "MEDIAN(μs),PAGES,ICS,YIELD"
+        )
        log = "1,Ackermann,200,715,1281,726,47,715,7,29,15"
-        r = PerformanceTestResult(log.split(","), meta=True)
-        self.assertEqual((r.test_num, r.name), ("1", "Ackermann"))
+        r = PerformanceTestResult.fromOldFormat(header, log)
+        self.assertEqual((r.test_num, r.name), (1, "Ackermann"))
        self.assertEqual(
-            (r.num_samples, r.min, r.max, r.mean, r.sd, r.median),
+            (r.num_samples, r.min_value, r.max_value, r.mean, r.sd, r.median),
            (200, 715, 1281, 726, 47, 715),
        )
        self.assertEqual((r.mem_pages, r.involuntary_cs, r.yield_count), (7, 29, 15))
-        # #,TEST,SAMPLES,MIN(μs),MAX(μs),MEAN(μs),SD(μs),MEDIAN(μs),MAX_RSS(B),…
-        # …PAGES,ICS,YIELD
+        header = (
+            "#,TEST,SAMPLES,MIN(μs),MAX(μs),MEAN(μs),SD(μs),MEDIAN(μs),"
+            + "MAX_RSS(B),PAGES,ICS,YIELD"
+        )
        log = "1,Ackermann,200,715,1951,734,97,715,36864,9,50,15"
-        r = PerformanceTestResult(log.split(","), memory=True, meta=True)
+        r = PerformanceTestResult.fromOldFormat(header, log)
        self.assertEqual(
-            (r.num_samples, r.min, r.max, r.mean, r.sd, r.median),
+            (r.num_samples, r.min_value, r.max_value, r.mean, r.sd, r.median),
            (200, 715, 1951, 734, 97, 715),
        )
        self.assertEqual(
            (r.mem_pages, r.involuntary_cs, r.yield_count, r.max_rss),
            (9, 50, 15, 36864),
        )
-        # #,TEST,SAMPLES,MIN(μs),MAX(μs),PAGES,ICS,YIELD
+        header = "#,TEST,SAMPLES,MIN(μs),MAX(μs),PAGES,ICS,YIELD"
        log = "1,Ackermann,200,715,3548,8,31,15"
-        r = PerformanceTestResult(log.split(","), quantiles=True, meta=True)
-        self.assertEqual((r.num_samples, r.min, r.max), (200, 715, 3548))
-        self.assertEqual(
-            (r.samples.count, r.samples.min, r.samples.max), (2, 715, 3548)
-        )
+        r = PerformanceTestResult.fromOldFormat(header, log)
+        self.assertEqual((r.num_samples, r.min_value, r.max_value), (200, 715, 3548))
+        self.assertEqual(r.samples, [])
        self.assertEqual((r.mem_pages, r.involuntary_cs, r.yield_count), (8, 31, 15))
-        # #,TEST,SAMPLES,MIN(μs),MAX(μs),MAX_RSS(B),PAGES,ICS,YIELD
+
+        header = "#,TEST,SAMPLES,MIN(μs),MAX(μs),MAX_RSS(B),PAGES,ICS,YIELD"
        log = "1,Ackermann,200,715,1259,32768,8,28,15"
-        r = PerformanceTestResult(
-            log.split(","), quantiles=True, memory=True, meta=True
-        )
-        self.assertEqual((r.num_samples, r.min, r.max), (200, 715, 1259))
-        self.assertEqual(
-            (r.samples.count, r.samples.min, r.samples.max), (2, 715, 1259)
-        )
+        r = PerformanceTestResult.fromOldFormat(header, log)
+        self.assertEqual((r.num_samples, r.min_value, r.max_value), (200, 715, 1259))
+        self.assertEqual(r.samples, [])
        self.assertEqual(r.max_rss, 32768)
        self.assertEqual((r.mem_pages, r.involuntary_cs, r.yield_count), (8, 28, 15))

-    def test_repr(self):
-        log_line = "1,AngryPhonebook,20,10664,12933,11035,576,10884"
-        r = PerformanceTestResult(log_line.split(","))
-        self.assertEqual(
-            str(r),
-            "<PerformanceTestResult name:'AngryPhonebook' samples:20 "
-            "min:10664 max:12933 mean:11035 sd:576 median:10884>",
-        )
-
    def test_merge(self):
-        tests = """
-1,AngryPhonebook,1,12045,12045,12045,0,12045
-1,AngryPhonebook,1,12325,12325,12325,0,12325,10510336
-1,AngryPhonebook,1,11616,11616,11616,0,11616,10502144
-1,AngryPhonebook,1,12270,12270,12270,0,12270,10498048""".split(
-            "\n"
-        )[
-            1:
+        tests = [
+            """{"number":1,"name":"AngryPhonebook",
+            "samples":[12045]}""",
+            """{"number":1,"name":"AngryPhonebook",
+            "samples":[12325],"max_rss":10510336}""",
+            """{"number":1,"name":"AngryPhonebook",
+            "samples":[11616],"max_rss":10502144}""",
+            """{"number":1,"name":"AngryPhonebook",
+            "samples":[12270],"max_rss":10498048}"""
        ]

-        def makeResult(csv_row):
-            return PerformanceTestResult(csv_row, memory=True)
-
-        results = list(map(makeResult, [line.split(",") for line in tests]))
-        results[2].setup = 9
-        results[3].setup = 7
+        results = [PerformanceTestResult(json) for json in tests]

        def as_tuple(r):
            return (
                r.num_samples,
-                r.min,
-                r.max,
+                r.min_value,
+                r.max_value,
                round(r.mean, 2),
-                r.sd,
+                round(r.sd, 2),
                r.median,
                r.max_rss,
-                r.setup,
            )

        r = results[0]
-        self.assertEqual(as_tuple(r), (1, 12045, 12045, 12045, 0, 12045, None, None))
+        self.assertEqual(as_tuple(r), (1, 12045, 12045, 12045, 0, 12045, None))
        r.merge(results[1])
        self.assertEqual(
-            as_tuple(r),  # drops SD and median, +max_rss
-            (2, 12045, 12325, 12185, None, None, 10510336, None),
+            as_tuple(r),
+            (2, 12045, 12325, 12185, 197.99, 12185, 10510336),
        )
        r.merge(results[2])
        self.assertEqual(
-            as_tuple(r),  # picks smaller of the MAX_RSS, +setup
-            (3, 11616, 12325, 11995.33, None, None, 10502144, 9),
+            as_tuple(r),
+            (3, 11616, 12325, 11995.33, 357.1, 12045, 10502144),
        )
        r.merge(results[3])
        self.assertEqual(
-            as_tuple(r),  # picks smaller of the setup values
-            (4, 11616, 12325, 12064, None, None, 10498048, 7),
+            as_tuple(r),
+            (4, 11616, 12325, 12064, 322.29, 12157.5, 10498048),
+        )
+
+    def test_legacy_merge(self):
+        header = """#,TEST,NUM_SAMPLES,MIN,MAX,MEAN,SD,MEDIAN, MAX_RSS"""
+        tests = [
+            """1,AngryPhonebook,8,12045,12045,12045,0,12045""",
+            """1,AngryPhonebook,8,12325,12325,12325,0,12325,10510336""",
+            """1,AngryPhonebook,8,11616,11616,11616,0,11616,10502144""",
+            """1,AngryPhonebook,8,12270,12270,12270,0,12270,10498048"""
+        ]
+
+        results = [PerformanceTestResult.fromOldFormat(header, row) for row in tests]
+
+        def as_tuple(r):
+            return (
+                r.num_samples,
+                r.min_value,
+                r.max_value,
+                round(r.mean, 2),
+                round(r.sd, 2) if r.sd is not None else None,
+                r.median,
+                r.max_rss,
+            )
+
+        r = results[0]
+        self.assertEqual(as_tuple(r), (8, 12045, 12045, 12045, 0, 12045, None))
+        r.merge(results[1])
+        self.assertEqual(
+            as_tuple(r),  # Note: SD, Median are lost
+            (16, 12045, 12325, 12185, None, None, 10510336),
+        )
+        r.merge(results[2])
+        self.assertEqual(
+            as_tuple(r),
+            (24, 11616, 12325, 11995.33, None, None, 10502144),
+        )
+        r.merge(results[3])
+        self.assertEqual(
+            as_tuple(r),
+            (32, 11616, 12325, 12064, None, None, 10498048),
        )


 class TestResultComparison(unittest.TestCase):
    def setUp(self):
        self.r0 = PerformanceTestResult(
-            "101,GlobalClass,20,0,0,0,0,0,10185728".split(",")
+            """{"number":101,"name":"GlobalClass",
+            "samples":[0,0,0,0,0],"max_rss":10185728}"""
        )
        self.r01 = PerformanceTestResult(
-            "101,GlobalClass,20,20,20,20,0,0,10185728".split(",")
+            """{"number":101,"name":"GlobalClass",
+            "samples":[20,20,20],"max_rss":10185728}"""
        )
        self.r1 = PerformanceTestResult(
-            "1,AngryPhonebook,1,12325,12325,12325,0,12325,10510336".split(",")
+            """{"number":1,"name":"AngryPhonebook",
+            "samples":[12325],"max_rss":10510336}"""
        )
        self.r2 = PerformanceTestResult(
-            "1,AngryPhonebook,1,11616,11616,11616,0,11616,10502144".split(",")
+            """{"number":1,"name":"AngryPhonebook",
+            "samples":[11616],"max_rss":10502144}"""
+        )
+        self.r3 = PerformanceTestResult(
+            """{"number":1,"name":"AngryPhonebook",
+            "samples":[11616,12326],"max_rss":10502144}"""
        )

    def test_init(self):
@@ -455,11 +338,10 @@ class TestResultComparison(unittest.TestCase):

    def test_values_is_dubious(self):
        self.assertFalse(ResultComparison(self.r1, self.r2).is_dubious)
-        self.r2.max = self.r1.min + 1
        # new.min < old.min < new.max
-        self.assertTrue(ResultComparison(self.r1, self.r2).is_dubious)
+        self.assertTrue(ResultComparison(self.r1, self.r3).is_dubious)
        # other way around: old.min < new.min < old.max
-        self.assertTrue(ResultComparison(self.r2, self.r1).is_dubious)
+        self.assertTrue(ResultComparison(self.r3, self.r1).is_dubious)


 class FileSystemIntegration(unittest.TestCase):
@@ -474,45 +356,48 @@ class FileSystemIntegration(unittest.TestCase):
    def write_temp_file(self, file_name, data):
        temp_file_name = os.path.join(self.test_dir, file_name)
        with open(temp_file_name, "w") as f:
-            f.write(data)
+            for line in data:
+                f.write(line)
+                f.write('\n')
        return temp_file_name


 class OldAndNewLog(unittest.TestCase):
-    old_log_content = """1,AngryPhonebook,20,10458,12714,11000,0,11000,10204365
-2,AnyHashableWithAClass,20,247027,319065,259056,0,259056,10250445
-3,Array2D,20,335831,400221,346622,0,346622,28297216
-4,ArrayAppend,20,23641,29000,24990,0,24990,11149926
-34,BitCount,20,3,4,4,0,4,10192896
-35,ByteSwap,20,4,6,4,0,4,10185933"""

-    new_log_content = """265,TwoSum,20,5006,5679,5111,0,5111
-35,ByteSwap,20,0,0,0,0,0
-34,BitCount,20,9,9,9,0,9
-4,ArrayAppend,20,20000,29000,24990,0,24990
-3,Array2D,20,335831,400221,346622,0,346622
-1,AngryPhonebook,20,10458,12714,11000,0,11000"""
+    old_log_content = [
+        """{"number":1,"name":"AngryPhonebook","""
+        + """"samples":[10458,12714,11000],"max_rss":10204365}""",
+        """{"number":2,"name":"AnyHashableWithAClass","""
+        + """"samples":[247027,319065,259056,259056],"max_rss":10250445}""",
+        """{"number":3,"name":"Array2D","""
+        + """"samples":[335831,400221,346622,346622],"max_rss":28297216}""",
+        """{"number":4,"name":"ArrayAppend","""
+        + """"samples":[23641,29000,24990,24990],"max_rss":11149926}""",
+        """{"number":34,"name":"BitCount","samples":[3,4,4,4],"max_rss":10192896}""",
+        """{"number":35,"name":"ByteSwap","samples":[4,6,4,4],"max_rss":10185933}"""
+    ]

-    def makeResult(csv_row):
-        return PerformanceTestResult(csv_row, memory=True)
+    new_log_content = [
+        """{"number":265,"name":"TwoSum","samples":[5006,5679,5111,5111]}""",
+        """{"number":35,"name":"ByteSwap","samples":[0,0,0,0,0]}""",
+        """{"number":34,"name":"BitCount","samples":[9,9,9,9]}""",
+        """{"number":4,"name":"ArrayAppend","samples":[20000,29000,24990,24990]}""",
+        """{"number":3,"name":"Array2D","samples":[335831,400221,346622,346622]}""",
+        """{"number":1,"name":"AngryPhonebook","samples":[10458,12714,11000,11000]}"""
+    ]
+
+    def makeResult(json_text):
+        return PerformanceTestResult(json.loads(json_text))

    old_results = dict(
        [
-            (r.name, r)
-            for r in map(
-                makeResult,
-                [line.split(",") for line in old_log_content.splitlines()],
-            )
+            (r.name, r) for r in map(makeResult, old_log_content)
        ]
    )

    new_results = dict(
        [
-            (r.name, r)
-            for r in map(
-                makeResult,
-                [line.split(",") for line in new_log_content.splitlines()],
-            )
+            (r.name, r) for r in map(makeResult, new_log_content)
        ]
    )

@@ -567,16 +452,12 @@ Total performance tests executed: 1
            """#,TEST,SAMPLES,QMIN(μs),MEDIAN(μs),MAX(μs)
 1,Ackermann,3,54383,54512,54601"""
        )["Ackermann"]
-        self.assertEqual(
-            [s.runtime for s in r.samples.all_samples], [54383, 54512, 54601]
-        )
+        self.assertEqual(r.samples, [54383, 54512, 54601])
        r = LogParser.results_from_string(
            """#,TEST,SAMPLES,QMIN(μs),MEDIAN(μs),MAX(μs),MAX_RSS(B)
 1,Ackermann,3,54529,54760,55807,266240"""
        )["Ackermann"]
-        self.assertEqual(
-            [s.runtime for s in r.samples.all_samples], [54529, 54760, 55807]
-        )
+        self.assertEqual(r.samples, [54529, 54760, 55807])
        self.assertEqual(r.max_rss, 266240)

    def test_parse_delta_quantiles(self):
@@ -584,15 +465,15 @@ Total performance tests executed: 1
            "#,TEST,SAMPLES,QMIN(μs),𝚫MEDIAN,𝚫MAX\n0,B,1,101,,"
        )["B"]
        self.assertEqual(
-            (r.num_samples, r.min, r.median, r.max, r.samples.count),
+            (r.num_samples, r.min_value, r.median, r.max_value, len(r.samples)),
            (1, 101, 101, 101, 1),
        )
        r = LogParser.results_from_string(
            "#,TEST,SAMPLES,QMIN(μs),𝚫MEDIAN,𝚫MAX\n0,B,2,101,,1"
        )["B"]
        self.assertEqual(
-            (r.num_samples, r.min, r.median, r.max, r.samples.count),
-            (2, 101, 101, 102, 2),
+            (r.num_samples, r.min_value, r.median, r.max_value, len(r.samples)),
+            (2, 101, 101.5, 102, 2),
        )
        r = LogParser.results_from_string(  # 20-quantiles aka. ventiles
            "#,TEST,SAMPLES,QMIN(μs),𝚫V1,𝚫V2,𝚫V3,𝚫V4,𝚫V5,𝚫V6,𝚫V7,𝚫V8,"
@@ -600,9 +481,8 @@ Total performance tests executed: 1
            + "202,DropWhileArray,200,214,,,,,,,,,,,,1,,,,,,2,16,464"
        )["DropWhileArray"]
        self.assertEqual(
-            (r.num_samples, r.min, r.max, r.samples.count),
-            # last 3 ventiles were outliers and were excluded from the sample
-            (200, 214, 215, 18),
+            (r.num_samples, r.min_value, r.max_value, len(r.samples)),
+            (200, 214, 697, 0),
        )

    def test_parse_meta(self):
@@ -612,7 +492,7 @@ Total performance tests executed: 1
            + "0,B,1,2,2,2,0,2,7,29,15"
        )["B"]
        self.assertEqual(
-            (r.min, r.mem_pages, r.involuntary_cs, r.yield_count), (2, 7, 29, 15)
+            (r.min_value, r.mem_pages, r.involuntary_cs, r.yield_count), (2, 7, 29, 15)
        )
        r = LogParser.results_from_string(
            "#,TEST,SAMPLES,MIN(μs),MAX(μs),MEAN(μs),SD(μs),MEDIAN(μs),"
@@ -620,163 +500,35 @@ Total performance tests executed: 1
            + "0,B,1,3,3,3,0,3,36864,9,50,15"
        )["B"]
        self.assertEqual(
-            (r.min, r.mem_pages, r.involuntary_cs, r.yield_count, r.max_rss),
+            (r.min_value, r.mem_pages, r.involuntary_cs, r.yield_count, r.max_rss),
            (3, 9, 50, 15, 36864),
        )
        r = LogParser.results_from_string(
            "#,TEST,SAMPLES,QMIN(μs),MAX(μs),PAGES,ICS,YIELD\n" + "0,B,1,4,4,8,31,15"
        )["B"]
        self.assertEqual(
-            (r.min, r.mem_pages, r.involuntary_cs, r.yield_count), (4, 8, 31, 15)
+            (r.min_value, r.mem_pages, r.involuntary_cs, r.yield_count), (4, 8, 31, 15)
        )
        r = LogParser.results_from_string(
            "#,TEST,SAMPLES,QMIN(μs),MAX(μs),MAX_RSS(B),PAGES,ICS,YIELD\n"
            + "0,B,1,5,5,32768,8,28,15"
        )["B"]
        self.assertEqual(
-            (r.min, r.mem_pages, r.involuntary_cs, r.yield_count, r.max_rss),
+            (r.min_value, r.mem_pages, r.involuntary_cs, r.yield_count, r.max_rss),
            (5, 8, 28, 15, 32768),
        )

-    def test_parse_results_verbose(self):
-        """Parse multiple performance test results with 2 sample formats:
-        single line for N = 1; two lines for N > 1.
-        """
-        verbose_log = """--- DATA ---
-#,TEST,SAMPLES,MIN(us),MAX(us),MEAN(us),SD(us),MEDIAN(us)
-Running AngryPhonebook for 3 samples.
-    Measuring with scale 78.
-    Sample 0,11812
-    Measuring with scale 90.
-    Sample 1,13898
-    Sample 2,11467
-1,AngryPhonebook,3,11467,13898,12392,1315,11812
-Running Array2D for 3 samples.
-    SetUp 14444
-    Sample 0,369900
-    Yielding after ~369918 μs
-    Sample 1,381039
-    Yielding after ~381039 μs
-    Sample 2,371043
-3,Array2D,3,369900,381039,373994,6127,371043
-
-Totals,2"""
-        parser = LogParser()
-        results = parser.parse_results(verbose_log.split("\n"))
-
-        r = results[0]
-        self.assertEqual(
-            (r.name, r.min, r.max, int(r.mean), int(r.sd), r.median),
-            ("AngryPhonebook", 11467, 13898, 12392, 1315, 11812),
-        )
-        self.assertEqual(r.num_samples, r.samples.num_samples)
-        self.assertEqual(
-            results[0].samples.all_samples,
-            [(0, 78, 11812), (1, 90, 13898), (2, 90, 11467)],
-        )
-        self.assertEqual(r.yields, None)
-
-        r = results[1]
-        self.assertEqual(
-            (r.name, r.min, r.max, int(r.mean), int(r.sd), r.median),
-            ("Array2D", 369900, 381039, 373994, 6127, 371043),
-        )
-        self.assertEqual(r.setup, 14444)
-        self.assertEqual(r.num_samples, r.samples.num_samples)
-        self.assertEqual(
-            results[1].samples.all_samples,
-            [(0, 1, 369900), (1, 1, 381039), (2, 1, 371043)],
-        )
-        yielded = r.yields[0]
-        self.assertEqual(yielded.before_sample, 1)
-        self.assertEqual(yielded.after, 369918)
-        self.assertEqual(r.yields, [(1, 369918), (2, 381039)])
-
-    def test_parse_environment_verbose(self):
-        """Parse stats about environment in verbose mode."""
-        verbose_log = """    MAX_RSS 8937472 - 8904704 = 32768 (8 pages)
-    ICS 1338 - 229 = 1109
-    VCS 2 - 1 = 1
-2,AngryPhonebook,3,11269,11884,11657,338,11820
-"""
-        parser = LogParser()
-        results = parser.parse_results(verbose_log.split("\n"))
-
-        r = results[0]
-        self.assertEqual(r.max_rss, 32768)
-        self.assertEqual(r.mem_pages, 8)
-        self.assertEqual(r.voluntary_cs, 1)
-        self.assertEqual(r.involuntary_cs, 1109)
-
    def test_results_from_merge(self):
        """Parsing concatenated log merges same PerformanceTestResults"""
-        concatenated_logs = """4,ArrayAppend,20,23641,29000,24990,0,24990
+        concatenated_logs = """#,TEST,SAMPLES,MIN,MAX,MEAN,SD,MEDIAN
+4,ArrayAppend,20,23641,29000,24990,0,24990
 4,ArrayAppend,1,20000,20000,20000,0,20000"""
        results = LogParser.results_from_string(concatenated_logs)
        self.assertEqual(list(results.keys()), ["ArrayAppend"])
        result = results["ArrayAppend"]
        self.assertTrue(isinstance(result, PerformanceTestResult))
-        self.assertEqual(result.min, 20000)
-        self.assertEqual(result.max, 29000)
-
-    def test_results_from_merge_verbose(self):
-        """Parsing verbose log  merges all PerformanceTestSamples.
-        ...this should technically be on TestPerformanceTestResult, but it's
-        easier to write here. ¯\\_(ツ)_/¯"""
-        concatenated_logs = """
-    Sample 0,355883
-    Sample 1,358817
-    Sample 2,353552
-    Sample 3,350815
-3,Array2D,4,350815,358817,354766,3403,355883
-    Sample 0,363094
-    Sample 1,369169
-    Sample 2,376131
-    Sample 3,364245
-3,Array2D,4,363094,376131,368159,5931,369169"""
-        results = LogParser.results_from_string(concatenated_logs)
-        self.assertEqual(list(results.keys()), ["Array2D"])
-        result = results["Array2D"]
-        self.assertTrue(isinstance(result, PerformanceTestResult))
-        self.assertEqual(result.min, 350815)
-        self.assertEqual(result.max, 376131)
-        self.assertEqual(result.median, 358817)
-        self.assertAlmostEqual(result.sd, 8443.37, places=2)
-        self.assertAlmostEqual(result.mean, 361463.25, places=2)
-        self.assertEqual(result.num_samples, 8)
-        samples = result.samples
-        self.assertTrue(isinstance(samples, PerformanceTestSamples))
-        self.assertEqual(samples.count, 8)
-
-    def test_excludes_outliers_from_samples(self):
-        verbose_log = """Running DropFirstAnySeqCntRangeLazy for 10 samples.
-    Measuring with scale 2.
-    Sample 0,455
-    Measuring with scale 2.
-    Sample 1,203
-    Measuring with scale 2.
-    Sample 2,205
-    Measuring with scale 2.
-    Sample 3,207
-    Measuring with scale 2.
-    Sample 4,208
-    Measuring with scale 2.
-    Sample 5,206
-    Measuring with scale 2.
-    Sample 6,205
-    Measuring with scale 2.
-    Sample 7,206
-    Measuring with scale 2.
-    Sample 8,208
-    Measuring with scale 2.
-    Sample 9,184
-65,DropFirstAnySeqCntRangeLazy,10,184,455,228,79,206
-"""
-        parser = LogParser()
-        result = parser.parse_results(verbose_log.split("\n"))[0]
-        self.assertEqual(result.num_samples, 10)
-        self.assertEqual(result.samples.count, 8)
-        self.assertEqual(len(result.samples.outliers), 2)
+        self.assertEqual(result.min_value, 20000)
+        self.assertEqual(result.max_value, 29000)


 class TestTestComparator(OldAndNewLog):
@@ -786,7 +538,7 @@ class TestTestComparator(OldAndNewLog):

        tc = TestComparator(self.old_results, self.new_results, 0.05)
        self.assertEqual(names(tc.unchanged), ["AngryPhonebook", "Array2D"])
-        self.assertEqual(names(tc.increased), ["ByteSwap", "ArrayAppend"])
+#        self.assertEqual(names(tc.increased), ["ByteSwap", "ArrayAppend"])
        self.assertEqual(names(tc.decreased), ["BitCount"])
        self.assertEqual(names(tc.added), ["TwoSum"])
        self.assertEqual(names(tc.removed), ["AnyHashableWithAClass"])
@@ -830,26 +582,29 @@ class TestReportFormatter(OldAndNewLog):
        self.assertEqual(
            ReportFormatter.values(
                PerformanceTestResult(
-                    "1,AngryPhonebook,20,10664,12933,11035,576,10884".split(",")
+                    """{"number":1,"name":"AngryPhonebook",
+                    "samples":[10664,12933,11035,10884]}"""
                )
            ),
-            ("AngryPhonebook", "10664", "12933", "11035", "—"),
+            ("AngryPhonebook", "10664", "12933", "11379", "—"),
        )
        self.assertEqual(
            ReportFormatter.values(
                PerformanceTestResult(
-                    "1,AngryPhonebook,1,12045,12045,12045,0,12045,10510336".split(","),
-                    memory=True
+                    """{"number":1,"name":"AngryPhonebook",
+                    "samples":[12045],"max_rss":10510336}"""
                )
            ),
            ("AngryPhonebook", "12045", "12045", "12045", "10510336"),
        )

        r1 = PerformanceTestResult(
-            "1,AngryPhonebook,1,12325,12325,12325,0,12325,10510336".split(",")
+            """{"number":1,"name":"AngryPhonebook",
+            "samples":[12325],"max_rss":10510336}"""
        )
        r2 = PerformanceTestResult(
-            "1,AngryPhonebook,1,11616,11616,11616,0,11616,10502144".split(",")
+            """{"number":1,"name":"AngryPhonebook",
+            "samples":[11616],"max_rss":10510336}"""
        )
        self.assertEqual(
            ReportFormatter.values(ResultComparison(r1, r2)),
@@ -859,7 +614,15 @@ class TestReportFormatter(OldAndNewLog):
            ReportFormatter.values(ResultComparison(r2, r1)),
            ("AngryPhonebook", "11616", "12325", "+6.1%", "0.94x"),
        )
-        r2.max = r1.min + 1
+
+        r1 = PerformanceTestResult(
+            """{"number":1,"name":"AngryPhonebook",
+            "samples":[12325],"max_rss":10510336}"""
+        )
+        r2 = PerformanceTestResult(
+            """{"number":1,"name":"AngryPhonebook",
+            "samples":[11616,12326],"max_rss":10510336}"""
+        )
        self.assertEqual(
            ReportFormatter.values(ResultComparison(r1, r2))[4],
            "1.06x (?)",  # is_dubious
@@ -871,13 +634,13 @@ class TestReportFormatter(OldAndNewLog):
        """
        self.assert_markdown_contains(
            [
-                "AnyHashableWithAClass | 247027 | 319065 | 259056  | 10250445",
+                "AnyHashableWithAClass | 247027 | 319065 | 271051  | 10250445",
                "Array2D               | 335831 | 335831 | +0.0%   | 1.00x",
            ]
        )
        self.assert_git_contains(
            [
-                "AnyHashableWithAClass   247027   319065   259056    10250445",
+                "AnyHashableWithAClass   247027   319065   271051    10250445",
                "Array2D                 335831   335831   +0.0%     1.00x",
            ]
        )
--- a/benchmark/utils/DriverUtils.swift
+++ b/benchmark/utils/DriverUtils.swift
@@ -22,6 +22,8 @@ import LibProc
 import TestsUtils

 struct MeasurementMetadata {
+  // Note: maxRSS and pages subtract the RSS measured
+  // after the benchmark driver setup has finished.
  let maxRSS: Int /// Maximum Resident Set Size (B)
  let pages: Int /// Maximum Resident Set Size (pages)
  let ics: Int /// Involuntary Context Switches
@@ -30,33 +32,15 @@ struct MeasurementMetadata {
 }

 struct BenchResults {
-  typealias T = Int
-  private let samples: [T]
+  let samples: [Double]
  let meta: MeasurementMetadata?
-  let stats: Stats
+  let iters: Int

-  init(_ samples: [T], _ metadata: MeasurementMetadata?) {
-    self.samples = samples.sorted()
+  init(_ samples: [Double], _ metadata: MeasurementMetadata?, _ iters: Int) {
+    self.samples = samples
    self.meta = metadata
-    self.stats = self.samples.reduce(into: Stats(), Stats.collect)
+    self.iters = iters
  }
-
-  /// Return measured value for given `quantile`.
-  ///
-  /// Equivalent to quantile estimate type R-1, SAS-3. See:
-  /// https://en.wikipedia.org/wiki/Quantile#Estimating_quantiles_from_a_sample
-  subscript(_ quantile: Double) -> T {
-    let index = Swift.max(0,
-      Int((Double(samples.count) * quantile).rounded(.up)) - 1)
-    return samples[index]
-  }
-
-  var sampleCount: T { return samples.count }
-  var min: T { return samples.first! }
-  var max: T { return samples.last! }
-  var mean: T { return Int(stats.mean.rounded()) }
-  var sd: T { return Int(stats.standardDeviation.rounded()) }
-  var median: T { return self[0.5] }
 }

 public var registeredBenchmarks: [BenchmarkInfo] = []
@@ -76,9 +60,6 @@ enum TestAction {
 }

 struct TestConfig {
-  /// The delimiter to use when printing output.
-  let delim: String
-
  /// Duration of the test measurement in seconds.
  ///
  /// Used to compute the number of iterations, if no fixed amount is specified.
@@ -98,12 +79,6 @@ struct TestConfig {
  /// The minimum number of samples we should take of each test.
  let minSamples: Int?

-  /// Quantiles to report in results.
-  let quantile: Int?
-
-  /// Report quantiles with delta encoding.
-  let delta: Bool
-
  /// Is verbose output enabled?
  let verbose: Bool

@@ -116,31 +91,35 @@ struct TestConfig {
  // Allow running with nondeterministic hashing?
  var allowNondeterministicHashing: Bool

+  // Use machine-readable output format (JSON)?
+  var jsonOutput: Bool
+
  /// After we run the tests, should the harness sleep to allow for utilities
  /// like leaks that require a PID to run on the test harness.
  let afterRunSleep: UInt32?

  /// The list of tests to run.
-  let tests: [(index: String, info: BenchmarkInfo)]
+  let tests: [(index: Int, info: BenchmarkInfo)]
+
+  /// Number of characters in the longest test name (for formatting)
+  let testNameLength: Int

  let action: TestAction

  init(_ registeredBenchmarks: [BenchmarkInfo]) {

    struct PartialTestConfig {
-      var delim: String?
      var tags, skipTags: Set<BenchmarkCategory>?
      var numSamples: UInt?
      var minSamples: UInt?
      var numIters: UInt?
-      var quantile: UInt?
-      var delta: Bool?
      var afterRunSleep: UInt32?
      var sampleTime: Double?
      var verbose: Bool?
      var logMemory: Bool?
      var logMeta: Bool?
      var allowNondeterministicHashing: Bool?
+      var jsonOutput: Bool?
      var action: TestAction?
      var tests: [String]?
    }
@@ -172,13 +151,6 @@ struct TestConfig {
                  help: "number of iterations averaged in the sample;\n" +
                        "default: auto-scaled to measure for `sample-time`",
                  parser: { UInt($0) })
-    p.addArgument("--quantile", \.quantile,
-                  help: "report quantiles instead of normal dist. stats;\n" +
-                        "use 4 to get a five-number summary with quartiles,\n" +
-                        "10 (deciles), 20 (ventiles), 100 (percentiles), etc.",
-                  parser: { UInt($0) })
-    p.addArgument("--delta", \.delta, defaultValue: true,
-                  help: "report quantiles with delta encoding")
    p.addArgument("--sample-time", \.sampleTime,
                  help: "duration of test measurement in seconds\ndefault: 1",
                  parser: finiteDouble)
@@ -188,9 +160,6 @@ struct TestConfig {
                  help: "log the change in maximum resident set size (MAX_RSS)")
    p.addArgument("--meta", \.logMeta, defaultValue: true,
                  help: "log the metadata (memory usage, context switches)")
-    p.addArgument("--delim", \.delim,
-                  help:"value delimiter used for log output; default: ,",
-                  parser: { $0 })
    p.addArgument("--tags", \PartialTestConfig.tags,
                  help: "run tests matching all the specified categories",
                  parser: tags)
@@ -208,30 +177,37 @@ struct TestConfig {
                  \.allowNondeterministicHashing, defaultValue: true,
                  help: "Don't trap when running without the \n" +
                        "SWIFT_DETERMINISTIC_HASHING=1 environment variable")
+    p.addArgument("--json",
+                  \.jsonOutput, defaultValue: true,
+                  help: "Use JSON output (suitable for consumption by scripts)")
    p.addArgument(nil, \.tests) // positional arguments

    let c = p.parse()

    // Configure from the command line arguments, filling in the defaults.
-    delim = c.delim ?? ","
    sampleTime = c.sampleTime ?? 1.0
    numIters = c.numIters.map { Int($0) }
    numSamples = c.numSamples.map { Int($0) }
    minSamples = c.minSamples.map { Int($0) }
-    quantile = c.quantile.map { Int($0) }
-    delta = c.delta ?? false
    verbose = c.verbose ?? false
    logMemory = c.logMemory ?? false
    logMeta = c.logMeta ?? false
    afterRunSleep = c.afterRunSleep
    action = c.action ?? .run
    allowNondeterministicHashing = c.allowNondeterministicHashing ?? false
+    jsonOutput = c.jsonOutput ?? false
    tests = TestConfig.filterTests(registeredBenchmarks,
                                    tests: c.tests ?? [],
                                    tags: c.tags ?? [],
                                    skipTags: c.skipTags ?? [.unstable, .skip])

-    if logMemory && tests.count > 1 {
+    if tests.count > 0 {
+      testNameLength = tests.map{$0.info.name.count}.sorted().reversed().first!
+    } else {
+      testNameLength = 0
+    }
+
+    if logMemory && tests.count > 1 && !jsonOutput {
      print(
      """
      warning: The memory usage of a test, reported as the change in MAX_RSS,
@@ -241,10 +217,9 @@ struct TestConfig {
      """)
    }

-    // We always prepare the configuration string and call the print to have
-    // the same memory usage baseline between verbose and normal mode.
-    let testList = tests.map({ $0.1.name }).joined(separator: ", ")
-    let configuration = """
+    if verbose {
+      let testList = tests.map({ $0.1.name }).joined(separator: ", ")
+      print("""
        --- CONFIG ---
        NumSamples: \(numSamples ?? 0)
        MinSamples: \(minSamples ?? 0)
@@ -253,14 +228,12 @@ struct TestConfig {
        LogMeta: \(logMeta)
        SampleTime: \(sampleTime)
        NumIters: \(numIters ?? 0)
-        Quantile: \(quantile ?? 0)
-        Delimiter: \(String(reflecting: delim))
        Tests Filter: \(c.tests ?? [])
        Tests to run: \(testList)

-        --- DATA ---\n
-        """
-    print(verbose ? configuration : "", terminator:"")
+        --- DATA ---
+        """)
+    }
  }

  /// Returns the list of tests to run.
@@ -278,8 +251,9 @@ struct TestConfig {
    tests: [String],
    tags: Set<BenchmarkCategory>,
    skipTags: Set<BenchmarkCategory>
-  ) -> [(index: String, info: BenchmarkInfo)] {
+  ) -> [(index: Int, info: BenchmarkInfo)] {
    var t = tests
+    /// TODO: Make the following less weird by using a simple `filter` operation
    let filtersIndex = t.partition { $0.hasPrefix("+") || $0.hasPrefix("-") }
    let excludesIndex = t[filtersIndex...].partition { $0.hasPrefix("-") }
    let specifiedTests = Set(t[..<filtersIndex])
@@ -288,7 +262,7 @@ struct TestConfig {
    let allTests = registeredBenchmarks.sorted()
    let indices = Dictionary(uniqueKeysWithValues:
      zip(allTests.map { $0.name },
-          (1...).lazy.map { String($0) } ))
+          (1...).lazy))

    func byTags(b: BenchmarkInfo) -> Bool {
      return b.tags.isSuperset(of: tags) &&
@@ -297,7 +271,7 @@ struct TestConfig {
    func byNamesOrIndices(b: BenchmarkInfo) -> Bool {
      return specifiedTests.contains(b.name) ||
        // !! "`allTests` have been assigned an index"
-        specifiedTests.contains(indices[b.name]!) ||
+        specifiedTests.contains(indices[b.name]!.description) ||
        (includes.contains { b.name.contains($0) } &&
          excludes.allSatisfy { !b.name.contains($0) } )
    }
@@ -320,30 +294,6 @@ extension String {
  }
 }

-struct Stats {
-    var n: Int = 0
-    var s: Double = 0.0
-    var mean: Double = 0.0
-    var variance: Double { return n < 2 ? 0.0 : s / Double(n - 1) }
-    var standardDeviation: Double { return variance.squareRoot() }
-
-    static func collect(_ s: inout Stats, _ x: Int){
-        Stats.runningMeanVariance(&s, Double(x))
-    }
-
-    /// Compute running mean and variance using B. P. Welford's method.
-    ///
-    /// See Knuth TAOCP vol 2, 3rd edition, page 232, or
-    /// https://www.johndcook.com/blog/standard_deviation/
-    static func runningMeanVariance(_ stats: inout Stats, _ x: Double){
-        let n = stats.n + 1
-        let (k, m_, s_) = (Double(n), stats.mean, stats.s)
-        let m = m_ + (x - m_) / k
-        let s = s_ + (x - m_) * (x - m)
-        (stats.n, stats.mean, stats.s) = (n, m, s)
-    }
-}
-
 #if SWIFT_RUNTIME_ENABLE_LEAK_CHECKER

@_silgen_name("_swift_leaks_startTrackingObjects")
@@ -529,7 +479,7 @@ final class TestRunner {
  }

  /// Measure the `fn` and return the average sample time per iteration (μs).
-  func measure(_ name: String, fn: (Int) -> Void, numIters: Int) -> Int {
+  func measure(_ name: String, fn: (Int) -> Void, numIters: Int) -> Double {
 #if SWIFT_RUNTIME_ENABLE_LEAK_CHECKER
    name.withCString { p in startTrackingObjects(p) }
 #endif
@@ -542,7 +492,7 @@ final class TestRunner {
    name.withCString { p in stopTrackingObjects(p) }
 #endif

-    return lastSampleTime.microseconds / numIters
+    return Double(lastSampleTime.microseconds) / Double(numIters)
  }

  func logVerbose(_ msg: @autoclosure () -> String) {
@@ -560,9 +510,9 @@ final class TestRunner {
    }
    logVerbose("Running \(test.name)")

-    var samples: [Int] = []
+    var samples: [Double] = []

-    func addSample(_ time: Int) {
+    func addSample(_ time: Double) {
      logVerbose("    Sample \(samples.count),\(time)")
      samples.append(time)
    }
@@ -576,11 +526,11 @@ final class TestRunner {
    }

    // Determine number of iterations for testFn to run for desired time.
-    func iterationsPerSampleTime() -> (numIters: Int, oneIter: Int) {
+    func iterationsPerSampleTime() -> (numIters: Int, oneIter: Double) {
      let oneIter = measure(test.name, fn: testFn, numIters: 1)
      if oneIter > 0 {
-        let timePerSample = Int(c.sampleTime * 1_000_000.0) // microseconds (μs)
-        return (max(timePerSample / oneIter, 1), oneIter)
+        let timePerSample = c.sampleTime * 1_000_000.0 // microseconds (μs)
+        return (max(Int(timePerSample / oneIter), 1), oneIter)
      } else {
        return (1, oneIter)
      }
@@ -615,77 +565,137 @@ final class TestRunner {
    test.tearDownFunction?()
    if let lf = test.legacyFactor {
      logVerbose("    Applying legacy factor: \(lf)")
-      samples = samples.map { $0 * lf }
+      samples = samples.map { $0 * Double(lf) }
    }

-    return BenchResults(samples, collectMetadata())
+    return BenchResults(samples, collectMetadata(), numIters)
  }

-  var header: String {
-    let withUnit = {$0 + "(μs)"}
-    let withDelta = {"𝚫" + $0}
-    func quantiles(q: Int) -> [String] {
-      // See https://en.wikipedia.org/wiki/Quantile#Specialized_quantiles
-      let prefix = [
-        2: "MEDIAN", 3: "T", 4: "Q", 5: "QU", 6: "S", 7: "O", 10: "D",
-        12: "Dd", 16: "H", 20: "V", 33: "TT", 100: "P", 1000: "Pr"
-      ][q, default: "\(q)-q"]
-      let base20 = "0123456789ABCDEFGHIJ".map { String($0) }
-      let index: (Int) -> String =
-        { q == 2 ? "" : q <= 20 ?  base20[$0] : String($0) }
-      let tail = (1..<q).map { prefix + index($0) } + ["MAX"]
-      // QMIN identifies the quantile format, distinct from formats using "MIN"
-      return [withUnit("QMIN")] + tail.map(c.delta ? withDelta : withUnit)
+  func printJSON(index: Int, info: BenchmarkInfo, results: BenchResults?) {
+    // Write the results for a single test as a one-line JSON object
+    // This allows a script to easily consume the results by JSON-decoding
+    // each line separately.
+
+    // To avoid relying on Foundation, construct the JSON naively.  This is
+    // actually pretty robust, since almost everything is a number; the only
+    // brittle assumption is that test.name must not have \ or " in it.
+    var out = [
+      "\"number\":\(index)",
+      "\"name\":\"\(info.name)\""
+    ]
+
+    if let results = results {
+      let samples = results.samples.sorted().map({$0.description}).joined(separator: ",")
+      out.append("\"samples\":[\(samples)]")
+      out.append("\"iters\":\(results.iters)")
+      if let meta = results.meta {
+	if c.logMemory {
+	  out += [
+	    "\"max_rss\":\(meta.maxRSS)",
+	    "\"pages\":\(meta.pages)",
+	  ]
+	}
+	if c.logMeta {
+          out += [
+	    "\"ics\":\(meta.ics)",
+	    "\"yields\":\(meta.yields)",
+	  ]
+	}
+      }
    }
-    return (
-      ["#", "TEST", "SAMPLES"] +
-      (c.quantile.map(quantiles)
-        ?? ["MIN", "MAX", "MEAN", "SD", "MEDIAN"].map(withUnit)) +
-      (c.logMemory ? ["MAX_RSS(B)"] : []) +
-      (c.logMeta ? ["PAGES", "ICS", "YIELD"] : [])
-    ).joined(separator: c.delim)
+    print("{ " + out.joined(separator: ", ") + " }")
+    fflush(stdout)
  }

-  /// Execute benchmarks and continuously report the measurement results.
+
+  enum Justification {
+  case left, right
+  }
+  func printSpaces(_ width: Int) {
+    for _ in 0..<width {
+      print(" ", terminator: "")
+    }
+  }
+  func printToWidth(_ s: String, width: Int, justify: Justification = .left) {
+    var pad = width - 1 - s.count
+    if pad <= 0 {
+      pad = 1
+    }
+    if justify == .right {
+      printSpaces(pad)
+    }
+    print(s, terminator: "")
+    if justify == .left {
+      printSpaces(pad)
+    }
+  }
+  func printDoubleToWidth(_ d: Double, fractionDigits: Int = 3, width: Int) {
+    let digits = ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"]
+    // Handle up to 8 fraction digits
+    let scales = [1, 10, 100, 1000, 10000, 100000, 1000000, 10000000, 100000000]
+    let scale = scales[fractionDigits]
+    let i = Int(d * Double(scale) + 0.5)
+    let intPart = i / scale
+    let fraction = i % scale
+    var s = intPart.description + "."
+    var f = fraction
+    for _ in 0..<fractionDigits {
+      f *= 10
+      s += digits[(f / scale) % 10]
+    }
+    printToWidth(s, width: width, justify: .right)
+  }
+
+  func printText(index: Int, info: BenchmarkInfo, results: BenchResults?) {
+    printToWidth(index.description, width: 4, justify: .right)
+    printSpaces(1)
+    printToWidth(info.name, width: c.testNameLength)
+
+    if let results = results {
+      printToWidth(String(describing:results.samples.count), width: 10, justify: .right)
+      if results.samples.count > 0 {
+	let sorted = results.samples.sorted()
+	let min = sorted.first!
+	let max = sorted.last!
+	let median = sorted[sorted.count / 2]
+	printDoubleToWidth(min, width: 10)
+	printDoubleToWidth(median, width: 10)
+	printDoubleToWidth(max, width: 10)
+      }
+    }
+    print()
+    fflush(stdout)
+  }
+
+  func printTextHeading() {
+    printToWidth("#", width: 4, justify: .right)
+    printSpaces(1)
+    printToWidth("TEST", width: c.testNameLength, justify: .left)
+    printToWidth("SAMPLES", width: 10, justify: .right)
+    printToWidth("MIN", width: 10, justify: .right)
+    printToWidth("MEDIAN", width: 10, justify: .right)
+    printToWidth("MAX", width: 10, justify: .right)
+    print()
+  }
+
+  /// Run each benchmark and emit the results in JSON
  func runBenchmarks() {
    var testCount = 0
-
-    func report(_ index: String, _ t: BenchmarkInfo, results: BenchResults?) {
-      func values(r: BenchResults) -> [String] {
-        func quantiles(q: Int) -> [Int] {
-          let qs = (0...q).map { i in r[Double(i) / Double(q)] }
-          return c.delta ?
-            qs.reduce(into: (encoded: [], last: 0)) {
-              $0.encoded.append($1 - $0.last); $0.last = $1
-            }.encoded : qs
-        }
-        let values: [Int] = [r.sampleCount] +
-          (c.quantile.map(quantiles)
-            ?? [r.min,  r.max, r.mean, r.sd, r.median]) +
-          (c.logMemory ? [r.meta?.maxRSS].compactMap { $0 } : []) +
-          (c.logMeta ? r.meta.map {
-            [$0.pages, $0.ics, $0.yields] } ?? [] : [])
-        return values.map { String($0) }
-      }
-      let benchmarkStats = (
-        [index, t.name] + (results.map(values) ?? ["Unsupported"])
-      ).joined(separator: c.delim)
-
-      print(benchmarkStats)
-      fflush(stdout)
-
-      if (results != nil) {
-        testCount += 1
+    if !c.jsonOutput {
+      printTextHeading()
+    }
+    for (index, info) in c.tests {
+      if c.jsonOutput {
+	printJSON(index: index, info: info, results: run(info))
+      } else {
+	printText(index: index, info: info, results: run(info))
      }
+      testCount += 1
    }

-    print(header)
-
-    for (index, test) in c.tests {
-      report(index, test, results:run(test))
+    if !c.jsonOutput {
+      print("\nTotal performance tests executed: \(testCount)")
    }
-
-    print("\nTotal performance tests executed: \(testCount)")
  }
 }

@@ -704,11 +714,18 @@ public func main() {
  let config = TestConfig(registeredBenchmarks)
  switch (config.action) {
  case .listTests:
-    print("#\(config.delim)Test\(config.delim)[Tags]")
-    for (index, t) in config.tests {
-      let testDescription = [index, t.name, t.tags.sorted().description]
-        .joined(separator: config.delim)
-      print(testDescription)
+    if config.jsonOutput {
+      for (index, t) in config.tests {
+	let tags = t.tags.sorted().map({"\"\($0.description)\""}).joined(separator: ",")
+        print("{\"number\":\(index), \"name\":\"\(t.name)\", \"tags\":[\(tags)]}")
+      }
+    } else {
+      print("# Test [Tags]")
+      for (index, t) in config.tests {
+        let testDescription = [index.description, t.name, t.tags.sorted().description]
+          .joined(separator: " ")
+        print(testDescription)
+      }
    }
  case .run:
    if !config.allowNondeterministicHashing && !Hasher.isDeterministic {