Files
swift-mirror/benchmark/scripts/compare_perf_tests.py
Andrew Trick 6bcbde2f1c [benchmark] Fix the comparison script to report possibly spurious data points.
When comparing scores from either multiple samples, or multiple runs,
print (?) if scores for the configurations being compared have
overlapping ranges.
2016-02-28 00:29:49 -08:00

217 lines
6.7 KiB
Python
Executable File

#!/usr/bin/python
# -*- coding: utf-8 -*-
# ===--- compare_perf_tests.py --------------------------------------------===//
#
# This source file is part of the Swift.org open source project
#
# Copyright (c) 2014 - 2016 Apple Inc. and the Swift project authors
# Licensed under Apache License v2.0 with Runtime Library Exception
#
# See http://swift.org/LICENSE.txt for license information
# See http://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
#
# ===----------------------------------------------------------------------===//
# e.g.
# repeat.sh 3 tot/bin/Benchmark_Driver run -o -O > tot.O.times
# repeat.sh 3 mypatch/bin/Benchmark_Driver run -o -O > mypatch.O.times
# compare_perf_tests.py tot.O.times mypatch.O.times | sort -t, -n -k 6 | column -s, -t
import sys
import re
VERBOSE = 0
# #,TEST,SAMPLES,MIN(ms),MAX(ms),MEAN(ms),SD(ms),MEDIAN(ms)
SCORERE = re.compile(r"(\d+),[ \t]*(\w+),[ \t]*([\d.]+),[ \t]*([\d.]+),[ \t]*([\d.]+)")
TOTALRE = re.compile(r"()(Totals),[ \t]*([\d.]+),[ \t]*([\d.]+),[ \t]*([\d.]+)")
NUMGROUP = 1
KEYGROUP = 2
BESTGROUP = 4
WORSTGROUP = 5
IsTime = 1
ShowSpeedup = 1
PrintAllScores = 0
def parseInt(word):
try:
return int(word)
except:
raise Exception("Expected integer value, not " + word)
def getScores(fname):
scores = {}
worstscores = {}
nums = {}
runs = 0
f = open(fname)
try:
for line in f:
if VERBOSE:
print "Parsing", line,
m = SCORERE.match(line)
is_total = False
if not m:
is_total = True
m = TOTALRE.match(line)
if not m:
continue
if VERBOSE:
print " match", m.group(KEYGROUP), m.group(BESTGROUP)
if not m.group(KEYGROUP) in scores:
scores[m.group(KEYGROUP)] = []
worstscores[m.group(KEYGROUP)] = []
scores[m.group(KEYGROUP)].append(parseInt(m.group(BESTGROUP)))
worstscores[m.group(KEYGROUP)].append(parseInt(m.group(WORSTGROUP)))
if is_total:
nums[m.group(KEYGROUP)] = ""
else:
nums[m.group(KEYGROUP)] = m.group(NUMGROUP)
if len(scores[m.group(KEYGROUP)]) > runs:
runs = len(scores[m.group(KEYGROUP)])
finally:
f.close()
return scores, worstscores, runs, nums
def isMaxScore(newscore, maxscore, invert):
return not maxscore or (newscore > maxscore if not invert else newscore < maxscore)
def compareScores(key, score1, worstsample1, score2, worstsample2, runs, num):
print num.rjust(3),
print key.ljust(25),
bestscore1 = None
bestscore2 = None
worstscore1 = None
worstscore2 = None
minbest = IsTime
minworst = not minbest
r = 0
for score in score1:
if isMaxScore(newscore=score, maxscore=bestscore1, invert=minbest):
bestscore1 = score
if isMaxScore(newscore=score, maxscore=worstscore1, invert=minworst):
worstscore1 = score
if PrintAllScores:
print ("%d" % score).rjust(16),
for score in worstsample1:
if isMaxScore(newscore=score, maxscore=worstscore1, invert=minworst):
worstscore1 = score
for score in score2:
if isMaxScore(newscore=score, maxscore=bestscore2, invert=minbest):
bestscore2 = score
if isMaxScore(newscore=score, maxscore=worstscore2, invert=minworst):
worstscore2 = score
if PrintAllScores:
print ("%d" % score).rjust(16),
r += 1
for score in worstsample2:
if isMaxScore(newscore=score, maxscore=worstscore2, invert=minworst):
worstscore2 = score
while r < runs:
if PrintAllScores:
print ("0").rjust(9),
r += 1
if not PrintAllScores:
print ("%d" % bestscore1).rjust(16),
print ("%d" % bestscore2).rjust(16),
print ("%+d" % (bestscore2 - bestscore1)).rjust(9),
if bestscore1 != 0 and bestscore2 != 0:
print ("%+.1f%%" % (((float(bestscore2) / bestscore1) - 1) * 100)).rjust(9),
if ShowSpeedup:
Num, Den = float(bestscore2), float(bestscore1)
if IsTime:
Num, Den = Den, Num
print ("%.2fx" % (Num / Den)).rjust(9),
else:
print "*".rjust(9),
if ShowSpeedup:
print "*".rjust(9),
# check if the worst->best interval for each configuration overlap.
if minbest:
if (bestscore1 < bestscore2 and bestscore2 < worstscore1) \
or (bestscore2 < bestscore1 and bestscore1 < worstscore2):
print "(?)",
else:
if (worstscore1 < worstscore2 and worstscore2 < bestscore1) \
or (worstscore2 < worstscore1 and worstscore1 < bestscore2):
print "(?)",
print
def printBestScores(key, scores):
print key,
bestscore = None
minbest = IsTime
for score in scores:
if isMaxScore(newscore=score, maxscore=bestscore, invert=minbest):
bestscore = score
print ", %d" % bestscore
def usage():
print "repeat.sh <n> Benchmark_O[none|unchecked] > file.times"
print "compare_perf_tests.py <file.times> [<file2.times>]"
if __name__ == '__main__':
if len(sys.argv) < 2:
usage()
sys.exit(1)
file1 = sys.argv[1]
if len(sys.argv) < 3:
scores, worstscores, runs, nums = getScores(file1)
keys = list(set(scores.keys()))
keys.sort()
for key in keys:
printBestScores(key, scores[key])
sys.exit(0)
file2 = sys.argv[2]
if len(sys.argv) > 3:
SCORERE = re.compile(sys.argv[3])
scores1, worstscores1, runs1, nums = getScores(file1)
scores2, worstscores2, runs2, nums = getScores(file2)
runs = runs1
if runs2 > runs:
runs = runs2
if VERBOSE:
print scores1
print scores2
keys = list(set(scores1.keys() + scores2.keys()))
keys.sort()
if VERBOSE:
print "comparing ", file1, "vs", file2, "=",
if IsTime:
print file1, "/", file2
else:
print file2, "/", file1
print "#".rjust(3),
print "TEST".ljust(25),
if PrintAllScores:
for i in range(0, runs):
print ("OLD_RUN%d" % i).rjust(9),
for i in range(0, runs):
print ("NEW_RUN%d" % i).rjust(9),
else:
print "BEST_OLD_MIN(μs)".rjust(17),
print "BEST_NEW_MIN(μs)".rjust(17),
print 'DELTA'.rjust(9), '%DELTA'.rjust(9), 'SPEEDUP'.rjust(9)
for key in keys:
if key not in scores1:
print key, "not in", file1
continue
if key not in scores2:
print key, "not in", file2
continue
compareScores(key, scores1[key], worstscores1[key], scores2[key], worstscores2[key], runs, nums[key])