mirror of
https://github.com/google/benchmark.git
synced 2025-03-31 22:50:14 +08:00
The first problem you have to solve yourself. The second one can be aided. The benchmark library can compute some statistics over the repetitions, which helps with grasping the results somewhat. But that is only for the one set of results. It does not really help to compare the two benchmark results, which is the interesting bit. Thankfully, there are these bundled `tools/compare.py` and `tools/compare_bench.py` scripts. They can provide a diff between two benchmarking results. Yay! Except not really, it's just a diff, while it is very informative and better than nothing, it does not really help answer The Question - am i just looking at the noise? It's like not having these per-benchmark statistics... Roughly, we can formulate the question as: > Are these two benchmarks the same? > Did my change actually change anything, or is the difference below the noise level? Well, this really sounds like a [null hypothesis](https://en.wikipedia.org/wiki/Null_hypothesis), does it not? So maybe we can use statistics here, and solve all our problems? lol, no, it won't solve all the problems. But maybe it will act as a tool, to better understand the output, just like the usual statistics on the repetitions... I'm making an assumption here that most of the people care about the change of average value, not the standard deviation. Thus i believe we can use T-Test, be it either [Student's t-test](https://en.wikipedia.org/wiki/Student%27s_t-test), or [Welch's t-test](https://en.wikipedia.org/wiki/Welch%27s_t-test). **EDIT**: however, after @dominichamon review, it was decided that it is better to use more robust [Mann–Whitney U test](https://en.wikipedia.org/wiki/Mann–Whitney_U_test) I'm using [scipy.stats.mannwhitneyu](https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.mannwhitneyu.html#scipy.stats.mannwhitneyu). There are two new user-facing knobs: ``` $ ./compare.py --help usage: compare.py [-h] [-u] [--alpha UTEST_ALPHA] {benchmarks,filters,benchmarksfiltered} ... versatile benchmark output compare tool <...> optional arguments: -h, --help show this help message and exit -u, --utest Do a two-tailed Mann-Whitney U test with the null hypothesis that it is equally likely that a randomly selected value from one sample will be less than or greater than a randomly selected value from a second sample. WARNING: requires **LARGE** (9 or more) number of repetitions to be meaningful! --alpha UTEST_ALPHA significance level alpha. if the calculated p-value is below this value, then the result is said to be statistically significant and the null hypothesis is rejected. (default: 0.0500) ``` Example output:  As you can guess, the alpha does affect anything but the coloring of the computed p-values. If it is green, then the change in the average values is statistically-significant. I'm detecting the repetitions by matching name. This way, no changes to the json are _needed_. Caveats: * This won't work if the json is not in the same order as outputted by the benchmark, or if the parsing does not retain the ordering. * This won't work if after the grouped repetitions there isn't at least one row with different name (e.g. statistic). Since there isn't a knob to disable printing of statistics (only the other way around), i'm not too worried about this. * **The results will be wrong if the repetition count is different between the two benchmarks being compared.** * Even though i have added (hopefully full) test coverage, the code of these python tools is staring to look a bit jumbled. * So far i have added this only to the `tools/compare.py`. Should i add it to `tools/compare_bench.py` too? Or should we deduplicate them (by removing the latter one)?
373 lines
15 KiB
Python
Executable File
373 lines
15 KiB
Python
Executable File
#!/usr/bin/env python
|
|
|
|
"""
|
|
compare.py - versatile benchmark output compare tool
|
|
"""
|
|
|
|
import argparse
|
|
from argparse import ArgumentParser
|
|
import sys
|
|
import gbench
|
|
from gbench import util, report
|
|
from gbench.util import *
|
|
|
|
|
|
def check_inputs(in1, in2, flags):
|
|
"""
|
|
Perform checking on the user provided inputs and diagnose any abnormalities
|
|
"""
|
|
in1_kind, in1_err = classify_input_file(in1)
|
|
in2_kind, in2_err = classify_input_file(in2)
|
|
output_file = find_benchmark_flag('--benchmark_out=', flags)
|
|
output_type = find_benchmark_flag('--benchmark_out_format=', flags)
|
|
if in1_kind == IT_Executable and in2_kind == IT_Executable and output_file:
|
|
print(("WARNING: '--benchmark_out=%s' will be passed to both "
|
|
"benchmarks causing it to be overwritten") % output_file)
|
|
if in1_kind == IT_JSON and in2_kind == IT_JSON and len(flags) > 0:
|
|
print("WARNING: passing optional flags has no effect since both "
|
|
"inputs are JSON")
|
|
if output_type is not None and output_type != 'json':
|
|
print(("ERROR: passing '--benchmark_out_format=%s' to 'compare.py`"
|
|
" is not supported.") % output_type)
|
|
sys.exit(1)
|
|
|
|
|
|
def create_parser():
|
|
parser = ArgumentParser(
|
|
description='versatile benchmark output compare tool')
|
|
|
|
utest = parser.add_argument_group()
|
|
utest.add_argument(
|
|
'-u',
|
|
'--utest',
|
|
action="store_true",
|
|
help="Do a two-tailed Mann-Whitney U test with the null hypothesis that it is equally likely that a randomly selected value from one sample will be less than or greater than a randomly selected value from a second sample.\nWARNING: requires **LARGE** (no less than 9) number of repetitions to be meaningful!")
|
|
alpha_default = 0.05
|
|
utest.add_argument(
|
|
"--alpha",
|
|
dest='utest_alpha',
|
|
default=alpha_default,
|
|
type=float,
|
|
help=("significance level alpha. if the calculated p-value is below this value, then the result is said to be statistically significant and the null hypothesis is rejected.\n(default: %0.4f)") %
|
|
alpha_default)
|
|
|
|
subparsers = parser.add_subparsers(
|
|
help='This tool has multiple modes of operation:',
|
|
dest='mode')
|
|
|
|
parser_a = subparsers.add_parser(
|
|
'benchmarks',
|
|
help='The most simple use-case, compare all the output of these two benchmarks')
|
|
baseline = parser_a.add_argument_group(
|
|
'baseline', 'The benchmark baseline')
|
|
baseline.add_argument(
|
|
'test_baseline',
|
|
metavar='test_baseline',
|
|
type=argparse.FileType('r'),
|
|
nargs=1,
|
|
help='A benchmark executable or JSON output file')
|
|
contender = parser_a.add_argument_group(
|
|
'contender', 'The benchmark that will be compared against the baseline')
|
|
contender.add_argument(
|
|
'test_contender',
|
|
metavar='test_contender',
|
|
type=argparse.FileType('r'),
|
|
nargs=1,
|
|
help='A benchmark executable or JSON output file')
|
|
parser_a.add_argument(
|
|
'benchmark_options',
|
|
metavar='benchmark_options',
|
|
nargs=argparse.REMAINDER,
|
|
help='Arguments to pass when running benchmark executables')
|
|
|
|
parser_b = subparsers.add_parser(
|
|
'filters', help='Compare filter one with the filter two of benchmark')
|
|
baseline = parser_b.add_argument_group(
|
|
'baseline', 'The benchmark baseline')
|
|
baseline.add_argument(
|
|
'test',
|
|
metavar='test',
|
|
type=argparse.FileType('r'),
|
|
nargs=1,
|
|
help='A benchmark executable or JSON output file')
|
|
baseline.add_argument(
|
|
'filter_baseline',
|
|
metavar='filter_baseline',
|
|
type=str,
|
|
nargs=1,
|
|
help='The first filter, that will be used as baseline')
|
|
contender = parser_b.add_argument_group(
|
|
'contender', 'The benchmark that will be compared against the baseline')
|
|
contender.add_argument(
|
|
'filter_contender',
|
|
metavar='filter_contender',
|
|
type=str,
|
|
nargs=1,
|
|
help='The second filter, that will be compared against the baseline')
|
|
parser_b.add_argument(
|
|
'benchmark_options',
|
|
metavar='benchmark_options',
|
|
nargs=argparse.REMAINDER,
|
|
help='Arguments to pass when running benchmark executables')
|
|
|
|
parser_c = subparsers.add_parser(
|
|
'benchmarksfiltered',
|
|
help='Compare filter one of first benchmark with filter two of the second benchmark')
|
|
baseline = parser_c.add_argument_group(
|
|
'baseline', 'The benchmark baseline')
|
|
baseline.add_argument(
|
|
'test_baseline',
|
|
metavar='test_baseline',
|
|
type=argparse.FileType('r'),
|
|
nargs=1,
|
|
help='A benchmark executable or JSON output file')
|
|
baseline.add_argument(
|
|
'filter_baseline',
|
|
metavar='filter_baseline',
|
|
type=str,
|
|
nargs=1,
|
|
help='The first filter, that will be used as baseline')
|
|
contender = parser_c.add_argument_group(
|
|
'contender', 'The benchmark that will be compared against the baseline')
|
|
contender.add_argument(
|
|
'test_contender',
|
|
metavar='test_contender',
|
|
type=argparse.FileType('r'),
|
|
nargs=1,
|
|
help='The second benchmark executable or JSON output file, that will be compared against the baseline')
|
|
contender.add_argument(
|
|
'filter_contender',
|
|
metavar='filter_contender',
|
|
type=str,
|
|
nargs=1,
|
|
help='The second filter, that will be compared against the baseline')
|
|
parser_c.add_argument(
|
|
'benchmark_options',
|
|
metavar='benchmark_options',
|
|
nargs=argparse.REMAINDER,
|
|
help='Arguments to pass when running benchmark executables')
|
|
|
|
return parser
|
|
|
|
|
|
def main():
|
|
# Parse the command line flags
|
|
parser = create_parser()
|
|
args, unknown_args = parser.parse_known_args()
|
|
if args.mode is None:
|
|
parser.print_help()
|
|
exit(1)
|
|
assert not unknown_args
|
|
benchmark_options = args.benchmark_options
|
|
|
|
if args.mode == 'benchmarks':
|
|
test_baseline = args.test_baseline[0].name
|
|
test_contender = args.test_contender[0].name
|
|
filter_baseline = ''
|
|
filter_contender = ''
|
|
|
|
# NOTE: if test_baseline == test_contender, you are analyzing the stdev
|
|
|
|
description = 'Comparing %s to %s' % (test_baseline, test_contender)
|
|
elif args.mode == 'filters':
|
|
test_baseline = args.test[0].name
|
|
test_contender = args.test[0].name
|
|
filter_baseline = args.filter_baseline[0]
|
|
filter_contender = args.filter_contender[0]
|
|
|
|
# NOTE: if filter_baseline == filter_contender, you are analyzing the
|
|
# stdev
|
|
|
|
description = 'Comparing %s to %s (from %s)' % (
|
|
filter_baseline, filter_contender, args.test[0].name)
|
|
elif args.mode == 'benchmarksfiltered':
|
|
test_baseline = args.test_baseline[0].name
|
|
test_contender = args.test_contender[0].name
|
|
filter_baseline = args.filter_baseline[0]
|
|
filter_contender = args.filter_contender[0]
|
|
|
|
# NOTE: if test_baseline == test_contender and
|
|
# filter_baseline == filter_contender, you are analyzing the stdev
|
|
|
|
description = 'Comparing %s (from %s) to %s (from %s)' % (
|
|
filter_baseline, test_baseline, filter_contender, test_contender)
|
|
else:
|
|
# should never happen
|
|
print("Unrecognized mode of operation: '%s'" % args.mode)
|
|
parser.print_help()
|
|
exit(1)
|
|
|
|
check_inputs(test_baseline, test_contender, benchmark_options)
|
|
|
|
options_baseline = []
|
|
options_contender = []
|
|
|
|
if filter_baseline and filter_contender:
|
|
options_baseline = ['--benchmark_filter=%s' % filter_baseline]
|
|
options_contender = ['--benchmark_filter=%s' % filter_contender]
|
|
|
|
# Run the benchmarks and report the results
|
|
json1 = json1_orig = gbench.util.run_or_load_benchmark(
|
|
test_baseline, benchmark_options + options_baseline)
|
|
json2 = json2_orig = gbench.util.run_or_load_benchmark(
|
|
test_contender, benchmark_options + options_contender)
|
|
|
|
# Now, filter the benchmarks so that the difference report can work
|
|
if filter_baseline and filter_contender:
|
|
replacement = '[%s vs. %s]' % (filter_baseline, filter_contender)
|
|
json1 = gbench.report.filter_benchmark(
|
|
json1_orig, filter_baseline, replacement)
|
|
json2 = gbench.report.filter_benchmark(
|
|
json2_orig, filter_contender, replacement)
|
|
|
|
# Diff and output
|
|
output_lines = gbench.report.generate_difference_report(
|
|
json1, json2, args.utest, args.utest_alpha)
|
|
print(description)
|
|
for ln in output_lines:
|
|
print(ln)
|
|
|
|
|
|
import unittest
|
|
|
|
|
|
class TestParser(unittest.TestCase):
|
|
def setUp(self):
|
|
self.parser = create_parser()
|
|
testInputs = os.path.join(
|
|
os.path.dirname(
|
|
os.path.realpath(__file__)),
|
|
'gbench',
|
|
'Inputs')
|
|
self.testInput0 = os.path.join(testInputs, 'test1_run1.json')
|
|
self.testInput1 = os.path.join(testInputs, 'test1_run2.json')
|
|
|
|
def test_benchmarks_basic(self):
|
|
parsed = self.parser.parse_args(
|
|
['benchmarks', self.testInput0, self.testInput1])
|
|
self.assertFalse(parsed.utest)
|
|
self.assertEqual(parsed.mode, 'benchmarks')
|
|
self.assertEqual(parsed.test_baseline[0].name, self.testInput0)
|
|
self.assertEqual(parsed.test_contender[0].name, self.testInput1)
|
|
self.assertFalse(parsed.benchmark_options)
|
|
|
|
def test_benchmarks_basic_with_utest(self):
|
|
parsed = self.parser.parse_args(
|
|
['-u', 'benchmarks', self.testInput0, self.testInput1])
|
|
self.assertTrue(parsed.utest)
|
|
self.assertEqual(parsed.utest_alpha, 0.05)
|
|
self.assertEqual(parsed.mode, 'benchmarks')
|
|
self.assertEqual(parsed.test_baseline[0].name, self.testInput0)
|
|
self.assertEqual(parsed.test_contender[0].name, self.testInput1)
|
|
self.assertFalse(parsed.benchmark_options)
|
|
|
|
def test_benchmarks_basic_with_utest(self):
|
|
parsed = self.parser.parse_args(
|
|
['--utest', 'benchmarks', self.testInput0, self.testInput1])
|
|
self.assertTrue(parsed.utest)
|
|
self.assertEqual(parsed.utest_alpha, 0.05)
|
|
self.assertEqual(parsed.mode, 'benchmarks')
|
|
self.assertEqual(parsed.test_baseline[0].name, self.testInput0)
|
|
self.assertEqual(parsed.test_contender[0].name, self.testInput1)
|
|
self.assertFalse(parsed.benchmark_options)
|
|
|
|
def test_benchmarks_basic_with_utest_alpha(self):
|
|
parsed = self.parser.parse_args(
|
|
['--utest', '--alpha=0.314', 'benchmarks', self.testInput0, self.testInput1])
|
|
self.assertTrue(parsed.utest)
|
|
self.assertEqual(parsed.utest_alpha, 0.314)
|
|
self.assertEqual(parsed.mode, 'benchmarks')
|
|
self.assertEqual(parsed.test_baseline[0].name, self.testInput0)
|
|
self.assertEqual(parsed.test_contender[0].name, self.testInput1)
|
|
self.assertFalse(parsed.benchmark_options)
|
|
|
|
def test_benchmarks_with_remainder(self):
|
|
parsed = self.parser.parse_args(
|
|
['benchmarks', self.testInput0, self.testInput1, 'd'])
|
|
self.assertFalse(parsed.utest)
|
|
self.assertEqual(parsed.mode, 'benchmarks')
|
|
self.assertEqual(parsed.test_baseline[0].name, self.testInput0)
|
|
self.assertEqual(parsed.test_contender[0].name, self.testInput1)
|
|
self.assertEqual(parsed.benchmark_options, ['d'])
|
|
|
|
def test_benchmarks_with_remainder_after_doubleminus(self):
|
|
parsed = self.parser.parse_args(
|
|
['benchmarks', self.testInput0, self.testInput1, '--', 'e'])
|
|
self.assertFalse(parsed.utest)
|
|
self.assertEqual(parsed.mode, 'benchmarks')
|
|
self.assertEqual(parsed.test_baseline[0].name, self.testInput0)
|
|
self.assertEqual(parsed.test_contender[0].name, self.testInput1)
|
|
self.assertEqual(parsed.benchmark_options, ['e'])
|
|
|
|
def test_filters_basic(self):
|
|
parsed = self.parser.parse_args(
|
|
['filters', self.testInput0, 'c', 'd'])
|
|
self.assertFalse(parsed.utest)
|
|
self.assertEqual(parsed.mode, 'filters')
|
|
self.assertEqual(parsed.test[0].name, self.testInput0)
|
|
self.assertEqual(parsed.filter_baseline[0], 'c')
|
|
self.assertEqual(parsed.filter_contender[0], 'd')
|
|
self.assertFalse(parsed.benchmark_options)
|
|
|
|
def test_filters_with_remainder(self):
|
|
parsed = self.parser.parse_args(
|
|
['filters', self.testInput0, 'c', 'd', 'e'])
|
|
self.assertFalse(parsed.utest)
|
|
self.assertEqual(parsed.mode, 'filters')
|
|
self.assertEqual(parsed.test[0].name, self.testInput0)
|
|
self.assertEqual(parsed.filter_baseline[0], 'c')
|
|
self.assertEqual(parsed.filter_contender[0], 'd')
|
|
self.assertEqual(parsed.benchmark_options, ['e'])
|
|
|
|
def test_filters_with_remainder_after_doubleminus(self):
|
|
parsed = self.parser.parse_args(
|
|
['filters', self.testInput0, 'c', 'd', '--', 'f'])
|
|
self.assertFalse(parsed.utest)
|
|
self.assertEqual(parsed.mode, 'filters')
|
|
self.assertEqual(parsed.test[0].name, self.testInput0)
|
|
self.assertEqual(parsed.filter_baseline[0], 'c')
|
|
self.assertEqual(parsed.filter_contender[0], 'd')
|
|
self.assertEqual(parsed.benchmark_options, ['f'])
|
|
|
|
def test_benchmarksfiltered_basic(self):
|
|
parsed = self.parser.parse_args(
|
|
['benchmarksfiltered', self.testInput0, 'c', self.testInput1, 'e'])
|
|
self.assertFalse(parsed.utest)
|
|
self.assertEqual(parsed.mode, 'benchmarksfiltered')
|
|
self.assertEqual(parsed.test_baseline[0].name, self.testInput0)
|
|
self.assertEqual(parsed.filter_baseline[0], 'c')
|
|
self.assertEqual(parsed.test_contender[0].name, self.testInput1)
|
|
self.assertEqual(parsed.filter_contender[0], 'e')
|
|
self.assertFalse(parsed.benchmark_options)
|
|
|
|
def test_benchmarksfiltered_with_remainder(self):
|
|
parsed = self.parser.parse_args(
|
|
['benchmarksfiltered', self.testInput0, 'c', self.testInput1, 'e', 'f'])
|
|
self.assertFalse(parsed.utest)
|
|
self.assertEqual(parsed.mode, 'benchmarksfiltered')
|
|
self.assertEqual(parsed.test_baseline[0].name, self.testInput0)
|
|
self.assertEqual(parsed.filter_baseline[0], 'c')
|
|
self.assertEqual(parsed.test_contender[0].name, self.testInput1)
|
|
self.assertEqual(parsed.filter_contender[0], 'e')
|
|
self.assertEqual(parsed.benchmark_options[0], 'f')
|
|
|
|
def test_benchmarksfiltered_with_remainder_after_doubleminus(self):
|
|
parsed = self.parser.parse_args(
|
|
['benchmarksfiltered', self.testInput0, 'c', self.testInput1, 'e', '--', 'g'])
|
|
self.assertFalse(parsed.utest)
|
|
self.assertEqual(parsed.mode, 'benchmarksfiltered')
|
|
self.assertEqual(parsed.test_baseline[0].name, self.testInput0)
|
|
self.assertEqual(parsed.filter_baseline[0], 'c')
|
|
self.assertEqual(parsed.test_contender[0].name, self.testInput1)
|
|
self.assertEqual(parsed.filter_contender[0], 'e')
|
|
self.assertEqual(parsed.benchmark_options[0], 'g')
|
|
|
|
|
|
if __name__ == '__main__':
|
|
# unittest.main()
|
|
main()
|
|
|
|
# vim: tabstop=4 expandtab shiftwidth=4 softtabstop=4
|
|
# kate: tab-width: 4; replace-tabs on; indent-width 4; tab-indents: off;
|
|
# kate: indent-mode python; remove-trailing-spaces modified;
|