diff --git a/tools/compare.py b/tools/compare.py index f0a4455f..f293306e 100755 --- a/tools/compare.py +++ b/tools/compare.py @@ -35,6 +35,22 @@ def check_inputs(in1, in2, flags): def create_parser(): parser = ArgumentParser( description='versatile benchmark output compare tool') + + utest = parser.add_argument_group() + utest.add_argument( + '-u', + '--utest', + action="store_true", + help="Do a two-tailed Mann-Whitney U test with the null hypothesis that it is equally likely that a randomly selected value from one sample will be less than or greater than a randomly selected value from a second sample.\nWARNING: requires **LARGE** (no less than 9) number of repetitions to be meaningful!") + alpha_default = 0.05 + utest.add_argument( + "--alpha", + dest='utest_alpha', + default=alpha_default, + type=float, + help=("significance level alpha. if the calculated p-value is below this value, then the result is said to be statistically significant and the null hypothesis is rejected.\n(default: %0.4f)") % + alpha_default) + subparsers = parser.add_subparsers( help='This tool has multiple modes of operation:', dest='mode') @@ -139,8 +155,8 @@ def main(): parser = create_parser() args, unknown_args = parser.parse_known_args() if args.mode is None: - parser.print_help() - exit(1) + parser.print_help() + exit(1) assert not unknown_args benchmark_options = args.benchmark_options @@ -205,7 +221,8 @@ def main(): json2_orig, filter_contender, replacement) # Diff and output - output_lines = gbench.report.generate_difference_report(json1, json2) + output_lines = gbench.report.generate_difference_report( + json1, json2, args.utest, args.utest_alpha) print(description) for ln in output_lines: print(ln) @@ -228,6 +245,37 @@ class TestParser(unittest.TestCase): def test_benchmarks_basic(self): parsed = self.parser.parse_args( ['benchmarks', self.testInput0, self.testInput1]) + self.assertFalse(parsed.utest) + self.assertEqual(parsed.mode, 'benchmarks') + self.assertEqual(parsed.test_baseline[0].name, self.testInput0) + self.assertEqual(parsed.test_contender[0].name, self.testInput1) + self.assertFalse(parsed.benchmark_options) + + def test_benchmarks_basic_with_utest(self): + parsed = self.parser.parse_args( + ['-u', 'benchmarks', self.testInput0, self.testInput1]) + self.assertTrue(parsed.utest) + self.assertEqual(parsed.utest_alpha, 0.05) + self.assertEqual(parsed.mode, 'benchmarks') + self.assertEqual(parsed.test_baseline[0].name, self.testInput0) + self.assertEqual(parsed.test_contender[0].name, self.testInput1) + self.assertFalse(parsed.benchmark_options) + + def test_benchmarks_basic_with_utest(self): + parsed = self.parser.parse_args( + ['--utest', 'benchmarks', self.testInput0, self.testInput1]) + self.assertTrue(parsed.utest) + self.assertEqual(parsed.utest_alpha, 0.05) + self.assertEqual(parsed.mode, 'benchmarks') + self.assertEqual(parsed.test_baseline[0].name, self.testInput0) + self.assertEqual(parsed.test_contender[0].name, self.testInput1) + self.assertFalse(parsed.benchmark_options) + + def test_benchmarks_basic_with_utest_alpha(self): + parsed = self.parser.parse_args( + ['--utest', '--alpha=0.314', 'benchmarks', self.testInput0, self.testInput1]) + self.assertTrue(parsed.utest) + self.assertEqual(parsed.utest_alpha, 0.314) self.assertEqual(parsed.mode, 'benchmarks') self.assertEqual(parsed.test_baseline[0].name, self.testInput0) self.assertEqual(parsed.test_contender[0].name, self.testInput1) @@ -236,6 +284,7 @@ class TestParser(unittest.TestCase): def test_benchmarks_with_remainder(self): parsed = self.parser.parse_args( ['benchmarks', self.testInput0, self.testInput1, 'd']) + self.assertFalse(parsed.utest) self.assertEqual(parsed.mode, 'benchmarks') self.assertEqual(parsed.test_baseline[0].name, self.testInput0) self.assertEqual(parsed.test_contender[0].name, self.testInput1) @@ -244,6 +293,7 @@ class TestParser(unittest.TestCase): def test_benchmarks_with_remainder_after_doubleminus(self): parsed = self.parser.parse_args( ['benchmarks', self.testInput0, self.testInput1, '--', 'e']) + self.assertFalse(parsed.utest) self.assertEqual(parsed.mode, 'benchmarks') self.assertEqual(parsed.test_baseline[0].name, self.testInput0) self.assertEqual(parsed.test_contender[0].name, self.testInput1) @@ -252,6 +302,7 @@ class TestParser(unittest.TestCase): def test_filters_basic(self): parsed = self.parser.parse_args( ['filters', self.testInput0, 'c', 'd']) + self.assertFalse(parsed.utest) self.assertEqual(parsed.mode, 'filters') self.assertEqual(parsed.test[0].name, self.testInput0) self.assertEqual(parsed.filter_baseline[0], 'c') @@ -261,6 +312,7 @@ class TestParser(unittest.TestCase): def test_filters_with_remainder(self): parsed = self.parser.parse_args( ['filters', self.testInput0, 'c', 'd', 'e']) + self.assertFalse(parsed.utest) self.assertEqual(parsed.mode, 'filters') self.assertEqual(parsed.test[0].name, self.testInput0) self.assertEqual(parsed.filter_baseline[0], 'c') @@ -270,6 +322,7 @@ class TestParser(unittest.TestCase): def test_filters_with_remainder_after_doubleminus(self): parsed = self.parser.parse_args( ['filters', self.testInput0, 'c', 'd', '--', 'f']) + self.assertFalse(parsed.utest) self.assertEqual(parsed.mode, 'filters') self.assertEqual(parsed.test[0].name, self.testInput0) self.assertEqual(parsed.filter_baseline[0], 'c') @@ -279,6 +332,7 @@ class TestParser(unittest.TestCase): def test_benchmarksfiltered_basic(self): parsed = self.parser.parse_args( ['benchmarksfiltered', self.testInput0, 'c', self.testInput1, 'e']) + self.assertFalse(parsed.utest) self.assertEqual(parsed.mode, 'benchmarksfiltered') self.assertEqual(parsed.test_baseline[0].name, self.testInput0) self.assertEqual(parsed.filter_baseline[0], 'c') @@ -289,6 +343,7 @@ class TestParser(unittest.TestCase): def test_benchmarksfiltered_with_remainder(self): parsed = self.parser.parse_args( ['benchmarksfiltered', self.testInput0, 'c', self.testInput1, 'e', 'f']) + self.assertFalse(parsed.utest) self.assertEqual(parsed.mode, 'benchmarksfiltered') self.assertEqual(parsed.test_baseline[0].name, self.testInput0) self.assertEqual(parsed.filter_baseline[0], 'c') @@ -299,6 +354,7 @@ class TestParser(unittest.TestCase): def test_benchmarksfiltered_with_remainder_after_doubleminus(self): parsed = self.parser.parse_args( ['benchmarksfiltered', self.testInput0, 'c', self.testInput1, 'e', '--', 'g']) + self.assertFalse(parsed.utest) self.assertEqual(parsed.mode, 'benchmarksfiltered') self.assertEqual(parsed.test_baseline[0].name, self.testInput0) self.assertEqual(parsed.filter_baseline[0], 'c') diff --git a/tools/gbench/Inputs/test3_run0.json b/tools/gbench/Inputs/test3_run0.json new file mode 100644 index 00000000..c777bb0c --- /dev/null +++ b/tools/gbench/Inputs/test3_run0.json @@ -0,0 +1,39 @@ +{ + "context": { + "date": "2016-08-02 17:44:46", + "num_cpus": 4, + "mhz_per_cpu": 4228, + "cpu_scaling_enabled": false, + "library_build_type": "release" + }, + "benchmarks": [ + { + "name": "BM_One", + "iterations": 1000, + "real_time": 10, + "cpu_time": 100, + "time_unit": "ns" + }, + { + "name": "BM_Two", + "iterations": 1000, + "real_time": 9, + "cpu_time": 90, + "time_unit": "ns" + }, + { + "name": "BM_Two", + "iterations": 1000, + "real_time": 8, + "cpu_time": 80, + "time_unit": "ns" + }, + { + "name": "BM_Two_stat", + "iterations": 1000, + "real_time": 8, + "cpu_time": 80, + "time_unit": "ns" + } + ] +} diff --git a/tools/gbench/Inputs/test3_run1.json b/tools/gbench/Inputs/test3_run1.json new file mode 100644 index 00000000..03503339 --- /dev/null +++ b/tools/gbench/Inputs/test3_run1.json @@ -0,0 +1,39 @@ +{ + "context": { + "date": "2016-08-02 17:44:46", + "num_cpus": 4, + "mhz_per_cpu": 4228, + "cpu_scaling_enabled": false, + "library_build_type": "release" + }, + "benchmarks": [ + { + "name": "BM_One", + "iterations": 1000, + "real_time": 9, + "cpu_time": 110, + "time_unit": "ns" + }, + { + "name": "BM_Two", + "iterations": 1000, + "real_time": 10, + "cpu_time": 89, + "time_unit": "ns" + }, + { + "name": "BM_Two", + "iterations": 1000, + "real_time": 7, + "cpu_time": 70, + "time_unit": "ns" + }, + { + "name": "BM_Two_stat", + "iterations": 1000, + "real_time": 8, + "cpu_time": 80, + "time_unit": "ns" + } + ] +} diff --git a/tools/gbench/report.py b/tools/gbench/report.py index 0c090981..4cdd3b74 100644 --- a/tools/gbench/report.py +++ b/tools/gbench/report.py @@ -4,6 +4,9 @@ import os import re import copy +from scipy.stats import mannwhitneyu + + class BenchmarkColor(object): def __init__(self, name, code): self.name = name @@ -16,11 +19,13 @@ class BenchmarkColor(object): def __format__(self, format): return self.code + # Benchmark Colors Enumeration BC_NONE = BenchmarkColor('NONE', '') BC_MAGENTA = BenchmarkColor('MAGENTA', '\033[95m') BC_CYAN = BenchmarkColor('CYAN', '\033[96m') BC_OKBLUE = BenchmarkColor('OKBLUE', '\033[94m') +BC_OKGREEN = BenchmarkColor('OKGREEN', '\033[32m') BC_HEADER = BenchmarkColor('HEADER', '\033[92m') BC_WARNING = BenchmarkColor('WARNING', '\033[93m') BC_WHITE = BenchmarkColor('WHITE', '\033[97m') @@ -29,6 +34,7 @@ BC_ENDC = BenchmarkColor('ENDC', '\033[0m') BC_BOLD = BenchmarkColor('BOLD', '\033[1m') BC_UNDERLINE = BenchmarkColor('UNDERLINE', '\033[4m') + def color_format(use_color, fmt_str, *args, **kwargs): """ Return the result of 'fmt_str.format(*args, **kwargs)' after transforming @@ -78,30 +84,82 @@ def filter_benchmark(json_orig, family, replacement=""): for be in json_orig['benchmarks']: if not regex.search(be['name']): continue - filteredbench = copy.deepcopy(be) # Do NOT modify the old name! + filteredbench = copy.deepcopy(be) # Do NOT modify the old name! filteredbench['name'] = regex.sub(replacement, filteredbench['name']) filtered['benchmarks'].append(filteredbench) return filtered -def generate_difference_report(json1, json2, use_color=True): +def generate_difference_report( + json1, + json2, + utest=False, + utest_alpha=0.05, + use_color=True): """ Calculate and report the difference between each test of two benchmarks runs specified as 'json1' and 'json2'. """ + assert utest is True or utest is False first_col_width = find_longest_name(json1['benchmarks']) + def find_test(name): for b in json2['benchmarks']: if b['name'] == name: return b return None - first_col_width = max(first_col_width, len('Benchmark')) + + utest_col_name = "U-test (p-value)" + first_col_width = max( + first_col_width, + len('Benchmark'), + len(utest_col_name)) first_line = "{:<{}s}Time CPU Time Old Time New CPU Old CPU New".format( 'Benchmark', 12 + first_col_width) output_strs = [first_line, '-' * len(first_line)] - gen = (bn for bn in json1['benchmarks'] if 'real_time' in bn and 'cpu_time' in bn) + last_name = None + timings_time = [[], []] + timings_cpu = [[], []] + + gen = (bn for bn in json1['benchmarks'] + if 'real_time' in bn and 'cpu_time' in bn) for bn in gen: + fmt_str = "{}{:<{}s}{endc}{}{:+16.4f}{endc}{}{:+16.4f}{endc}{:14.0f}{:14.0f}{endc}{:14.0f}{:14.0f}" + special_str = "{}{:<{}s}{endc}{}{:16.4f}{endc}{}{:16.4f}" + + if last_name is None: + last_name = bn['name'] + if last_name != bn['name']: + MIN_REPETITIONS = 2 + if ((len(timings_time[0]) >= MIN_REPETITIONS) and + (len(timings_time[1]) >= MIN_REPETITIONS) and + (len(timings_cpu[0]) >= MIN_REPETITIONS) and + (len(timings_cpu[1]) >= MIN_REPETITIONS)): + if utest: + def get_utest_color(pval): + if pval >= utest_alpha: + return BC_FAIL + else: + return BC_OKGREEN + time_pvalue = mannwhitneyu( + timings_time[0], timings_time[1], alternative='two-sided').pvalue + cpu_pvalue = mannwhitneyu( + timings_cpu[0], timings_cpu[1], alternative='two-sided').pvalue + output_strs += [color_format(use_color, + special_str, + BC_HEADER, + utest_col_name, + first_col_width, + get_utest_color(time_pvalue), + time_pvalue, + get_utest_color(cpu_pvalue), + cpu_pvalue, + endc=BC_ENDC)] + last_name = bn['name'] + timings_time = [[], []] + timings_cpu = [[], []] + other_bench = find_test(bn['name']) if not other_bench: continue @@ -116,26 +174,44 @@ def generate_difference_report(json1, json2, use_color=True): return BC_WHITE else: return BC_CYAN - fmt_str = "{}{:<{}s}{endc}{}{:+16.4f}{endc}{}{:+16.4f}{endc}{:14.0f}{:14.0f}{endc}{:14.0f}{:14.0f}" - tres = calculate_change(bn['real_time'], other_bench['real_time']) - cpures = calculate_change(bn['cpu_time'], other_bench['cpu_time']) - output_strs += [color_format(use_color, fmt_str, - BC_HEADER, bn['name'], first_col_width, - get_color(tres), tres, get_color(cpures), cpures, - bn['real_time'], other_bench['real_time'], - bn['cpu_time'], other_bench['cpu_time'], - endc=BC_ENDC)] + + timings_time[0].append(bn['real_time']) + timings_time[1].append(other_bench['real_time']) + timings_cpu[0].append(bn['cpu_time']) + timings_cpu[1].append(other_bench['cpu_time']) + + tres = calculate_change(timings_time[0][-1], timings_time[1][-1]) + cpures = calculate_change(timings_cpu[0][-1], timings_cpu[1][-1]) + output_strs += [color_format(use_color, + fmt_str, + BC_HEADER, + bn['name'], + first_col_width, + get_color(tres), + tres, + get_color(cpures), + cpures, + timings_time[0][-1], + timings_time[1][-1], + timings_cpu[0][-1], + timings_cpu[1][-1], + endc=BC_ENDC)] return output_strs ############################################################################### # Unit tests + import unittest + class TestReportDifference(unittest.TestCase): def load_results(self): import json - testInputs = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'Inputs') + testInputs = os.path.join( + os.path.dirname( + os.path.realpath(__file__)), + 'Inputs') testOutput1 = os.path.join(testInputs, 'test1_run1.json') testOutput2 = os.path.join(testInputs, 'test1_run2.json') with open(testOutput1, 'r') as f: @@ -160,7 +236,8 @@ class TestReportDifference(unittest.TestCase): ['BM_BadTimeUnit', '-0.9000', '+0.2000', '0', '0', '0', '1'], ] json1, json2 = self.load_results() - output_lines_with_header = generate_difference_report(json1, json2, use_color=False) + output_lines_with_header = generate_difference_report( + json1, json2, use_color=False) output_lines = output_lines_with_header[2:] print("\n".join(output_lines_with_header)) self.assertEqual(len(output_lines), len(expect_lines)) @@ -173,7 +250,10 @@ class TestReportDifference(unittest.TestCase): class TestReportDifferenceBetweenFamilies(unittest.TestCase): def load_result(self): import json - testInputs = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'Inputs') + testInputs = os.path.join( + os.path.dirname( + os.path.realpath(__file__)), + 'Inputs') testOutput = os.path.join(testInputs, 'test2_run.json') with open(testOutput, 'r') as f: json = json.load(f) @@ -189,7 +269,8 @@ class TestReportDifferenceBetweenFamilies(unittest.TestCase): json = self.load_result() json1 = filter_benchmark(json, "BM_Z.ro", ".") json2 = filter_benchmark(json, "BM_O.e", ".") - output_lines_with_header = generate_difference_report(json1, json2, use_color=False) + output_lines_with_header = generate_difference_report( + json1, json2, use_color=False) output_lines = output_lines_with_header[2:] print("\n") print("\n".join(output_lines_with_header)) @@ -200,6 +281,41 @@ class TestReportDifferenceBetweenFamilies(unittest.TestCase): self.assertEqual(parts, expect_lines[i]) +class TestReportDifferenceWithUTest(unittest.TestCase): + def load_results(self): + import json + testInputs = os.path.join( + os.path.dirname( + os.path.realpath(__file__)), + 'Inputs') + testOutput1 = os.path.join(testInputs, 'test3_run0.json') + testOutput2 = os.path.join(testInputs, 'test3_run1.json') + with open(testOutput1, 'r') as f: + json1 = json.load(f) + with open(testOutput2, 'r') as f: + json2 = json.load(f) + return json1, json2 + + def test_utest(self): + expect_lines = [] + expect_lines = [ + ['BM_One', '-0.1000', '+0.1000', '10', '9', '100', '110'], + ['BM_Two', '+0.1111', '-0.0111', '9', '10', '90', '89'], + ['BM_Two', '+0.2500', '+0.1125', '8', '10', '80', '89'], + ['U-test', '(p-value)', '0.2207', '0.6831'], + ['BM_Two_stat', '+0.0000', '+0.0000', '8', '8', '80', '80'], + ] + json1, json2 = self.load_results() + output_lines_with_header = generate_difference_report( + json1, json2, True, 0.05, use_color=False) + output_lines = output_lines_with_header[2:] + print("\n".join(output_lines_with_header)) + self.assertEqual(len(output_lines), len(expect_lines)) + for i in range(0, len(output_lines)): + parts = [x for x in output_lines[i].split(' ') if x] + self.assertEqual(parts, expect_lines[i]) + + if __name__ == '__main__': unittest.main()