From 859641cb0c3892d079925e6320c0acc938acd0d4 Mon Sep 17 00:00:00 2001
From: Matej Ferencevic <matej.ferencevic@memgraph.io>
Date: Thu, 28 Dec 2017 16:35:22 +0100
Subject: [PATCH] Changed macro benchmark summary format

Reviewers: buda, mtomic, mislav.bradac

Reviewed By: mislav.bradac

Subscribers: florijan, pullbot

Differential Revision: https://phabricator.memgraph.io/D972
---
 .gitignore                           |   2 +-
 tests/macro_benchmark/harness        |  43 ++++-------
 tests/macro_benchmark/query_suite.py |  61 ++++++++-------
 tools/apollo/macro_benchmark_summary | 109 +++++++++++++--------------
 4 files changed, 98 insertions(+), 117 deletions(-)

diff --git a/.gitignore b/.gitignore
index 860d3da5b..108515286 100644
--- a/.gitignore
+++ b/.gitignore
@@ -28,4 +28,4 @@ ve/
 ve3/
 perf.data*
 TAGS
-*.apollo_data
+*.apollo_measurements
diff --git a/tests/macro_benchmark/harness b/tests/macro_benchmark/harness
index 75e1e4aa2..523213396 100755
--- a/tests/macro_benchmark/harness
+++ b/tests/macro_benchmark/harness
@@ -242,46 +242,33 @@ def main():
         log.info("No scenarios to execute")
         return
 
+    results = []
+
     # Run scenarios.
     log.info("Executing %d scenarios", len(filtered_scenarios))
-    results = []
     for (group, scenario_name), scenario in sorted(filtered_scenarios.items()):
         log.info("Executing group.scenario '%s.%s' with elements %s",
                  group, scenario_name, list(scenario.keys()))
-        for iter_result in suite.run(scenario, group, scenario_name, runner):
-            iter_result["group"] = group
-            iter_result["scenario"] = scenario_name
-            results.append(iter_result)
-
-    # Save results.
-    run = dict()
-    run["suite"] = args.suite
-    run["runner"] = runner.__class__.__name__
-    run["runner_config"] = vars(runner.args)
-    run.update(args.additional_run_fields)
-
-    # Currently this output is not used anywhere, and has a tendancy to create huge files..
-    # TODO(dgleich): Revise this in the near future.
-    #for result in results:
-    #    jail.store_data(result)
+        results.append(suite.run(scenario, group, scenario_name, runner))
 
     # Print summary.
     print("\n\nMacro benchmark summary:")
     print("{}\n".format(suite.summary))
+
+    # Save data points.
     with open(get_absolute_path(".harness_summary"), "w") as f:
-        print(suite.summary, file=f)
+        json.dump({"results": results, "headers": suite.headers}, f)
 
     # Export data points.
-    with open(get_absolute_path(".apollo_data"), "w") as f:
-        apollo_data = ""
-        data = list(filter(lambda x: x.strip(), suite.summary.split("\n")))
-        headers = data[0].strip().split()
-        for row in data[1:]:
-            row = row.strip().split()
-            group, scenario = row[0:2]
-            for header, value in zip(headers[2:], row[2:]):
-                apollo_data += "{}.{}.{} {}\n".format(group, scenario, header, value)
-        f.write(apollo_data)
+    with open(get_absolute_path(".apollo_measurements"), "w") as f:
+        headers = list(suite.headers)
+        headers.remove("group_name")
+        headers.remove("scenario_name")
+        for row in results:
+            group, scenario = row.pop("group_name"), row.pop("scenario_name")
+            for header in headers:
+                f.write("{}.{}.{} {:.20f}\n".format(group, scenario,
+                        header, row[header]["median"]))
 
 
 if __name__ == "__main__":
diff --git a/tests/macro_benchmark/query_suite.py b/tests/macro_benchmark/query_suite.py
index 0fc208e55..4b9bd6026 100644
--- a/tests/macro_benchmark/query_suite.py
+++ b/tests/macro_benchmark/query_suite.py
@@ -7,7 +7,7 @@ import json
 from argparse import ArgumentParser
 from collections import defaultdict
 import tempfile
-from statistics import median
+from statistics import median, mean, stdev
 from common import get_absolute_path, WALL_TIME, CPU_TIME, MAX_MEMORY, APOLLO
 from databases import Memgraph, Neo
 from clients import QueryClient
@@ -28,10 +28,10 @@ class _QuerySuite:
     FORMAT = ["{:>24}", "{:>28}", "{:>16}", "{:>18}", "{:>22}",
               "{:>16}", "{:>16}", "{:>16}"]
     FULL_FORMAT = "".join(FORMAT) + "\n"
-    summary = FULL_FORMAT.format(
-                      "group_name", "scenario_name", "parsing_time",
-                      "planning_time", "plan_execution_time",
-                      WALL_TIME, CPU_TIME, MAX_MEMORY)
+    headers = ["group_name", "scenario_name", "parsing_time",
+               "planning_time", "plan_execution_time",
+               WALL_TIME, CPU_TIME, MAX_MEMORY]
+    summary = summary_raw = FULL_FORMAT.format(*headers)
 
     def __init__(self, args):
         argp = ArgumentParser("MemgraphRunnerArgumentParser")
@@ -55,21 +55,7 @@ class _QuerySuite:
                                                     time.time() - start_time))
             return r_val
 
-        def add_measurement(dictionary, iteration, key):
-            if key in dictionary:
-                measurement = {"target": key,
-                               "value": float(dictionary[key]),
-                               "unit": "s",
-                               "type": "time",
-                               "iteration": iteration}
-                measurements.append(measurement)
-                try:
-                    measurement_lists[key].append(float(dictionary[key]))
-                except:
-                    pass
-
-        measurements = []
-        measurement_lists = defaultdict(list)
+        measurements = defaultdict(list)
 
         # Run the whole test three times because memgraph is sometimes
         # consistently slow and with this hack we get a good median
@@ -107,29 +93,42 @@ class _QuerySuite:
                 run_result = execute("run")
 
                 if self.args.perf:
-                    self.perf_proc.terminate() 
+                    self.perf_proc.terminate()
                     self.perf_proc.wait()
 
-                add_measurement(run_result, iteration, CPU_TIME)
-                add_measurement(run_result, iteration, MAX_MEMORY)
+                measurements["cpu_time"].append(run_result["cpu_time"])
+                measurements["max_memory"].append(run_result["max_memory"])
+
                 assert len(run_result["groups"]) == 1, \
                         "Multiple groups in run step not yet supported"
+
                 group = run_result["groups"][0]
-                add_measurement(group, iteration, WALL_TIME)
-                for measurement in ["parsing_time",
-                                    "plan_execution_time",
-                                    "planning_time"] :
+                measurements["wall_time"].append(group["wall_time"])
+
+                for key in ["parsing_time", "plan_execution_time",
+                            "planning_time"]:
                     for i in range(len(group.get("metadatas", []))):
-                        add_measurement(group["metadatas"][i], iteration,
-                                        measurement)
+                        if not key in group["metadatas"][i]: continue
+                        measurements[key].append(group["metadatas"][i][key])
+
                 execute("iterteardown")
 
-            # TODO value outlier detection and warning across iterations
             execute("teardown")
             runner.stop()
 
         self.append_scenario_summary(group_name, scenario_name,
-                                     measurement_lists, num_iterations)
+                                     measurements, num_iterations)
+
+        # calculate mean, median and stdev of measurements
+        for key in measurements:
+            samples = measurements[key]
+            measurements[key] = {"mean": mean(samples),
+                                 "median": median(samples),
+                                 "stdev": stdev(samples),
+                                 "count": len(samples)}
+        measurements["group_name"] = group_name
+        measurements["scenario_name"] = scenario_name
+
         return measurements
 
     def append_scenario_summary(self, group_name, scenario_name,
diff --git a/tools/apollo/macro_benchmark_summary b/tools/apollo/macro_benchmark_summary
index 040608d07..78bc5c549 100755
--- a/tools/apollo/macro_benchmark_summary
+++ b/tools/apollo/macro_benchmark_summary
@@ -1,59 +1,46 @@
-#!/usr/bin/python3
+#!/usr/bin/env python3
 import argparse
+import json
 import os
 import sys
 
-def convert2float(val):
-    try:
-        return float(val)
-    except:
-        return val
-
-def parse_file(fname):
+def load_file(fname):
     with open(fname) as f:
-        data = f.readlines()
-    ret = []
-    for row in data:
-        row = row.strip()
-        if row == "": continue
-        ret.append(list(map(convert2float, row.split())))
-    return ret
+        data = f.read()
+    try:
+        return json.loads(data)
+    except json.decoder.JSONDecodeError:
+        return {"results": [], "headers": []}
 
 def strip_integers(row):
-    return list(filter(lambda x: type(x) == str, row))
+    return {k: v for k, v in row.items() if type(v) == str}
 
-def find_item(data, header, row):
-    headers = data[0]
-    row = strip_integers(row)
-    pos_x = -1
-    for i in range(len(data)):
-        s = strip_integers(data[i])
-        if s != row: continue
-        pos_x = i
-        break
-    if pos_x == -1: return None
-    pos_y = -1
-    for j in range(len(headers)):
-        if headers[j] != header: continue
-        pos_y = j
-        break
-    if pos_y == -1: return None
-    return data[pos_x][pos_y]
+def find_item(results_prev, header_cur, row_cur):
+    row_cur = strip_integers(row_cur)
+    row_prev = None
+    for result in results_prev:
+        s = strip_integers(result)
+        if s == row_cur:
+            row_prev = result
+            break
+    if row_prev is None: return None
+    if not header_cur in row_prev: return None
+    return row_prev[header_cur]
 
-def compare_values(data_cur, data_prev):
-    ret = []
-    headers = data_cur[0]
-    for i in range(len(data_cur)):
+def compare_values(headers_cur, results_cur, headers_prev, results_prev):
+    ret = [list(map(lambda x: " ".join(x.split("_")).capitalize(),
+            headers_cur))]
+    for row_cur in results_cur:
         ret.append([])
-        row_cur = data_cur[i]
         performance_change = False
-        for j in range(len(row_cur)):
-            item_cur = row_cur[j]
+        for header in headers_cur:
+            item_cur = row_cur[header]
             if type(item_cur) == str:
                 item = " ".join(item_cur.split("_")).capitalize()
             else:
-                item_prev = find_item(data_prev, headers[j], row_cur)
-                if j != len(row_cur) - 1:
+                value_cur = item_cur["median"]
+                item_prev = find_item(results_prev, header, row_cur)
+                if header != "max_memory":
                     fmt = "{:.3f}ms"
                     scale = 1000.0
                     treshold = 0.050
@@ -61,27 +48,29 @@ def compare_values(data_cur, data_prev):
                     fmt = "{:.2f}MiB"
                     scale = 1.0 / 1024.0
                     treshold = 0.025
+                # TODO: add statistics check
                 if item_prev != None:
-                    if item_prev != 0.0:
-                        diff = (item_cur - item_prev) / item_prev
+                    value_prev = item_prev["median"]
+                    if value_prev != 0.0:
+                        diff = (value_cur - value_prev) / value_prev
                     else:
                         diff = 0.0
-                    if diff < -treshold and item_cur > 0.0005:
+                    if diff < -treshold and value_cur > 0.0005:
                         performance_change = True
                         sign = " {icon arrow-down color=green}"
-                    elif diff > treshold and item_cur > 0.0005:
+                    elif diff > treshold and value_cur > 0.0005:
                         performance_change = True
                         sign = " {icon arrow-up color=red}"
                     else:
                         sign = ""
                     fmt += " //({:+.2%})//{}"
-                    item = fmt.format(item_cur * scale, diff, sign)
+                    item = fmt.format(value_cur * scale, diff, sign)
                 else:
                     fmt += " //(new)// {{icon plus color=blue}}"
-                    item = fmt.format(item_cur * scale)
+                    item = fmt.format(value_cur * scale)
                     performance_change = True
             ret[-1].append(item)
-        if performance_change == False and i > 0: ret.pop()
+        if not performance_change: ret.pop()
     return ret
 
 def generate_remarkup(data):
@@ -113,15 +102,21 @@ if __name__ == "__main__":
 
     args = parser.parse_args()
 
-    data_cur, data_prev = [], []
-    for i, current in enumerate(args.current):
-        off = 0 if i == 0 else 1
-        data_cur += parse_file(current)[off:]
-    for i, previous in enumerate(args.previous):
-        off = 0 if i == 0 else 1
-        data_prev += parse_file(previous)[off:]
+    headers_cur, headers_prev = None, None
+    results_cur, results_prev = [], []
+    for current in args.current:
+        data = load_file(current)
+        if headers_cur is None:
+            headers_cur = data["headers"]
+        results_cur += data["results"]
+    for previous in args.previous:
+        data = load_file(previous)
+        if headers_prev is None:
+            headers_prev = data["headers"]
+        results_prev += data["results"]
 
-    markup = generate_remarkup(compare_values(data_cur, data_prev))
+    markup = generate_remarkup(compare_values(headers_cur, results_cur,
+            headers_prev, results_prev))
 
     if args.output == "":
         sys.stdout.write(markup)