From 142b1f42b1d81f5d9e3ba487956d8c427e4432b6 Mon Sep 17 00:00:00 2001
From: Marko Budiselic <marko.budiselic@memgraph.io>
Date: Tue, 23 Jan 2018 15:31:57 +0100
Subject: [PATCH] Add run_pokec script and minimum refactor

Summary:
  * add run_pokec script because more than one step is required
  * refactor of plot_throughput script
  * move all plot scripts under tools/plot

Reviewers: mferencevic, teon.banek, mislav.bradac

Reviewed By: mferencevic

Subscribers: florijan, pullbot, buda

Differential Revision: https://phabricator.memgraph.io/D1106
---
 tests/macro_benchmark/.gitignore              |  1 +
 .../macro_benchmark/groups/pokec/config.json  |  2 +-
 .../download_dataset}                         |  3 --
 tests/macro_benchmark/harness                 |  7 ++++
 tests/macro_benchmark/long_running_suite.py   | 21 +++++-------
 tests/macro_benchmark/run_pokec               | 22 ++++++++++++
 tests/public_benchmark/ldbc/apollo_runs.yaml  |  2 +-
 .../ldbc/continuous_integration               |  4 +--
 tools/{plot_gbench_json => plot/gbench_json}  |  0
 .../{plot_ldbc_latency => plot/ldbc_latency}  |  2 +-
 .../{plot_througput => plot/pokec_throughput} | 34 ++++++++-----------
 11 files changed, 58 insertions(+), 40 deletions(-)
 rename tests/macro_benchmark/groups/{download_datasets => pokec/download_dataset} (64%)
 create mode 100755 tests/macro_benchmark/run_pokec
 rename tools/{plot_gbench_json => plot/gbench_json} (100%)
 rename tools/{plot_ldbc_latency => plot/ldbc_latency} (99%)
 rename tools/{plot_througput => plot/pokec_throughput} (76%)

diff --git a/tests/macro_benchmark/.gitignore b/tests/macro_benchmark/.gitignore
index 7855f07fe..d6570241d 100644
--- a/tests/macro_benchmark/.gitignore
+++ b/tests/macro_benchmark/.gitignore
@@ -1,2 +1,3 @@
 .storage/
+.results/
 .harness_summary
diff --git a/tests/macro_benchmark/groups/pokec/config.json b/tests/macro_benchmark/groups/pokec/config.json
index fcafa44ac..ec6cf94bd 100644
--- a/tests/macro_benchmark/groups/pokec/config.json
+++ b/tests/macro_benchmark/groups/pokec/config.json
@@ -1,3 +1,3 @@
 {
-    "duration": 30
+    "duration": 60
 }
diff --git a/tests/macro_benchmark/groups/download_datasets b/tests/macro_benchmark/groups/pokec/download_dataset
similarity index 64%
rename from tests/macro_benchmark/groups/download_datasets
rename to tests/macro_benchmark/groups/pokec/download_dataset
index 3f55c90ec..cd315c5f1 100755
--- a/tests/macro_benchmark/groups/download_datasets
+++ b/tests/macro_benchmark/groups/pokec/download_dataset
@@ -3,7 +3,4 @@
 working_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
 cd ${working_dir}
 
-cd pokec
 wget -nv -O pokec_small.setup.cypher http://deps.memgraph.io/pokec_small.setup.cypher
-wget -nv -O pokec_medium.setup.cypher http://deps.memgraph.io/pokec_medium.setup.cypher
-cd ..
diff --git a/tests/macro_benchmark/harness b/tests/macro_benchmark/harness
index 523213396..0170f5300 100755
--- a/tests/macro_benchmark/harness
+++ b/tests/macro_benchmark/harness
@@ -259,6 +259,13 @@ def main():
     with open(get_absolute_path(".harness_summary"), "w") as f:
         json.dump({"results": results, "headers": suite.headers}, f)
 
+    # The if block is here because the results from all suites
+    # aren't compatible with the export below.
+    if type(suite) not in [QuerySuite, QueryParallelSuite]:
+        log.warning("The results from the suite "
+                    "aren't compatible with the apollo measurements export.")
+        return
+
     # Export data points.
     with open(get_absolute_path(".apollo_measurements"), "w") as f:
         headers = list(suite.headers)
diff --git a/tests/macro_benchmark/long_running_suite.py b/tests/macro_benchmark/long_running_suite.py
index 7d71f8d94..dd57a5ce1 100644
--- a/tests/macro_benchmark/long_running_suite.py
+++ b/tests/macro_benchmark/long_running_suite.py
@@ -1,11 +1,5 @@
 import logging
-import os
-import time
-import itertools
-import json
 from argparse import ArgumentParser
-from collections import defaultdict
-from statistics import median
 from common import get_absolute_path, APOLLO
 from databases import Memgraph, Neo
 from clients import QueryClient, LongRunningClient
@@ -15,6 +9,7 @@ log = logging.getLogger(__name__)
 
 class LongRunningSuite:
     KNOWN_KEYS = {"config", "setup", "run"}
+    headers = ["elapsed_time", "num_executed_queries"]
 
     def __init__(self, args):
         argp = ArgumentParser("LongRunningSuiteArgumentParser")
@@ -45,17 +40,17 @@ class LongRunningSuite:
         for result in results:
             self.summary += summary_format.format(
                     result["elapsed_time"], result["num_executed_queries"])
-            # TODO: Revise this.
             measurements.append({
                 "target": "throughput",
-                "value": result["num_executed_queries"] / result["elapsed_time"],
-                "unit": "queries per second",
+                "time": result["elapsed_time"],
+                "value": result["num_executed_queries"],
+                "unit": "number of executed queries",
                 "type": "throughput"})
         self.summary += "\n\nThroughtput: " + str(measurements[-1]["value"])
         return measurements
 
     def runners(self):
-        return { "MemgraphRunner" : MemgraphRunner, "NeoRunner" : NeoRunner }
+        return {"MemgraphRunner": MemgraphRunner, "NeoRunner": NeoRunner}
 
     def groups(self):
         return ["pokec"]
@@ -100,9 +95,9 @@ class MemgraphRunner(_LongRunningRunner):
                           help="Number of clients")
         self.args, remaining_args = argp.parse_known_args(args)
         assert not APOLLO or self.args.num_database_workers, \
-                "--num-database-workers is obligatory flag on apollo"
+            "--num-database-workers is obligatory flag on apollo"
         assert not APOLLO or self.args.num_client_workers, \
-                "--num-client-workers is obligatory flag on apollo"
+            "--num-client-workers is obligatory flag on apollo"
         database = Memgraph(remaining_args, self.args.runner_config,
                             self.args.num_database_workers)
         super(MemgraphRunner, self).__init__(
@@ -122,7 +117,7 @@ class NeoRunner(_LongRunningRunner):
                           help="Number of clients")
         self.args, remaining_args = argp.parse_known_args(args)
         assert not APOLLO or self.args.num_client_workers, \
-                "--client-num-clients is obligatory flag on apollo"
+            "--client-num-clients is obligatory flag on apollo"
         database = Neo(remaining_args, self.args.runner_config)
         super(NeoRunner, self).__init__(
                 remaining_args, database, self.args.num_client_workers)
diff --git a/tests/macro_benchmark/run_pokec b/tests/macro_benchmark/run_pokec
new file mode 100755
index 000000000..e4cb63de6
--- /dev/null
+++ b/tests/macro_benchmark/run_pokec
@@ -0,0 +1,22 @@
+#!/bin/bash -e
+
+script_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
+
+# Run pokec bench (download dataset, run neo and memgraph, plot the results).
+
+cd ${script_dir}
+mkdir -p .results/pokec/
+
+${script_dir}/groups/pokec/download_dataset
+
+./harness LongRunningSuite MemgraphRunner --groups pokec
+mv .harness_summary ${script_dir}/.results/pokec/memgraph.summary
+
+./harness LongRunningSuite NeoRunner --groups pokec
+mv .harness_summary ${script_dir}/.results/pokec/neo4j.summary
+
+../../tools/plot/pokec_throughput \
+    --vendor-references neo4j memgraph \
+    --vendor-titles Neo4j Memgraph \
+    --results ${script_dir}/.results/pokec/neo4j.summary ${script_dir}/.results/pokec/memgraph.summary \
+    --plot-title "Pokec Small" --window-size 1
diff --git a/tests/public_benchmark/ldbc/apollo_runs.yaml b/tests/public_benchmark/ldbc/apollo_runs.yaml
index 1f7cd3d3b..c98f375f0 100644
--- a/tests/public_benchmark/ldbc/apollo_runs.yaml
+++ b/tests/public_benchmark/ldbc/apollo_runs.yaml
@@ -7,7 +7,7 @@
     - ../../../config # directory with config files
     - ../../../libs/neo4j # neo4j directory
     - ../../../tools/mg_import_csv # memgraph csv import tool
-    - ../../../tools/plot_ldbc_latency # ldbc plot generation tool
+    - ../../../tools/plot/ldbc_latency # ldbc plot generation tool
   outfile_paths: # TODO: maybe this should also accept relative paths?
     - \./memgraph/tests/public_benchmark/ldbc/results/.+
     - \./memgraph/tests/public_benchmark/ldbc/plots/.+
diff --git a/tests/public_benchmark/ldbc/continuous_integration b/tests/public_benchmark/ldbc/continuous_integration
index ad99aecf0..520aeb8bb 100644
--- a/tests/public_benchmark/ldbc/continuous_integration
+++ b/tests/public_benchmark/ldbc/continuous_integration
@@ -9,12 +9,12 @@ TIMEOUT=3600 ./build_dataset
 # run read benchmarks
 TIMEOUT=3600 ./run_benchmark --run-db memgraph --create-index --thread-count $THREADS --result-file-prefix read
 TIMEOUT=3600 ./run_benchmark --run-db neo4j --create-index --thread-count $THREADS --result-file-prefix read
-./ve3/bin/python3 ../../../tools/plot_ldbc_latency --results results/read-memgraph-scale_1-LDBC-results.json results/read-neo4j-scale_1-LDBC-results.json --logo-path plots/ldbc-logo.png --plot-title "Read queries, scale 1" --output plots/read-queries-scale_1.png
+./ve3/bin/python3 ../../../tools/plot/ldbc_latency --results results/read-memgraph-scale_1-LDBC-results.json results/read-neo4j-scale_1-LDBC-results.json --logo-path plots/ldbc-logo.png --plot-title "Read queries, scale 1" --output plots/read-queries-scale_1.png
 
 # run update benchmarks
 TIMEOUT=3600 ./run_benchmark --run-db memgraph --create-index --thread-count $THREADS --result-file-prefix update --test-type updates --time-compression-ratio 1.5 --operation-count 200
 TIMEOUT=3600 ./run_benchmark --run-db neo4j --create-index --thread-count $THREADS --result-file-prefix update --test-type updates --time-compression-ratio 1.5 --operation-count 200
-./ve3/bin/python3 ../../../tools/plot_ldbc_latency --results results/update-memgraph-scale_1-LDBC-results.json results/update-neo4j-scale_1-LDBC-results.json --logo-path plots/ldbc-logo.png --plot-title "Update queries, scale 1" --output plots/update-queries-scale_1.png
+./ve3/bin/python3 ../../../tools/plot/ldbc_latency --results results/update-memgraph-scale_1-LDBC-results.json results/update-neo4j-scale_1-LDBC-results.json --logo-path plots/ldbc-logo.png --plot-title "Update queries, scale 1" --output plots/update-queries-scale_1.png
 
 # convert results to Apollo measurements
 ./convert_results
diff --git a/tools/plot_gbench_json b/tools/plot/gbench_json
similarity index 100%
rename from tools/plot_gbench_json
rename to tools/plot/gbench_json
diff --git a/tools/plot_ldbc_latency b/tools/plot/ldbc_latency
similarity index 99%
rename from tools/plot_ldbc_latency
rename to tools/plot/ldbc_latency
index c3c73f451..2850aa228 100755
--- a/tools/plot_ldbc_latency
+++ b/tools/plot/ldbc_latency
@@ -20,7 +20,7 @@ from matplotlib.cbook import get_sample_data
 
 
 SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__))
-COLORS = ['#ff7300', '#008cc2']  # TODO: add more colors!
+COLORS = ['#ff7300', '#008cc2']
 LDBC_TIME_FACTORS = {
     "SECONDS": 1.0,
     "MILLISECONDS": 1000.0,
diff --git a/tools/plot_througput b/tools/plot/pokec_throughput
similarity index 76%
rename from tools/plot_througput
rename to tools/plot/pokec_throughput
index 5684cb431..b6eb5fee1 100755
--- a/tools/plot_througput
+++ b/tools/plot/pokec_throughput
@@ -1,17 +1,17 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
-#
-# Example usage:
-# ./plot_througput --vendor-references neo4j memgraph           \
-#                  --vendor-titles neo4j memgraph               \
-#                  --results neo_to_plot memgraph_to_plot       \
-#                  --plot-title "Pokec small" --window-size 10
+
+'''
+Example usage:
+./pokec_throughput --vendor-references neo4j memgraph           \
+                   --vendor-titles Neo4j Memgraph               \
+                   --results neo_to_plot memgraph_to_plot       \
+                   --plot-title "Pokec small" --window-size 10
+'''
 
 
 import json
-import os
 import matplotlib.pyplot as plt
-from matplotlib.cbook import get_sample_data
 from argparse import ArgumentParser
 
 COLORS = {
@@ -57,29 +57,25 @@ def main():
     ax.set_ylabel('Throughput (queries per second)')
     ax.set_xlabel('Time (seconds)')
     ax.set_title(args.plot_title)
-    ax.set_aspect(0.01)
 
     # Collect the benchmark data and plot lines.
     print("Pokec throughput")
     for vendor_reference, vendor_data in vendors.items():
         print("Vendor: %s" % vendor_reference)
         with open(vendor_data['results_path']) as results_file:
+            results = json.load(results_file)['results'][0]
             # Skip first line which contains titles.
-            prev_time, prev_num_queries  = 0.0, 0
-            for line in results_file.readlines()[1:]:
-                data = line.split()
-                if data == []: break
-                assert len(data) == 2, "Invalid data"
-                vendor_data['t'].append(float(data[0]))
-                vendor_data['q'].append(int(data[1]))
+            for measurement in results:
+                vendor_data['t'].append(float(measurement['time']))
+                vendor_data['q'].append(int(measurement['value']))
             for i in range(1, len(vendor_data['t'])):
                 j = max(0, i - args.window_size)
                 vendor_data['dq/dt'].append(
                         (vendor_data['q'][i] - vendor_data['q'][j]) /
                         (vendor_data['t'][i] - vendor_data['t'][j]))
-
-        line1, = ax.plot(vendor_data['t'], vendor_data['dq/dt'], '-', linewidth=2,
-                         label=vendor_data['title'], color=vendor_data['color'])
+        line1, = ax.plot(vendor_data['t'], vendor_data['dq/dt'], '-',
+                         linewidth=2, label=vendor_data['title'],
+                         color=vendor_data['color'])
 
     ax.legend(loc='lower right')
     plt.grid()