Add run_pokec script and minimum refactor

Summary: * add run_pokec script because more than one step is required * refactor of plot_throughput script * move all plot scripts under tools/plot Reviewers: mferencevic, teon.banek, mislav.bradac Reviewed By: mferencevic Subscribers: florijan, pullbot, buda Differential Revision: https://phabricator.memgraph.io/D1106
2018-01-23 15:31:57 +01:00 · 2018-01-23 15:31:57 +01:00 · 142b1f42b1
commit 142b1f42b1
parent ca32538f63
11 changed files with 58 additions and 40 deletions
--- a/tests/macro_benchmark/.gitignore
+++ b/tests/macro_benchmark/.gitignore
@ -1,2 +1,3 @@
 .storage/
 .results/
 .harness_summary
--- a/tests/macro_benchmark/groups/pokec/config.json
+++ b/tests/macro_benchmark/groups/pokec/config.json
@ -1,3 +1,3 @@
 {
-    "duration": 30
+    "duration": 60
 }
--- a/tests/macro_benchmark/groups/pokec/download_dataset
+++ b/tests/macro_benchmark/groups/pokec/download_dataset
@ -3,7 +3,4 @@
 working_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
 cd ${working_dir}
 cd pokec
 wget -nv -O pokec_small.setup.cypher http://deps.memgraph.io/pokec_small.setup.cypher
 wget -nv -O pokec_medium.setup.cypher http://deps.memgraph.io/pokec_medium.setup.cypher
 cd ..
--- a/tests/macro_benchmark/harness
+++ b/tests/macro_benchmark/harness
@ -259,6 +259,13 @@ def main():
    with open(get_absolute_path(".harness_summary"), "w") as f:
        json.dump({"results": results, "headers": suite.headers}, f)
    # The if block is here because the results from all suites
    # aren't compatible with the export below.
    if type(suite) not in [QuerySuite, QueryParallelSuite]:
        log.warning("The results from the suite "
                    "aren't compatible with the apollo measurements export.")
        return
    # Export data points.
    with open(get_absolute_path(".apollo_measurements"), "w") as f:
        headers = list(suite.headers)
--- a/tests/macro_benchmark/long_running_suite.py
+++ b/tests/macro_benchmark/long_running_suite.py
@ -1,11 +1,5 @@
 import logging
 import os
 import time
 import itertools
 import json
 from argparse import ArgumentParser
 from collections import defaultdict
 from statistics import median
 from common import get_absolute_path, APOLLO
 from databases import Memgraph, Neo
 from clients import QueryClient, LongRunningClient
@ -15,6 +9,7 @@ log = logging.getLogger(__name__)
 class LongRunningSuite:
    KNOWN_KEYS = {"config", "setup", "run"}
    headers = ["elapsed_time", "num_executed_queries"]
    def __init__(self, args):
        argp = ArgumentParser("LongRunningSuiteArgumentParser")
@ -45,11 +40,11 @@ class LongRunningSuite:
        for result in results:
            self.summary += summary_format.format(
                    result["elapsed_time"], result["num_executed_queries"])
            # TODO: Revise this.
            measurements.append({
                "target": "throughput",
-                "value": result["num_executed_queries"] / result["elapsed_time"],
+                "time": result["elapsed_time"],
-                "unit": "queries per second",
+                "value": result["num_executed_queries"],
                "unit": "number of executed queries",
                "type": "throughput"})
        self.summary += "\n\nThroughtput: " + str(measurements[-1]["value"])
        return measurements
--- a/tests/macro_benchmark/run_pokec
+++ b/tests/macro_benchmark/run_pokec
@ -0,0 +1,22 @@
 #!/bin/bash -e
 script_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
 # Run pokec bench (download dataset, run neo and memgraph, plot the results).
 cd ${script_dir}
 mkdir -p .results/pokec/
 ${script_dir}/groups/pokec/download_dataset
 ./harness LongRunningSuite MemgraphRunner --groups pokec
 mv .harness_summary ${script_dir}/.results/pokec/memgraph.summary
 ./harness LongRunningSuite NeoRunner --groups pokec
 mv .harness_summary ${script_dir}/.results/pokec/neo4j.summary
 ../../tools/plot/pokec_throughput \
    --vendor-references neo4j memgraph \
    --vendor-titles Neo4j Memgraph \
    --results ${script_dir}/.results/pokec/neo4j.summary ${script_dir}/.results/pokec/memgraph.summary \
    --plot-title "Pokec Small" --window-size 1
--- a/tests/public_benchmark/ldbc/apollo_runs.yaml
+++ b/tests/public_benchmark/ldbc/apollo_runs.yaml
@ -7,7 +7,7 @@
    - ../../../config # directory with config files
    - ../../../libs/neo4j # neo4j directory
    - ../../../tools/mg_import_csv # memgraph csv import tool
-    - ../../../tools/plot_ldbc_latency # ldbc plot generation tool
+    - ../../../tools/plot/ldbc_latency # ldbc plot generation tool
  outfile_paths: # TODO: maybe this should also accept relative paths?
    - \./memgraph/tests/public_benchmark/ldbc/results/.+
    - \./memgraph/tests/public_benchmark/ldbc/plots/.+
--- a/tests/public_benchmark/ldbc/continuous_integration
+++ b/tests/public_benchmark/ldbc/continuous_integration
@ -9,12 +9,12 @@ TIMEOUT=3600 ./build_dataset
 # run read benchmarks
 TIMEOUT=3600 ./run_benchmark --run-db memgraph --create-index --thread-count $THREADS --result-file-prefix read
 TIMEOUT=3600 ./run_benchmark --run-db neo4j --create-index --thread-count $THREADS --result-file-prefix read
-./ve3/bin/python3 ../../../tools/plot_ldbc_latency --results results/read-memgraph-scale_1-LDBC-results.json results/read-neo4j-scale_1-LDBC-results.json --logo-path plots/ldbc-logo.png --plot-title "Read queries, scale 1" --output plots/read-queries-scale_1.png
+./ve3/bin/python3 ../../../tools/plot/ldbc_latency --results results/read-memgraph-scale_1-LDBC-results.json results/read-neo4j-scale_1-LDBC-results.json --logo-path plots/ldbc-logo.png --plot-title "Read queries, scale 1" --output plots/read-queries-scale_1.png
 # run update benchmarks
 TIMEOUT=3600 ./run_benchmark --run-db memgraph --create-index --thread-count $THREADS --result-file-prefix update --test-type updates --time-compression-ratio 1.5 --operation-count 200
 TIMEOUT=3600 ./run_benchmark --run-db neo4j --create-index --thread-count $THREADS --result-file-prefix update --test-type updates --time-compression-ratio 1.5 --operation-count 200
-./ve3/bin/python3 ../../../tools/plot_ldbc_latency --results results/update-memgraph-scale_1-LDBC-results.json results/update-neo4j-scale_1-LDBC-results.json --logo-path plots/ldbc-logo.png --plot-title "Update queries, scale 1" --output plots/update-queries-scale_1.png
+./ve3/bin/python3 ../../../tools/plot/ldbc_latency --results results/update-memgraph-scale_1-LDBC-results.json results/update-neo4j-scale_1-LDBC-results.json --logo-path plots/ldbc-logo.png --plot-title "Update queries, scale 1" --output plots/update-queries-scale_1.png
 # convert results to Apollo measurements
 ./convert_results
--- a/tools/plot/gbench_json
+++ b/tools/plot/gbench_json
--- a/tools/plot/ldbc_latency
+++ b/tools/plot/ldbc_latency
@ -20,7 +20,7 @@ from matplotlib.cbook import get_sample_data
 SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__))
-COLORS = ['#ff7300', '#008cc2']  # TODO: add more colors!
+COLORS = ['#ff7300', '#008cc2']
 LDBC_TIME_FACTORS = {
    "SECONDS": 1.0,
    "MILLISECONDS": 1000.0,
--- a/tools/plot/pokec_throughput
+++ b/tools/plot/pokec_throughput
@ -1,17 +1,17 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
-#
+
-# Example usage:
+'''
-# ./plot_througput --vendor-references neo4j memgraph           \
+Example usage:
-#                  --vendor-titles neo4j memgraph               \
+./pokec_throughput --vendor-references neo4j memgraph           \
-#                  --results neo_to_plot memgraph_to_plot       \
+                   --vendor-titles Neo4j Memgraph               \
-#                  --plot-title "Pokec small" --window-size 10
+                   --results neo_to_plot memgraph_to_plot       \
                   --plot-title "Pokec small" --window-size 10
 '''
 import json
 import os
 import matplotlib.pyplot as plt
 from matplotlib.cbook import get_sample_data
 from argparse import ArgumentParser
 COLORS = {
@ -57,29 +57,25 @@ def main():
    ax.set_ylabel('Throughput (queries per second)')
    ax.set_xlabel('Time (seconds)')
    ax.set_title(args.plot_title)
    ax.set_aspect(0.01)
    # Collect the benchmark data and plot lines.
    print("Pokec throughput")
    for vendor_reference, vendor_data in vendors.items():
        print("Vendor: %s" % vendor_reference)
        with open(vendor_data['results_path']) as results_file:
            results = json.load(results_file)['results'][0]
            # Skip first line which contains titles.
-            prev_time, prev_num_queries  = 0.0, 0
+            for measurement in results:
-            for line in results_file.readlines()[1:]:
+                vendor_data['t'].append(float(measurement['time']))
-                data = line.split()
+                vendor_data['q'].append(int(measurement['value']))
                if data == []: break
                assert len(data) == 2, "Invalid data"
                vendor_data['t'].append(float(data[0]))
                vendor_data['q'].append(int(data[1]))
            for i in range(1, len(vendor_data['t'])):
                j = max(0, i - args.window_size)
                vendor_data['dq/dt'].append(
                        (vendor_data['q'][i] - vendor_data['q'][j]) /
                        (vendor_data['t'][i] - vendor_data['t'][j]))
-
+        line1, = ax.plot(vendor_data['t'], vendor_data['dq/dt'], '-',
-        line1, = ax.plot(vendor_data['t'], vendor_data['dq/dt'], '-', linewidth=2,
+                         linewidth=2, label=vendor_data['title'],
-                         label=vendor_data['title'], color=vendor_data['color'])
+                         color=vendor_data['color'])
    ax.legend(loc='lower right')
    plt.grid()