From 70d9f3f6f19914d028a0afc036367a8acbfb6e1a Mon Sep 17 00:00:00 2001
From: Matej Ferencevic <matej.ferencevic@memgraph.io>
Date: Mon, 4 Sep 2017 14:16:12 +0200
Subject: [PATCH] Refactored harness and added PostgreSQL support.

Summary:
Moved Neo4j config to config dir.

Neo4j and PostgreSQL are now downloaded to libs.

Renamed metadata flags in memgraph.

Changed apollo generate for new harness.

Reviewers: mislav.bradac

Reviewed By: mislav.bradac

Subscribers: pullbot

Differential Revision: https://phabricator.memgraph.io/D741
---
 libs/cleanup.sh                               |   4 +-
 libs/setup.sh                                 |  14 +-
 src/query/engine.hpp                          |   6 +-
 src/query/interpreter.hpp                     |   8 +-
 .../{neo4j_config => config}/neo4j.conf       |  13 +-
 tests/macro_benchmark/harness/harness.py      | 228 ++++++++++++------
 tools/apollo/generate                         |  13 +-
 7 files changed, 190 insertions(+), 96 deletions(-)
 rename tests/macro_benchmark/harness/{neo4j_config => config}/neo4j.conf (98%)

diff --git a/libs/cleanup.sh b/libs/cleanup.sh
index b4d1d7506..5151c821d 100755
--- a/libs/cleanup.sh
+++ b/libs/cleanup.sh
@@ -4,8 +4,8 @@
 working_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
 cd ${working_dir}
 
-# remove antlr parser generator
-rm *.jar
+# remove archives
+rm *.jar *.tar.gz *.tar 2>/dev/null
 
 # remove lib directories
 for folder in * ; do
diff --git a/libs/setup.sh b/libs/setup.sh
index e19b78716..5c2c986e4 100755
--- a/libs/setup.sh
+++ b/libs/setup.sh
@@ -1,4 +1,4 @@
-#!/bin/bash
+#!/bin/bash -e
 
 # Download external dependencies.
 
@@ -88,3 +88,15 @@ gflags_tag="652651b421ca5ac7b722a34a301fb656deca5198" # May 6, 2017
 cd gflags
 git checkout ${gflags_tag}
 cd ..
+
+# neo4j
+wget http://deps.memgraph.io/neo4j-community-3.2.3-unix.tar.gz -O neo4j.tar.gz
+tar -xzf neo4j.tar.gz
+mv neo4j-community-3.2.3 neo4j
+rm neo4j.tar.gz
+
+# postgresql
+wget http://deps.memgraph.io/postgresql-9.6.5-1-linux-x64-binaries.tar.gz -O postgres.tar.gz
+tar -xzf postgres.tar.gz
+mv pgsql postgresql
+rm postgres.tar.gz
diff --git a/src/query/engine.hpp b/src/query/engine.hpp
index 0a2061555..15d7fc08a 100644
--- a/src/query/engine.hpp
+++ b/src/query/engine.hpp
@@ -99,11 +99,11 @@ class QueryEngine {
     }
 
     std::map<std::string, query::TypedValue> summary;
-    summary["query_parsing_time"] = parsing_time.count();
+    summary["parsing_time"] = parsing_time.count();
     // This doesn't do any actual planning, but benchmarking harness knows how
     // to work with this field.
-    summary["query_planning_time"] = planning_time.count();
-    summary["query_plan_execution_time"] = execution_time.count();
+    summary["planning_time"] = planning_time.count();
+    summary["plan_execution_time"] = execution_time.count();
     summary["type"] = "rw";
     stream.Summary(summary);
 
diff --git a/src/query/interpreter.hpp b/src/query/interpreter.hpp
index 8aeeb8a65..ad71d203c 100644
--- a/src/query/interpreter.hpp
+++ b/src/query/interpreter.hpp
@@ -178,10 +178,10 @@ class Interpreter {
     }
     auto execution_time = execution_timer.Elapsed();
 
-    summary["query_parsing_time"] = frontend_time.count();
-    summary["query_planning_time"] = planning_time.count();
-    summary["query_plan_execution_time"] = execution_time.count();
-    summary["query_cost_estimate"] = query_plan_cost_estimation;
+    summary["parsing_time"] = frontend_time.count();
+    summary["planning_time"] = planning_time.count();
+    summary["plan_execution_time"] = execution_time.count();
+    summary["cost_estimate"] = query_plan_cost_estimation;
 
     // TODO: set summary['type'] based on transaction metadata
     // the type can't be determined based only on top level LogicalOp
diff --git a/tests/macro_benchmark/harness/neo4j_config/neo4j.conf b/tests/macro_benchmark/harness/config/neo4j.conf
similarity index 98%
rename from tests/macro_benchmark/harness/neo4j_config/neo4j.conf
rename to tests/macro_benchmark/harness/config/neo4j.conf
index 5e30252ae..a18b76851 100644
--- a/tests/macro_benchmark/harness/neo4j_config/neo4j.conf
+++ b/tests/macro_benchmark/harness/config/neo4j.conf
@@ -13,13 +13,13 @@
 #dbms.directories.plugins=/var/lib/neo4j/plugins
 #dbms.directories.certificates=/var/lib/neo4j/certificates
 #dbms.directories.logs=/var/log/neo4j
-dbms.directories.lib=/usr/share/neo4j/lib
+#dbms.directories.lib=/usr/share/neo4j/lib
 #dbms.directories.run=/var/run/neo4j
 
 # This setting constrains all `LOAD CSV` import files to be under the `import` directory. Remove or comment it out to
 # allow files to be loaded from anywhere in the filesystem; this introduces possible security problems. See the
 # `LOAD CSV` section of the manual for details.
-dbms.directories.import=/var/lib/neo4j/import
+#dbms.directories.import=/var/lib/neo4j/import
 
 # Whether requests to Neo4j are authenticated.
 # To disable authentication, uncomment this line
@@ -75,7 +75,7 @@ dbms.connector.http.enabled=true
 #dbms.connector.http.listen_address=:7474
 
 # HTTPS Connector. There can be zero or one HTTPS connectors.
-dbms.connector.https.enabled=true
+dbms.connector.https.enabled=false
 #dbms.connector.https.listen_address=:7473
 
 # Number of Neo4j worker threads.
@@ -316,3 +316,10 @@ dbms.windows_service_name=neo4j
 # Other Neo4j system properties
 #********************************************************************
 dbms.jvm.additional=-Dunsupported.dbms.udc.source=debian
+
+# Disable Neo4j usage data collection
+dbms.udc.enabled=false
+
+# Disable query cache
+dbms.query_cache_size=0
+
diff --git a/tests/macro_benchmark/harness/harness.py b/tests/macro_benchmark/harness/harness.py
index 07a42fd71..8462fbc7a 100755
--- a/tests/macro_benchmark/harness/harness.py
+++ b/tests/macro_benchmark/harness/harness.py
@@ -3,11 +3,10 @@
 
 import logging
 import os
-from os import path
 import time
 import itertools
 import json
-from subprocess import check_output
+import subprocess
 from argparse import ArgumentParser
 from collections import OrderedDict
 from collections import defaultdict
@@ -15,6 +14,8 @@ import tempfile
 import shutil
 from statistics import median
 
+from perf import Perf
+
 try:
     import jail
     APOLLO = True
@@ -22,13 +23,34 @@ except:
     import jail_faker as jail
     APOLLO = False
 
-DIR_PATH = path.dirname(path.realpath(__file__))
+DIR_PATH = os.path.dirname(os.path.realpath(__file__))
 WALL_TIME = "wall_time"
 CPU_TIME = "cpu_time"
-from perf import Perf
 
 log = logging.getLogger(__name__)
 
+
+def get_absolute_path(path, base=""):
+    if base == "build":
+        extra = "../../../build"
+    elif base == "build_release":
+        extra = "../../../build_release"
+    elif base == "libs":
+        extra = "../../../libs"
+    elif base == "config":
+        extra = "../../../config"
+    else:
+        extra = ""
+    return os.path.normpath(os.path.join(DIR_PATH, extra, path))
+
+
+def wait_for_server(port, delay=1.0):
+    cmd = ["nc", "-z", "-w", "1", "127.0.0.1", port]
+    while subprocess.call(cmd) != 0:
+        time.sleep(0.5)
+    time.sleep(delay)
+
+
 def load_scenarios(args):
     """
     Scans through folder structure starting with groups_root and
@@ -67,9 +89,8 @@ def load_scenarios(args):
         {group: (scenario, {config: query_generator_function})
     """
     argp = ArgumentParser("QuerySuite.scenarios argument parser")
-    argp.add_argument("--query-scenarios-root", default=path.join(
-        DIR_PATH, "groups"),
-        dest="root")
+    argp.add_argument("--query-scenarios-root",
+                      default=get_absolute_path("groups"), dest="root")
     args, _ = argp.parse_known_args()
     log.info("Loading query scenarios from root: %s", args.root)
 
@@ -78,7 +99,7 @@ def load_scenarios(args):
             log.debug("Processing config file %s", config_file)
             config_name = config_file.split(".")[-2]
             config_dict[config_name] = QuerySuite.Loader(
-                path.join(base, config_file))
+                os.path.join(base, config_file))
 
         # validate that the scenario does not contain any illegal
         # keys (defense against typos in file naming)
@@ -89,19 +110,19 @@ def load_scenarios(args):
 
     def dir_content(root, predicate):
         return [p for p in os.listdir(root)
-                if predicate(path.join(root, p))]
+                if predicate(os.path.join(root, p))]
 
     group_scenarios = OrderedDict()
-    for group in dir_content(args.root, path.isdir):
+    for group in dir_content(args.root, os.path.isdir):
         log.info("Loading group: '%s'", group)
 
         group_scenarios[group] = []
-        files = dir_content(path.join(args.root, group),
-                            path.isfile)
+        files = dir_content(os.path.join(args.root, group),
+                            os.path.isfile)
 
         # process group default config
         group_config = {}
-        fill_config_dict(group_config, path.join(args.root, group),
+        fill_config_dict(group_config, os.path.join(args.root, group),
                          [f for f in files if f.count(".") == 1])
 
         # group files on scenario
@@ -111,7 +132,7 @@ def load_scenarios(args):
             log.info("Loading scenario: '%s'", scenario_name)
             scenario = dict(group_config)
             fill_config_dict(scenario,
-                             path.join(args.root, group),
+                             os.path.join(args.root, group),
                              scenario_files)
             group_scenarios[group].append((scenario_name, scenario))
             log.debug("Loaded config for scenario '%s'\n%r", scenario_name,
@@ -130,12 +151,12 @@ class _QuerySuite:
     # what the QuerySuite can work with
     KNOWN_KEYS = {"config", "setup", "itersetup", "run", "iterteardown",
                   "teardown", "common"}
-    FORMAT = ["{:>24}", "{:>28}", "{:>22}", "{:>24}", "{:>28}",
+    FORMAT = ["{:>24}", "{:>28}", "{:>16}", "{:>18}", "{:>22}",
               "{:>16}", "{:>16}"]
     FULL_FORMAT = "".join(FORMAT) + "\n"
     summary = FULL_FORMAT.format(
-                      "group_name", "scenario_name", "query_parsing_time",
-                      "query_planning_time", "query_plan_execution_time",
+                      "group_name", "scenario_name", "parsing_time",
+                      "planning_time", "plan_execution_time",
                       WALL_TIME, CPU_TIME)
 
     def __init__(self, args):
@@ -173,12 +194,12 @@ class _QuerySuite:
             """ Yields queries found in the given file_path one by one """
             log.debug("Generating queries from file_path: %s",
                       self.file_path)
-            _, extension = path.splitext(self.file_path)
+            _, extension = os.path.splitext(self.file_path)
             if extension == ".cypher":
                 with open(self.file_path) as f:
                     return self._queries(f.read())
             elif extension == ".py":
-                return self._queries(check_output(
+                return self._queries(subprocess.check_output(
                     ["python3", self.file_path]).decode("ascii"))
             elif extension == ".json":
                 with open(self.file_path) as f:
@@ -241,9 +262,9 @@ class _QuerySuite:
                                  scenario_config.get("num_client_workers", 1))
             add_measurement(run_result, iteration, WALL_TIME)
             add_measurement(run_result, iteration, CPU_TIME)
-            for measurement in ["query_parsing_time",
-                                "query_plan_execution_time",
-                                "query_planning_time"] :
+            for measurement in ["parsing_time",
+                                "plan_execution_time",
+                                "planning_time"] :
                 for i in range(len(run_result.get("metadatas", []))):
                     add_measurement(run_result["metadatas"][i], iteration,
                                     measurement)
@@ -263,8 +284,8 @@ class _QuerySuite:
                                 measurement_lists, num_iterations):
         self.summary += self.FORMAT[0].format(group_name)
         self.summary += self.FORMAT[1].format(scenario_name)
-        for i, key in enumerate(("query_parsing_time", "query_planning_time",
-                    "query_plan_execution_time", WALL_TIME, CPU_TIME)):
+        for i, key in enumerate(("parsing_time", "planning_time",
+                    "plan_execution_time", WALL_TIME, CPU_TIME)):
             if key not in measurement_lists:
                 time = "-"
             else:
@@ -305,15 +326,6 @@ class QueryParallelSuite(_QuerySuite):
         return ["aggregation_parallel", "create_parallel"]
 
 
-def get_common_runner_argument_parser():
-    argp = ArgumentParser("CommonRunnerArgumentParser")
-    argp.add_argument("--address", help="Database and client address",
-                      default="127.0.0.1")
-    argp.add_argument("--port", help="Database and client port",
-                      default="7687")
-    return argp
-
-
 # Database wrappers.
 
 class Memgraph:
@@ -322,15 +334,13 @@ class Memgraph:
     """
     def __init__(self, args, cpus):
         self.log = logging.getLogger("MemgraphRunner")
-        argp = ArgumentParser("MemgraphArgumentParser", add_help=False,
-                              parents=[get_common_runner_argument_parser()])
-        argp.add_argument("--RunnerBin",
-                          default=os.path.join(DIR_PATH,
-                                               "../../../build/memgraph"))
-        argp.add_argument("--RunnerConfig",
-                          default=os.path.normpath(os.path.join(
-                              DIR_PATH,
-                              "../../../config/benchmarking_latency.conf")))
+        argp = ArgumentParser("MemgraphArgumentParser", add_help=False)
+        argp.add_argument("--runner-bin",
+                          default=get_absolute_path("memgraph", "build"))
+        argp.add_argument("--runner-config",
+                          default=get_absolute_path("benchmarking_latency.conf", "config"))
+        argp.add_argument("--port", default="7687",
+                          help="Database and client port")
         self.log.info("Initializing Runner with arguments %r", args)
         self.args, _ = argp.parse_known_args(args)
         self.database_bin = jail.get_process()
@@ -338,14 +348,20 @@ class Memgraph:
 
     def start(self):
         self.log.info("start")
-        environment = os.environ.copy()
-        environment["MEMGRAPH_CONFIG"] = self.args.RunnerConfig
-        database_args = ["--interface", self.args.address,
-                         "--port", self.args.port]
-        self.database_bin.run(self.args.RunnerBin, database_args,
-                              env=environment, timeout=600)
-        # TODO change to a check via SIGUSR
-        time.sleep(1.0)
+        env = {"MEMGRAPH_CONFIG": self.args.runner_config}
+        database_args = ["--port", self.args.port]
+
+        # find executable path
+        runner_bin = self.args.runner_bin
+        if not os.path.exists(runner_bin):
+            # Apollo builds both debug and release binaries on diff
+            # so we need to use the release binary if the debug one
+            # doesn't exist
+            runner_bin = get_absolute_path("memgraph", "build_release")
+
+        # start memgraph
+        self.database_bin.run(runner_bin, database_args, env=env, timeout=600)
+        wait_for_server(self.args.port)
         return self.database_bin.get_pid() if not APOLLO else None
 
     def stop(self):
@@ -356,42 +372,105 @@ class Memgraph:
 class Neo:
     def __init__(self, args, cpus):
         self.log = logging.getLogger("NeoRunner")
-        argp = ArgumentParser("NeoArgumentParser", add_help=False,
-                              parents=[get_common_runner_argument_parser()])
-        argp.add_argument(
-            "--RunnerConfigDir",
-            default=path.join(DIR_PATH, "neo4j_config"))
+        argp = ArgumentParser("NeoArgumentParser", add_help=False)
+        argp.add_argument("--runner-bin", default=get_absolute_path(
+                          "neo4j/bin/neo4j", "libs"))
+        argp.add_argument("--runner-config",
+                          default=get_absolute_path("config/neo4j.conf"))
+        argp.add_argument("--port", default="7687",
+                          help="Database and client port")
+        argp.add_argument("--http-port", default="7474",
+                          help="Database and client port")
         self.log.info("Initializing Runner with arguments %r", args)
         self.args, _ = argp.parse_known_args(args)
-        if self.args.address != "127.0.0.1" or self.args.port != "7687":
-            raise Exception(
-                "Neo wrapper doesn't support different address or port")
         self.database_bin = jail.get_process()
         self.database_bin.set_cpus(cpus)
 
     def start(self):
         self.log.info("start")
-        environment = os.environ.copy()
-        environment["NEO4J_CONF"] = self.args.RunnerConfigDir
+
+        # create home directory
         self.neo4j_home_path = tempfile.mkdtemp(dir="/dev/shm")
-        environment["NEO4J_HOME"] = self.neo4j_home_path
+
         try:
-            self.database_bin.run("/usr/share/neo4j/bin/neo4j", args=["console"],
-                                  env=environment, timeout=600)
-            # TODO change to a check via SIGUSR
-            time.sleep(5.0)
+            os.symlink(os.path.join(get_absolute_path("neo4j", "libs"), "lib"),
+                       os.path.join(self.neo4j_home_path, "lib"))
+            neo4j_conf_dir = os.path.join(self.neo4j_home_path, "conf")
+            neo4j_conf_file = os.path.join(neo4j_conf_dir, "neo4j.conf")
+            os.mkdir(neo4j_conf_dir)
+            shutil.copyfile(self.args.runner_config, neo4j_conf_file)
+            with open(neo4j_conf_file, "a") as f:
+                f.write("\ndbms.connector.bolt.listen_address=:" +
+                        self.args.port + "\n")
+                f.write("\ndbms.connector.http.listen_address=:" +
+                        self.args.http_port + "\n")
+
+            # environment
+            cwd = os.path.dirname(self.args.runner_bin)
+            env = {"NEO4J_HOME": self.neo4j_home_path}
+
+            self.database_bin.run(self.args.runner_bin, args=["console"],
+                                  env=env, timeout=600, cwd=cwd)
         except:
             shutil.rmtree(self.neo4j_home_path)
-            raise Exception("Couldn't create symlink or run neo4j")
+            raise Exception("Couldn't run Neo4j!")
+
+        wait_for_server(self.args.http_port, 2.0)
         return self.database_bin.get_pid() if not APOLLO else None
 
     def stop(self):
         self.database_bin.send_signal(jail.SIGTERM)
         self.database_bin.wait()
-        if path.exists(self.neo4j_home_path):
+        if os.path.exists(self.neo4j_home_path):
             shutil.rmtree(self.neo4j_home_path)
 
 
+class Postgres:
+    """
+    Knows how to start and stop PostgreSQL.
+    """
+    def __init__(self, args, cpus):
+        self.log = logging.getLogger("PostgresRunner")
+        argp = ArgumentParser("PostgresArgumentParser", add_help=False)
+        argp.add_argument("--init-bin", default=get_absolute_path(
+                          "postgresql/bin/initdb", "libs"))
+        argp.add_argument("--runner-bin", default=get_absolute_path(
+                          "postgresql/bin/postgres", "libs"))
+        argp.add_argument("--port", default="5432",
+                          help="Database and client port")
+        self.log.info("Initializing Runner with arguments %r", args)
+        self.args, _ = argp.parse_known_args(args)
+        self.username = "macro_benchmark"
+        self.database_bin = jail.get_process()
+        self.database_bin.set_cpus(cpus)
+
+    def start(self):
+        self.log.info("start")
+        self.data_path = tempfile.mkdtemp(dir="/dev/shm")
+        init_args = ["-D", self.data_path, "-U", self.username]
+        self.database_bin.run_and_wait(self.args.init_bin, init_args)
+
+        # args
+        runner_args = ["-D", self.data_path, "-c", "port=" + self.args.port,
+                       "-c", "ssl=false", "-c", "max_worker_processes=1"]
+
+        try:
+            self.database_bin.run(self.args.runner_bin, args=runner_args,
+                                  timeout=600)
+        except:
+            shutil.rmtree(self.data_path)
+            raise Exception("Couldn't run PostgreSQL!")
+
+        wait_for_server(self.args.port)
+        return self.database_bin.get_pid() if not APOLLO else None
+
+    def stop(self):
+        self.database_bin.send_signal(jail.SIGTERM)
+        self.database_bin.wait()
+        if os.path.exists(self.data_path):
+            shutil.rmtree(self.data_path)
+
+
 class _HarnessClientRunner:
     """
     Knows how to start and stop database (backend) some client frontend (bolt),
@@ -405,8 +484,7 @@ class _HarnessClientRunner:
         if cpus is None: cpus = [2, 3]
         self.log = logging.getLogger("_HarnessClientRunner")
         self.database = database
-        argp = ArgumentParser("RunnerArgumentParser", add_help=False,
-                              parents=[get_common_runner_argument_parser()])
+        argp = ArgumentParser("RunnerArgumentParser", add_help=False)
         self.args, _ = argp.parse_known_args()
         self.bolt_client = jail.get_process()
         self.bolt_client.set_cpus(cpus)
@@ -417,15 +495,13 @@ class _HarnessClientRunner:
     def execute(self, queries, num_client_workers):
         self.log.debug("execute('%s')", str(queries))
 
-        client = os.path.normpath(os.path.join(DIR_PATH,
-                "../../../build/tests/macro_benchmark/harness_client"))
+        client_path = "tests/macro_benchmark/harness_client"
+        client = get_absolute_path(client_path, "build")
         if not os.path.exists(client):
             # Apollo builds both debug and release binaries on diff
             # so we need to use the release client if the debug one
             # doesn't exist
-            client = os.path.normpath(os.path.join(DIR_PATH,
-                    "../../../build_release/tests/macro_benchmark/"
-                    "harness_client"))
+            client = get_absolute_path(client_path, "build_release")
 
         queries_fd, queries_path = tempfile.mkstemp()
         try:
@@ -440,7 +516,7 @@ class _HarnessClientRunner:
         output_fd, output = tempfile.mkstemp()
         os.close(output_fd)
 
-        client_args = ["--address", self.args.address, "--port", self.args.port,
+        client_args = ["--port", self.database.args.port,
                        "--num-workers", str(num_client_workers),
                        "--output", output]
 
@@ -590,7 +666,7 @@ def main():
     # Print summary.
     print("\n\nMacro benchmark summary:")
     print("{}\n".format(suite.summary))
-    with open(os.path.join(DIR_PATH, ".harness_summary"), "w") as f:
+    with open(get_absolute_path(".harness_summary"), "w") as f:
         print(suite.summary, file=f)
 
 
diff --git a/tools/apollo/generate b/tools/apollo/generate
index e88718e06..9cb9380ff 100755
--- a/tools/apollo/generate
+++ b/tools/apollo/generate
@@ -197,14 +197,13 @@ macro_bench_path = os.path.join(BASE_DIR, "tests", "macro_benchmark")
 harness_client_binary = os.path.join(BUILD_RELEASE_DIR, "tests",
         "macro_benchmark", "harness_client")
 infile = create_archive("macro_benchmark", [binary_release_path,
-        macro_bench_path, config_path, harness_client_binary],
-        cwd = WORKSPACE_DIR)
-supervisor = "./{}/tests/macro_benchmark/harness/harness.py".format(BASE_DIR_NAME)
-args = MACRO_BENCHMARK_ARGS + " --RunnerBin " + binary_release_path
-outfile_paths = "\./{}/tests/macro_benchmark/harness/\.harness_summary".format(
-        BASE_DIR_NAME)
+        binary_release_link_path, macro_bench_path, config_path,
+        harness_client_binary], cwd = WORKSPACE_DIR)
+supervisor = "./memgraph/tests/macro_benchmark/harness/harness.py"
+outfile_paths = "\./memgraph/tests/macro_benchmark/harness/\.harness_summary"
 RUNS.append(generate_run("macro_benchmark", supervisor = supervisor,
-        arguments = args, infile = infile, outfile_paths = outfile_paths))
+        arguments = MACRO_BENCHMARK_ARGS, infile = infile,
+        outfile_paths = outfile_paths))
 
 # macro benchmark parent tests
 if mode == "diff":