Refactored harness and added PostgreSQL support.

Summary: Moved Neo4j config to config dir. Neo4j and PostgreSQL are now downloaded to libs. Renamed metadata flags in memgraph. Changed apollo generate for new harness. Reviewers: mislav.bradac Reviewed By: mislav.bradac Subscribers: pullbot Differential Revision: https://phabricator.memgraph.io/D741
2017-09-04 14:16:12 +02:00 · 2017-09-04 14:16:12 +02:00 · 70d9f3f6f1
commit 70d9f3f6f1
parent b30e3252e2
7 changed files with 190 additions and 96 deletions
--- a/libs/cleanup.sh
+++ b/libs/cleanup.sh
@ -4,8 +4,8 @@
 working_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
 cd ${working_dir}

-# remove antlr parser generator
-rm *.jar
+# remove archives
+rm *.jar *.tar.gz *.tar 2>/dev/null

 # remove lib directories
 for folder in * ; do
--- a/libs/setup.sh
+++ b/libs/setup.sh
@ -1,4 +1,4 @@
-#!/bin/bash
+#!/bin/bash -e

 # Download external dependencies.

@ -88,3 +88,15 @@ gflags_tag="652651b421ca5ac7b722a34a301fb656deca5198" # May 6, 2017
 cd gflags
 git checkout ${gflags_tag}
 cd ..
+
+# neo4j
+wget http://deps.memgraph.io/neo4j-community-3.2.3-unix.tar.gz -O neo4j.tar.gz
+tar -xzf neo4j.tar.gz
+mv neo4j-community-3.2.3 neo4j
+rm neo4j.tar.gz
+
+# postgresql
+wget http://deps.memgraph.io/postgresql-9.6.5-1-linux-x64-binaries.tar.gz -O postgres.tar.gz
+tar -xzf postgres.tar.gz
+mv pgsql postgresql
+rm postgres.tar.gz
--- a/src/query/engine.hpp
+++ b/src/query/engine.hpp
@ -99,11 +99,11 @@ class QueryEngine {
    }

    std::map<std::string, query::TypedValue> summary;
-    summary["query_parsing_time"] = parsing_time.count();
+    summary["parsing_time"] = parsing_time.count();
    // This doesn't do any actual planning, but benchmarking harness knows how
    // to work with this field.
-    summary["query_planning_time"] = planning_time.count();
-    summary["query_plan_execution_time"] = execution_time.count();
+    summary["planning_time"] = planning_time.count();
+    summary["plan_execution_time"] = execution_time.count();
    summary["type"] = "rw";
    stream.Summary(summary);

--- a/src/query/interpreter.hpp
+++ b/src/query/interpreter.hpp
@ -178,10 +178,10 @@ class Interpreter {
    }
    auto execution_time = execution_timer.Elapsed();

-    summary["query_parsing_time"] = frontend_time.count();
-    summary["query_planning_time"] = planning_time.count();
-    summary["query_plan_execution_time"] = execution_time.count();
-    summary["query_cost_estimate"] = query_plan_cost_estimation;
+    summary["parsing_time"] = frontend_time.count();
+    summary["planning_time"] = planning_time.count();
+    summary["plan_execution_time"] = execution_time.count();
+    summary["cost_estimate"] = query_plan_cost_estimation;

    // TODO: set summary['type'] based on transaction metadata
    // the type can't be determined based only on top level LogicalOp
--- a/tests/macro_benchmark/harness/neo4j_config/neo4j.conf
+++ b/tests/macro_benchmark/harness/neo4j_config/neo4j.conf
@ -13,13 +13,13 @@
 #dbms.directories.plugins=/var/lib/neo4j/plugins
 #dbms.directories.certificates=/var/lib/neo4j/certificates
 #dbms.directories.logs=/var/log/neo4j
-dbms.directories.lib=/usr/share/neo4j/lib
+#dbms.directories.lib=/usr/share/neo4j/lib
 #dbms.directories.run=/var/run/neo4j

 # This setting constrains all `LOAD CSV` import files to be under the `import` directory. Remove or comment it out to
 # allow files to be loaded from anywhere in the filesystem; this introduces possible security problems. See the
 # `LOAD CSV` section of the manual for details.
-dbms.directories.import=/var/lib/neo4j/import
+#dbms.directories.import=/var/lib/neo4j/import

 # Whether requests to Neo4j are authenticated.
 # To disable authentication, uncomment this line
@ -75,7 +75,7 @@ dbms.connector.http.enabled=true
 #dbms.connector.http.listen_address=:7474

 # HTTPS Connector. There can be zero or one HTTPS connectors.
-dbms.connector.https.enabled=true
+dbms.connector.https.enabled=false
 #dbms.connector.https.listen_address=:7473

 # Number of Neo4j worker threads.
@ -316,3 +316,10 @@ dbms.windows_service_name=neo4j
 # Other Neo4j system properties
 #********************************************************************
 dbms.jvm.additional=-Dunsupported.dbms.udc.source=debian
+
+# Disable Neo4j usage data collection
+dbms.udc.enabled=false
+
+# Disable query cache
+dbms.query_cache_size=0
+
--- a/tests/macro_benchmark/harness/harness.py
+++ b/tests/macro_benchmark/harness/harness.py
@ -3,11 +3,10 @@

 import logging
 import os
-from os import path
 import time
 import itertools
 import json
-from subprocess import check_output
+import subprocess
 from argparse import ArgumentParser
 from collections import OrderedDict
 from collections import defaultdict
@ -15,6 +14,8 @@ import tempfile
 import shutil
 from statistics import median

+from perf import Perf
+
 try:
    import jail
    APOLLO = True
@ -22,13 +23,34 @@ except:
    import jail_faker as jail
    APOLLO = False

-DIR_PATH = path.dirname(path.realpath(__file__))
+DIR_PATH = os.path.dirname(os.path.realpath(__file__))
 WALL_TIME = "wall_time"
 CPU_TIME = "cpu_time"
-from perf import Perf

 log = logging.getLogger(__name__)

+
+def get_absolute_path(path, base=""):
+    if base == "build":
+        extra = "../../../build"
+    elif base == "build_release":
+        extra = "../../../build_release"
+    elif base == "libs":
+        extra = "../../../libs"
+    elif base == "config":
+        extra = "../../../config"
+    else:
+        extra = ""
+    return os.path.normpath(os.path.join(DIR_PATH, extra, path))
+
+
+def wait_for_server(port, delay=1.0):
+    cmd = ["nc", "-z", "-w", "1", "127.0.0.1", port]
+    while subprocess.call(cmd) != 0:
+        time.sleep(0.5)
+    time.sleep(delay)
+
+
 def load_scenarios(args):
    """
    Scans through folder structure starting with groups_root and
@ -67,9 +89,8 @@ def load_scenarios(args):
        {group: (scenario, {config: query_generator_function})
    """
    argp = ArgumentParser("QuerySuite.scenarios argument parser")
-    argp.add_argument("--query-scenarios-root", default=path.join(
-        DIR_PATH, "groups"),
-        dest="root")
+    argp.add_argument("--query-scenarios-root",
+                      default=get_absolute_path("groups"), dest="root")
    args, _ = argp.parse_known_args()
    log.info("Loading query scenarios from root: %s", args.root)

@ -78,7 +99,7 @@ def load_scenarios(args):
            log.debug("Processing config file %s", config_file)
            config_name = config_file.split(".")[-2]
            config_dict[config_name] = QuerySuite.Loader(
-                path.join(base, config_file))
+                os.path.join(base, config_file))

        # validate that the scenario does not contain any illegal
        # keys (defense against typos in file naming)
@ -89,19 +110,19 @@ def load_scenarios(args):

    def dir_content(root, predicate):
        return [p for p in os.listdir(root)
-                if predicate(path.join(root, p))]
+                if predicate(os.path.join(root, p))]

    group_scenarios = OrderedDict()
-    for group in dir_content(args.root, path.isdir):
+    for group in dir_content(args.root, os.path.isdir):
        log.info("Loading group: '%s'", group)

        group_scenarios[group] = []
-        files = dir_content(path.join(args.root, group),
-                            path.isfile)
+        files = dir_content(os.path.join(args.root, group),
+                            os.path.isfile)

        # process group default config
        group_config = {}
-        fill_config_dict(group_config, path.join(args.root, group),
+        fill_config_dict(group_config, os.path.join(args.root, group),
                         [f for f in files if f.count(".") == 1])

        # group files on scenario
@ -111,7 +132,7 @@ def load_scenarios(args):
            log.info("Loading scenario: '%s'", scenario_name)
            scenario = dict(group_config)
            fill_config_dict(scenario,
-                             path.join(args.root, group),
+                             os.path.join(args.root, group),
                             scenario_files)
            group_scenarios[group].append((scenario_name, scenario))
            log.debug("Loaded config for scenario '%s'\n%r", scenario_name,
@ -130,12 +151,12 @@ class _QuerySuite:
    # what the QuerySuite can work with
    KNOWN_KEYS = {"config", "setup", "itersetup", "run", "iterteardown",
                  "teardown", "common"}
-    FORMAT = ["{:>24}", "{:>28}", "{:>22}", "{:>24}", "{:>28}",
+    FORMAT = ["{:>24}", "{:>28}", "{:>16}", "{:>18}", "{:>22}",
              "{:>16}", "{:>16}"]
    FULL_FORMAT = "".join(FORMAT) + "\n"
    summary = FULL_FORMAT.format(
-                      "group_name", "scenario_name", "query_parsing_time",
-                      "query_planning_time", "query_plan_execution_time",
+                      "group_name", "scenario_name", "parsing_time",
+                      "planning_time", "plan_execution_time",
                      WALL_TIME, CPU_TIME)

    def __init__(self, args):
@ -173,12 +194,12 @@ class _QuerySuite:
            """ Yields queries found in the given file_path one by one """
            log.debug("Generating queries from file_path: %s",
                      self.file_path)
-            _, extension = path.splitext(self.file_path)
+            _, extension = os.path.splitext(self.file_path)
            if extension == ".cypher":
                with open(self.file_path) as f:
                    return self._queries(f.read())
            elif extension == ".py":
-                return self._queries(check_output(
+                return self._queries(subprocess.check_output(
                    ["python3", self.file_path]).decode("ascii"))
            elif extension == ".json":
                with open(self.file_path) as f:
@ -241,9 +262,9 @@ class _QuerySuite:
                                 scenario_config.get("num_client_workers", 1))
            add_measurement(run_result, iteration, WALL_TIME)
            add_measurement(run_result, iteration, CPU_TIME)
-            for measurement in ["query_parsing_time",
-                                "query_plan_execution_time",
-                                "query_planning_time"] :
+            for measurement in ["parsing_time",
+                                "plan_execution_time",
+                                "planning_time"] :
                for i in range(len(run_result.get("metadatas", []))):
                    add_measurement(run_result["metadatas"][i], iteration,
                                    measurement)
@ -263,8 +284,8 @@ class _QuerySuite:
                                measurement_lists, num_iterations):
        self.summary += self.FORMAT[0].format(group_name)
        self.summary += self.FORMAT[1].format(scenario_name)
-        for i, key in enumerate(("query_parsing_time", "query_planning_time",
-                    "query_plan_execution_time", WALL_TIME, CPU_TIME)):
+        for i, key in enumerate(("parsing_time", "planning_time",
+                    "plan_execution_time", WALL_TIME, CPU_TIME)):
            if key not in measurement_lists:
                time = "-"
            else:
@ -305,15 +326,6 @@ class QueryParallelSuite(_QuerySuite):
        return ["aggregation_parallel", "create_parallel"]


-def get_common_runner_argument_parser():
-    argp = ArgumentParser("CommonRunnerArgumentParser")
-    argp.add_argument("--address", help="Database and client address",
-                      default="127.0.0.1")
-    argp.add_argument("--port", help="Database and client port",
-                      default="7687")
-    return argp
-
-
 # Database wrappers.

 class Memgraph:
@ -322,15 +334,13 @@ class Memgraph:
    """
    def __init__(self, args, cpus):
        self.log = logging.getLogger("MemgraphRunner")
-        argp = ArgumentParser("MemgraphArgumentParser", add_help=False,
-                              parents=[get_common_runner_argument_parser()])
-        argp.add_argument("--RunnerBin",
-                          default=os.path.join(DIR_PATH,
-                                               "../../../build/memgraph"))
-        argp.add_argument("--RunnerConfig",
-                          default=os.path.normpath(os.path.join(
-                              DIR_PATH,
-                              "../../../config/benchmarking_latency.conf")))
+        argp = ArgumentParser("MemgraphArgumentParser", add_help=False)
+        argp.add_argument("--runner-bin",
+                          default=get_absolute_path("memgraph", "build"))
+        argp.add_argument("--runner-config",
+                          default=get_absolute_path("benchmarking_latency.conf", "config"))
+        argp.add_argument("--port", default="7687",
+                          help="Database and client port")
        self.log.info("Initializing Runner with arguments %r", args)
        self.args, _ = argp.parse_known_args(args)
        self.database_bin = jail.get_process()
@ -338,14 +348,20 @@ class Memgraph:

    def start(self):
        self.log.info("start")
-        environment = os.environ.copy()
-        environment["MEMGRAPH_CONFIG"] = self.args.RunnerConfig
-        database_args = ["--interface", self.args.address,
-                         "--port", self.args.port]
-        self.database_bin.run(self.args.RunnerBin, database_args,
-                              env=environment, timeout=600)
-        # TODO change to a check via SIGUSR
-        time.sleep(1.0)
+        env = {"MEMGRAPH_CONFIG": self.args.runner_config}
+        database_args = ["--port", self.args.port]
+
+        # find executable path
+        runner_bin = self.args.runner_bin
+        if not os.path.exists(runner_bin):
+            # Apollo builds both debug and release binaries on diff
+            # so we need to use the release binary if the debug one
+            # doesn't exist
+            runner_bin = get_absolute_path("memgraph", "build_release")
+
+        # start memgraph
+        self.database_bin.run(runner_bin, database_args, env=env, timeout=600)
+        wait_for_server(self.args.port)
        return self.database_bin.get_pid() if not APOLLO else None

    def stop(self):
@ -356,42 +372,105 @@ class Memgraph:
 class Neo:
    def __init__(self, args, cpus):
        self.log = logging.getLogger("NeoRunner")
-        argp = ArgumentParser("NeoArgumentParser", add_help=False,
-                              parents=[get_common_runner_argument_parser()])
-        argp.add_argument(
-            "--RunnerConfigDir",
-            default=path.join(DIR_PATH, "neo4j_config"))
+        argp = ArgumentParser("NeoArgumentParser", add_help=False)
+        argp.add_argument("--runner-bin", default=get_absolute_path(
+                          "neo4j/bin/neo4j", "libs"))
+        argp.add_argument("--runner-config",
+                          default=get_absolute_path("config/neo4j.conf"))
+        argp.add_argument("--port", default="7687",
+                          help="Database and client port")
+        argp.add_argument("--http-port", default="7474",
+                          help="Database and client port")
        self.log.info("Initializing Runner with arguments %r", args)
        self.args, _ = argp.parse_known_args(args)
-        if self.args.address != "127.0.0.1" or self.args.port != "7687":
-            raise Exception(
-                "Neo wrapper doesn't support different address or port")
        self.database_bin = jail.get_process()
        self.database_bin.set_cpus(cpus)

    def start(self):
        self.log.info("start")
-        environment = os.environ.copy()
-        environment["NEO4J_CONF"] = self.args.RunnerConfigDir
+
+        # create home directory
        self.neo4j_home_path = tempfile.mkdtemp(dir="/dev/shm")
-        environment["NEO4J_HOME"] = self.neo4j_home_path
+
        try:
-            self.database_bin.run("/usr/share/neo4j/bin/neo4j", args=["console"],
-                                  env=environment, timeout=600)
-            # TODO change to a check via SIGUSR
-            time.sleep(5.0)
+            os.symlink(os.path.join(get_absolute_path("neo4j", "libs"), "lib"),
+                       os.path.join(self.neo4j_home_path, "lib"))
+            neo4j_conf_dir = os.path.join(self.neo4j_home_path, "conf")
+            neo4j_conf_file = os.path.join(neo4j_conf_dir, "neo4j.conf")
+            os.mkdir(neo4j_conf_dir)
+            shutil.copyfile(self.args.runner_config, neo4j_conf_file)
+            with open(neo4j_conf_file, "a") as f:
+                f.write("\ndbms.connector.bolt.listen_address=:" +
+                        self.args.port + "\n")
+                f.write("\ndbms.connector.http.listen_address=:" +
+                        self.args.http_port + "\n")
+
+            # environment
+            cwd = os.path.dirname(self.args.runner_bin)
+            env = {"NEO4J_HOME": self.neo4j_home_path}
+
+            self.database_bin.run(self.args.runner_bin, args=["console"],
+                                  env=env, timeout=600, cwd=cwd)
        except:
            shutil.rmtree(self.neo4j_home_path)
-            raise Exception("Couldn't create symlink or run neo4j")
+            raise Exception("Couldn't run Neo4j!")
+
+        wait_for_server(self.args.http_port, 2.0)
        return self.database_bin.get_pid() if not APOLLO else None

    def stop(self):
        self.database_bin.send_signal(jail.SIGTERM)
        self.database_bin.wait()
-        if path.exists(self.neo4j_home_path):
+        if os.path.exists(self.neo4j_home_path):
            shutil.rmtree(self.neo4j_home_path)


+class Postgres:
+    """
+    Knows how to start and stop PostgreSQL.
+    """
+    def __init__(self, args, cpus):
+        self.log = logging.getLogger("PostgresRunner")
+        argp = ArgumentParser("PostgresArgumentParser", add_help=False)
+        argp.add_argument("--init-bin", default=get_absolute_path(
+                          "postgresql/bin/initdb", "libs"))
+        argp.add_argument("--runner-bin", default=get_absolute_path(
+                          "postgresql/bin/postgres", "libs"))
+        argp.add_argument("--port", default="5432",
+                          help="Database and client port")
+        self.log.info("Initializing Runner with arguments %r", args)
+        self.args, _ = argp.parse_known_args(args)
+        self.username = "macro_benchmark"
+        self.database_bin = jail.get_process()
+        self.database_bin.set_cpus(cpus)
+
+    def start(self):
+        self.log.info("start")
+        self.data_path = tempfile.mkdtemp(dir="/dev/shm")
+        init_args = ["-D", self.data_path, "-U", self.username]
+        self.database_bin.run_and_wait(self.args.init_bin, init_args)
+
+        # args
+        runner_args = ["-D", self.data_path, "-c", "port=" + self.args.port,
+                       "-c", "ssl=false", "-c", "max_worker_processes=1"]
+
+        try:
+            self.database_bin.run(self.args.runner_bin, args=runner_args,
+                                  timeout=600)
+        except:
+            shutil.rmtree(self.data_path)
+            raise Exception("Couldn't run PostgreSQL!")
+
+        wait_for_server(self.args.port)
+        return self.database_bin.get_pid() if not APOLLO else None
+
+    def stop(self):
+        self.database_bin.send_signal(jail.SIGTERM)
+        self.database_bin.wait()
+        if os.path.exists(self.data_path):
+            shutil.rmtree(self.data_path)
+
+
 class _HarnessClientRunner:
    """
    Knows how to start and stop database (backend) some client frontend (bolt),
@ -405,8 +484,7 @@ class _HarnessClientRunner:
        if cpus is None: cpus = [2, 3]
        self.log = logging.getLogger("_HarnessClientRunner")
        self.database = database
-        argp = ArgumentParser("RunnerArgumentParser", add_help=False,
-                              parents=[get_common_runner_argument_parser()])
+        argp = ArgumentParser("RunnerArgumentParser", add_help=False)
        self.args, _ = argp.parse_known_args()
        self.bolt_client = jail.get_process()
        self.bolt_client.set_cpus(cpus)
@ -417,15 +495,13 @@ class _HarnessClientRunner:
    def execute(self, queries, num_client_workers):
        self.log.debug("execute('%s')", str(queries))

-        client = os.path.normpath(os.path.join(DIR_PATH,
-                "../../../build/tests/macro_benchmark/harness_client"))
+        client_path = "tests/macro_benchmark/harness_client"
+        client = get_absolute_path(client_path, "build")
        if not os.path.exists(client):
            # Apollo builds both debug and release binaries on diff
            # so we need to use the release client if the debug one
            # doesn't exist
-            client = os.path.normpath(os.path.join(DIR_PATH,
-                    "../../../build_release/tests/macro_benchmark/"
-                    "harness_client"))
+            client = get_absolute_path(client_path, "build_release")

        queries_fd, queries_path = tempfile.mkstemp()
        try:
@ -440,7 +516,7 @@ class _HarnessClientRunner:
        output_fd, output = tempfile.mkstemp()
        os.close(output_fd)

-        client_args = ["--address", self.args.address, "--port", self.args.port,
+        client_args = ["--port", self.database.args.port,
                       "--num-workers", str(num_client_workers),
                       "--output", output]

@ -590,7 +666,7 @@ def main():
    # Print summary.
    print("\n\nMacro benchmark summary:")
    print("{}\n".format(suite.summary))
-    with open(os.path.join(DIR_PATH, ".harness_summary"), "w") as f:
+    with open(get_absolute_path(".harness_summary"), "w") as f:
        print(suite.summary, file=f)


--- a/tools/apollo/generate
+++ b/tools/apollo/generate
@ -197,14 +197,13 @@ macro_bench_path = os.path.join(BASE_DIR, "tests", "macro_benchmark")
 harness_client_binary = os.path.join(BUILD_RELEASE_DIR, "tests",
        "macro_benchmark", "harness_client")
 infile = create_archive("macro_benchmark", [binary_release_path,
-        macro_bench_path, config_path, harness_client_binary],
-        cwd = WORKSPACE_DIR)
-supervisor = "./{}/tests/macro_benchmark/harness/harness.py".format(BASE_DIR_NAME)
-args = MACRO_BENCHMARK_ARGS + " --RunnerBin " + binary_release_path
-outfile_paths = "\./{}/tests/macro_benchmark/harness/\.harness_summary".format(
-        BASE_DIR_NAME)
+        binary_release_link_path, macro_bench_path, config_path,
+        harness_client_binary], cwd = WORKSPACE_DIR)
+supervisor = "./memgraph/tests/macro_benchmark/harness/harness.py"
+outfile_paths = "\./memgraph/tests/macro_benchmark/harness/\.harness_summary"
 RUNS.append(generate_run("macro_benchmark", supervisor = supervisor,
-        arguments = args, infile = infile, outfile_paths = outfile_paths))
+        arguments = MACRO_BENCHMARK_ARGS, infile = infile,
+        outfile_paths = outfile_paths))

 # macro benchmark parent tests
 if mode == "diff":