From 70d9f3f6f19914d028a0afc036367a8acbfb6e1a Mon Sep 17 00:00:00 2001 From: Matej Ferencevic Date: Mon, 4 Sep 2017 14:16:12 +0200 Subject: [PATCH] Refactored harness and added PostgreSQL support. Summary: Moved Neo4j config to config dir. Neo4j and PostgreSQL are now downloaded to libs. Renamed metadata flags in memgraph. Changed apollo generate for new harness. Reviewers: mislav.bradac Reviewed By: mislav.bradac Subscribers: pullbot Differential Revision: https://phabricator.memgraph.io/D741 --- libs/cleanup.sh | 4 +- libs/setup.sh | 14 +- src/query/engine.hpp | 6 +- src/query/interpreter.hpp | 8 +- .../{neo4j_config => config}/neo4j.conf | 13 +- tests/macro_benchmark/harness/harness.py | 228 ++++++++++++------ tools/apollo/generate | 13 +- 7 files changed, 190 insertions(+), 96 deletions(-) rename tests/macro_benchmark/harness/{neo4j_config => config}/neo4j.conf (98%) diff --git a/libs/cleanup.sh b/libs/cleanup.sh index b4d1d7506..5151c821d 100755 --- a/libs/cleanup.sh +++ b/libs/cleanup.sh @@ -4,8 +4,8 @@ working_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" cd ${working_dir} -# remove antlr parser generator -rm *.jar +# remove archives +rm *.jar *.tar.gz *.tar 2>/dev/null # remove lib directories for folder in * ; do diff --git a/libs/setup.sh b/libs/setup.sh index e19b78716..5c2c986e4 100755 --- a/libs/setup.sh +++ b/libs/setup.sh @@ -1,4 +1,4 @@ -#!/bin/bash +#!/bin/bash -e # Download external dependencies. @@ -88,3 +88,15 @@ gflags_tag="652651b421ca5ac7b722a34a301fb656deca5198" # May 6, 2017 cd gflags git checkout ${gflags_tag} cd .. + +# neo4j +wget http://deps.memgraph.io/neo4j-community-3.2.3-unix.tar.gz -O neo4j.tar.gz +tar -xzf neo4j.tar.gz +mv neo4j-community-3.2.3 neo4j +rm neo4j.tar.gz + +# postgresql +wget http://deps.memgraph.io/postgresql-9.6.5-1-linux-x64-binaries.tar.gz -O postgres.tar.gz +tar -xzf postgres.tar.gz +mv pgsql postgresql +rm postgres.tar.gz diff --git a/src/query/engine.hpp b/src/query/engine.hpp index 0a2061555..15d7fc08a 100644 --- a/src/query/engine.hpp +++ b/src/query/engine.hpp @@ -99,11 +99,11 @@ class QueryEngine { } std::map summary; - summary["query_parsing_time"] = parsing_time.count(); + summary["parsing_time"] = parsing_time.count(); // This doesn't do any actual planning, but benchmarking harness knows how // to work with this field. - summary["query_planning_time"] = planning_time.count(); - summary["query_plan_execution_time"] = execution_time.count(); + summary["planning_time"] = planning_time.count(); + summary["plan_execution_time"] = execution_time.count(); summary["type"] = "rw"; stream.Summary(summary); diff --git a/src/query/interpreter.hpp b/src/query/interpreter.hpp index 8aeeb8a65..ad71d203c 100644 --- a/src/query/interpreter.hpp +++ b/src/query/interpreter.hpp @@ -178,10 +178,10 @@ class Interpreter { } auto execution_time = execution_timer.Elapsed(); - summary["query_parsing_time"] = frontend_time.count(); - summary["query_planning_time"] = planning_time.count(); - summary["query_plan_execution_time"] = execution_time.count(); - summary["query_cost_estimate"] = query_plan_cost_estimation; + summary["parsing_time"] = frontend_time.count(); + summary["planning_time"] = planning_time.count(); + summary["plan_execution_time"] = execution_time.count(); + summary["cost_estimate"] = query_plan_cost_estimation; // TODO: set summary['type'] based on transaction metadata // the type can't be determined based only on top level LogicalOp diff --git a/tests/macro_benchmark/harness/neo4j_config/neo4j.conf b/tests/macro_benchmark/harness/config/neo4j.conf similarity index 98% rename from tests/macro_benchmark/harness/neo4j_config/neo4j.conf rename to tests/macro_benchmark/harness/config/neo4j.conf index 5e30252ae..a18b76851 100644 --- a/tests/macro_benchmark/harness/neo4j_config/neo4j.conf +++ b/tests/macro_benchmark/harness/config/neo4j.conf @@ -13,13 +13,13 @@ #dbms.directories.plugins=/var/lib/neo4j/plugins #dbms.directories.certificates=/var/lib/neo4j/certificates #dbms.directories.logs=/var/log/neo4j -dbms.directories.lib=/usr/share/neo4j/lib +#dbms.directories.lib=/usr/share/neo4j/lib #dbms.directories.run=/var/run/neo4j # This setting constrains all `LOAD CSV` import files to be under the `import` directory. Remove or comment it out to # allow files to be loaded from anywhere in the filesystem; this introduces possible security problems. See the # `LOAD CSV` section of the manual for details. -dbms.directories.import=/var/lib/neo4j/import +#dbms.directories.import=/var/lib/neo4j/import # Whether requests to Neo4j are authenticated. # To disable authentication, uncomment this line @@ -75,7 +75,7 @@ dbms.connector.http.enabled=true #dbms.connector.http.listen_address=:7474 # HTTPS Connector. There can be zero or one HTTPS connectors. -dbms.connector.https.enabled=true +dbms.connector.https.enabled=false #dbms.connector.https.listen_address=:7473 # Number of Neo4j worker threads. @@ -316,3 +316,10 @@ dbms.windows_service_name=neo4j # Other Neo4j system properties #******************************************************************** dbms.jvm.additional=-Dunsupported.dbms.udc.source=debian + +# Disable Neo4j usage data collection +dbms.udc.enabled=false + +# Disable query cache +dbms.query_cache_size=0 + diff --git a/tests/macro_benchmark/harness/harness.py b/tests/macro_benchmark/harness/harness.py index 07a42fd71..8462fbc7a 100755 --- a/tests/macro_benchmark/harness/harness.py +++ b/tests/macro_benchmark/harness/harness.py @@ -3,11 +3,10 @@ import logging import os -from os import path import time import itertools import json -from subprocess import check_output +import subprocess from argparse import ArgumentParser from collections import OrderedDict from collections import defaultdict @@ -15,6 +14,8 @@ import tempfile import shutil from statistics import median +from perf import Perf + try: import jail APOLLO = True @@ -22,13 +23,34 @@ except: import jail_faker as jail APOLLO = False -DIR_PATH = path.dirname(path.realpath(__file__)) +DIR_PATH = os.path.dirname(os.path.realpath(__file__)) WALL_TIME = "wall_time" CPU_TIME = "cpu_time" -from perf import Perf log = logging.getLogger(__name__) + +def get_absolute_path(path, base=""): + if base == "build": + extra = "../../../build" + elif base == "build_release": + extra = "../../../build_release" + elif base == "libs": + extra = "../../../libs" + elif base == "config": + extra = "../../../config" + else: + extra = "" + return os.path.normpath(os.path.join(DIR_PATH, extra, path)) + + +def wait_for_server(port, delay=1.0): + cmd = ["nc", "-z", "-w", "1", "127.0.0.1", port] + while subprocess.call(cmd) != 0: + time.sleep(0.5) + time.sleep(delay) + + def load_scenarios(args): """ Scans through folder structure starting with groups_root and @@ -67,9 +89,8 @@ def load_scenarios(args): {group: (scenario, {config: query_generator_function}) """ argp = ArgumentParser("QuerySuite.scenarios argument parser") - argp.add_argument("--query-scenarios-root", default=path.join( - DIR_PATH, "groups"), - dest="root") + argp.add_argument("--query-scenarios-root", + default=get_absolute_path("groups"), dest="root") args, _ = argp.parse_known_args() log.info("Loading query scenarios from root: %s", args.root) @@ -78,7 +99,7 @@ def load_scenarios(args): log.debug("Processing config file %s", config_file) config_name = config_file.split(".")[-2] config_dict[config_name] = QuerySuite.Loader( - path.join(base, config_file)) + os.path.join(base, config_file)) # validate that the scenario does not contain any illegal # keys (defense against typos in file naming) @@ -89,19 +110,19 @@ def load_scenarios(args): def dir_content(root, predicate): return [p for p in os.listdir(root) - if predicate(path.join(root, p))] + if predicate(os.path.join(root, p))] group_scenarios = OrderedDict() - for group in dir_content(args.root, path.isdir): + for group in dir_content(args.root, os.path.isdir): log.info("Loading group: '%s'", group) group_scenarios[group] = [] - files = dir_content(path.join(args.root, group), - path.isfile) + files = dir_content(os.path.join(args.root, group), + os.path.isfile) # process group default config group_config = {} - fill_config_dict(group_config, path.join(args.root, group), + fill_config_dict(group_config, os.path.join(args.root, group), [f for f in files if f.count(".") == 1]) # group files on scenario @@ -111,7 +132,7 @@ def load_scenarios(args): log.info("Loading scenario: '%s'", scenario_name) scenario = dict(group_config) fill_config_dict(scenario, - path.join(args.root, group), + os.path.join(args.root, group), scenario_files) group_scenarios[group].append((scenario_name, scenario)) log.debug("Loaded config for scenario '%s'\n%r", scenario_name, @@ -130,12 +151,12 @@ class _QuerySuite: # what the QuerySuite can work with KNOWN_KEYS = {"config", "setup", "itersetup", "run", "iterteardown", "teardown", "common"} - FORMAT = ["{:>24}", "{:>28}", "{:>22}", "{:>24}", "{:>28}", + FORMAT = ["{:>24}", "{:>28}", "{:>16}", "{:>18}", "{:>22}", "{:>16}", "{:>16}"] FULL_FORMAT = "".join(FORMAT) + "\n" summary = FULL_FORMAT.format( - "group_name", "scenario_name", "query_parsing_time", - "query_planning_time", "query_plan_execution_time", + "group_name", "scenario_name", "parsing_time", + "planning_time", "plan_execution_time", WALL_TIME, CPU_TIME) def __init__(self, args): @@ -173,12 +194,12 @@ class _QuerySuite: """ Yields queries found in the given file_path one by one """ log.debug("Generating queries from file_path: %s", self.file_path) - _, extension = path.splitext(self.file_path) + _, extension = os.path.splitext(self.file_path) if extension == ".cypher": with open(self.file_path) as f: return self._queries(f.read()) elif extension == ".py": - return self._queries(check_output( + return self._queries(subprocess.check_output( ["python3", self.file_path]).decode("ascii")) elif extension == ".json": with open(self.file_path) as f: @@ -241,9 +262,9 @@ class _QuerySuite: scenario_config.get("num_client_workers", 1)) add_measurement(run_result, iteration, WALL_TIME) add_measurement(run_result, iteration, CPU_TIME) - for measurement in ["query_parsing_time", - "query_plan_execution_time", - "query_planning_time"] : + for measurement in ["parsing_time", + "plan_execution_time", + "planning_time"] : for i in range(len(run_result.get("metadatas", []))): add_measurement(run_result["metadatas"][i], iteration, measurement) @@ -263,8 +284,8 @@ class _QuerySuite: measurement_lists, num_iterations): self.summary += self.FORMAT[0].format(group_name) self.summary += self.FORMAT[1].format(scenario_name) - for i, key in enumerate(("query_parsing_time", "query_planning_time", - "query_plan_execution_time", WALL_TIME, CPU_TIME)): + for i, key in enumerate(("parsing_time", "planning_time", + "plan_execution_time", WALL_TIME, CPU_TIME)): if key not in measurement_lists: time = "-" else: @@ -305,15 +326,6 @@ class QueryParallelSuite(_QuerySuite): return ["aggregation_parallel", "create_parallel"] -def get_common_runner_argument_parser(): - argp = ArgumentParser("CommonRunnerArgumentParser") - argp.add_argument("--address", help="Database and client address", - default="127.0.0.1") - argp.add_argument("--port", help="Database and client port", - default="7687") - return argp - - # Database wrappers. class Memgraph: @@ -322,15 +334,13 @@ class Memgraph: """ def __init__(self, args, cpus): self.log = logging.getLogger("MemgraphRunner") - argp = ArgumentParser("MemgraphArgumentParser", add_help=False, - parents=[get_common_runner_argument_parser()]) - argp.add_argument("--RunnerBin", - default=os.path.join(DIR_PATH, - "../../../build/memgraph")) - argp.add_argument("--RunnerConfig", - default=os.path.normpath(os.path.join( - DIR_PATH, - "../../../config/benchmarking_latency.conf"))) + argp = ArgumentParser("MemgraphArgumentParser", add_help=False) + argp.add_argument("--runner-bin", + default=get_absolute_path("memgraph", "build")) + argp.add_argument("--runner-config", + default=get_absolute_path("benchmarking_latency.conf", "config")) + argp.add_argument("--port", default="7687", + help="Database and client port") self.log.info("Initializing Runner with arguments %r", args) self.args, _ = argp.parse_known_args(args) self.database_bin = jail.get_process() @@ -338,14 +348,20 @@ class Memgraph: def start(self): self.log.info("start") - environment = os.environ.copy() - environment["MEMGRAPH_CONFIG"] = self.args.RunnerConfig - database_args = ["--interface", self.args.address, - "--port", self.args.port] - self.database_bin.run(self.args.RunnerBin, database_args, - env=environment, timeout=600) - # TODO change to a check via SIGUSR - time.sleep(1.0) + env = {"MEMGRAPH_CONFIG": self.args.runner_config} + database_args = ["--port", self.args.port] + + # find executable path + runner_bin = self.args.runner_bin + if not os.path.exists(runner_bin): + # Apollo builds both debug and release binaries on diff + # so we need to use the release binary if the debug one + # doesn't exist + runner_bin = get_absolute_path("memgraph", "build_release") + + # start memgraph + self.database_bin.run(runner_bin, database_args, env=env, timeout=600) + wait_for_server(self.args.port) return self.database_bin.get_pid() if not APOLLO else None def stop(self): @@ -356,42 +372,105 @@ class Memgraph: class Neo: def __init__(self, args, cpus): self.log = logging.getLogger("NeoRunner") - argp = ArgumentParser("NeoArgumentParser", add_help=False, - parents=[get_common_runner_argument_parser()]) - argp.add_argument( - "--RunnerConfigDir", - default=path.join(DIR_PATH, "neo4j_config")) + argp = ArgumentParser("NeoArgumentParser", add_help=False) + argp.add_argument("--runner-bin", default=get_absolute_path( + "neo4j/bin/neo4j", "libs")) + argp.add_argument("--runner-config", + default=get_absolute_path("config/neo4j.conf")) + argp.add_argument("--port", default="7687", + help="Database and client port") + argp.add_argument("--http-port", default="7474", + help="Database and client port") self.log.info("Initializing Runner with arguments %r", args) self.args, _ = argp.parse_known_args(args) - if self.args.address != "127.0.0.1" or self.args.port != "7687": - raise Exception( - "Neo wrapper doesn't support different address or port") self.database_bin = jail.get_process() self.database_bin.set_cpus(cpus) def start(self): self.log.info("start") - environment = os.environ.copy() - environment["NEO4J_CONF"] = self.args.RunnerConfigDir + + # create home directory self.neo4j_home_path = tempfile.mkdtemp(dir="/dev/shm") - environment["NEO4J_HOME"] = self.neo4j_home_path + try: - self.database_bin.run("/usr/share/neo4j/bin/neo4j", args=["console"], - env=environment, timeout=600) - # TODO change to a check via SIGUSR - time.sleep(5.0) + os.symlink(os.path.join(get_absolute_path("neo4j", "libs"), "lib"), + os.path.join(self.neo4j_home_path, "lib")) + neo4j_conf_dir = os.path.join(self.neo4j_home_path, "conf") + neo4j_conf_file = os.path.join(neo4j_conf_dir, "neo4j.conf") + os.mkdir(neo4j_conf_dir) + shutil.copyfile(self.args.runner_config, neo4j_conf_file) + with open(neo4j_conf_file, "a") as f: + f.write("\ndbms.connector.bolt.listen_address=:" + + self.args.port + "\n") + f.write("\ndbms.connector.http.listen_address=:" + + self.args.http_port + "\n") + + # environment + cwd = os.path.dirname(self.args.runner_bin) + env = {"NEO4J_HOME": self.neo4j_home_path} + + self.database_bin.run(self.args.runner_bin, args=["console"], + env=env, timeout=600, cwd=cwd) except: shutil.rmtree(self.neo4j_home_path) - raise Exception("Couldn't create symlink or run neo4j") + raise Exception("Couldn't run Neo4j!") + + wait_for_server(self.args.http_port, 2.0) return self.database_bin.get_pid() if not APOLLO else None def stop(self): self.database_bin.send_signal(jail.SIGTERM) self.database_bin.wait() - if path.exists(self.neo4j_home_path): + if os.path.exists(self.neo4j_home_path): shutil.rmtree(self.neo4j_home_path) +class Postgres: + """ + Knows how to start and stop PostgreSQL. + """ + def __init__(self, args, cpus): + self.log = logging.getLogger("PostgresRunner") + argp = ArgumentParser("PostgresArgumentParser", add_help=False) + argp.add_argument("--init-bin", default=get_absolute_path( + "postgresql/bin/initdb", "libs")) + argp.add_argument("--runner-bin", default=get_absolute_path( + "postgresql/bin/postgres", "libs")) + argp.add_argument("--port", default="5432", + help="Database and client port") + self.log.info("Initializing Runner with arguments %r", args) + self.args, _ = argp.parse_known_args(args) + self.username = "macro_benchmark" + self.database_bin = jail.get_process() + self.database_bin.set_cpus(cpus) + + def start(self): + self.log.info("start") + self.data_path = tempfile.mkdtemp(dir="/dev/shm") + init_args = ["-D", self.data_path, "-U", self.username] + self.database_bin.run_and_wait(self.args.init_bin, init_args) + + # args + runner_args = ["-D", self.data_path, "-c", "port=" + self.args.port, + "-c", "ssl=false", "-c", "max_worker_processes=1"] + + try: + self.database_bin.run(self.args.runner_bin, args=runner_args, + timeout=600) + except: + shutil.rmtree(self.data_path) + raise Exception("Couldn't run PostgreSQL!") + + wait_for_server(self.args.port) + return self.database_bin.get_pid() if not APOLLO else None + + def stop(self): + self.database_bin.send_signal(jail.SIGTERM) + self.database_bin.wait() + if os.path.exists(self.data_path): + shutil.rmtree(self.data_path) + + class _HarnessClientRunner: """ Knows how to start and stop database (backend) some client frontend (bolt), @@ -405,8 +484,7 @@ class _HarnessClientRunner: if cpus is None: cpus = [2, 3] self.log = logging.getLogger("_HarnessClientRunner") self.database = database - argp = ArgumentParser("RunnerArgumentParser", add_help=False, - parents=[get_common_runner_argument_parser()]) + argp = ArgumentParser("RunnerArgumentParser", add_help=False) self.args, _ = argp.parse_known_args() self.bolt_client = jail.get_process() self.bolt_client.set_cpus(cpus) @@ -417,15 +495,13 @@ class _HarnessClientRunner: def execute(self, queries, num_client_workers): self.log.debug("execute('%s')", str(queries)) - client = os.path.normpath(os.path.join(DIR_PATH, - "../../../build/tests/macro_benchmark/harness_client")) + client_path = "tests/macro_benchmark/harness_client" + client = get_absolute_path(client_path, "build") if not os.path.exists(client): # Apollo builds both debug and release binaries on diff # so we need to use the release client if the debug one # doesn't exist - client = os.path.normpath(os.path.join(DIR_PATH, - "../../../build_release/tests/macro_benchmark/" - "harness_client")) + client = get_absolute_path(client_path, "build_release") queries_fd, queries_path = tempfile.mkstemp() try: @@ -440,7 +516,7 @@ class _HarnessClientRunner: output_fd, output = tempfile.mkstemp() os.close(output_fd) - client_args = ["--address", self.args.address, "--port", self.args.port, + client_args = ["--port", self.database.args.port, "--num-workers", str(num_client_workers), "--output", output] @@ -590,7 +666,7 @@ def main(): # Print summary. print("\n\nMacro benchmark summary:") print("{}\n".format(suite.summary)) - with open(os.path.join(DIR_PATH, ".harness_summary"), "w") as f: + with open(get_absolute_path(".harness_summary"), "w") as f: print(suite.summary, file=f) diff --git a/tools/apollo/generate b/tools/apollo/generate index e88718e06..9cb9380ff 100755 --- a/tools/apollo/generate +++ b/tools/apollo/generate @@ -197,14 +197,13 @@ macro_bench_path = os.path.join(BASE_DIR, "tests", "macro_benchmark") harness_client_binary = os.path.join(BUILD_RELEASE_DIR, "tests", "macro_benchmark", "harness_client") infile = create_archive("macro_benchmark", [binary_release_path, - macro_bench_path, config_path, harness_client_binary], - cwd = WORKSPACE_DIR) -supervisor = "./{}/tests/macro_benchmark/harness/harness.py".format(BASE_DIR_NAME) -args = MACRO_BENCHMARK_ARGS + " --RunnerBin " + binary_release_path -outfile_paths = "\./{}/tests/macro_benchmark/harness/\.harness_summary".format( - BASE_DIR_NAME) + binary_release_link_path, macro_bench_path, config_path, + harness_client_binary], cwd = WORKSPACE_DIR) +supervisor = "./memgraph/tests/macro_benchmark/harness/harness.py" +outfile_paths = "\./memgraph/tests/macro_benchmark/harness/\.harness_summary" RUNS.append(generate_run("macro_benchmark", supervisor = supervisor, - arguments = args, infile = infile, outfile_paths = outfile_paths)) + arguments = MACRO_BENCHMARK_ARGS, infile = infile, + outfile_paths = outfile_paths)) # macro benchmark parent tests if mode == "diff":