Add bigger LDBC dataset to mgbench (#747)
This commit is contained in:
parent
6349fc9501
commit
cb813c3070
@ -247,7 +247,7 @@ Index queries for each supported vendor can be downloaded from “https://s3.eu-
|
||||
|Q19|pattern_short| analytical | MATCH (n:User {id: $id})-[e]->(m) RETURN m LIMIT 1|
|
||||
|Q20|single_edge_write| write | MATCH (n:User {id: $from}), (m:User {id: $to}) WITH n, m CREATE (n)-[e:Temp]->(m) RETURN e|
|
||||
|Q21|single_vertex_write| write |CREATE (n:UserTemp {id : $id}) RETURN n|
|
||||
|Q22|single_vertex_property_update| update | MATCH (n:User {id: $id})-[e]->(m) RETURN m LIMIT 1|
|
||||
|Q22|single_vertex_property_update| update | MATCH (n:User {id: $id}) SET n.property = -1|
|
||||
|Q23|single_vertex_read| read | MATCH (n:User {id : $id}) RETURN n|
|
||||
|
||||
## :computer: Platform
|
||||
|
File diff suppressed because it is too large
Load Diff
57
tests/mgbench/benchmark_context.py
Normal file
57
tests/mgbench/benchmark_context.py
Normal file
@ -0,0 +1,57 @@
|
||||
# Describes all the information of single benchmark.py run.
|
||||
class BenchmarkContext:
|
||||
"""
|
||||
Class for holding information on what type of benchmark is being executed
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
benchmark_target_workload: str = None, # Workload that needs to be executed (dataset/variant/group/query)
|
||||
vendor_binary: str = None, # Benchmark vendor binary
|
||||
vendor_name: str = None,
|
||||
client_binary: str = None,
|
||||
num_workers_for_import: int = None,
|
||||
num_workers_for_benchmark: int = None,
|
||||
single_threaded_runtime_sec: int = 0,
|
||||
no_load_query_counts: bool = False,
|
||||
no_save_query_counts: bool = False,
|
||||
export_results: str = None,
|
||||
temporary_directory: str = None,
|
||||
workload_mixed: str = None, # Default mode is isolated, mixed None
|
||||
workload_realistic: str = None, # Default mode is isolated, realistic None
|
||||
time_dependent_execution: int = 0,
|
||||
warm_up: str = None,
|
||||
performance_tracking: bool = False,
|
||||
no_authorization: bool = True,
|
||||
customer_workloads: str = None,
|
||||
vendor_args: dict = {},
|
||||
) -> None:
|
||||
|
||||
self.benchmark_target_workload = benchmark_target_workload
|
||||
self.vendor_binary = vendor_binary
|
||||
self.vendor_name = vendor_name
|
||||
self.client_binary = client_binary
|
||||
self.num_workers_for_import = num_workers_for_import
|
||||
self.num_workers_for_benchmark = num_workers_for_benchmark
|
||||
self.single_threaded_runtime_sec = single_threaded_runtime_sec
|
||||
self.no_load_query_counts = no_load_query_counts
|
||||
self.no_save_query_counts = no_save_query_counts
|
||||
self.export_results = export_results
|
||||
self.temporary_directory = temporary_directory
|
||||
|
||||
if workload_mixed != None:
|
||||
self.mode = "Mixed"
|
||||
self.mode_config = workload_mixed
|
||||
elif workload_realistic != None:
|
||||
self.mode = "Realistic"
|
||||
self.mode_config = workload_realistic
|
||||
else:
|
||||
self.mode = "Isolated"
|
||||
self.mode_config = "Isolated run does not have a config."
|
||||
|
||||
self.time_dependent_execution = time_dependent_execution
|
||||
self.performance_tracking = performance_tracking
|
||||
self.warm_up = warm_up
|
||||
self.no_authorization = no_authorization
|
||||
self.customer_workloads = customer_workloads
|
||||
self.vendor_args = vendor_args
|
@ -289,6 +289,7 @@ void ExecuteTimeDependentWorkload(
|
||||
// Synchronize workers and collect runtime.
|
||||
while (ready.load(std::memory_order_acq_rel) < FLAGS_num_workers)
|
||||
;
|
||||
|
||||
run.store(true);
|
||||
for (int i = 0; i < FLAGS_num_workers; ++i) {
|
||||
threads[i].join();
|
||||
@ -310,6 +311,7 @@ void ExecuteTimeDependentWorkload(
|
||||
|
||||
final_duration /= FLAGS_num_workers;
|
||||
double execution_delta = time_limit.count() / final_duration;
|
||||
|
||||
// This is adjusted throughput based on how much longer did workload execution time took.
|
||||
double throughput = (total_iterations / final_duration) * execution_delta;
|
||||
double raw_throughput = total_iterations / final_duration;
|
||||
@ -319,7 +321,6 @@ void ExecuteTimeDependentWorkload(
|
||||
summary["duration"] = final_duration;
|
||||
summary["time_limit"] = FLAGS_time_dependent_execution;
|
||||
summary["queries_executed"] = total_iterations;
|
||||
|
||||
summary["throughput"] = throughput;
|
||||
summary["raw_throughput"] = raw_throughput;
|
||||
summary["latency_stats"] = LatencyStatistics(worker_query_durations);
|
||||
|
@ -77,10 +77,10 @@ def compare_results(results_from, results_to, fields, ignored, different_vendors
|
||||
recursive_get(summary_from, "database", key, value=None),
|
||||
summary_to["database"][key],
|
||||
)
|
||||
elif summary_to.get("query_statistics") != None and key in summary_to["query_statistics"]:
|
||||
elif summary_to.get("latency_stats") != None and key in summary_to["latency_stats"]:
|
||||
row[key] = compute_diff(
|
||||
recursive_get(summary_from, "query_statistics", key, value=None),
|
||||
summary_to["query_statistics"][key],
|
||||
recursive_get(summary_from, "latency_stats", key, value=None),
|
||||
summary_to["latency_stats"][key],
|
||||
)
|
||||
elif not different_vendors:
|
||||
row[key] = compute_diff(
|
||||
@ -160,7 +160,10 @@ if __name__ == "__main__":
|
||||
help="Comparing different vendors, there is no need for metadata, duration, count check.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--difference-threshold", type=float, help="Difference threshold for memory and throughput, 0.02 = 2% "
|
||||
"--difference-threshold",
|
||||
type=float,
|
||||
default=0.02,
|
||||
help="Difference threshold for memory and throughput, 0.02 = 2% ",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
0
tests/mgbench/cypher/__init__.py
Normal file
0
tests/mgbench/cypher/__init__.py
Normal file
500
tests/mgbench/cypher/ldbc_to_cypher.py
Normal file
500
tests/mgbench/cypher/ldbc_to_cypher.py
Normal file
@ -0,0 +1,500 @@
|
||||
import argparse
|
||||
import csv
|
||||
import sys
|
||||
from collections import defaultdict
|
||||
from pathlib import Path
|
||||
|
||||
import helpers
|
||||
|
||||
# Most recent list of LDBC datasets available at: https://github.com/ldbc/data-sets-surf-repository
|
||||
INTERACTIVE_LINK = {
|
||||
"sf0.1": "https://repository.surfsara.nl/datasets/cwi/snb/files/social_network-csv_basic/social_network-csv_basic-sf0.1.tar.zst",
|
||||
"sf0.3": "https://repository.surfsara.nl/datasets/cwi/snb/files/social_network-csv_basic/social_network-csv_basic-sf0.3.tar.zst",
|
||||
"sf1": "https://repository.surfsara.nl/datasets/cwi/snb/files/social_network-csv_basic/social_network-csv_basic-sf1.tar.zst",
|
||||
"sf3": "https://repository.surfsara.nl/datasets/cwi/snb/files/social_network-csv_basic/social_network-csv_basic-sf3.tar.zst",
|
||||
"sf10": "https://repository.surfsara.nl/datasets/cwi/snb/files/social_network-csv_basic/social_network-csv_basic-sf10.tar.zst",
|
||||
}
|
||||
|
||||
|
||||
BI_LINK = {
|
||||
"sf1": "https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/bi-pre-audit/bi-sf1-composite-projected-fk.tar.zst",
|
||||
"sf3": "https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/bi-pre-audit/bi-sf3-composite-projected-fk.tar.zst",
|
||||
"sf10": "https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/bi-pre-audit/bi-sf10-composite-projected-fk.tar.zst",
|
||||
}
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
parser = argparse.ArgumentParser(
|
||||
prog="LDBC CSV to CYPHERL converter",
|
||||
description="""Converts all LDBC CSV files to CYPHERL transactions, for faster Memgraph load""",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--size",
|
||||
required=True,
|
||||
choices=["0.1", "0.3", "1", "3", "10"],
|
||||
help="Interactive: (0.1 , 0.3, 1, 3, 10) BI: (1, 3, 10)",
|
||||
)
|
||||
parser.add_argument("--type", required=True, choices=["interactive", "bi"], help="interactive or bi")
|
||||
|
||||
args = parser.parse_args()
|
||||
output_directory = Path().absolute() / ".cache" / "LDBC_generated"
|
||||
output_directory.mkdir(exist_ok=True)
|
||||
|
||||
if args.type == "interactive":
|
||||
|
||||
NODES_INTERACTIVE = [
|
||||
{"filename": "Place", "label": "Place"},
|
||||
{"filename": "Organisation", "label": "Organisation"},
|
||||
{"filename": "TagClass", "label": "TagClass"},
|
||||
{"filename": "Tag", "label": "Tag"},
|
||||
{"filename": "Comment", "label": "Message:Comment"},
|
||||
{"filename": "Forum", "label": "Forum"},
|
||||
{"filename": "Person", "label": "Person"},
|
||||
{"filename": "Post", "label": "Message:Post"},
|
||||
]
|
||||
|
||||
EDGES_INTERACTIVE = [
|
||||
{
|
||||
"filename": "Place_isPartOf_Place",
|
||||
"source_label": "Place",
|
||||
"type": "IS_PART_OF",
|
||||
"target_label": "Place",
|
||||
},
|
||||
{
|
||||
"filename": "TagClass_isSubclassOf_TagClass",
|
||||
"source_label": "TagClass",
|
||||
"type": "IS_SUBCLASS_OF",
|
||||
"target_label": "TagClass",
|
||||
},
|
||||
{
|
||||
"filename": "Organisation_isLocatedIn_Place",
|
||||
"source_label": "Organisation",
|
||||
"type": "IS_LOCATED_IN",
|
||||
"target_label": "Place",
|
||||
},
|
||||
{"filename": "Tag_hasType_TagClass", "source_label": "Tag", "type": "HAS_TYPE", "target_label": "TagClass"},
|
||||
{
|
||||
"filename": "Comment_hasCreator_Person",
|
||||
"source_label": "Comment",
|
||||
"type": "HAS_CREATOR",
|
||||
"target_label": "Person",
|
||||
},
|
||||
{
|
||||
"filename": "Comment_isLocatedIn_Place",
|
||||
"source_label": "Comment",
|
||||
"type": "IS_LOCATED_IN",
|
||||
"target_label": "Place",
|
||||
},
|
||||
{
|
||||
"filename": "Comment_replyOf_Comment",
|
||||
"source_label": "Comment",
|
||||
"type": "REPLY_OF",
|
||||
"target_label": "Comment",
|
||||
},
|
||||
{"filename": "Comment_replyOf_Post", "source_label": "Comment", "type": "REPLY_OF", "target_label": "Post"},
|
||||
{
|
||||
"filename": "Forum_containerOf_Post",
|
||||
"source_label": "Forum",
|
||||
"type": "CONTAINER_OF",
|
||||
"target_label": "Post",
|
||||
},
|
||||
{
|
||||
"filename": "Forum_hasMember_Person",
|
||||
"source_label": "Forum",
|
||||
"type": "HAS_MEMBER",
|
||||
"target_label": "Person",
|
||||
},
|
||||
{
|
||||
"filename": "Forum_hasModerator_Person",
|
||||
"source_label": "Forum",
|
||||
"type": "HAS_MODERATOR",
|
||||
"target_label": "Person",
|
||||
},
|
||||
{"filename": "Forum_hasTag_Tag", "source_label": "Forum", "type": "HAS_TAG", "target_label": "Tag"},
|
||||
{
|
||||
"filename": "Person_hasInterest_Tag",
|
||||
"source_label": "Person",
|
||||
"type": "HAS_INTEREST",
|
||||
"target_label": "Tag",
|
||||
},
|
||||
{
|
||||
"filename": "Person_isLocatedIn_Place",
|
||||
"source_label": "Person",
|
||||
"type": "IS_LOCATED_IN",
|
||||
"target_label": "Place",
|
||||
},
|
||||
{"filename": "Person_knows_Person", "source_label": "Person", "type": "KNOWS", "target_label": "Person"},
|
||||
{"filename": "Person_likes_Comment", "source_label": "Person", "type": "LIKES", "target_label": "Comment"},
|
||||
{"filename": "Person_likes_Post", "source_label": "Person", "type": "LIKES", "target_label": "Post"},
|
||||
{
|
||||
"filename": "Post_hasCreator_Person",
|
||||
"source_label": "Post",
|
||||
"type": "HAS_CREATOR",
|
||||
"target_label": "Person",
|
||||
},
|
||||
{"filename": "Comment_hasTag_Tag", "source_label": "Comment", "type": "HAS_TAG", "target_label": "Tag"},
|
||||
{"filename": "Post_hasTag_Tag", "source_label": "Post", "type": "HAS_TAG", "target_label": "Tag"},
|
||||
{
|
||||
"filename": "Post_isLocatedIn_Place",
|
||||
"source_label": "Post",
|
||||
"type": "IS_LOCATED_IN",
|
||||
"target_label": "Place",
|
||||
},
|
||||
{
|
||||
"filename": "Person_studyAt_Organisation",
|
||||
"source_label": "Person",
|
||||
"type": "STUDY_AT",
|
||||
"target_label": "Organisation",
|
||||
},
|
||||
{
|
||||
"filename": "Person_workAt_Organisation",
|
||||
"source_label": "Person",
|
||||
"type": "WORK_AT",
|
||||
"target_label": "Organisation",
|
||||
},
|
||||
]
|
||||
|
||||
file_size = "sf{}".format(args.size)
|
||||
out_file = "ldbc_interactive_{}.cypher".format(file_size)
|
||||
output = output_directory / out_file
|
||||
if output.exists():
|
||||
output.unlink()
|
||||
|
||||
files_present = None
|
||||
for file in output_directory.glob("**/*.tar.zst"):
|
||||
if "basic-" + file_size in file.name:
|
||||
files_present = file.with_suffix("").with_suffix("")
|
||||
break
|
||||
|
||||
if not files_present:
|
||||
try:
|
||||
print("Downloading the file... " + INTERACTIVE_LINK[file_size])
|
||||
downloaded_file = helpers.download_file(INTERACTIVE_LINK[file_size], output_directory.absolute())
|
||||
print("Unpacking the file..." + downloaded_file)
|
||||
files_present = helpers.unpack_tar_zst(Path(downloaded_file))
|
||||
except:
|
||||
print("Issue with downloading and unpacking the file, check if links are working properly.")
|
||||
raise
|
||||
|
||||
input_files = {}
|
||||
for file in files_present.glob("**/*.csv"):
|
||||
name = file.name.replace("_0_0.csv", "").lower()
|
||||
input_files[name] = file
|
||||
|
||||
for node_file in NODES_INTERACTIVE:
|
||||
key = node_file["filename"].lower()
|
||||
default_label = node_file["label"]
|
||||
query = None
|
||||
if key in input_files.keys():
|
||||
with input_files[key].open("r") as input_f, output.open("a") as output_f:
|
||||
reader = csv.DictReader(input_f, delimiter="|")
|
||||
|
||||
for row in reader:
|
||||
if "type" in row.keys():
|
||||
label = default_label + ":" + row.pop("type").capitalize()
|
||||
else:
|
||||
label = default_label
|
||||
|
||||
query = "CREATE (:{} {{id:{}, ".format(label, row.pop("id"))
|
||||
# Format properties to fit Memgraph
|
||||
for k, v in row.items():
|
||||
if k == "creationDate":
|
||||
row[k] = 'localDateTime("{}")'.format(v[0:-5])
|
||||
elif k == "birthday":
|
||||
row[k] = 'date("{}")'.format(v)
|
||||
elif k == "length":
|
||||
row[k] = "toInteger({})".format(v)
|
||||
else:
|
||||
row[k] = '"{}"'.format(v)
|
||||
|
||||
prop_string = ", ".join("{} : {}".format(k, v) for k, v in row.items())
|
||||
query = query + prop_string + "});"
|
||||
output_f.write(query + "\n")
|
||||
print("Converted file: " + input_files[key].name + " to " + output.name)
|
||||
else:
|
||||
print("Didn't process node file: " + key)
|
||||
raise Exception("Didn't find the file that was needed!")
|
||||
|
||||
for edge_file in EDGES_INTERACTIVE:
|
||||
key = edge_file["filename"].lower()
|
||||
source_label = edge_file["source_label"]
|
||||
edge_type = edge_file["type"]
|
||||
target_label = edge_file["target_label"]
|
||||
if key in input_files.keys():
|
||||
query = None
|
||||
with input_files[key].open("r") as input_f, output.open("a") as output_f:
|
||||
sufixl = ".id"
|
||||
sufixr = ".id"
|
||||
# Handle identical label/key in CSV header
|
||||
if source_label == target_label:
|
||||
sufixl = "l"
|
||||
sufixr = "r"
|
||||
# Move a place from header
|
||||
header = next(input_f).strip().split("|")
|
||||
reader = csv.DictReader(
|
||||
input_f, delimiter="|", fieldnames=([source_label + sufixl, target_label + sufixr] + header[2:])
|
||||
)
|
||||
|
||||
for row in reader:
|
||||
query = "MATCH (n1:{} {{id:{}}}), (n2:{} {{id:{}}}) ".format(
|
||||
source_label, row.pop(source_label + sufixl), target_label, row.pop(target_label + sufixr)
|
||||
)
|
||||
for k, v in row.items():
|
||||
if "date" in k.lower():
|
||||
# Take time zone out
|
||||
row[k] = 'localDateTime("{}")'.format(v[0:-5])
|
||||
elif "workfrom" in k.lower() or "classyear" in k.lower():
|
||||
row[k] = 'toInteger("{}")'.format(v)
|
||||
else:
|
||||
row[k] = '"{}"'.format(v)
|
||||
|
||||
edge_part = "CREATE (n1)-[:{}{{".format(edge_type)
|
||||
prop_string = ", ".join("{} : {}".format(k, v) for k, v in row.items())
|
||||
|
||||
query = query + edge_part + prop_string + "}]->(n2);"
|
||||
output_f.write(query + "\n")
|
||||
print("Converted file: " + input_files[key].name + " to " + output.name)
|
||||
else:
|
||||
print("Didn't process Edge file: " + key)
|
||||
raise Exception("Didn't find the file that was needed!")
|
||||
|
||||
elif args.type == "bi":
|
||||
|
||||
NODES_BI = [
|
||||
{"filename": "Place", "label": "Place"},
|
||||
{"filename": "Organisation", "label": "Organisation"},
|
||||
{"filename": "TagClass", "label": "TagClass"},
|
||||
{"filename": "Tag", "label": "Tag"},
|
||||
{"filename": "Comment", "label": "Message:Comment"},
|
||||
{"filename": "Forum", "label": "Forum"},
|
||||
{"filename": "Person", "label": "Person"},
|
||||
{"filename": "Post", "label": "Message:Post"},
|
||||
]
|
||||
|
||||
EDGES_BI = [
|
||||
{
|
||||
"filename": "Place_isPartOf_Place",
|
||||
"source_label": "Place",
|
||||
"type": "IS_PART_OF",
|
||||
"target_label": "Place",
|
||||
},
|
||||
{
|
||||
"filename": "TagClass_isSubclassOf_TagClass",
|
||||
"source_label": "TagClass",
|
||||
"type": "IS_SUBCLASS_OF",
|
||||
"target_label": "TagClass",
|
||||
},
|
||||
{
|
||||
"filename": "Organisation_isLocatedIn_Place",
|
||||
"source_label": "Organisation",
|
||||
"type": "IS_LOCATED_IN",
|
||||
"target_label": "Place",
|
||||
},
|
||||
{"filename": "Tag_hasType_TagClass", "source_label": "Tag", "type": "HAS_TYPE", "target_label": "TagClass"},
|
||||
{
|
||||
"filename": "Comment_hasCreator_Person",
|
||||
"source_label": "Comment",
|
||||
"type": "HAS_CREATOR",
|
||||
"target_label": "Person",
|
||||
},
|
||||
# Change place to Country
|
||||
{
|
||||
"filename": "Comment_isLocatedIn_Country",
|
||||
"source_label": "Comment",
|
||||
"type": "IS_LOCATED_IN",
|
||||
"target_label": "Country",
|
||||
},
|
||||
{
|
||||
"filename": "Comment_replyOf_Comment",
|
||||
"source_label": "Comment",
|
||||
"type": "REPLY_OF",
|
||||
"target_label": "Comment",
|
||||
},
|
||||
{"filename": "Comment_replyOf_Post", "source_label": "Comment", "type": "REPLY_OF", "target_label": "Post"},
|
||||
{
|
||||
"filename": "Forum_containerOf_Post",
|
||||
"source_label": "Forum",
|
||||
"type": "CONTAINER_OF",
|
||||
"target_label": "Post",
|
||||
},
|
||||
{
|
||||
"filename": "Forum_hasMember_Person",
|
||||
"source_label": "Forum",
|
||||
"type": "HAS_MEMBER",
|
||||
"target_label": "Person",
|
||||
},
|
||||
{
|
||||
"filename": "Forum_hasModerator_Person",
|
||||
"source_label": "Forum",
|
||||
"type": "HAS_MODERATOR",
|
||||
"target_label": "Person",
|
||||
},
|
||||
{"filename": "Forum_hasTag_Tag", "source_label": "Forum", "type": "HAS_TAG", "target_label": "Tag"},
|
||||
{
|
||||
"filename": "Person_hasInterest_Tag",
|
||||
"source_label": "Person",
|
||||
"type": "HAS_INTEREST",
|
||||
"target_label": "Tag",
|
||||
},
|
||||
# Changed place to City
|
||||
{
|
||||
"filename": "Person_isLocatedIn_City",
|
||||
"source_label": "Person",
|
||||
"type": "IS_LOCATED_IN",
|
||||
"target_label": "City",
|
||||
},
|
||||
{"filename": "Person_knows_Person", "source_label": "Person", "type": "KNOWS", "target_label": "Person"},
|
||||
{"filename": "Person_likes_Comment", "source_label": "Person", "type": "LIKES", "target_label": "Comment"},
|
||||
{"filename": "Person_likes_Post", "source_label": "Person", "type": "LIKES", "target_label": "Post"},
|
||||
{
|
||||
"filename": "Post_hasCreator_Person",
|
||||
"source_label": "Post",
|
||||
"type": "HAS_CREATOR",
|
||||
"target_label": "Person",
|
||||
},
|
||||
{"filename": "Comment_hasTag_Tag", "source_label": "Comment", "type": "HAS_TAG", "target_label": "Tag"},
|
||||
{"filename": "Post_hasTag_Tag", "source_label": "Post", "type": "HAS_TAG", "target_label": "Tag"},
|
||||
# Change place to Country
|
||||
{
|
||||
"filename": "Post_isLocatedIn_Country",
|
||||
"source_label": "Post",
|
||||
"type": "IS_LOCATED_IN",
|
||||
"target_label": "Country",
|
||||
},
|
||||
# Changed organisation to University
|
||||
{
|
||||
"filename": "Person_studyAt_University",
|
||||
"source_label": "Person",
|
||||
"type": "STUDY_AT",
|
||||
"target_label": "University",
|
||||
},
|
||||
# Changed organisation to Company
|
||||
{
|
||||
"filename": "Person_workAt_Company",
|
||||
"source_label": "Person",
|
||||
"type": "WORK_AT",
|
||||
"target_label": "Company",
|
||||
},
|
||||
]
|
||||
|
||||
file_size = "sf{}".format(args.size)
|
||||
out_file = "ldbc_bi_{}.cypher".format(file_size)
|
||||
output = output_directory / out_file
|
||||
if output.exists():
|
||||
output.unlink()
|
||||
|
||||
files_present = None
|
||||
for file in output_directory.glob("**/*.tar.zst"):
|
||||
if "bi-" + file_size in file.name:
|
||||
files_present = file.with_suffix("").with_suffix("")
|
||||
break
|
||||
|
||||
if not files_present:
|
||||
try:
|
||||
print("Downloading the file... " + BI_LINK[file_size])
|
||||
downloaded_file = helpers.download_file(BI_LINK[file_size], output_directory.absolute())
|
||||
print("Unpacking the file..." + downloaded_file)
|
||||
files_present = helpers.unpack_tar_zst(Path(downloaded_file))
|
||||
except:
|
||||
print("Issue with downloading and unpacking the file, check if links are working properly.")
|
||||
raise
|
||||
|
||||
for file in files_present.glob("**/*.csv.gz"):
|
||||
if "initial_snapshot" in file.parts:
|
||||
helpers.unpack_gz(file)
|
||||
|
||||
input_files = defaultdict(list)
|
||||
for file in files_present.glob("**/*.csv"):
|
||||
key = file.parents[0].name
|
||||
input_files[file.parents[0].name].append(file)
|
||||
|
||||
for node_file in NODES_BI:
|
||||
key = node_file["filename"]
|
||||
default_label = node_file["label"]
|
||||
query = None
|
||||
if key in input_files.keys():
|
||||
for part_file in input_files[key]:
|
||||
with part_file.open("r") as input_f, output.open("a") as output_f:
|
||||
reader = csv.DictReader(input_f, delimiter="|")
|
||||
|
||||
for row in reader:
|
||||
if "type" in row.keys():
|
||||
label = default_label + ":" + row.pop("type")
|
||||
else:
|
||||
label = default_label
|
||||
|
||||
query = "CREATE (:{} {{id:{}, ".format(label, row.pop("id"))
|
||||
# Format properties to fit Memgraph
|
||||
for k, v in row.items():
|
||||
if k == "creationDate":
|
||||
row[k] = 'localDateTime("{}")'.format(v[0:-6])
|
||||
elif k == "birthday":
|
||||
row[k] = 'date("{}")'.format(v)
|
||||
elif k == "length":
|
||||
row[k] = "toInteger({})".format(v)
|
||||
else:
|
||||
row[k] = '"{}"'.format(v)
|
||||
|
||||
prop_string = ", ".join("{} : {}".format(k, v) for k, v in row.items())
|
||||
query = query + prop_string + "});"
|
||||
output_f.write(query + "\n")
|
||||
print("Key: " + key + " Converted file: " + part_file.name + " to " + output.name)
|
||||
else:
|
||||
print("Didn't process node file: " + key)
|
||||
|
||||
for edge_file in EDGES_BI:
|
||||
key = edge_file["filename"]
|
||||
source_label = edge_file["source_label"]
|
||||
edge_type = edge_file["type"]
|
||||
target_label = edge_file["target_label"]
|
||||
if key in input_files.keys():
|
||||
for part_file in input_files[key]:
|
||||
query = None
|
||||
with part_file.open("r") as input_f, output.open("a") as output_f:
|
||||
sufixl = "Id"
|
||||
sufixr = "Id"
|
||||
# Handle identical label/key in CSV header
|
||||
if source_label == target_label:
|
||||
sufixl = "l"
|
||||
sufixr = "r"
|
||||
# Move a place from header
|
||||
header = next(input_f).strip().split("|")
|
||||
if len(header) >= 3:
|
||||
reader = csv.DictReader(
|
||||
input_f,
|
||||
delimiter="|",
|
||||
fieldnames=(["date", source_label + sufixl, target_label + sufixr] + header[3:]),
|
||||
)
|
||||
else:
|
||||
reader = csv.DictReader(
|
||||
input_f,
|
||||
delimiter="|",
|
||||
fieldnames=([source_label + sufixl, target_label + sufixr] + header[2:]),
|
||||
)
|
||||
|
||||
for row in reader:
|
||||
query = "MATCH (n1:{} {{id:{}}}), (n2:{} {{id:{}}}) ".format(
|
||||
source_label,
|
||||
row.pop(source_label + sufixl),
|
||||
target_label,
|
||||
row.pop(target_label + sufixr),
|
||||
)
|
||||
for k, v in row.items():
|
||||
if "date" in k.lower():
|
||||
# Take time zone out
|
||||
row[k] = 'localDateTime("{}")'.format(v[0:-6])
|
||||
elif k == "classYear" or k == "workFrom":
|
||||
row[k] = 'toInteger("{}")'.format(v)
|
||||
else:
|
||||
row[k] = '"{}"'.format(v)
|
||||
|
||||
edge_part = "CREATE (n1)-[:{}{{".format(edge_type)
|
||||
prop_string = ", ".join("{} : {}".format(k, v) for k, v in row.items())
|
||||
|
||||
query = query + edge_part + prop_string + "}]->(n2);"
|
||||
output_f.write(query + "\n")
|
||||
print("Key: " + key + " Converted file: " + part_file.name + " to " + output.name)
|
||||
else:
|
||||
print("Didn't process Edge file: " + key)
|
||||
raise Exception("Didn't find the file that was needed!")
|
@ -16,14 +16,20 @@ def parse_arguments():
|
||||
help="Forward name and paths to vendors binary"
|
||||
"Example: --vendor memgraph /path/to/binary --vendor neo4j /path/to/binary",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--dataset-size",
|
||||
default="small",
|
||||
choices=["small", "medium", "large"],
|
||||
help="Pick a dataset size (small, medium, large)",
|
||||
"--dataset-name",
|
||||
default="",
|
||||
help="Dataset name you wish to execute",
|
||||
)
|
||||
|
||||
parser.add_argument("--dataset-group", default="basic", help="Select a group of queries")
|
||||
parser.add_argument(
|
||||
"--dataset-size",
|
||||
default="",
|
||||
help="Pick a dataset variant you wish to execute",
|
||||
)
|
||||
|
||||
parser.add_argument("--dataset-group", default="", help="Select a group of queries")
|
||||
|
||||
parser.add_argument(
|
||||
"--realistic",
|
||||
@ -53,88 +59,110 @@ def parse_arguments():
|
||||
return args
|
||||
|
||||
|
||||
def run_full_benchmarks(vendor, binary, dataset_size, dataset_group, realistic, mixed):
|
||||
def run_full_benchmarks(vendor, binary, dataset, dataset_size, dataset_group, realistic, mixed):
|
||||
|
||||
configurations = [
|
||||
# Basic full group test cold
|
||||
# Basic isolated test cold
|
||||
[
|
||||
"--export-results",
|
||||
vendor + "_" + dataset_size + "_cold_isolated.json",
|
||||
vendor + "_" + dataset + "_" + dataset_size + "_cold_isolated.json",
|
||||
],
|
||||
# Basic full group test hot
|
||||
# Basic isolated test hot
|
||||
[
|
||||
"--export-results",
|
||||
vendor + "_" + dataset_size + "_hot_isolated.json",
|
||||
"--warmup-run",
|
||||
vendor + "_" + dataset + "_" + dataset_size + "_hot_isolated.json",
|
||||
"--warm-up",
|
||||
"hot",
|
||||
],
|
||||
# Basic isolated test vulcanic
|
||||
[
|
||||
"--export-results",
|
||||
vendor + "_" + dataset + "_" + dataset_size + "_vulcanic_isolated.json",
|
||||
"--warm-up",
|
||||
"vulcanic",
|
||||
],
|
||||
]
|
||||
|
||||
# Configurations for full workload
|
||||
for count, write, read, update, analytical in realistic:
|
||||
cold = [
|
||||
"--export-results",
|
||||
vendor
|
||||
+ "_"
|
||||
+ dataset_size
|
||||
+ "_cold_realistic_{}_{}_{}_{}_{}.json".format(count, write, read, update, analytical),
|
||||
"--mixed-workload",
|
||||
count,
|
||||
write,
|
||||
read,
|
||||
update,
|
||||
analytical,
|
||||
]
|
||||
if realistic:
|
||||
# Configurations for full workload
|
||||
for count, write, read, update, analytical in realistic:
|
||||
cold = [
|
||||
"--export-results",
|
||||
vendor
|
||||
+ "_"
|
||||
+ dataset
|
||||
+ "_"
|
||||
+ dataset_size
|
||||
+ "_cold_realistic_{}_{}_{}_{}_{}.json".format(count, write, read, update, analytical),
|
||||
"--workload-realistic",
|
||||
count,
|
||||
write,
|
||||
read,
|
||||
update,
|
||||
analytical,
|
||||
]
|
||||
|
||||
hot = [
|
||||
"--export-results",
|
||||
vendor
|
||||
+ "_"
|
||||
+ dataset_size
|
||||
+ "_hot_realistic_{}_{}_{}_{}_{}.json".format(count, write, read, update, analytical),
|
||||
"--warmup-run",
|
||||
"--mixed-workload",
|
||||
count,
|
||||
write,
|
||||
read,
|
||||
update,
|
||||
analytical,
|
||||
]
|
||||
configurations.append(cold)
|
||||
configurations.append(hot)
|
||||
hot = [
|
||||
"--export-results",
|
||||
vendor
|
||||
+ "_"
|
||||
+ dataset
|
||||
+ "_"
|
||||
+ dataset_size
|
||||
+ "_hot_realistic_{}_{}_{}_{}_{}.json".format(count, write, read, update, analytical),
|
||||
"--warm-up",
|
||||
"hot",
|
||||
"--workload-realistic",
|
||||
count,
|
||||
write,
|
||||
read,
|
||||
update,
|
||||
analytical,
|
||||
]
|
||||
|
||||
# Configurations for workload per query
|
||||
for count, write, read, update, analytical, query in mixed:
|
||||
cold = [
|
||||
"--export-results",
|
||||
vendor
|
||||
+ "_"
|
||||
+ dataset_size
|
||||
+ "_cold_mixed_{}_{}_{}_{}_{}_{}.json".format(count, write, read, update, analytical, query),
|
||||
"--mixed-workload",
|
||||
count,
|
||||
write,
|
||||
read,
|
||||
update,
|
||||
analytical,
|
||||
query,
|
||||
]
|
||||
hot = [
|
||||
"--export-results",
|
||||
vendor
|
||||
+ "_"
|
||||
+ dataset_size
|
||||
+ "_hot_mixed_{}_{}_{}_{}_{}_{}.json".format(count, write, read, update, analytical, query),
|
||||
"--warmup-run",
|
||||
"--mixed-workload",
|
||||
count,
|
||||
write,
|
||||
read,
|
||||
update,
|
||||
analytical,
|
||||
query,
|
||||
]
|
||||
configurations.append(cold)
|
||||
configurations.append(hot)
|
||||
configurations.append(cold)
|
||||
configurations.append(hot)
|
||||
|
||||
if mixed:
|
||||
# Configurations for workload per query
|
||||
for count, write, read, update, analytical, query in mixed:
|
||||
cold = [
|
||||
"--export-results",
|
||||
vendor
|
||||
+ "_"
|
||||
+ dataset
|
||||
+ "_"
|
||||
+ dataset_size
|
||||
+ "_cold_mixed_{}_{}_{}_{}_{}_{}.json".format(count, write, read, update, analytical, query),
|
||||
"--workload-mixed",
|
||||
count,
|
||||
write,
|
||||
read,
|
||||
update,
|
||||
analytical,
|
||||
query,
|
||||
]
|
||||
hot = [
|
||||
"--export-results",
|
||||
vendor
|
||||
+ "_"
|
||||
+ dataset
|
||||
+ "_"
|
||||
+ dataset_size
|
||||
+ "_hot_mixed_{}_{}_{}_{}_{}_{}.json".format(count, write, read, update, analytical, query),
|
||||
"--warm-up",
|
||||
"hot",
|
||||
"--workload-mixed",
|
||||
count,
|
||||
write,
|
||||
read,
|
||||
update,
|
||||
analytical,
|
||||
query,
|
||||
]
|
||||
|
||||
configurations.append(cold)
|
||||
configurations.append(hot)
|
||||
|
||||
default_args = [
|
||||
"python3",
|
||||
@ -146,9 +174,7 @@ def run_full_benchmarks(vendor, binary, dataset_size, dataset_group, realistic,
|
||||
"--num-workers-for-benchmark",
|
||||
"12",
|
||||
"--no-authorization",
|
||||
"pokec/" + dataset_size + "/" + dataset_group + "/*",
|
||||
"--tail-latency",
|
||||
"100",
|
||||
dataset + "/" + dataset_size + "/" + dataset_group + "/*",
|
||||
]
|
||||
|
||||
for config in configurations:
|
||||
@ -157,11 +183,11 @@ def run_full_benchmarks(vendor, binary, dataset_size, dataset_group, realistic,
|
||||
subprocess.run(args=full_config, check=True)
|
||||
|
||||
|
||||
def collect_all_results(vendor_name, dataset_size, dataset_group):
|
||||
def collect_all_results(vendor_name, dataset, dataset_size, dataset_group):
|
||||
working_directory = Path().absolute()
|
||||
print(working_directory)
|
||||
results = sorted(working_directory.glob(vendor_name + "_" + dataset_size + "_*.json"))
|
||||
summary = {"pokec": {dataset_size: {dataset_group: {}}}}
|
||||
results = sorted(working_directory.glob(vendor_name + "_" + dataset + "_" + dataset_size + "_*.json"))
|
||||
summary = {dataset: {dataset_size: {dataset_group: {}}}}
|
||||
|
||||
for file in results:
|
||||
if "summary" in file.name:
|
||||
@ -169,19 +195,22 @@ def collect_all_results(vendor_name, dataset_size, dataset_group):
|
||||
f = file.open()
|
||||
data = json.loads(f.read())
|
||||
if data["__run_configuration__"]["condition"] == "hot":
|
||||
for key, value in data["pokec"][dataset_size][dataset_group].items():
|
||||
for key, value in data[dataset][dataset_size][dataset_group].items():
|
||||
key_condition = key + "_hot"
|
||||
summary["pokec"][dataset_size][dataset_group][key_condition] = value
|
||||
summary[dataset][dataset_size][dataset_group][key_condition] = value
|
||||
elif data["__run_configuration__"]["condition"] == "cold":
|
||||
for key, value in data["pokec"][dataset_size][dataset_group].items():
|
||||
for key, value in data[dataset][dataset_size][dataset_group].items():
|
||||
key_condition = key + "_cold"
|
||||
summary["pokec"][dataset_size][dataset_group][key_condition] = value
|
||||
|
||||
summary[dataset][dataset_size][dataset_group][key_condition] = value
|
||||
elif data["__run_configuration__"]["condition"] == "vulcanic":
|
||||
for key, value in data[dataset][dataset_size][dataset_group].items():
|
||||
key_condition = key + "_vulcanic"
|
||||
summary[dataset][dataset_size][dataset_group][key_condition] = value
|
||||
print(summary)
|
||||
|
||||
json_object = json.dumps(summary, indent=4)
|
||||
print(json_object)
|
||||
with open(vendor_name + "_" + dataset_size + "_summary.json", "w") as f:
|
||||
with open(vendor_name + "_" + dataset + "_" + dataset_size + "_summary.json", "w") as f:
|
||||
json.dump(summary, f)
|
||||
|
||||
|
||||
@ -194,16 +223,17 @@ if __name__ == "__main__":
|
||||
vendor_names = {"memgraph", "neo4j"}
|
||||
for vendor_name, vendor_binary in args.vendor:
|
||||
path = Path(vendor_binary)
|
||||
if vendor_name.lower() in vendor_names and (path.is_file() or path.is_dir()):
|
||||
if vendor_name.lower() in vendor_names and path.is_file():
|
||||
run_full_benchmarks(
|
||||
vendor_name,
|
||||
vendor_binary,
|
||||
args.dataset_name,
|
||||
args.dataset_size,
|
||||
args.dataset_group,
|
||||
realistic,
|
||||
mixed,
|
||||
)
|
||||
collect_all_results(vendor_name, args.dataset_size, args.dataset_group)
|
||||
collect_all_results(vendor_name, args.dataset_name, args.dataset_size, args.dataset_group)
|
||||
else:
|
||||
raise Exception(
|
||||
"Check that vendor: {} is supported and you are passing right path: {} to binary.".format(
|
||||
|
@ -1,4 +1,4 @@
|
||||
# Copyright 2021 Memgraph Ltd.
|
||||
# Copyright 2023 Memgraph Ltd.
|
||||
#
|
||||
# Use of this software is governed by the Business Source License
|
||||
# included in the file licenses/BSL.txt; by using this file, you agree to be bound by the terms of the Business Source
|
||||
@ -9,11 +9,21 @@
|
||||
# by the Apache License, Version 2.0, included in the file
|
||||
# licenses/APL.txt.
|
||||
|
||||
import collections
|
||||
import copy
|
||||
import fnmatch
|
||||
import importlib
|
||||
import inspect
|
||||
import json
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
import workloads
|
||||
from benchmark_context import BenchmarkContext
|
||||
from workloads import *
|
||||
from workloads import base
|
||||
|
||||
SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__))
|
||||
|
||||
@ -28,22 +38,70 @@ def get_binary_path(path, base=""):
|
||||
|
||||
|
||||
def download_file(url, path):
|
||||
ret = subprocess.run(["wget", "-nv", "--content-disposition", url],
|
||||
stderr=subprocess.PIPE, cwd=path, check=True)
|
||||
ret = subprocess.run(["wget", "-nv", "--content-disposition", url], stderr=subprocess.PIPE, cwd=path, check=True)
|
||||
data = ret.stderr.decode("utf-8")
|
||||
tmp = data.split("->")[1]
|
||||
name = tmp[tmp.index('"') + 1:tmp.rindex('"')]
|
||||
name = tmp[tmp.index('"') + 1 : tmp.rindex('"')]
|
||||
return os.path.join(path, name)
|
||||
|
||||
|
||||
def unpack_and_move_file(input_path, output_path):
|
||||
def unpack_gz_and_move_file(input_path, output_path):
|
||||
if input_path.endswith(".gz"):
|
||||
subprocess.run(["gunzip", input_path],
|
||||
stdout=subprocess.DEVNULL, check=True)
|
||||
subprocess.run(["gunzip", input_path], stdout=subprocess.DEVNULL, check=True)
|
||||
input_path = input_path[:-3]
|
||||
os.rename(input_path, output_path)
|
||||
|
||||
|
||||
def unpack_gz(input_path: Path):
|
||||
if input_path.suffix == ".gz":
|
||||
subprocess.run(["gzip", "-d", input_path], capture_output=True, check=True)
|
||||
input_path = input_path.with_suffix("")
|
||||
return input_path
|
||||
|
||||
|
||||
def unpack_zip(input_path: Path):
|
||||
if input_path.suffix == ".zip":
|
||||
subprocess.run(["unzip", input_path], capture_output=True, check=True, cwd=input_path.parent)
|
||||
input_path = input_path.with_suffix("")
|
||||
return input_path
|
||||
|
||||
|
||||
def unpack_tar_zst(input_path: Path):
|
||||
if input_path.suffix == ".zst":
|
||||
subprocess.run(
|
||||
["tar", "--use-compress-program=unzstd", "-xvf", input_path],
|
||||
cwd=input_path.parent,
|
||||
capture_output=True,
|
||||
check=True,
|
||||
)
|
||||
input_path = input_path.with_suffix("").with_suffix("")
|
||||
return input_path
|
||||
|
||||
|
||||
def unpack_tar_gz(input_path: Path):
|
||||
if input_path.suffix == ".gz":
|
||||
subprocess.run(
|
||||
["tar", "-xvf", input_path],
|
||||
cwd=input_path.parent,
|
||||
capture_output=True,
|
||||
check=True,
|
||||
)
|
||||
input_path = input_path.with_suffix("").with_suffix("")
|
||||
return input_path
|
||||
|
||||
|
||||
def unpack_tar_zst_and_move(input_path: Path, output_path: Path):
|
||||
if input_path.suffix == ".zst":
|
||||
subprocess.run(
|
||||
["tar", "--use-compress-program=unzstd", "-xvf", input_path],
|
||||
cwd=input_path.parent,
|
||||
capture_output=True,
|
||||
check=True,
|
||||
)
|
||||
input_path = input_path.with_suffix("").with_suffix("")
|
||||
return input_path.rename(output_path)
|
||||
|
||||
|
||||
def ensure_directory(path):
|
||||
if not os.path.exists(path):
|
||||
os.makedirs(path)
|
||||
@ -51,6 +109,129 @@ def ensure_directory(path):
|
||||
raise Exception("The path '{}' should be a directory!".format(path))
|
||||
|
||||
|
||||
def get_available_workloads(customer_workloads: str = None) -> dict:
|
||||
generators = {}
|
||||
for module in map(workloads.__dict__.get, workloads.__all__):
|
||||
for key in dir(module):
|
||||
if key.startswith("_"):
|
||||
continue
|
||||
base_class = getattr(module, key)
|
||||
if not inspect.isclass(base_class) or not issubclass(base_class, base.Workload):
|
||||
continue
|
||||
queries = collections.defaultdict(list)
|
||||
for funcname in dir(base_class):
|
||||
if not funcname.startswith("benchmark__"):
|
||||
continue
|
||||
group, query = funcname.split("__")[1:]
|
||||
queries[group].append((query, funcname))
|
||||
generators[base_class.NAME] = (base_class, dict(queries))
|
||||
|
||||
if customer_workloads:
|
||||
head_tail = os.path.split(customer_workloads)
|
||||
path_without_dataset_name = head_tail[0]
|
||||
dataset_name = head_tail[1].split(".")[0]
|
||||
sys.path.append(path_without_dataset_name)
|
||||
dataset_to_use = importlib.import_module(dataset_name)
|
||||
|
||||
for key in dir(dataset_to_use):
|
||||
if key.startswith("_"):
|
||||
continue
|
||||
base_class = getattr(dataset_to_use, key)
|
||||
if not inspect.isclass(base_class) or not issubclass(base_class, base.Workload):
|
||||
continue
|
||||
queries = collections.defaultdict(list)
|
||||
for funcname in dir(base_class):
|
||||
if not funcname.startswith("benchmark__"):
|
||||
continue
|
||||
group, query = funcname.split("__")[1:]
|
||||
queries[group].append((query, funcname))
|
||||
generators[base_class.NAME] = (base_class, dict(queries))
|
||||
|
||||
return generators
|
||||
|
||||
|
||||
def list_available_workloads(customer_workloads: str = None):
|
||||
generators = get_available_workloads(customer_workloads)
|
||||
for name in sorted(generators.keys()):
|
||||
print("Dataset:", name)
|
||||
dataset, queries = generators[name]
|
||||
print(
|
||||
" Variants:",
|
||||
", ".join(dataset.VARIANTS),
|
||||
"(default: " + dataset.DEFAULT_VARIANT + ")",
|
||||
)
|
||||
for group in sorted(queries.keys()):
|
||||
print(" Group:", group)
|
||||
for query_name, query_func in queries[group]:
|
||||
print(" Query:", query_name)
|
||||
|
||||
|
||||
def match_patterns(workload, variant, group, query, is_default_variant, patterns):
|
||||
for pattern in patterns:
|
||||
verdict = [fnmatch.fnmatchcase(workload, pattern[0])]
|
||||
if pattern[1] != "":
|
||||
verdict.append(fnmatch.fnmatchcase(variant, pattern[1]))
|
||||
else:
|
||||
verdict.append(is_default_variant)
|
||||
verdict.append(fnmatch.fnmatchcase(group, pattern[2]))
|
||||
verdict.append(fnmatch.fnmatchcase(query, pattern[3]))
|
||||
if all(verdict):
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def filter_workloads(available_workloads: dict, benchmark_context: BenchmarkContext) -> list:
|
||||
patterns = benchmark_context.benchmark_target_workload
|
||||
for i in range(len(patterns)):
|
||||
pattern = patterns[i].split("/")
|
||||
if len(pattern) > 5 or len(pattern) == 0:
|
||||
raise Exception("Invalid benchmark description '" + pattern + "'!")
|
||||
pattern.extend(["", "*", "*"][len(pattern) - 1 :])
|
||||
patterns[i] = pattern
|
||||
filtered = []
|
||||
for workload in sorted(available_workloads.keys()):
|
||||
generator, queries = available_workloads[workload]
|
||||
for variant in generator.VARIANTS:
|
||||
is_default_variant = variant == generator.DEFAULT_VARIANT
|
||||
current = collections.defaultdict(list)
|
||||
for group in queries:
|
||||
for query_name, query_func in queries[group]:
|
||||
if match_patterns(
|
||||
workload,
|
||||
variant,
|
||||
group,
|
||||
query_name,
|
||||
is_default_variant,
|
||||
patterns,
|
||||
):
|
||||
current[group].append((query_name, query_func))
|
||||
if len(current) == 0:
|
||||
continue
|
||||
|
||||
# Ignore benchgraph "basic" queries in standard CI/CD run
|
||||
for pattern in patterns:
|
||||
res = pattern.count("*")
|
||||
key = "basic"
|
||||
if res >= 2 and key in current.keys():
|
||||
current.pop(key)
|
||||
|
||||
filtered.append((generator(variant=variant, benchmark_context=benchmark_context), dict(current)))
|
||||
return filtered
|
||||
|
||||
|
||||
def parse_kwargs(items):
|
||||
"""
|
||||
Parse a series of key-value pairs and return a dictionary
|
||||
"""
|
||||
d = {}
|
||||
|
||||
if items:
|
||||
for item in items:
|
||||
key, value = item.split("=")
|
||||
d[key] = value
|
||||
return d
|
||||
|
||||
|
||||
class Directory:
|
||||
def __init__(self, path):
|
||||
self._path = path
|
||||
@ -103,6 +284,9 @@ class Cache:
|
||||
ensure_directory(path)
|
||||
return Directory(path)
|
||||
|
||||
def get_default_cache_directory(self):
|
||||
return self._directory
|
||||
|
||||
def load_config(self):
|
||||
if not os.path.isfile(self._config):
|
||||
return RecursiveDict()
|
||||
|
@ -9,6 +9,8 @@
|
||||
# by the Apache License, Version 2.0, included in the file
|
||||
# licenses/APL.txt.
|
||||
|
||||
import logging
|
||||
|
||||
COLOR_GRAY = 0
|
||||
COLOR_RED = 1
|
||||
COLOR_GREEN = 2
|
||||
@ -16,27 +18,45 @@ COLOR_YELLOW = 3
|
||||
COLOR_BLUE = 4
|
||||
COLOR_VIOLET = 5
|
||||
COLOR_CYAN = 6
|
||||
COLOR_WHITE = 7
|
||||
|
||||
|
||||
def log(color, *args):
|
||||
logger = logging.Logger("mgbench_logger")
|
||||
file_handler = logging.FileHandler("mgbench_logs.log")
|
||||
file_format = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
|
||||
file_handler.setFormatter(file_format)
|
||||
logger.addHandler(file_handler)
|
||||
|
||||
|
||||
def _log(color, *args):
|
||||
print("\033[1;3{}m~~".format(color), *args, "~~\033[0m")
|
||||
|
||||
|
||||
def log(msg):
|
||||
print(msg)
|
||||
logger.info(msg=msg)
|
||||
|
||||
|
||||
def init(*args):
|
||||
log(COLOR_BLUE, *args)
|
||||
_log(COLOR_BLUE, *args)
|
||||
logger.info(*args)
|
||||
|
||||
|
||||
def info(*args):
|
||||
log(COLOR_CYAN, *args)
|
||||
_log(COLOR_WHITE, *args)
|
||||
logger.info(*args)
|
||||
|
||||
|
||||
def success(*args):
|
||||
log(COLOR_GREEN, *args)
|
||||
_log(COLOR_GREEN, *args)
|
||||
logger.info(*args)
|
||||
|
||||
|
||||
def warning(*args):
|
||||
log(COLOR_YELLOW, *args)
|
||||
_log(COLOR_YELLOW, *args)
|
||||
logger.warning(*args)
|
||||
|
||||
|
||||
def error(*args):
|
||||
log(COLOR_RED, *args)
|
||||
_log(COLOR_RED, *args)
|
||||
logger.critical(*args)
|
||||
|
@ -1,4 +1,4 @@
|
||||
# Copyright 2022 Memgraph Ltd.
|
||||
# Copyright 2023 Memgraph Ltd.
|
||||
#
|
||||
# Use of this software is governed by the Business Source License
|
||||
# included in the file licenses/BSL.txt; by using this file, you agree to be bound by the terms of the Business Source
|
||||
@ -17,10 +17,13 @@ import subprocess
|
||||
import tempfile
|
||||
import threading
|
||||
import time
|
||||
from abc import ABC, abstractmethod
|
||||
from pathlib import Path
|
||||
|
||||
from benchmark_context import BenchmarkContext
|
||||
|
||||
def wait_for_server(port, delay=0.1):
|
||||
|
||||
def _wait_for_server(port, delay=0.1):
|
||||
cmd = ["nc", "-z", "-w", "1", "127.0.0.1", str(port)]
|
||||
while subprocess.call(cmd) != 0:
|
||||
time.sleep(0.01)
|
||||
@ -62,50 +65,165 @@ def _get_current_usage(pid):
|
||||
return rss / 1024
|
||||
|
||||
|
||||
class Memgraph:
|
||||
def __init__(self, memgraph_binary, temporary_dir, properties_on_edges, bolt_port, performance_tracking):
|
||||
self._memgraph_binary = memgraph_binary
|
||||
self._directory = tempfile.TemporaryDirectory(dir=temporary_dir)
|
||||
self._properties_on_edges = properties_on_edges
|
||||
class BaseClient(ABC):
|
||||
@abstractmethod
|
||||
def __init__(self, benchmark_context: BenchmarkContext):
|
||||
self.benchmark_context = benchmark_context
|
||||
|
||||
@abstractmethod
|
||||
def execute(self):
|
||||
pass
|
||||
|
||||
|
||||
class BoltClient(BaseClient):
|
||||
def __init__(self, benchmark_context: BenchmarkContext):
|
||||
self._client_binary = benchmark_context.client_binary
|
||||
self._directory = tempfile.TemporaryDirectory(dir=benchmark_context.temporary_directory)
|
||||
self._username = ""
|
||||
self._password = ""
|
||||
self._bolt_port = (
|
||||
benchmark_context.vendor_args["bolt-port"] if "bolt-port" in benchmark_context.vendor_args.keys() else 7687
|
||||
)
|
||||
|
||||
def _get_args(self, **kwargs):
|
||||
return _convert_args_to_flags(self._client_binary, **kwargs)
|
||||
|
||||
def set_credentials(self, username: str, password: str):
|
||||
self._username = username
|
||||
self._password = password
|
||||
|
||||
def execute(
|
||||
self,
|
||||
queries=None,
|
||||
file_path=None,
|
||||
num_workers=1,
|
||||
max_retries: int = 50,
|
||||
validation: bool = False,
|
||||
time_dependent_execution: int = 0,
|
||||
):
|
||||
if (queries is None and file_path is None) or (queries is not None and file_path is not None):
|
||||
raise ValueError("Either queries or input_path must be specified!")
|
||||
|
||||
queries_json = False
|
||||
if queries is not None:
|
||||
queries_json = True
|
||||
file_path = os.path.join(self._directory.name, "queries.json")
|
||||
with open(file_path, "w") as f:
|
||||
for query in queries:
|
||||
json.dump(query, f)
|
||||
f.write("\n")
|
||||
args = self._get_args(
|
||||
input=file_path,
|
||||
num_workers=num_workers,
|
||||
max_retries=max_retries,
|
||||
queries_json=queries_json,
|
||||
username=self._username,
|
||||
password=self._password,
|
||||
port=self._bolt_port,
|
||||
validation=validation,
|
||||
time_dependent_execution=time_dependent_execution,
|
||||
)
|
||||
|
||||
ret = None
|
||||
try:
|
||||
ret = subprocess.run(args, capture_output=True)
|
||||
finally:
|
||||
error = ret.stderr.decode("utf-8").strip().split("\n")
|
||||
data = ret.stdout.decode("utf-8").strip().split("\n")
|
||||
if error and error[0] != "":
|
||||
print("Reported errros from client")
|
||||
print(error)
|
||||
data = [x for x in data if not x.startswith("[")]
|
||||
return list(map(json.loads, data))
|
||||
|
||||
|
||||
class BaseRunner(ABC):
|
||||
subclasses = {}
|
||||
|
||||
def __init_subclass__(cls, **kwargs) -> None:
|
||||
super().__init_subclass__(**kwargs)
|
||||
cls.subclasses[cls.__name__.lower()] = cls
|
||||
return
|
||||
|
||||
@classmethod
|
||||
def create(cls, benchmark_context: BenchmarkContext):
|
||||
if benchmark_context.vendor_name not in cls.subclasses:
|
||||
raise ValueError("Missing runner with name: {}".format(benchmark_context.vendor_name))
|
||||
|
||||
return cls.subclasses[benchmark_context.vendor_name](
|
||||
benchmark_context=benchmark_context,
|
||||
)
|
||||
|
||||
@abstractmethod
|
||||
def __init__(self, benchmark_context: BenchmarkContext):
|
||||
self.benchmark_context = benchmark_context
|
||||
|
||||
@abstractmethod
|
||||
def start_benchmark(self):
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def start_preparation(self):
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def stop(self):
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def clean_db(self):
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def fetch_client(self) -> BaseClient:
|
||||
pass
|
||||
|
||||
|
||||
class Memgraph(BaseRunner):
|
||||
def __init__(self, benchmark_context: BenchmarkContext):
|
||||
super().__init__(benchmark_context=benchmark_context)
|
||||
self._memgraph_binary = benchmark_context.vendor_binary
|
||||
self._performance_tracking = benchmark_context.performance_tracking
|
||||
self._directory = tempfile.TemporaryDirectory(dir=benchmark_context.temporary_directory)
|
||||
self._vendor_args = benchmark_context.vendor_args
|
||||
self._properties_on_edges = (
|
||||
self._vendor_args["no-properties-on-edges"]
|
||||
if "no-properties-on-edges" in self._vendor_args.keys()
|
||||
else False
|
||||
)
|
||||
self._bolt_port = self._vendor_args["bolt-port"] if "bolt-port" in self._vendor_args.keys() else 7687
|
||||
self._proc_mg = None
|
||||
self._bolt_port = bolt_port
|
||||
self.performance_tracking = performance_tracking
|
||||
self._stop_event = threading.Event()
|
||||
self._rss = []
|
||||
atexit.register(self._cleanup)
|
||||
|
||||
# Determine Memgraph version
|
||||
ret = subprocess.run([memgraph_binary, "--version"], stdout=subprocess.PIPE, check=True)
|
||||
ret = subprocess.run([self._memgraph_binary, "--version"], stdout=subprocess.PIPE, check=True)
|
||||
version = re.search(r"[0-9]+\.[0-9]+\.[0-9]+", ret.stdout.decode("utf-8")).group(0)
|
||||
self._memgraph_version = tuple(map(int, version.split(".")))
|
||||
|
||||
atexit.register(self._cleanup)
|
||||
|
||||
def __del__(self):
|
||||
self._cleanup()
|
||||
atexit.unregister(self._cleanup)
|
||||
|
||||
def _get_args(self, **kwargs):
|
||||
def _set_args(self, **kwargs):
|
||||
data_directory = os.path.join(self._directory.name, "memgraph")
|
||||
kwargs["bolt_port"] = self._bolt_port
|
||||
if self._memgraph_version >= (0, 50, 0):
|
||||
kwargs["data_directory"] = data_directory
|
||||
else:
|
||||
kwargs["durability_directory"] = data_directory
|
||||
if self._memgraph_version >= (0, 50, 0):
|
||||
kwargs["storage_properties_on_edges"] = self._properties_on_edges
|
||||
else:
|
||||
assert self._properties_on_edges, "Older versions of Memgraph can't disable properties on edges!"
|
||||
kwargs["data_directory"] = data_directory
|
||||
kwargs["storage_properties_on_edges"] = self._properties_on_edges
|
||||
return _convert_args_to_flags(self._memgraph_binary, **kwargs)
|
||||
|
||||
def _start(self, **kwargs):
|
||||
if self._proc_mg is not None:
|
||||
raise Exception("The database process is already running!")
|
||||
args = self._get_args(**kwargs)
|
||||
args = self._set_args(**kwargs)
|
||||
self._proc_mg = subprocess.Popen(args, stdout=subprocess.DEVNULL)
|
||||
time.sleep(0.2)
|
||||
if self._proc_mg.poll() is not None:
|
||||
self._proc_mg = None
|
||||
raise Exception("The database process died prematurely!")
|
||||
wait_for_server(self._bolt_port)
|
||||
_wait_for_server(self._bolt_port)
|
||||
ret = self._proc_mg.poll()
|
||||
assert ret is None, "The database process died prematurely " "({})!".format(ret)
|
||||
|
||||
@ -119,7 +237,7 @@ class Memgraph:
|
||||
return ret, usage
|
||||
|
||||
def start_preparation(self, workload):
|
||||
if self.performance_tracking:
|
||||
if self._performance_tracking:
|
||||
p = threading.Thread(target=self.res_background_tracking, args=(self._rss, self._stop_event))
|
||||
self._stop_event.clear()
|
||||
self._rss.clear()
|
||||
@ -127,13 +245,26 @@ class Memgraph:
|
||||
self._start(storage_snapshot_on_exit=True)
|
||||
|
||||
def start_benchmark(self, workload):
|
||||
if self.performance_tracking:
|
||||
if self._performance_tracking:
|
||||
p = threading.Thread(target=self.res_background_tracking, args=(self._rss, self._stop_event))
|
||||
self._stop_event.clear()
|
||||
self._rss.clear()
|
||||
p.start()
|
||||
self._start(storage_recover_on_startup=True)
|
||||
|
||||
def clean_db(self):
|
||||
if self._proc_mg is not None:
|
||||
raise Exception("The database process is already running, cannot clear data it!")
|
||||
else:
|
||||
out = subprocess.run(
|
||||
args="rm -Rf memgraph/snapshots/*",
|
||||
cwd=self._directory.name,
|
||||
capture_output=True,
|
||||
shell=True,
|
||||
)
|
||||
print(out.stderr.decode("utf-8"))
|
||||
print(out.stdout.decode("utf-8"))
|
||||
|
||||
def res_background_tracking(self, res, stop_event):
|
||||
print("Started rss tracking.")
|
||||
while not stop_event.is_set():
|
||||
@ -154,35 +285,46 @@ class Memgraph:
|
||||
f.close()
|
||||
|
||||
def stop(self, workload):
|
||||
if self.performance_tracking:
|
||||
if self._performance_tracking:
|
||||
self._stop_event.set()
|
||||
self.dump_rss(workload)
|
||||
ret, usage = self._cleanup()
|
||||
assert ret == 0, "The database process exited with a non-zero " "status ({})!".format(ret)
|
||||
return usage
|
||||
|
||||
def fetch_client(self) -> BoltClient:
|
||||
return BoltClient(benchmark_context=self.benchmark_context)
|
||||
|
||||
class Neo4j:
|
||||
def __init__(self, neo4j_path, temporary_dir, bolt_port, performance_tracking):
|
||||
self._neo4j_path = Path(neo4j_path)
|
||||
self._neo4j_binary = Path(neo4j_path) / "bin" / "neo4j"
|
||||
self._neo4j_config = Path(neo4j_path) / "conf" / "neo4j.conf"
|
||||
self._neo4j_pid = Path(neo4j_path) / "run" / "neo4j.pid"
|
||||
self._neo4j_admin = Path(neo4j_path) / "bin" / "neo4j-admin"
|
||||
self.performance_tracking = performance_tracking
|
||||
|
||||
class Neo4j(BaseRunner):
|
||||
def __init__(self, benchmark_context: BenchmarkContext):
|
||||
super().__init__(benchmark_context=benchmark_context)
|
||||
self._neo4j_binary = Path(benchmark_context.vendor_binary)
|
||||
self._neo4j_path = Path(benchmark_context.vendor_binary).parents[1]
|
||||
self._neo4j_config = self._neo4j_path / "conf" / "neo4j.conf"
|
||||
self._neo4j_pid = self._neo4j_path / "run" / "neo4j.pid"
|
||||
self._neo4j_admin = self._neo4j_path / "bin" / "neo4j-admin"
|
||||
self._performance_tracking = benchmark_context.performance_tracking
|
||||
self._vendor_args = benchmark_context.vendor_args
|
||||
self._stop_event = threading.Event()
|
||||
self._rss = []
|
||||
|
||||
if not self._neo4j_binary.is_file():
|
||||
raise Exception("Wrong path to binary!")
|
||||
self._directory = tempfile.TemporaryDirectory(dir=temporary_dir)
|
||||
self._bolt_port = bolt_port
|
||||
|
||||
tempfile.TemporaryDirectory(dir=benchmark_context.temporary_directory)
|
||||
self._bolt_port = (
|
||||
self.benchmark_context.vendor_args["bolt-port"]
|
||||
if "bolt-port" in self.benchmark_context.vendor_args.keys()
|
||||
else 7687
|
||||
)
|
||||
atexit.register(self._cleanup)
|
||||
configs = []
|
||||
memory_flag = "server.jvm.additional=-XX:NativeMemoryTracking=detail"
|
||||
auth_flag = "dbms.security.auth_enabled=false"
|
||||
|
||||
if self.performance_tracking:
|
||||
bolt_flag = "server.bolt.listen_address=:7687"
|
||||
http_flag = "server.http.listen_address=:7474"
|
||||
if self._performance_tracking:
|
||||
configs.append(memory_flag)
|
||||
else:
|
||||
lines = []
|
||||
@ -201,6 +343,8 @@ class Neo4j:
|
||||
file.close()
|
||||
|
||||
configs.append(auth_flag)
|
||||
configs.append(bolt_flag)
|
||||
configs.append(http_flag)
|
||||
print("Check neo4j config flags:")
|
||||
for conf in configs:
|
||||
with self._neo4j_config.open("r+") as file:
|
||||
@ -234,7 +378,7 @@ class Neo4j:
|
||||
else:
|
||||
raise Exception("The database process died prematurely!")
|
||||
print("Run server check:")
|
||||
wait_for_server(self._bolt_port)
|
||||
_wait_for_server(self._bolt_port)
|
||||
|
||||
def _cleanup(self):
|
||||
if self._neo4j_pid.exists():
|
||||
@ -248,7 +392,7 @@ class Neo4j:
|
||||
return 0
|
||||
|
||||
def start_preparation(self, workload):
|
||||
if self.performance_tracking:
|
||||
if self._performance_tracking:
|
||||
p = threading.Thread(target=self.res_background_tracking, args=(self._rss, self._stop_event))
|
||||
self._stop_event.clear()
|
||||
self._rss.clear()
|
||||
@ -257,11 +401,11 @@ class Neo4j:
|
||||
# Start DB
|
||||
self._start()
|
||||
|
||||
if self.performance_tracking:
|
||||
if self._performance_tracking:
|
||||
self.get_memory_usage("start_" + workload)
|
||||
|
||||
def start_benchmark(self, workload):
|
||||
if self.performance_tracking:
|
||||
if self._performance_tracking:
|
||||
p = threading.Thread(target=self.res_background_tracking, args=(self._rss, self._stop_event))
|
||||
self._stop_event.clear()
|
||||
self._rss.clear()
|
||||
@ -269,7 +413,7 @@ class Neo4j:
|
||||
# Start DB
|
||||
self._start()
|
||||
|
||||
if self.performance_tracking:
|
||||
if self._performance_tracking:
|
||||
self.get_memory_usage("start_" + workload)
|
||||
|
||||
def dump_db(self, path):
|
||||
@ -290,6 +434,20 @@ class Neo4j:
|
||||
check=True,
|
||||
)
|
||||
|
||||
def clean_db(self):
|
||||
print("Cleaning the database")
|
||||
if self._neo4j_pid.exists():
|
||||
raise Exception("Cannot clean DB because it is running.")
|
||||
else:
|
||||
out = subprocess.run(
|
||||
args="rm -Rf data/databases/* data/transactions/*",
|
||||
cwd=self._neo4j_path,
|
||||
capture_output=True,
|
||||
shell=True,
|
||||
)
|
||||
print(out.stderr.decode("utf-8"))
|
||||
print(out.stdout.decode("utf-8"))
|
||||
|
||||
def load_db_from_dump(self, path):
|
||||
print("Loading the neo4j database from dump...")
|
||||
if self._neo4j_pid.exists():
|
||||
@ -300,7 +458,8 @@ class Neo4j:
|
||||
self._neo4j_admin,
|
||||
"database",
|
||||
"load",
|
||||
"--from-path=" + path,
|
||||
"--from-path",
|
||||
path,
|
||||
"--overwrite-destination=true",
|
||||
"neo4j",
|
||||
],
|
||||
@ -325,7 +484,7 @@ class Neo4j:
|
||||
return True
|
||||
|
||||
def stop(self, workload):
|
||||
if self.performance_tracking:
|
||||
if self._performance_tracking:
|
||||
self._stop_event.set()
|
||||
self.get_memory_usage("stop_" + workload)
|
||||
self.dump_rss(workload)
|
||||
@ -360,51 +519,5 @@ class Neo4j:
|
||||
f.write(memory_usage.stdout)
|
||||
f.close()
|
||||
|
||||
|
||||
class Client:
|
||||
def __init__(
|
||||
self, client_binary: str, temporary_directory: str, bolt_port: int, username: str = "", password: str = ""
|
||||
):
|
||||
self._client_binary = client_binary
|
||||
self._directory = tempfile.TemporaryDirectory(dir=temporary_directory)
|
||||
self._username = username
|
||||
self._password = password
|
||||
self._bolt_port = bolt_port
|
||||
|
||||
def _get_args(self, **kwargs):
|
||||
return _convert_args_to_flags(self._client_binary, **kwargs)
|
||||
|
||||
def execute(self, queries=None, file_path=None, num_workers=1):
|
||||
if (queries is None and file_path is None) or (queries is not None and file_path is not None):
|
||||
raise ValueError("Either queries or input_path must be specified!")
|
||||
|
||||
# TODO: check `file_path.endswith(".json")` to support advanced
|
||||
# input queries
|
||||
|
||||
queries_json = False
|
||||
if queries is not None:
|
||||
queries_json = True
|
||||
file_path = os.path.join(self._directory.name, "queries.json")
|
||||
with open(file_path, "w") as f:
|
||||
for query in queries:
|
||||
json.dump(query, f)
|
||||
f.write("\n")
|
||||
|
||||
args = self._get_args(
|
||||
input=file_path,
|
||||
num_workers=num_workers,
|
||||
queries_json=queries_json,
|
||||
username=self._username,
|
||||
password=self._password,
|
||||
port=self._bolt_port,
|
||||
)
|
||||
|
||||
ret = subprocess.run(args, capture_output=True, check=True)
|
||||
error = ret.stderr.decode("utf-8").strip().split("\n")
|
||||
if error and error[0] != "":
|
||||
print("Reported errros from client")
|
||||
print(error)
|
||||
|
||||
data = ret.stdout.decode("utf-8").strip().split("\n")
|
||||
data = [x for x in data if not x.startswith("[")]
|
||||
return list(map(json.loads, data))
|
||||
def fetch_client(self) -> BoltClient:
|
||||
return BoltClient(benchmark_context=self.benchmark_context)
|
||||
|
244
tests/mgbench/validation.py
Normal file
244
tests/mgbench/validation.py
Normal file
@ -0,0 +1,244 @@
|
||||
import argparse
|
||||
import copy
|
||||
import multiprocessing
|
||||
import random
|
||||
|
||||
import helpers
|
||||
import runners
|
||||
import workloads
|
||||
from benchmark_context import BenchmarkContext
|
||||
from workloads import base
|
||||
|
||||
|
||||
def pars_args():
|
||||
|
||||
parser = argparse.ArgumentParser(
|
||||
prog="Validator for individual query checking",
|
||||
description="""Validates that query is running, and validates output between different vendors""",
|
||||
)
|
||||
parser.add_argument(
|
||||
"benchmarks",
|
||||
nargs="*",
|
||||
default="",
|
||||
help="descriptions of benchmarks that should be run; "
|
||||
"multiple descriptions can be specified to run multiple "
|
||||
"benchmarks; the description is specified as "
|
||||
"dataset/variant/group/query; Unix shell-style wildcards "
|
||||
"can be used in the descriptions; variant, group and query "
|
||||
"are optional and they can be left out; the default "
|
||||
"variant is '' which selects the default dataset variant; "
|
||||
"the default group is '*' which selects all groups; the"
|
||||
"default query is '*' which selects all queries",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--vendor-binary-1",
|
||||
help="Vendor binary used for benchmarking, by default it is memgraph",
|
||||
default=helpers.get_binary_path("memgraph"),
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--vendor-name-1",
|
||||
default="memgraph",
|
||||
choices=["memgraph", "neo4j"],
|
||||
help="Input vendor binary name (memgraph, neo4j)",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--vendor-binary-2",
|
||||
help="Vendor binary used for benchmarking, by default it is memgraph",
|
||||
default=helpers.get_binary_path("memgraph"),
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--vendor-name-2",
|
||||
default="memgraph",
|
||||
choices=["memgraph", "neo4j"],
|
||||
help="Input vendor binary name (memgraph, neo4j)",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--client-binary",
|
||||
default=helpers.get_binary_path("tests/mgbench/client"),
|
||||
help="Client binary used for benchmarking",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--temporary-directory",
|
||||
default="/tmp",
|
||||
help="directory path where temporary data should " "be stored",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--num-workers-for-import",
|
||||
type=int,
|
||||
default=multiprocessing.cpu_count() // 2,
|
||||
help="number of workers used to import the dataset",
|
||||
)
|
||||
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def get_queries(gen, count):
|
||||
# Make the generator deterministic.
|
||||
random.seed(gen.__name__)
|
||||
# Generate queries.
|
||||
ret = []
|
||||
for i in range(count):
|
||||
ret.append(gen())
|
||||
return ret
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
args = pars_args()
|
||||
|
||||
benchmark_context_db_1 = BenchmarkContext(
|
||||
vendor_name=args.vendor_name_1,
|
||||
vendor_binary=args.vendor_binary_1,
|
||||
benchmark_target_workload=copy.copy(args.benchmarks),
|
||||
client_binary=args.client_binary,
|
||||
num_workers_for_import=args.num_workers_for_import,
|
||||
temporary_directory=args.temporary_directory,
|
||||
)
|
||||
|
||||
available_workloads = helpers.get_available_workloads()
|
||||
|
||||
print(helpers.list_available_workloads())
|
||||
|
||||
vendor_runner = runners.BaseRunner.create(
|
||||
benchmark_context=benchmark_context_db_1,
|
||||
)
|
||||
|
||||
cache = helpers.Cache()
|
||||
client = vendor_runner.fetch_client()
|
||||
|
||||
workloads = helpers.filter_workloads(
|
||||
available_workloads=available_workloads, benchmark_context=benchmark_context_db_1
|
||||
)
|
||||
|
||||
results_db_1 = {}
|
||||
|
||||
for workload, queries in workloads:
|
||||
|
||||
vendor_runner.clean_db()
|
||||
|
||||
generated_queries = workload.dataset_generator()
|
||||
if generated_queries:
|
||||
vendor_runner.start_preparation("import")
|
||||
client.execute(queries=generated_queries, num_workers=benchmark_context_db_1.num_workers_for_import)
|
||||
vendor_runner.stop("import")
|
||||
else:
|
||||
workload.prepare(cache.cache_directory("datasets", workload.NAME, workload.get_variant()))
|
||||
imported = workload.custom_import()
|
||||
if not imported:
|
||||
vendor_runner.start_preparation("import")
|
||||
print("Executing database cleanup and index setup...")
|
||||
client.execute(
|
||||
file_path=workload.get_index(), num_workers=benchmark_context_db_1.num_workers_for_import
|
||||
)
|
||||
print("Importing dataset...")
|
||||
ret = client.execute(
|
||||
file_path=workload.get_file(), num_workers=benchmark_context_db_1.num_workers_for_import
|
||||
)
|
||||
usage = vendor_runner.stop("import")
|
||||
|
||||
for group in sorted(queries.keys()):
|
||||
for query, funcname in queries[group]:
|
||||
print("Running query:{}/{}/{}".format(group, query, funcname))
|
||||
func = getattr(workload, funcname)
|
||||
count = 1
|
||||
vendor_runner.start_benchmark("validation")
|
||||
try:
|
||||
ret = client.execute(queries=get_queries(func, count), num_workers=1, validation=True)[0]
|
||||
results_db_1[funcname] = ret["results"].items()
|
||||
except Exception as e:
|
||||
print("Issue running the query" + funcname)
|
||||
print(e)
|
||||
results_db_1[funcname] = "Query not executed properly"
|
||||
finally:
|
||||
usage = vendor_runner.stop("validation")
|
||||
print("Database used {:.3f} seconds of CPU time.".format(usage["cpu"]))
|
||||
print("Database peaked at {:.3f} MiB of memory.".format(usage["memory"] / 1024.0 / 1024.0))
|
||||
|
||||
benchmark_context_db_2 = BenchmarkContext(
|
||||
vendor_name=args.vendor_name_2,
|
||||
vendor_binary=args.vendor_binary_2,
|
||||
benchmark_target_workload=copy.copy(args.benchmarks),
|
||||
client_binary=args.client_binary,
|
||||
num_workers_for_import=args.num_workers_for_import,
|
||||
temporary_directory=args.temporary_directory,
|
||||
)
|
||||
|
||||
vendor_runner = runners.BaseRunner.create(
|
||||
benchmark_context=benchmark_context_db_2,
|
||||
)
|
||||
available_workloads = helpers.get_available_workloads()
|
||||
|
||||
workloads = helpers.filter_workloads(available_workloads, benchmark_context=benchmark_context_db_2)
|
||||
|
||||
client = vendor_runner.fetch_client()
|
||||
|
||||
results_db_2 = {}
|
||||
|
||||
for workload, queries in workloads:
|
||||
|
||||
vendor_runner.clean_db()
|
||||
|
||||
generated_queries = workload.dataset_generator()
|
||||
if generated_queries:
|
||||
vendor_runner.start_preparation("import")
|
||||
client.execute(queries=generated_queries, num_workers=benchmark_context_db_2.num_workers_for_import)
|
||||
vendor_runner.stop("import")
|
||||
else:
|
||||
workload.prepare(cache.cache_directory("datasets", workload.NAME, workload.get_variant()))
|
||||
imported = workload.custom_import()
|
||||
if not imported:
|
||||
vendor_runner.start_preparation("import")
|
||||
print("Executing database cleanup and index setup...")
|
||||
client.execute(
|
||||
file_path=workload.get_index(), num_workers=benchmark_context_db_2.num_workers_for_import
|
||||
)
|
||||
print("Importing dataset...")
|
||||
ret = client.execute(
|
||||
file_path=workload.get_file(), num_workers=benchmark_context_db_2.num_workers_for_import
|
||||
)
|
||||
usage = vendor_runner.stop("import")
|
||||
|
||||
for group in sorted(queries.keys()):
|
||||
for query, funcname in queries[group]:
|
||||
print("Running query:{}/{}/{}".format(group, query, funcname))
|
||||
func = getattr(workload, funcname)
|
||||
count = 1
|
||||
vendor_runner.start_benchmark("validation")
|
||||
try:
|
||||
ret = client.execute(queries=get_queries(func, count), num_workers=1, validation=True)[0]
|
||||
results_db_2[funcname] = ret["results"].items()
|
||||
except Exception as e:
|
||||
print("Issue running the query" + funcname)
|
||||
print(e)
|
||||
results_db_2[funcname] = "Query not executed properly"
|
||||
finally:
|
||||
usage = vendor_runner.stop("validation")
|
||||
print("Database used {:.3f} seconds of CPU time.".format(usage["cpu"]))
|
||||
print("Database peaked at {:.3f} MiB of memory.".format(usage["memory"] / 1024.0 / 1024.0))
|
||||
|
||||
validation = {}
|
||||
for key in results_db_1.keys():
|
||||
if type(results_db_1[key]) is str:
|
||||
validation[key] = "Query not executed properly."
|
||||
else:
|
||||
db_1_values = set()
|
||||
for index, value in results_db_1[key]:
|
||||
db_1_values.add(value)
|
||||
neo4j_values = set()
|
||||
for index, value in results_db_2[key]:
|
||||
neo4j_values.add(value)
|
||||
|
||||
if db_1_values == neo4j_values:
|
||||
validation[key] = "Identical results"
|
||||
else:
|
||||
validation[key] = "Different results, check manually."
|
||||
|
||||
for key, value in validation.items():
|
||||
print(key + " " + value)
|
4
tests/mgbench/workloads/__init__.py
Normal file
4
tests/mgbench/workloads/__init__.py
Normal file
@ -0,0 +1,4 @@
|
||||
from pathlib import Path
|
||||
|
||||
modules = Path(__file__).resolve().parent.glob("*.py")
|
||||
__all__ = [f.name[:-3] for f in modules if f.is_file() and not f.name == "__init__.py"]
|
197
tests/mgbench/workloads/base.py
Normal file
197
tests/mgbench/workloads/base.py
Normal file
@ -0,0 +1,197 @@
|
||||
# Copyright 2022 Memgraph Ltd.
|
||||
#
|
||||
# Use of this software is governed by the Business Source License
|
||||
# included in the file licenses/BSL.txt; by using this file, you agree to be bound by the terms of the Business Source
|
||||
# License, and you may not use this file except in compliance with the Business Source License.
|
||||
#
|
||||
# As of the Change Date specified in that file, in accordance with
|
||||
# the Business Source License, use of this software will be governed
|
||||
# by the Apache License, Version 2.0, included in the file
|
||||
# licenses/APL.txt.
|
||||
|
||||
from abc import ABC, abstractclassmethod
|
||||
from pathlib import Path
|
||||
|
||||
import helpers
|
||||
from benchmark_context import BenchmarkContext
|
||||
|
||||
|
||||
# Base dataset class used as a template to create each individual dataset. All
|
||||
# common logic is handled here.
|
||||
class Workload(ABC):
|
||||
|
||||
# Name of the workload/dataset.
|
||||
NAME = ""
|
||||
# List of all variants of the workload/dataset that exist.
|
||||
VARIANTS = ["default"]
|
||||
# One of the available variants that should be used as the default variant.
|
||||
DEFAULT_VARIANT = "default"
|
||||
|
||||
# List of local files that should be used to import the dataset.
|
||||
LOCAL_FILE = None
|
||||
|
||||
# URLs of remote dataset files that should be used to import the dataset, compressed in gz format.
|
||||
URL_FILE = None
|
||||
|
||||
# Index files
|
||||
LOCAL_INDEX_FILE = None
|
||||
URL_INDEX_FILE = None
|
||||
|
||||
# Number of vertices/edges for each variant.
|
||||
SIZES = {
|
||||
"default": {"vertices": 0, "edges": 0},
|
||||
}
|
||||
|
||||
# Indicates whether the dataset has properties on edges.
|
||||
PROPERTIES_ON_EDGES = False
|
||||
|
||||
def __init_subclass__(cls) -> None:
|
||||
name_prerequisite = "NAME" in cls.__dict__
|
||||
generator_prerequisite = "dataset_generator" in cls.__dict__
|
||||
custom_import_prerequisite = "custom_import" in cls.__dict__
|
||||
basic_import_prerequisite = ("LOCAL_FILE" in cls.__dict__ or "URL_FILE" in cls.__dict__) and (
|
||||
"LOCAL_INDEX_FILE" in cls.__dict__ or "URL_INDEX_FILE" in cls.__dict__
|
||||
)
|
||||
|
||||
if not name_prerequisite:
|
||||
raise ValueError(
|
||||
"""Can't define a workload class {} without NAME property:
|
||||
NAME = "dataset name"
|
||||
Name property defines the workload you want to execute, for example: "demo/*/*/*"
|
||||
""".format(
|
||||
cls.__name__
|
||||
)
|
||||
)
|
||||
|
||||
# Check workload is in generator or dataset mode during interpretation (not both), not runtime
|
||||
if generator_prerequisite and (custom_import_prerequisite or basic_import_prerequisite):
|
||||
raise ValueError(
|
||||
"""
|
||||
The workload class {} cannot have defined dataset import and generate dataset at
|
||||
the same time.
|
||||
""".format(
|
||||
cls.__name__
|
||||
)
|
||||
)
|
||||
|
||||
if not generator_prerequisite and (not custom_import_prerequisite and not basic_import_prerequisite):
|
||||
raise ValueError(
|
||||
"""
|
||||
The workload class {} need to have defined dataset import or dataset generator
|
||||
""".format(
|
||||
cls.__name__
|
||||
)
|
||||
)
|
||||
|
||||
return super().__init_subclass__()
|
||||
|
||||
def __init__(self, variant: str = None, benchmark_context: BenchmarkContext = None):
|
||||
"""
|
||||
Accepts a `variant` variable that indicates which variant
|
||||
of the dataset should be executed
|
||||
"""
|
||||
self.benchmark_context = benchmark_context
|
||||
self._variant = variant
|
||||
self._vendor = benchmark_context.vendor_name
|
||||
self._file = None
|
||||
self._file_index = None
|
||||
|
||||
if self.NAME == "":
|
||||
raise ValueError("Give your workload a name, by setting self.NAME")
|
||||
|
||||
if variant is None:
|
||||
variant = self.DEFAULT_VARIANT
|
||||
if variant not in self.VARIANTS:
|
||||
raise ValueError("Invalid test variant!")
|
||||
if (self.LOCAL_FILE and variant not in self.LOCAL_FILE) and (self.URL_FILE and variant not in self.URL_FILE):
|
||||
raise ValueError("The variant doesn't have a defined URL or LOCAL file path!")
|
||||
if variant not in self.SIZES:
|
||||
raise ValueError("The variant doesn't have a defined dataset " "size!")
|
||||
|
||||
if (self.LOCAL_INDEX_FILE and self._vendor not in self.LOCAL_INDEX_FILE) and (
|
||||
self.URL_INDEX_FILE and self._vendor not in self.URL_INDEX_FILE
|
||||
):
|
||||
raise ValueError("Vendor does not have INDEX for dataset!")
|
||||
|
||||
if self.LOCAL_FILE is not None:
|
||||
self._local_file = self.LOCAL_FILE.get(variant, None)
|
||||
else:
|
||||
self._local_file = None
|
||||
|
||||
if self.URL_FILE is not None:
|
||||
self._url_file = self.URL_FILE.get(variant, None)
|
||||
else:
|
||||
self._url_file = None
|
||||
|
||||
if self.LOCAL_INDEX_FILE is not None:
|
||||
self._local_index = self.LOCAL_INDEX_FILE.get(self._vendor, None)
|
||||
else:
|
||||
self._local_index = None
|
||||
|
||||
if self.URL_INDEX_FILE is not None:
|
||||
self._url_index = self.URL_INDEX_FILE.get(self._vendor, None)
|
||||
else:
|
||||
self._url_index = None
|
||||
|
||||
self._size = self.SIZES[variant]
|
||||
if "vertices" in self._size or "edges" in self._size:
|
||||
self._num_vertices = self._size["vertices"]
|
||||
self._num_edges = self._size["edges"]
|
||||
|
||||
def prepare(self, directory):
|
||||
if self._local_file is not None:
|
||||
print("Using local dataset file:", self._local_file)
|
||||
self._file = self._local_file
|
||||
elif self._url_file is not None:
|
||||
cached_input, exists = directory.get_file("dataset.cypher")
|
||||
if not exists:
|
||||
print("Downloading dataset file:", self._url_file)
|
||||
downloaded_file = helpers.download_file(self._url_file, directory.get_path())
|
||||
print("Unpacking and caching file:", downloaded_file)
|
||||
helpers.unpack_gz_and_move_file(downloaded_file, cached_input)
|
||||
print("Using cached dataset file:", cached_input)
|
||||
self._file = cached_input
|
||||
|
||||
if self._local_index is not None:
|
||||
print("Using local index file:", self._local_index)
|
||||
self._file_index = self._local_index
|
||||
elif self._url_index is not None:
|
||||
cached_index, exists = directory.get_file(self._vendor + ".cypher")
|
||||
if not exists:
|
||||
print("Downloading index file:", self._url_index)
|
||||
downloaded_file = helpers.download_file(self._url_index, directory.get_path())
|
||||
print("Unpacking and caching file:", downloaded_file)
|
||||
helpers.unpack_gz_and_move_file(downloaded_file, cached_index)
|
||||
print("Using cached index file:", cached_index)
|
||||
self._file_index = cached_index
|
||||
|
||||
def get_variant(self):
|
||||
"""Returns the current variant of the dataset."""
|
||||
return self._variant
|
||||
|
||||
def get_index(self):
|
||||
"""Get index file, defined by vendor"""
|
||||
return self._file_index
|
||||
|
||||
def get_file(self):
|
||||
"""
|
||||
Returns path to the file that contains dataset creation queries.
|
||||
"""
|
||||
return self._file
|
||||
|
||||
def get_size(self):
|
||||
"""Returns number of vertices/edges for the current variant."""
|
||||
return self._size
|
||||
|
||||
def custom_import(self) -> bool:
|
||||
print("Workload does not have a custom import")
|
||||
return False
|
||||
|
||||
def dataset_generator(self) -> list:
|
||||
print("Workload is not auto generated")
|
||||
return []
|
||||
|
||||
# All tests should be query generator functions that output all of the
|
||||
# queries that should be executed by the runner. The functions should be
|
||||
# named `benchmark__GROUPNAME__TESTNAME` and should not accept any
|
||||
# arguments.
|
28
tests/mgbench/workloads/demo.py
Normal file
28
tests/mgbench/workloads/demo.py
Normal file
@ -0,0 +1,28 @@
|
||||
import random
|
||||
|
||||
from workloads.base import Workload
|
||||
|
||||
|
||||
class Demo(Workload):
|
||||
|
||||
NAME = "demo"
|
||||
|
||||
def dataset_generator(self):
|
||||
|
||||
queries = [("MATCH (n) DETACH DELETE n;", {})]
|
||||
for i in range(0, 100):
|
||||
queries.append(("CREATE (:NodeA{{ id:{}}});".format(i), {}))
|
||||
queries.append(("CREATE (:NodeB{{ id:{}}});".format(i), {}))
|
||||
|
||||
for i in range(0, 100):
|
||||
a = random.randint(0, 99)
|
||||
b = random.randint(0, 99)
|
||||
queries.append(("MATCH(a:NodeA{{ id: {}}}),(b:NodeB{{id: {}}}) CREATE (a)-[:EDGE]->(b)".format(a, b), {}))
|
||||
|
||||
return queries
|
||||
|
||||
def benchmark__test__sample_query1(self):
|
||||
return ("MATCH (n) RETURN n", {})
|
||||
|
||||
def benchmark__test__sample_query2(self):
|
||||
return ("MATCH (n) RETURN n", {})
|
0
tests/mgbench/workloads/importers/__init__.py
Normal file
0
tests/mgbench/workloads/importers/__init__.py
Normal file
213
tests/mgbench/workloads/importers/importer_ldbc_bi.py
Normal file
213
tests/mgbench/workloads/importers/importer_ldbc_bi.py
Normal file
@ -0,0 +1,213 @@
|
||||
import csv
|
||||
import subprocess
|
||||
from collections import defaultdict
|
||||
from pathlib import Path
|
||||
|
||||
import helpers
|
||||
from benchmark_context import BenchmarkContext
|
||||
from runners import BaseRunner
|
||||
|
||||
HEADERS_URL = "https://s3.eu-west-1.amazonaws.com/deps.memgraph.io/dataset/ldbc/benchmark/bi/headers.tar.gz"
|
||||
|
||||
|
||||
class ImporterLDBCBI:
|
||||
def __init__(
|
||||
self, benchmark_context: BenchmarkContext, dataset_name: str, variant: str, index_file: str, csv_dict: dict
|
||||
) -> None:
|
||||
self._benchmark_context = benchmark_context
|
||||
self._dataset_name = dataset_name
|
||||
self._variant = variant
|
||||
self._index_file = index_file
|
||||
self._csv_dict = csv_dict
|
||||
|
||||
def execute_import(self):
|
||||
|
||||
vendor_runner = BaseRunner.create(
|
||||
benchmark_context=self._benchmark_context,
|
||||
)
|
||||
client = vendor_runner.fetch_client()
|
||||
|
||||
if self._benchmark_context.vendor_name == "neo4j":
|
||||
data_dir = Path() / ".cache" / "datasets" / self._dataset_name / self._variant / "data_neo4j"
|
||||
data_dir.mkdir(parents=True, exist_ok=True)
|
||||
dir_name = self._csv_dict[self._variant].split("/")[-1:][0].removesuffix(".tar.zst")
|
||||
if (data_dir / dir_name).exists():
|
||||
print("Files downloaded")
|
||||
data_dir = data_dir / dir_name
|
||||
else:
|
||||
print("Downloading files")
|
||||
downloaded_file = helpers.download_file(self._csv_dict[self._variant], data_dir.absolute())
|
||||
print("Unpacking the file..." + downloaded_file)
|
||||
data_dir = helpers.unpack_tar_zst(Path(downloaded_file))
|
||||
|
||||
headers_dir = Path() / ".cache" / "datasets" / self._dataset_name / self._variant / "headers_neo4j"
|
||||
headers_dir.mkdir(parents=True, exist_ok=True)
|
||||
headers = HEADERS_URL.split("/")[-1:][0].removesuffix(".tar.gz")
|
||||
if (headers_dir / headers).exists():
|
||||
print("Header files downloaded.")
|
||||
else:
|
||||
print("Downloading files")
|
||||
downloaded_file = helpers.download_file(HEADERS_URL, headers_dir.absolute())
|
||||
print("Unpacking the file..." + downloaded_file)
|
||||
headers_dir = helpers.unpack_tar_gz(Path(downloaded_file))
|
||||
|
||||
input_headers = {}
|
||||
for header_file in headers_dir.glob("**/*.csv"):
|
||||
key = "/".join(header_file.parts[-2:])[0:-4]
|
||||
input_headers[key] = header_file.as_posix()
|
||||
|
||||
for data_file in data_dir.glob("**/*.gz"):
|
||||
if "initial_snapshot" in data_file.parts:
|
||||
data_file = helpers.unpack_gz(data_file)
|
||||
output = data_file.parent / (data_file.stem + "_neo" + ".csv")
|
||||
if not output.exists():
|
||||
with data_file.open("r") as input_f, output.open("a") as output_f:
|
||||
reader = csv.reader(input_f, delimiter="|")
|
||||
header = next(reader)
|
||||
writer = csv.writer(output_f, delimiter="|")
|
||||
for line in reader:
|
||||
writer.writerow(line)
|
||||
else:
|
||||
print("Files converted")
|
||||
|
||||
input_files = defaultdict(list)
|
||||
for neo_file in data_dir.glob("**/*_neo.csv"):
|
||||
key = "/".join(neo_file.parts[-3:-1])
|
||||
input_files[key].append(neo_file.as_posix())
|
||||
|
||||
vendor_runner.clean_db()
|
||||
subprocess.run(
|
||||
args=[
|
||||
vendor_runner._neo4j_admin,
|
||||
"database",
|
||||
"import",
|
||||
"full",
|
||||
"--id-type=INTEGER",
|
||||
"--ignore-empty-strings=true",
|
||||
"--bad-tolerance=0",
|
||||
"--nodes=Place=" + input_headers["static/Place"] + "," + ",".join(input_files["static/Place"]),
|
||||
"--nodes=Organisation="
|
||||
+ input_headers["static/Organisation"]
|
||||
+ ","
|
||||
+ ",".join(input_files["static/Organisation"]),
|
||||
"--nodes=TagClass="
|
||||
+ input_headers["static/TagClass"]
|
||||
+ ","
|
||||
+ ",".join(input_files["static/TagClass"]),
|
||||
"--nodes=Tag=" + input_headers["static/Tag"] + "," + ",".join(input_files["static/Tag"]),
|
||||
"--nodes=Forum=" + input_headers["dynamic/Forum"] + "," + ",".join(input_files["dynamic/Forum"]),
|
||||
"--nodes=Person=" + input_headers["dynamic/Person"] + "," + ",".join(input_files["dynamic/Person"]),
|
||||
"--nodes=Message:Comment="
|
||||
+ input_headers["dynamic/Comment"]
|
||||
+ ","
|
||||
+ ",".join(input_files["dynamic/Comment"]),
|
||||
"--nodes=Message:Post="
|
||||
+ input_headers["dynamic/Post"]
|
||||
+ ","
|
||||
+ ",".join(input_files["dynamic/Post"]),
|
||||
"--relationships=IS_PART_OF="
|
||||
+ input_headers["static/Place_isPartOf_Place"]
|
||||
+ ","
|
||||
+ ",".join(input_files["static/Place_isPartOf_Place"]),
|
||||
"--relationships=IS_SUBCLASS_OF="
|
||||
+ input_headers["static/TagClass_isSubclassOf_TagClass"]
|
||||
+ ","
|
||||
+ ",".join(input_files["static/TagClass_isSubclassOf_TagClass"]),
|
||||
"--relationships=IS_LOCATED_IN="
|
||||
+ input_headers["static/Organisation_isLocatedIn_Place"]
|
||||
+ ","
|
||||
+ ",".join(input_files["static/Organisation_isLocatedIn_Place"]),
|
||||
"--relationships=HAS_TYPE="
|
||||
+ input_headers["static/Tag_hasType_TagClass"]
|
||||
+ ","
|
||||
+ ",".join(input_files["static/Tag_hasType_TagClass"]),
|
||||
"--relationships=HAS_CREATOR="
|
||||
+ input_headers["dynamic/Comment_hasCreator_Person"]
|
||||
+ ","
|
||||
+ ",".join(input_files["dynamic/Comment_hasCreator_Person"]),
|
||||
"--relationships=IS_LOCATED_IN="
|
||||
+ input_headers["dynamic/Comment_isLocatedIn_Country"]
|
||||
+ ","
|
||||
+ ",".join(input_files["dynamic/Comment_isLocatedIn_Country"]),
|
||||
"--relationships=REPLY_OF="
|
||||
+ input_headers["dynamic/Comment_replyOf_Comment"]
|
||||
+ ","
|
||||
+ ",".join(input_files["dynamic/Comment_replyOf_Comment"]),
|
||||
"--relationships=REPLY_OF="
|
||||
+ input_headers["dynamic/Comment_replyOf_Post"]
|
||||
+ ","
|
||||
+ ",".join(input_files["dynamic/Comment_replyOf_Post"]),
|
||||
"--relationships=CONTAINER_OF="
|
||||
+ input_headers["dynamic/Forum_containerOf_Post"]
|
||||
+ ","
|
||||
+ ",".join(input_files["dynamic/Forum_containerOf_Post"]),
|
||||
"--relationships=HAS_MEMBER="
|
||||
+ input_headers["dynamic/Forum_hasMember_Person"]
|
||||
+ ","
|
||||
+ ",".join(input_files["dynamic/Forum_hasMember_Person"]),
|
||||
"--relationships=HAS_MODERATOR="
|
||||
+ input_headers["dynamic/Forum_hasModerator_Person"]
|
||||
+ ","
|
||||
+ ",".join(input_files["dynamic/Forum_hasModerator_Person"]),
|
||||
"--relationships=HAS_TAG="
|
||||
+ input_headers["dynamic/Forum_hasTag_Tag"]
|
||||
+ ","
|
||||
+ ",".join(input_files["dynamic/Forum_hasTag_Tag"]),
|
||||
"--relationships=HAS_INTEREST="
|
||||
+ input_headers["dynamic/Person_hasInterest_Tag"]
|
||||
+ ","
|
||||
+ ",".join(input_files["dynamic/Person_hasInterest_Tag"]),
|
||||
"--relationships=IS_LOCATED_IN="
|
||||
+ input_headers["dynamic/Person_isLocatedIn_City"]
|
||||
+ ","
|
||||
+ ",".join(input_files["dynamic/Person_isLocatedIn_City"]),
|
||||
"--relationships=KNOWS="
|
||||
+ input_headers["dynamic/Person_knows_Person"]
|
||||
+ ","
|
||||
+ ",".join(input_files["dynamic/Person_knows_Person"]),
|
||||
"--relationships=LIKES="
|
||||
+ input_headers["dynamic/Person_likes_Comment"]
|
||||
+ ","
|
||||
+ ",".join(input_files["dynamic/Person_likes_Comment"]),
|
||||
"--relationships=LIKES="
|
||||
+ input_headers["dynamic/Person_likes_Post"]
|
||||
+ ","
|
||||
+ ",".join(input_files["dynamic/Person_likes_Post"]),
|
||||
"--relationships=HAS_CREATOR="
|
||||
+ input_headers["dynamic/Post_hasCreator_Person"]
|
||||
+ ","
|
||||
+ ",".join(input_files["dynamic/Post_hasCreator_Person"]),
|
||||
"--relationships=HAS_TAG="
|
||||
+ input_headers["dynamic/Comment_hasTag_Tag"]
|
||||
+ ","
|
||||
+ ",".join(input_files["dynamic/Comment_hasTag_Tag"]),
|
||||
"--relationships=HAS_TAG="
|
||||
+ input_headers["dynamic/Post_hasTag_Tag"]
|
||||
+ ","
|
||||
+ ",".join(input_files["dynamic/Post_hasTag_Tag"]),
|
||||
"--relationships=IS_LOCATED_IN="
|
||||
+ input_headers["dynamic/Post_isLocatedIn_Country"]
|
||||
+ ","
|
||||
+ ",".join(input_files["dynamic/Post_isLocatedIn_Country"]),
|
||||
"--relationships=STUDY_AT="
|
||||
+ input_headers["dynamic/Person_studyAt_University"]
|
||||
+ ","
|
||||
+ ",".join(input_files["dynamic/Person_studyAt_University"]),
|
||||
"--relationships=WORK_AT="
|
||||
+ input_headers["dynamic/Person_workAt_Company"]
|
||||
+ ","
|
||||
+ ",".join(input_files["dynamic/Person_workAt_Company"]),
|
||||
"--delimiter",
|
||||
"|",
|
||||
"neo4j",
|
||||
],
|
||||
check=True,
|
||||
)
|
||||
|
||||
vendor_runner.start_preparation("Index preparation")
|
||||
print("Executing database index setup")
|
||||
client.execute(file_path=self._index_file, num_workers=1)
|
||||
vendor_runner.stop("Stop index preparation")
|
||||
return True
|
||||
else:
|
||||
return False
|
163
tests/mgbench/workloads/importers/importer_ldbc_interactive.py
Normal file
163
tests/mgbench/workloads/importers/importer_ldbc_interactive.py
Normal file
@ -0,0 +1,163 @@
|
||||
import csv
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
|
||||
import helpers
|
||||
from benchmark_context import BenchmarkContext
|
||||
from runners import BaseRunner
|
||||
|
||||
# Removed speaks/email from person header
|
||||
HEADERS_INTERACTIVE = {
|
||||
"static/organisation": "id:ID(Organisation)|:LABEL|name:STRING|url:STRING",
|
||||
"static/place": "id:ID(Place)|name:STRING|url:STRING|:LABEL",
|
||||
"static/tagclass": "id:ID(TagClass)|name:STRING|url:STRING",
|
||||
"static/tag": "id:ID(Tag)|name:STRING|url:STRING",
|
||||
"static/tagclass_isSubclassOf_tagclass": ":START_ID(TagClass)|:END_ID(TagClass)",
|
||||
"static/tag_hasType_tagclass": ":START_ID(Tag)|:END_ID(TagClass)",
|
||||
"static/organisation_isLocatedIn_place": ":START_ID(Organisation)|:END_ID(Place)",
|
||||
"static/place_isPartOf_place": ":START_ID(Place)|:END_ID(Place)",
|
||||
"dynamic/comment": "id:ID(Comment)|creationDate:LOCALDATETIME|locationIP:STRING|browserUsed:STRING|content:STRING|length:INT",
|
||||
"dynamic/forum": "id:ID(Forum)|title:STRING|creationDate:LOCALDATETIME",
|
||||
"dynamic/person": "id:ID(Person)|firstName:STRING|lastName:STRING|gender:STRING|birthday:LOCALDATETIME|creationDate:LOCALDATETIME|locationIP:STRING|browserUsed:STRING",
|
||||
"dynamic/post": "id:ID(Post)|imageFile:STRING|creationDate:LOCALDATETIME|locationIP:STRING|browserUsed:STRING|language:STRING|content:STRING|length:INT",
|
||||
"dynamic/comment_hasCreator_person": ":START_ID(Comment)|:END_ID(Person)",
|
||||
"dynamic/comment_isLocatedIn_place": ":START_ID(Comment)|:END_ID(Place)",
|
||||
"dynamic/comment_replyOf_comment": ":START_ID(Comment)|:END_ID(Comment)",
|
||||
"dynamic/comment_replyOf_post": ":START_ID(Comment)|:END_ID(Post)",
|
||||
"dynamic/forum_containerOf_post": ":START_ID(Forum)|:END_ID(Post)",
|
||||
"dynamic/forum_hasMember_person": ":START_ID(Forum)|:END_ID(Person)|joinDate:LOCALDATETIME",
|
||||
"dynamic/forum_hasModerator_person": ":START_ID(Forum)|:END_ID(Person)",
|
||||
"dynamic/forum_hasTag_tag": ":START_ID(Forum)|:END_ID(Tag)",
|
||||
"dynamic/person_hasInterest_tag": ":START_ID(Person)|:END_ID(Tag)",
|
||||
"dynamic/person_isLocatedIn_place": ":START_ID(Person)|:END_ID(Place)",
|
||||
"dynamic/person_knows_person": ":START_ID(Person)|:END_ID(Person)|creationDate:LOCALDATETIME",
|
||||
"dynamic/person_likes_comment": ":START_ID(Person)|:END_ID(Comment)|creationDate:LOCALDATETIME",
|
||||
"dynamic/person_likes_post": ":START_ID(Person)|:END_ID(Post)|creationDate:LOCALDATETIME",
|
||||
"dynamic/person_studyAt_organisation": ":START_ID(Person)|:END_ID(Organisation)|classYear:INT",
|
||||
"dynamic/person_workAt_organisation": ":START_ID(Person)|:END_ID(Organisation)|workFrom:INT",
|
||||
"dynamic/post_hasCreator_person": ":START_ID(Post)|:END_ID(Person)",
|
||||
"dynamic/comment_hasTag_tag": ":START_ID(Comment)|:END_ID(Tag)",
|
||||
"dynamic/post_hasTag_tag": ":START_ID(Post)|:END_ID(Tag)",
|
||||
"dynamic/post_isLocatedIn_place": ":START_ID(Post)|:END_ID(Place)",
|
||||
}
|
||||
|
||||
|
||||
class ImporterLDBCInteractive:
|
||||
def __init__(
|
||||
self, benchmark_context: BenchmarkContext, dataset_name: str, variant: str, index_file: str, csv_dict: dict
|
||||
) -> None:
|
||||
self._benchmark_context = benchmark_context
|
||||
self._dataset_name = dataset_name
|
||||
self._variant = variant
|
||||
self._index_file = index_file
|
||||
self._csv_dict = csv_dict
|
||||
|
||||
def execute_import(self):
|
||||
|
||||
vendor_runner = BaseRunner.create(
|
||||
benchmark_context=self._benchmark_context,
|
||||
)
|
||||
client = vendor_runner.fetch_client()
|
||||
|
||||
if self._benchmark_context.vendor_name == "neo4j":
|
||||
print("Runnning Neo4j import")
|
||||
dump_dir = Path() / ".cache" / "datasets" / self._dataset_name / self._variant / "dump"
|
||||
dump_dir.mkdir(parents=True, exist_ok=True)
|
||||
dir_name = self._csv_dict[self._variant].split("/")[-1:][0].removesuffix(".tar.zst")
|
||||
if (dump_dir / dir_name).exists():
|
||||
print("Files downloaded")
|
||||
dump_dir = dump_dir / dir_name
|
||||
else:
|
||||
print("Downloading files")
|
||||
downloaded_file = helpers.download_file(self._csv_dict[self._variant], dump_dir.absolute())
|
||||
print("Unpacking the file..." + downloaded_file)
|
||||
dump_dir = helpers.unpack_tar_zst(Path(downloaded_file))
|
||||
|
||||
input_files = {}
|
||||
for file in dump_dir.glob("*/*0.csv"):
|
||||
parts = file.parts[-2:]
|
||||
key = parts[0] + "/" + parts[1][:-8]
|
||||
input_files[key] = file
|
||||
|
||||
output_files = {}
|
||||
for key, file in input_files.items():
|
||||
output = file.parent / (file.stem + "_neo" + ".csv")
|
||||
if not output.exists():
|
||||
with file.open("r") as input_f, output.open("a") as output_f:
|
||||
reader = csv.reader(input_f, delimiter="|")
|
||||
header = next(reader)
|
||||
|
||||
writer = csv.writer(output_f, delimiter="|")
|
||||
if key in HEADERS_INTERACTIVE.keys():
|
||||
updated_header = HEADERS_INTERACTIVE[key].split("|")
|
||||
writer.writerow(updated_header)
|
||||
for line in reader:
|
||||
if "creationDate" in header:
|
||||
pos = header.index("creationDate")
|
||||
line[pos] = line[pos][0:-5]
|
||||
elif "joinDate" in header:
|
||||
pos = header.index("joinDate")
|
||||
line[pos] = line[pos][0:-5]
|
||||
|
||||
if "organisation_0_0.csv" == file.name:
|
||||
writer.writerow([line[0], line[1].capitalize(), line[2], line[3]])
|
||||
elif "place_0_0.csv" == file.name:
|
||||
writer.writerow([line[0], line[1], line[2], line[3].capitalize()])
|
||||
else:
|
||||
writer.writerow(line)
|
||||
|
||||
output_files[key] = output.as_posix()
|
||||
vendor_runner.clean_db()
|
||||
subprocess.run(
|
||||
args=[
|
||||
vendor_runner._neo4j_admin,
|
||||
"database",
|
||||
"import",
|
||||
"full",
|
||||
"--id-type=INTEGER",
|
||||
"--nodes=Place=" + output_files["static/place"],
|
||||
"--nodes=Organisation=" + output_files["static/organisation"],
|
||||
"--nodes=TagClass=" + output_files["static/tagclass"],
|
||||
"--nodes=Tag=" + output_files["static/tag"],
|
||||
"--nodes=Comment:Message=" + output_files["dynamic/comment"],
|
||||
"--nodes=Forum=" + output_files["dynamic/forum"],
|
||||
"--nodes=Person=" + output_files["dynamic/person"],
|
||||
"--nodes=Post:Message=" + output_files["dynamic/post"],
|
||||
"--relationships=IS_PART_OF=" + output_files["static/place_isPartOf_place"],
|
||||
"--relationships=IS_SUBCLASS_OF=" + output_files["static/tagclass_isSubclassOf_tagclass"],
|
||||
"--relationships=IS_LOCATED_IN=" + output_files["static/organisation_isLocatedIn_place"],
|
||||
"--relationships=HAS_TYPE=" + output_files["static/tag_hasType_tagclass"],
|
||||
"--relationships=HAS_CREATOR=" + output_files["dynamic/comment_hasCreator_person"],
|
||||
"--relationships=IS_LOCATED_IN=" + output_files["dynamic/comment_isLocatedIn_place"],
|
||||
"--relationships=REPLY_OF=" + output_files["dynamic/comment_replyOf_comment"],
|
||||
"--relationships=REPLY_OF=" + output_files["dynamic/comment_replyOf_post"],
|
||||
"--relationships=CONTAINER_OF=" + output_files["dynamic/forum_containerOf_post"],
|
||||
"--relationships=HAS_MEMBER=" + output_files["dynamic/forum_hasMember_person"],
|
||||
"--relationships=HAS_MODERATOR=" + output_files["dynamic/forum_hasModerator_person"],
|
||||
"--relationships=HAS_TAG=" + output_files["dynamic/forum_hasTag_tag"],
|
||||
"--relationships=HAS_INTEREST=" + output_files["dynamic/person_hasInterest_tag"],
|
||||
"--relationships=IS_LOCATED_IN=" + output_files["dynamic/person_isLocatedIn_place"],
|
||||
"--relationships=KNOWS=" + output_files["dynamic/person_knows_person"],
|
||||
"--relationships=LIKES=" + output_files["dynamic/person_likes_comment"],
|
||||
"--relationships=LIKES=" + output_files["dynamic/person_likes_post"],
|
||||
"--relationships=HAS_CREATOR=" + output_files["dynamic/post_hasCreator_person"],
|
||||
"--relationships=HAS_TAG=" + output_files["dynamic/comment_hasTag_tag"],
|
||||
"--relationships=HAS_TAG=" + output_files["dynamic/post_hasTag_tag"],
|
||||
"--relationships=IS_LOCATED_IN=" + output_files["dynamic/post_isLocatedIn_place"],
|
||||
"--relationships=STUDY_AT=" + output_files["dynamic/person_studyAt_organisation"],
|
||||
"--relationships=WORK_AT=" + output_files["dynamic/person_workAt_organisation"],
|
||||
"--delimiter",
|
||||
"|",
|
||||
"neo4j",
|
||||
],
|
||||
check=True,
|
||||
)
|
||||
|
||||
vendor_runner.start_preparation("Index preparation")
|
||||
print("Executing database index setup")
|
||||
client.execute(file_path=self._index_file, num_workers=1)
|
||||
vendor_runner.stop("Stop index preparation")
|
||||
|
||||
return True
|
||||
else:
|
||||
return False
|
41
tests/mgbench/workloads/importers/importer_pokec.py
Normal file
41
tests/mgbench/workloads/importers/importer_pokec.py
Normal file
@ -0,0 +1,41 @@
|
||||
from pathlib import Path
|
||||
|
||||
from benchmark_context import BenchmarkContext
|
||||
from runners import BaseRunner
|
||||
|
||||
|
||||
class ImporterPokec:
|
||||
def __init__(
|
||||
self, benchmark_context: BenchmarkContext, dataset_name: str, variant: str, index_file: str, dataset_file: str
|
||||
) -> None:
|
||||
self._benchmark_context = benchmark_context
|
||||
self._dataset_name = dataset_name
|
||||
self._variant = variant
|
||||
self._index_file = index_file
|
||||
self._dataset_file = dataset_file
|
||||
|
||||
def execute_import(self):
|
||||
if self._benchmark_context.vendor_name == "neo4j":
|
||||
|
||||
vendor_runner = BaseRunner.create(
|
||||
benchmark_context=self._benchmark_context,
|
||||
)
|
||||
client = vendor_runner.fetch_client()
|
||||
vendor_runner.clean_db()
|
||||
vendor_runner.start_preparation("preparation")
|
||||
print("Executing database cleanup and index setup...")
|
||||
client.execute(file_path=self._index_file, num_workers=1)
|
||||
vendor_runner.stop("preparation")
|
||||
neo4j_dump = Path() / ".cache" / "datasets" / self._dataset_name / self._variant / "neo4j.dump"
|
||||
if neo4j_dump.exists():
|
||||
vendor_runner.load_db_from_dump(path=neo4j_dump.parent)
|
||||
else:
|
||||
vendor_runner.start_preparation("import")
|
||||
print("Importing dataset...")
|
||||
client.execute(file_path=self._dataset_file, num_workers=self._benchmark_context.num_workers_for_import)
|
||||
vendor_runner.stop("import")
|
||||
vendor_runner.dump_db(path=neo4j_dump.parent)
|
||||
|
||||
return True
|
||||
else:
|
||||
return False
|
708
tests/mgbench/workloads/ldbc_bi.py
Normal file
708
tests/mgbench/workloads/ldbc_bi.py
Normal file
@ -0,0 +1,708 @@
|
||||
import inspect
|
||||
import random
|
||||
from pathlib import Path
|
||||
|
||||
import helpers
|
||||
from benchmark_context import BenchmarkContext
|
||||
from workloads.base import Workload
|
||||
from workloads.importers.importer_ldbc_bi import ImporterLDBCBI
|
||||
|
||||
|
||||
class LDBC_BI(Workload):
|
||||
NAME = "ldbc_bi"
|
||||
VARIANTS = ["sf1", "sf3", "sf10"]
|
||||
DEFAULT_VARIANT = "sf1"
|
||||
|
||||
URL_FILE = {
|
||||
"sf1": "https://s3.eu-west-1.amazonaws.com/deps.memgraph.io/dataset/ldbc/benchmark/bi/ldbc_bi_sf1.cypher.gz",
|
||||
"sf3": "https://s3.eu-west-1.amazonaws.com/deps.memgraph.io/dataset/ldbc/benchmark/bi/ldbc_bi_sf3.cypher.gz",
|
||||
"sf10": "https://s3.eu-west-1.amazonaws.com/deps.memgraph.io/dataset/ldbc/benchmark/bi/ldbc_bi_sf10.cypher.gz",
|
||||
}
|
||||
|
||||
URL_CSV = {
|
||||
"sf1": "https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/bi-pre-audit/bi-sf1-composite-projected-fk.tar.zst",
|
||||
"sf3": "https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/bi-pre-audit/bi-sf3-composite-projected-fk.tar.zst",
|
||||
"sf10": "https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/bi-pre-audit/bi-sf10-composite-projected-fk.tar.zst",
|
||||
}
|
||||
|
||||
SIZES = {
|
||||
"sf1": {"vertices": 2997352, "edges": 17196776},
|
||||
"sf3": {"vertices": 1, "edges": 1},
|
||||
"sf10": {"vertices": 1, "edges": 1},
|
||||
}
|
||||
|
||||
LOCAL_INDEX_FILES = None
|
||||
|
||||
URL_INDEX_FILE = {
|
||||
"memgraph": "https://s3.eu-west-1.amazonaws.com/deps.memgraph.io/dataset/ldbc/benchmark/bi/memgraph_bi_index.cypher",
|
||||
"neo4j": "https://s3.eu-west-1.amazonaws.com/deps.memgraph.io/dataset/ldbc/benchmark/bi/neo4j_bi_index.cypher",
|
||||
}
|
||||
|
||||
QUERY_PARAMETERS = {
|
||||
"sf1": "https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/bi-pre-audit/parameters-2022-10-01.zip",
|
||||
"sf3": "https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/bi-pre-audit/parameters-2022-10-01.zip",
|
||||
"sf10": "https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/bi-pre-audit/parameters-2022-10-01.zip",
|
||||
}
|
||||
|
||||
def custom_import(self) -> bool:
|
||||
importer = ImporterLDBCBI(
|
||||
benchmark_context=self.benchmark_context,
|
||||
dataset_name=self.NAME,
|
||||
variant=self._variant,
|
||||
index_file=self._file_index,
|
||||
csv_dict=self.URL_CSV,
|
||||
)
|
||||
return importer.execute_import()
|
||||
|
||||
def _prepare_parameters_directory(self):
|
||||
parameters = Path() / ".cache" / "datasets" / self.NAME / self._variant / "parameters"
|
||||
parameters.mkdir(parents=True, exist_ok=True)
|
||||
if parameters.exists() and any(parameters.iterdir()):
|
||||
print("Files downloaded.")
|
||||
else:
|
||||
print("Downloading files")
|
||||
downloaded_file = helpers.download_file(self.QUERY_PARAMETERS[self._variant], parameters.parent.absolute())
|
||||
print("Unpacking the file..." + downloaded_file)
|
||||
parameters = helpers.unpack_zip(Path(downloaded_file))
|
||||
return parameters / ("parameters-" + self._variant)
|
||||
|
||||
def _get_query_parameters(self) -> dict:
|
||||
func_name = inspect.stack()[1].function
|
||||
parameters = {}
|
||||
for file in self._parameters_dir.glob("bi-*.csv"):
|
||||
file_name_query_id = file.name.split("-")[1][0:-4]
|
||||
func_name_id = func_name.split("_")[-1]
|
||||
if file_name_query_id == func_name_id or file_name_query_id == func_name_id + "a":
|
||||
with file.open("r") as input:
|
||||
lines = input.readlines()
|
||||
header = lines[0].strip("\n").split("|")
|
||||
position = random.randint(1, len(lines) - 1)
|
||||
data = lines[position].strip("\n").split("|")
|
||||
for i in range(len(header)):
|
||||
key, value_type = header[i].split(":")
|
||||
if value_type == "DATETIME":
|
||||
# Drop time zone
|
||||
converted = data[i][0:-6]
|
||||
parameters[key] = converted
|
||||
elif value_type == "DATE":
|
||||
converted = data[i] + "T00:00:00"
|
||||
parameters[key] = converted
|
||||
elif value_type == "INT":
|
||||
parameters[key] = int(data[i])
|
||||
elif value_type == "STRING[]":
|
||||
elements = data[i].split(";")
|
||||
parameters[key] = elements
|
||||
else:
|
||||
parameters[key] = data[i]
|
||||
break
|
||||
|
||||
return parameters
|
||||
|
||||
def __init__(self, variant=None, benchmark_context: BenchmarkContext = None):
|
||||
super().__init__(variant, benchmark_context=benchmark_context)
|
||||
self._parameters_dir = self._prepare_parameters_directory()
|
||||
|
||||
def benchmark__bi__query_1_analytical(self):
|
||||
|
||||
memgraph = (
|
||||
"""
|
||||
MATCH (message:Message)
|
||||
WHERE message.creationDate < localDateTime($datetime)
|
||||
WITH count(message) AS totalMessageCountInt
|
||||
WITH toFloat(totalMessageCountInt) AS totalMessageCount
|
||||
MATCH (message:Message)
|
||||
WHERE message.creationDate < localDateTime($datetime)
|
||||
AND message.content IS NOT NULL
|
||||
WITH
|
||||
totalMessageCount,
|
||||
message,
|
||||
message.creationDate.year AS year
|
||||
WITH
|
||||
totalMessageCount,
|
||||
year,
|
||||
message:Comment AS isComment,
|
||||
CASE
|
||||
WHEN message.length < 40 THEN 0
|
||||
WHEN message.length < 80 THEN 1
|
||||
WHEN message.length < 160 THEN 2
|
||||
ELSE 3
|
||||
END AS lengthCategory,
|
||||
count(message) AS messageCount,
|
||||
sum(message.length) / toFloat(count(message)) AS averageMessageLength,
|
||||
sum(message.length) AS sumMessageLength
|
||||
RETURN
|
||||
year,
|
||||
isComment,
|
||||
lengthCategory,
|
||||
messageCount,
|
||||
averageMessageLength,
|
||||
sumMessageLength,
|
||||
messageCount / totalMessageCount AS percentageOfMessages
|
||||
ORDER BY
|
||||
year DESC,
|
||||
isComment ASC,
|
||||
lengthCategory ASC
|
||||
""".replace(
|
||||
"\n", ""
|
||||
),
|
||||
self._get_query_parameters(),
|
||||
)
|
||||
|
||||
neo4j = (
|
||||
"""
|
||||
MATCH (message:Message)
|
||||
WHERE message.creationDate < DateTime($datetime)
|
||||
WITH count(message) AS totalMessageCountInt
|
||||
WITH toFloat(totalMessageCountInt) AS totalMessageCount
|
||||
MATCH (message:Message)
|
||||
WHERE message.creationDate < DateTime($datetime)
|
||||
AND message.content IS NOT NULL
|
||||
WITH
|
||||
totalMessageCount,
|
||||
message,
|
||||
message.creationDate.year AS year
|
||||
WITH
|
||||
totalMessageCount,
|
||||
year,
|
||||
message:Comment AS isComment,
|
||||
CASE
|
||||
WHEN message.length < 40 THEN 0
|
||||
WHEN message.length < 80 THEN 1
|
||||
WHEN message.length < 160 THEN 2
|
||||
ELSE 3
|
||||
END AS lengthCategory,
|
||||
count(message) AS messageCount,
|
||||
sum(message.length) / toFloat(count(message)) AS averageMessageLength,
|
||||
sum(message.length) AS sumMessageLength
|
||||
RETURN
|
||||
year,
|
||||
isComment,
|
||||
lengthCategory,
|
||||
messageCount,
|
||||
averageMessageLength,
|
||||
sumMessageLength,
|
||||
messageCount / totalMessageCount AS percentageOfMessages
|
||||
ORDER BY
|
||||
year DESC,
|
||||
isComment ASC,
|
||||
lengthCategory ASC
|
||||
""".replace(
|
||||
"\n", ""
|
||||
),
|
||||
self._get_query_parameters(),
|
||||
)
|
||||
if self._vendor == "memgraph":
|
||||
return memgraph
|
||||
else:
|
||||
return neo4j
|
||||
|
||||
def benchmark__bi__query_2_analytical(self):
|
||||
|
||||
memgraph = (
|
||||
"""
|
||||
MATCH (tag:Tag)-[:HAS_TYPE]->(:TagClass {name: $tagClass})
|
||||
OPTIONAL MATCH (message1:Message)-[:HAS_TAG]->(tag)
|
||||
WHERE localDateTime($date) <= message1.creationDate
|
||||
AND message1.creationDate < localDateTime($date) + duration({day: 100})
|
||||
WITH tag, count(message1) AS countWindow1
|
||||
OPTIONAL MATCH (message2:Message)-[:HAS_TAG]->(tag)
|
||||
WHERE localDateTime($date) + duration({day: 100}) <= message2.creationDate
|
||||
AND message2.creationDate < localDateTime($date) + duration({day: 200})
|
||||
WITH
|
||||
tag,
|
||||
countWindow1,
|
||||
count(message2) AS countWindow2
|
||||
RETURN
|
||||
tag.name,
|
||||
countWindow1,
|
||||
countWindow2,
|
||||
abs(countWindow1 - countWindow2) AS diff
|
||||
ORDER BY
|
||||
diff DESC,
|
||||
tag.name ASC
|
||||
LIMIT 100
|
||||
""".replace(
|
||||
"\n", ""
|
||||
),
|
||||
self._get_query_parameters(),
|
||||
)
|
||||
|
||||
neo4j = (
|
||||
"""
|
||||
MATCH (tag:Tag)-[:HAS_TYPE]->(:TagClass {name: $tagClass})
|
||||
OPTIONAL MATCH (message1:Message)-[:HAS_TAG]->(tag)
|
||||
WHERE DateTime($date) <= message1.creationDate
|
||||
AND message1.creationDate < DateTime($date) + duration({days: 100})
|
||||
WITH tag, count(message1) AS countWindow1
|
||||
OPTIONAL MATCH (message2:Message)-[:HAS_TAG]->(tag)
|
||||
WHERE DateTime($date) + duration({days: 100}) <= message2.creationDate
|
||||
AND message2.creationDate < DateTime($date) + duration({days: 200})
|
||||
WITH
|
||||
tag,
|
||||
countWindow1,
|
||||
count(message2) AS countWindow2
|
||||
RETURN
|
||||
tag.name,
|
||||
countWindow1,
|
||||
countWindow2,
|
||||
abs(countWindow1 - countWindow2) AS diff
|
||||
ORDER BY
|
||||
diff DESC,
|
||||
tag.name ASC
|
||||
LIMIT 100
|
||||
""".replace(
|
||||
"\n", ""
|
||||
),
|
||||
self._get_query_parameters(),
|
||||
)
|
||||
if self._vendor == "memgraph":
|
||||
return memgraph
|
||||
else:
|
||||
return neo4j
|
||||
|
||||
def benchmark__bi__query_3_analytical(self):
|
||||
return (
|
||||
"""
|
||||
MATCH
|
||||
(:Country {name: $country})<-[:IS_PART_OF]-(:City)<-[:IS_LOCATED_IN]-
|
||||
(person:Person)<-[:HAS_MODERATOR]-(forum:Forum)-[:CONTAINER_OF]->
|
||||
(post:Post)<-[:REPLY_OF*0..]-(message:Message)-[:HAS_TAG]->(:Tag)-[:HAS_TYPE]->(:TagClass {name: $tagClass})
|
||||
RETURN
|
||||
forum.id as id,
|
||||
forum.title,
|
||||
person.id,
|
||||
count(DISTINCT message) AS messageCount
|
||||
ORDER BY
|
||||
messageCount DESC,
|
||||
id ASC
|
||||
LIMIT 20
|
||||
""".replace(
|
||||
"\n", ""
|
||||
),
|
||||
self._get_query_parameters(),
|
||||
)
|
||||
|
||||
def benchmark__bi__query_5_analytical(self):
|
||||
return (
|
||||
"""
|
||||
MATCH (tag:Tag {name: $tag})<-[:HAS_TAG]-(message:Message)-[:HAS_CREATOR]->(person:Person)
|
||||
OPTIONAL MATCH (message)<-[likes:LIKES]-(:Person)
|
||||
WITH person, message, count(likes) AS likeCount
|
||||
OPTIONAL MATCH (message)<-[:REPLY_OF]-(reply:Comment)
|
||||
WITH person, message, likeCount, count(reply) AS replyCount
|
||||
WITH person, count(message) AS messageCount, sum(likeCount) AS likeCount, sum(replyCount) AS replyCount
|
||||
RETURN
|
||||
person.id,
|
||||
replyCount,
|
||||
likeCount,
|
||||
messageCount,
|
||||
1*messageCount + 2*replyCount + 10*likeCount AS score
|
||||
ORDER BY
|
||||
score DESC,
|
||||
person.id ASC
|
||||
LIMIT 100
|
||||
""".replace(
|
||||
"\n", ""
|
||||
),
|
||||
self._get_query_parameters(),
|
||||
)
|
||||
|
||||
def benchmark__bi__query_6_analytical(self):
|
||||
return (
|
||||
"""
|
||||
MATCH (tag:Tag {name: $tag})<-[:HAS_TAG]-(message1:Message)-[:HAS_CREATOR]->(person1:Person)
|
||||
OPTIONAL MATCH (message1)<-[:LIKES]-(person2:Person)
|
||||
OPTIONAL MATCH (person2)<-[:HAS_CREATOR]-(message2:Message)<-[like:LIKES]-(person3:Person)
|
||||
RETURN
|
||||
person1.id as id,
|
||||
count(DISTINCT like) AS authorityScore
|
||||
ORDER BY
|
||||
authorityScore DESC,
|
||||
id ASC
|
||||
LIMIT 100
|
||||
""".replace(
|
||||
"\n", ""
|
||||
),
|
||||
self._get_query_parameters(),
|
||||
)
|
||||
|
||||
def benchmark__bi__query_7_analytical(self):
|
||||
|
||||
memgraph = (
|
||||
"""
|
||||
MATCH
|
||||
(tag:Tag {name: $tag})<-[:HAS_TAG]-(message:Message),
|
||||
(message)<-[:REPLY_OF]-(comment:Comment)-[:HAS_TAG]->(relatedTag:Tag)
|
||||
OPTIONAL MATCH (comment)-[:HAS_TAG]->(tag)
|
||||
WHERE tag IS NOT NULL
|
||||
RETURN
|
||||
relatedTag,
|
||||
count(DISTINCT comment) AS count
|
||||
ORDER BY
|
||||
relatedTag.name ASC,
|
||||
count DESC
|
||||
LIMIT 100
|
||||
""".replace(
|
||||
"\n", ""
|
||||
),
|
||||
self._get_query_parameters(),
|
||||
)
|
||||
|
||||
neo4j = (
|
||||
"""
|
||||
MATCH
|
||||
(tag:Tag {name: $tag})<-[:HAS_TAG]-(message:Message),
|
||||
(message)<-[:REPLY_OF]-(comment:Comment)-[:HAS_TAG]->(relatedTag:Tag)
|
||||
WHERE NOT (comment)-[:HAS_TAG]->(tag)
|
||||
RETURN
|
||||
relatedTag.name,
|
||||
count(DISTINCT comment) AS count
|
||||
ORDER BY
|
||||
relatedTag.name ASC,
|
||||
count DESC
|
||||
LIMIT 100
|
||||
""".replace(
|
||||
"\n", ""
|
||||
),
|
||||
self._get_query_parameters(),
|
||||
)
|
||||
if self._vendor == "memgraph":
|
||||
return memgraph
|
||||
else:
|
||||
return neo4j
|
||||
|
||||
def benchmark__bi__query_9_analytical(self):
|
||||
memgraph = (
|
||||
"""
|
||||
MATCH (person:Person)<-[:HAS_CREATOR]-(post:Post)<-[:REPLY_OF*0..]-(reply:Message)
|
||||
WHERE post.creationDate >= localDateTime($startDate)
|
||||
AND post.creationDate <= localDateTime($endDate)
|
||||
AND reply.creationDate >= localDateTime($startDate)
|
||||
AND reply.creationDate <= localDateTime($endDate)
|
||||
RETURN
|
||||
person.id as id,
|
||||
person.firstName,
|
||||
person.lastName,
|
||||
count(DISTINCT post) AS threadCount,
|
||||
count(DISTINCT reply) AS messageCount
|
||||
ORDER BY
|
||||
messageCount DESC,
|
||||
id ASC
|
||||
LIMIT 100
|
||||
""".replace(
|
||||
"\n", ""
|
||||
),
|
||||
self._get_query_parameters(),
|
||||
)
|
||||
neo4j = (
|
||||
"""
|
||||
MATCH (person:Person)<-[:HAS_CREATOR]-(post:Post)<-[:REPLY_OF*0..]-(reply:Message)
|
||||
WHERE post.creationDate >= DateTime($startDate)
|
||||
AND post.creationDate <= DateTime($endDate)
|
||||
AND reply.creationDate >= DateTime($startDate)
|
||||
AND reply.creationDate <= DateTime($endDate)
|
||||
RETURN
|
||||
person.id as id,
|
||||
person.firstName,
|
||||
person.lastName,
|
||||
count(DISTINCT post) AS threadCount,
|
||||
count(DISTINCT reply) AS messageCount
|
||||
ORDER BY
|
||||
messageCount DESC,
|
||||
id ASC
|
||||
LIMIT 100
|
||||
""".replace(
|
||||
"\n", ""
|
||||
),
|
||||
self._get_query_parameters(),
|
||||
)
|
||||
if self._vendor == "memgraph":
|
||||
return memgraph
|
||||
else:
|
||||
return neo4j
|
||||
|
||||
def benchmark__bi__query_11_analytical(self):
|
||||
return (
|
||||
"""
|
||||
MATCH (a:Person)-[:IS_LOCATED_IN]->(:City)-[:IS_PART_OF]->(country:Country {name: $country}),
|
||||
(a)-[k1:KNOWS]-(b:Person)
|
||||
WHERE a.id < b.id
|
||||
AND localDateTime($startDate) <= k1.creationDate AND k1.creationDate <= localDateTime($endDate)
|
||||
WITH DISTINCT country, a, b
|
||||
MATCH (b)-[:IS_LOCATED_IN]->(:City)-[:IS_PART_OF]->(country)
|
||||
WITH DISTINCT country, a, b
|
||||
MATCH (b)-[k2:KNOWS]-(c:Person),
|
||||
(c)-[:IS_LOCATED_IN]->(:City)-[:IS_PART_OF]->(country)
|
||||
WHERE b.id < c.id
|
||||
AND localDateTime($startDate) <= k2.creationDate AND k2.creationDate <= localDateTime($endDate)
|
||||
WITH DISTINCT a, b, c
|
||||
MATCH (c)-[k3:KNOWS]-(a)
|
||||
WHERE localDateTime($startDate) <= k3.creationDate AND k3.creationDate <= localDateTime($endDate)
|
||||
WITH DISTINCT a, b, c
|
||||
RETURN count(*) AS count
|
||||
""".replace(
|
||||
"\n", ""
|
||||
),
|
||||
self._get_query_parameters(),
|
||||
)
|
||||
|
||||
def benchmark__bi__query_12_analytical(self):
|
||||
return (
|
||||
"""
|
||||
MATCH (person:Person)
|
||||
OPTIONAL MATCH (person)<-[:HAS_CREATOR]-(message:Message)-[:REPLY_OF*0..]->(post:Post)
|
||||
WHERE message.content IS NOT NULL
|
||||
AND message.length < $lengthThreshold
|
||||
AND message.creationDate > localDateTime($startDate)
|
||||
AND post.language IN $languages
|
||||
WITH
|
||||
person,
|
||||
count(message) AS messageCount
|
||||
RETURN
|
||||
messageCount,
|
||||
count(person) AS personCount
|
||||
ORDER BY
|
||||
personCount DESC,
|
||||
messageCount DESC
|
||||
""".replace(
|
||||
"\n", ""
|
||||
),
|
||||
self._get_query_parameters(),
|
||||
)
|
||||
|
||||
def benchmark__bi__query_13_analytical(self):
|
||||
memgraph = (
|
||||
"""
|
||||
MATCH (country:Country {name: $country})<-[:IS_PART_OF]-(:City)<-[:IS_LOCATED_IN]-(zombie:Person)
|
||||
WHERE zombie.creationDate < localDateTime($endDate)
|
||||
WITH country, zombie
|
||||
OPTIONAL MATCH (zombie)<-[:HAS_CREATOR]-(message:Message)
|
||||
WHERE message.creationDate < localDateTime($endDate)
|
||||
WITH
|
||||
country,
|
||||
zombie,
|
||||
count(message) AS messageCount
|
||||
WITH
|
||||
country,
|
||||
zombie,
|
||||
12 * (localDateTime($endDate).year - zombie.creationDate.year )
|
||||
+ (localDateTime($endDate).month - zombie.creationDate.month)
|
||||
+ 1 AS months,
|
||||
messageCount
|
||||
WHERE messageCount / months < 1
|
||||
WITH
|
||||
country,
|
||||
collect(zombie) AS zombies
|
||||
UNWIND zombies AS zombie
|
||||
OPTIONAL MATCH
|
||||
(zombie)<-[:HAS_CREATOR]-(message:Message)<-[:LIKES]-(likerZombie:Person)
|
||||
WHERE likerZombie IN zombies
|
||||
WITH
|
||||
zombie,
|
||||
count(likerZombie) AS zombieLikeCount
|
||||
OPTIONAL MATCH
|
||||
(zombie)<-[:HAS_CREATOR]-(message:Message)<-[:LIKES]-(likerPerson:Person)
|
||||
WHERE likerPerson.creationDate < localDateTime($endDate)
|
||||
WITH
|
||||
zombie,
|
||||
zombieLikeCount,
|
||||
count(likerPerson) AS totalLikeCount
|
||||
RETURN
|
||||
zombie.id,
|
||||
zombieLikeCount,
|
||||
totalLikeCount,
|
||||
CASE totalLikeCount
|
||||
WHEN 0 THEN 0.0
|
||||
ELSE zombieLikeCount / toFloat(totalLikeCount)
|
||||
END AS zombieScore
|
||||
ORDER BY
|
||||
zombieScore DESC,
|
||||
zombie.id ASC
|
||||
LIMIT 100
|
||||
""".replace(
|
||||
"\n", ""
|
||||
),
|
||||
self._get_query_parameters(),
|
||||
)
|
||||
|
||||
neo4j = (
|
||||
"""
|
||||
MATCH (country:Country {name: $country})<-[:IS_PART_OF]-(:City)<-[:IS_LOCATED_IN]-(zombie:Person)
|
||||
WHERE zombie.creationDate < DateTime($endDate)
|
||||
WITH country, zombie
|
||||
OPTIONAL MATCH (zombie)<-[:HAS_CREATOR]-(message:Message)
|
||||
WHERE message.creationDate < DateTime($endDate)
|
||||
WITH
|
||||
country,
|
||||
zombie,
|
||||
count(message) AS messageCount
|
||||
WITH
|
||||
country,
|
||||
zombie,
|
||||
12 * (DateTime($endDate).year - zombie.creationDate.year )
|
||||
+ (DateTime($endDate).month - zombie.creationDate.month)
|
||||
+ 1 AS months,
|
||||
messageCount
|
||||
WHERE messageCount / months < 1
|
||||
WITH
|
||||
country,
|
||||
collect(zombie) AS zombies
|
||||
UNWIND zombies AS zombie
|
||||
OPTIONAL MATCH
|
||||
(zombie)<-[:HAS_CREATOR]-(message:Message)<-[:LIKES]-(likerZombie:Person)
|
||||
WHERE likerZombie IN zombies
|
||||
WITH
|
||||
zombie,
|
||||
count(likerZombie) AS zombieLikeCount
|
||||
OPTIONAL MATCH
|
||||
(zombie)<-[:HAS_CREATOR]-(message:Message)<-[:LIKES]-(likerPerson:Person)
|
||||
WHERE likerPerson.creationDate < DateTime($endDate)
|
||||
WITH
|
||||
zombie,
|
||||
zombieLikeCount,
|
||||
count(likerPerson) AS totalLikeCount
|
||||
RETURN
|
||||
zombie.id,
|
||||
zombieLikeCount,
|
||||
totalLikeCount,
|
||||
CASE totalLikeCount
|
||||
WHEN 0 THEN 0.0
|
||||
ELSE zombieLikeCount / toFloat(totalLikeCount)
|
||||
END AS zombieScore
|
||||
ORDER BY
|
||||
zombieScore DESC,
|
||||
zombie.id ASC
|
||||
LIMIT 100
|
||||
""".replace(
|
||||
"\n", ""
|
||||
),
|
||||
self._get_query_parameters(),
|
||||
)
|
||||
|
||||
if self._vendor == "memgraph":
|
||||
return memgraph
|
||||
else:
|
||||
return neo4j
|
||||
|
||||
def benchmark__bi__query_14_analytical(self):
|
||||
return (
|
||||
"""
|
||||
MATCH
|
||||
(country1:Country {name: $country1})<-[:IS_PART_OF]-(city1:City)<-[:IS_LOCATED_IN]-(person1:Person),
|
||||
(country2:Country {name: $country2})<-[:IS_PART_OF]-(city2:City)<-[:IS_LOCATED_IN]-(person2:Person),
|
||||
(person1)-[:KNOWS]-(person2)
|
||||
WITH person1, person2, city1, 0 AS score
|
||||
OPTIONAL MATCH (person1)<-[:HAS_CREATOR]-(c:Comment)-[:REPLY_OF]->(:Message)-[:HAS_CREATOR]->(person2)
|
||||
WITH DISTINCT person1, person2, city1, score + (CASE c WHEN null THEN 0 ELSE 4 END) AS score
|
||||
OPTIONAL MATCH (person1)<-[:HAS_CREATOR]-(m:Message)<-[:REPLY_OF]-(:Comment)-[:HAS_CREATOR]->(person2)
|
||||
WITH DISTINCT person1, person2, city1, score + (CASE m WHEN null THEN 0 ELSE 1 END) AS score
|
||||
OPTIONAL MATCH (person1)-[:LIKES]->(m:Message)-[:HAS_CREATOR]->(person2)
|
||||
WITH DISTINCT person1, person2, city1, score + (CASE m WHEN null THEN 0 ELSE 10 END) AS score
|
||||
OPTIONAL MATCH (person1)<-[:HAS_CREATOR]-(m:Message)<-[:LIKES]-(person2)
|
||||
WITH DISTINCT person1, person2, city1, score + (CASE m WHEN null THEN 0 ELSE 1 END) AS score
|
||||
ORDER BY
|
||||
city1.name ASC,
|
||||
score DESC,
|
||||
person1.id ASC,
|
||||
person2.id ASC
|
||||
WITH city1, collect({score: score, person1Id: person1.id, person2Id: person2.id})[0] AS top
|
||||
RETURN
|
||||
top.person1Id,
|
||||
top.person2Id,
|
||||
city1.name,
|
||||
top.score
|
||||
ORDER BY
|
||||
top.score DESC,
|
||||
top.person1Id ASC,
|
||||
top.person2Id ASC
|
||||
LIMIT 100
|
||||
""".replace(
|
||||
"\n", ""
|
||||
),
|
||||
self._get_query_parameters(),
|
||||
)
|
||||
|
||||
def benchmark__bi__query_17_analytical(self):
|
||||
|
||||
memgraph = (
|
||||
"""
|
||||
MATCH
|
||||
(tag:Tag {name: $tag}),
|
||||
(person1:Person)<-[:HAS_CREATOR]-(message1:Message)-[:REPLY_OF*0..]->(post1:Post)<-[:CONTAINER_OF]-(forum1:Forum),
|
||||
(message1)-[:HAS_TAG]->(tag),
|
||||
(forum1)<-[:HAS_MEMBER]->(person2:Person)<-[:HAS_CREATOR]-(comment:Comment)-[:HAS_TAG]->(tag),
|
||||
(forum1)<-[:HAS_MEMBER]->(person3:Person)<-[:HAS_CREATOR]-(message2:Message),
|
||||
(comment)-[:REPLY_OF]->(message2)-[:REPLY_OF*0..]->(post2:Post)<-[:CONTAINER_OF]-(forum2:Forum)
|
||||
MATCH (comment)-[:HAS_TAG]->(tag)
|
||||
MATCH (message2)-[:HAS_TAG]->(tag)
|
||||
OPTIONAL MATCH (forum2)-[:HAS_MEMBER]->(person1)
|
||||
WHERE forum1 <> forum2 AND message2.creationDate > message1.creationDate + duration({hours: $delta}) AND person1 IS NULL
|
||||
RETURN person1, count(DISTINCT message2) AS messageCount
|
||||
ORDER BY messageCount DESC, person1.id ASC
|
||||
LIMIT 10
|
||||
""".replace(
|
||||
"\n", ""
|
||||
),
|
||||
self._get_query_parameters(),
|
||||
)
|
||||
|
||||
neo4j = (
|
||||
"""
|
||||
MATCH
|
||||
(tag:Tag {name: $tag}),
|
||||
(person1:Person)<-[:HAS_CREATOR]-(message1:Message)-[:REPLY_OF*0..]->(post1:Post)<-[:CONTAINER_OF]-(forum1:Forum),
|
||||
(message1)-[:HAS_TAG]->(tag),
|
||||
(forum1)<-[:HAS_MEMBER]->(person2:Person)<-[:HAS_CREATOR]-(comment:Comment)-[:HAS_TAG]->(tag),
|
||||
(forum1)<-[:HAS_MEMBER]->(person3:Person)<-[:HAS_CREATOR]-(message2:Message),
|
||||
(comment)-[:REPLY_OF]->(message2)-[:REPLY_OF*0..]->(post2:Post)<-[:CONTAINER_OF]-(forum2:Forum)
|
||||
MATCH (comment)-[:HAS_TAG]->(tag)
|
||||
MATCH (message2)-[:HAS_TAG]->(tag)
|
||||
WHERE forum1 <> forum2
|
||||
AND message2.creationDate > message1.creationDate + duration({hours: $delta})
|
||||
AND NOT (forum2)-[:HAS_MEMBER]->(person1)
|
||||
RETURN person1, count(DISTINCT message2) AS messageCount
|
||||
ORDER BY messageCount DESC, person1.id ASC
|
||||
LIMIT 10
|
||||
""".replace(
|
||||
"\n", ""
|
||||
),
|
||||
self._get_query_parameters(),
|
||||
)
|
||||
if self._vendor == "memgraph":
|
||||
return memgraph
|
||||
else:
|
||||
return neo4j
|
||||
|
||||
def benchmark__bi__query_18_analytical(self):
|
||||
|
||||
memgraph = (
|
||||
"""
|
||||
MATCH (tag:Tag {name: $tag})<-[:HAS_INTEREST]-(person1:Person)-[:KNOWS]-(mutualFriend:Person)-[:KNOWS]-(person2:Person)-[:HAS_INTEREST]->(tag)
|
||||
OPTIONAL MATCH (person1)-[:KNOWS]-(person2)
|
||||
WHERE person1 <> person2
|
||||
RETURN person1.id AS person1Id, person2.id AS person2Id, count(DISTINCT mutualFriend) AS mutualFriendCount
|
||||
ORDER BY mutualFriendCount DESC, person1Id ASC, person2Id ASC
|
||||
LIMIT 20
|
||||
""".replace(
|
||||
"\n", ""
|
||||
),
|
||||
self._get_query_parameters(),
|
||||
)
|
||||
|
||||
neo4j = (
|
||||
"""
|
||||
MATCH (tag:Tag {name: $tag})<-[:HAS_INTEREST]-(person1:Person)-[:KNOWS]-(mutualFriend:Person)-[:KNOWS]-(person2:Person)-[:HAS_INTEREST]->(tag)
|
||||
WHERE person1 <> person2
|
||||
AND NOT (person1)-[:KNOWS]-(person2)
|
||||
RETURN person1.id AS person1Id, person2.id AS person2Id, count(DISTINCT mutualFriend) AS mutualFriendCount
|
||||
ORDER BY mutualFriendCount DESC, person1Id ASC, person2Id ASC
|
||||
LIMIT 20
|
||||
""".replace(
|
||||
"\n", ""
|
||||
),
|
||||
self._get_query_parameters(),
|
||||
)
|
||||
if self._vendor == "memgraph":
|
||||
return memgraph
|
||||
else:
|
||||
return neo4j
|
684
tests/mgbench/workloads/ldbc_interactive.py
Normal file
684
tests/mgbench/workloads/ldbc_interactive.py
Normal file
@ -0,0 +1,684 @@
|
||||
import inspect
|
||||
import random
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
|
||||
import helpers
|
||||
from benchmark_context import BenchmarkContext
|
||||
from workloads.base import Workload
|
||||
from workloads.importers.importer_ldbc_interactive import *
|
||||
|
||||
|
||||
class LDBC_Interactive(Workload):
|
||||
|
||||
NAME = "ldbc_interactive"
|
||||
VARIANTS = ["sf0.1", "sf1", "sf3", "sf10"]
|
||||
DEFAULT_VARIANT = "sf1"
|
||||
|
||||
URL_FILE = {
|
||||
"sf0.1": "https://s3.eu-west-1.amazonaws.com/deps.memgraph.io/dataset/ldbc/benchmark/interactive/ldbc_interactive_sf0.1.cypher.gz",
|
||||
"sf1": "https://s3.eu-west-1.amazonaws.com/deps.memgraph.io/dataset/ldbc/benchmark/interactive/ldbc_interactive_sf1.cypher.gz",
|
||||
"sf3": "https://s3.eu-west-1.amazonaws.com/deps.memgraph.io/dataset/ldbc/benchmark/interactive/ldbc_interactive_sf3.cypher.gz",
|
||||
"sf10": "https://s3.eu-west-1.amazonaws.com/deps.memgraph.io/dataset/ldbc/benchmark/interactive/ldbc_interactive_sf10.cypher.gz",
|
||||
}
|
||||
URL_CSV = {
|
||||
"sf0.1": "https://repository.surfsara.nl/datasets/cwi/snb/files/social_network-csv_basic/social_network-csv_basic-sf0.1.tar.zst",
|
||||
"sf1": "https://repository.surfsara.nl/datasets/cwi/snb/files/social_network-csv_basic/social_network-csv_basic-sf1.tar.zst",
|
||||
"sf3": "https://repository.surfsara.nl/datasets/cwi/snb/files/social_network-csv_basic/social_network-csv_basic-sf3.tar.zst",
|
||||
"sf10": "https://repository.surfsara.nl/datasets/cwi/snb/files/social_network-csv_basic/social_network-csv_basic-sf10.tar.zst",
|
||||
}
|
||||
|
||||
SIZES = {
|
||||
"sf0.1": {"vertices": 327588, "edges": 1477965},
|
||||
"sf1": {"vertices": 3181724, "edges": 17256038},
|
||||
"sf3": {"vertices": 1, "edges": 1},
|
||||
"sf10": {"vertices": 1, "edges": 1},
|
||||
}
|
||||
|
||||
URL_INDEX_FILE = {
|
||||
"memgraph": "https://s3.eu-west-1.amazonaws.com/deps.memgraph.io/dataset/ldbc/benchmark/interactive/memgraph_interactive_index.cypher",
|
||||
"neo4j": "https://s3.eu-west-1.amazonaws.com/deps.memgraph.io/dataset/ldbc/benchmark/interactive/neo4j_interactive_index.cypher",
|
||||
}
|
||||
|
||||
PROPERTIES_ON_EDGES = True
|
||||
|
||||
QUERY_PARAMETERS = {
|
||||
"sf0.1": "https://repository.surfsara.nl/datasets/cwi/snb/files/substitution_parameters/substitution_parameters-sf0.1.tar.zst",
|
||||
"sf1": "https://repository.surfsara.nl/datasets/cwi/snb/files/substitution_parameters/substitution_parameters-sf0.1.tar.zst",
|
||||
"sf3": "https://repository.surfsara.nl/datasets/cwi/snb/files/substitution_parameters/substitution_parameters-sf0.1.tar.zst",
|
||||
}
|
||||
|
||||
def custom_import(self) -> bool:
|
||||
importer = ImporterLDBCInteractive(
|
||||
benchmark_context=self.benchmark_context,
|
||||
dataset_name=self.NAME,
|
||||
variant=self._variant,
|
||||
index_file=self._file_index,
|
||||
csv_dict=self.URL_CSV,
|
||||
)
|
||||
return importer.execute_import()
|
||||
|
||||
def _prepare_parameters_directory(self):
|
||||
parameters = Path() / ".cache" / "datasets" / self.NAME / self._variant / "parameters"
|
||||
parameters.mkdir(parents=True, exist_ok=True)
|
||||
dir_name = self.QUERY_PARAMETERS[self._variant].split("/")[-1:][0].removesuffix(".tar.zst")
|
||||
if (parameters / dir_name).exists():
|
||||
print("Files downloaded:")
|
||||
parameters = parameters / dir_name
|
||||
else:
|
||||
print("Downloading files")
|
||||
downloaded_file = helpers.download_file(self.QUERY_PARAMETERS[self._variant], parameters.absolute())
|
||||
print("Unpacking the file..." + downloaded_file)
|
||||
parameters = helpers.unpack_tar_zst(Path(downloaded_file))
|
||||
return parameters
|
||||
|
||||
def _get_query_parameters(self) -> dict:
|
||||
func_name = inspect.stack()[1].function
|
||||
parameters = {}
|
||||
for file in self._parameters_dir.glob("interactive_*.txt"):
|
||||
if file.name.split("_")[1] == func_name.split("_")[-2]:
|
||||
with file.open("r") as input:
|
||||
lines = input.readlines()
|
||||
position = random.randint(1, len(lines) - 1)
|
||||
header = lines[0].strip("\n").split("|")
|
||||
data = lines[position].strip("\n").split("|")
|
||||
for i in range(len(header)):
|
||||
if "Date" in header[i]:
|
||||
time = int(data[i]) / 1000
|
||||
converted = datetime.utcfromtimestamp(time).strftime("%Y-%m-%dT%H:%M:%S")
|
||||
parameters[header[i]] = converted
|
||||
elif data[i].isdigit():
|
||||
parameters[header[i]] = int(data[i])
|
||||
else:
|
||||
parameters[header[i]] = data[i]
|
||||
|
||||
return parameters
|
||||
|
||||
def __init__(self, variant: str = None, benchmark_context: BenchmarkContext = None):
|
||||
super().__init__(variant, benchmark_context=benchmark_context)
|
||||
self._parameters_dir = self._prepare_parameters_directory()
|
||||
self.benchmark_context = benchmark_context
|
||||
|
||||
def benchmark__interactive__complex_query_1_analytical(self):
|
||||
memgraph = (
|
||||
"""
|
||||
MATCH (p:Person {id: $personId}), (friend:Person {firstName: $firstName})
|
||||
WHERE NOT p=friend
|
||||
WITH p, friend
|
||||
MATCH path =((p)-[:KNOWS *BFS 1..3]-(friend))
|
||||
WITH min(size(path)) AS distance, friend
|
||||
ORDER BY
|
||||
distance ASC,
|
||||
friend.lastName ASC,
|
||||
toInteger(friend.id) ASC
|
||||
LIMIT 20
|
||||
|
||||
MATCH (friend)-[:IS_LOCATED_IN]->(friendCity:City)
|
||||
OPTIONAL MATCH (friend)-[studyAt:STUDY_AT]->(uni:University)-[:IS_LOCATED_IN]->(uniCity:City)
|
||||
WITH friend, collect(
|
||||
CASE uni.name
|
||||
WHEN null THEN null
|
||||
ELSE [uni.name, studyAt.classYear, uniCity.name]
|
||||
END ) AS unis, friendCity, distance
|
||||
|
||||
OPTIONAL MATCH (friend)-[workAt:WORK_AT]->(company:Company)-[:IS_LOCATED_IN]->(companyCountry:Country)
|
||||
WITH friend, collect(
|
||||
CASE company.name
|
||||
WHEN null THEN null
|
||||
ELSE [company.name, workAt.workFrom, companyCountry.name]
|
||||
END ) AS companies, unis, friendCity, distance
|
||||
|
||||
RETURN
|
||||
friend.id AS friendId,
|
||||
friend.lastName AS friendLastName,
|
||||
distance AS distanceFromPerson,
|
||||
friend.birthday AS friendBirthday,
|
||||
friend.gender AS friendGender,
|
||||
friend.browserUsed AS friendBrowserUsed,
|
||||
friend.locationIP AS friendLocationIp,
|
||||
friend.email AS friendEmails,
|
||||
friend.speaks AS friendLanguages,
|
||||
friendCity.name AS friendCityName,
|
||||
unis AS friendUniversities,
|
||||
companies AS friendCompanies
|
||||
ORDER BY
|
||||
distanceFromPerson ASC,
|
||||
friendLastName ASC,
|
||||
toInteger(friendId) ASC
|
||||
LIMIT 20
|
||||
""".replace(
|
||||
"\n", ""
|
||||
),
|
||||
self._get_query_parameters(),
|
||||
)
|
||||
neo4j = (
|
||||
"""
|
||||
MATCH (p:Person {id: $personId}), (friend:Person {firstName: $firstName})
|
||||
WHERE NOT p=friend
|
||||
WITH p, friend
|
||||
MATCH path = shortestPath((p)-[:KNOWS*1..3]-(friend))
|
||||
WITH min(length(path)) AS distance, friend
|
||||
ORDER BY
|
||||
distance ASC,
|
||||
friend.lastName ASC,
|
||||
toInteger(friend.id) ASC
|
||||
LIMIT 20
|
||||
|
||||
MATCH (friend)-[:IS_LOCATED_IN]->(friendCity:City)
|
||||
OPTIONAL MATCH (friend)-[studyAt:STUDY_AT]->(uni:University)-[:IS_LOCATED_IN]->(uniCity:City)
|
||||
WITH friend, collect(
|
||||
CASE uni.name
|
||||
WHEN null THEN null
|
||||
ELSE [uni.name, studyAt.classYear, uniCity.name]
|
||||
END ) AS unis, friendCity, distance
|
||||
|
||||
OPTIONAL MATCH (friend)-[workAt:WORK_AT]->(company:Company)-[:IS_LOCATED_IN]->(companyCountry:Country)
|
||||
WITH friend, collect(
|
||||
CASE company.name
|
||||
WHEN null THEN null
|
||||
ELSE [company.name, workAt.workFrom, companyCountry.name]
|
||||
END ) AS companies, unis, friendCity, distance
|
||||
|
||||
RETURN
|
||||
friend.id AS friendId,
|
||||
friend.lastName AS friendLastName,
|
||||
distance AS distanceFromPerson,
|
||||
friend.birthday AS friendBirthday,
|
||||
friend.gender AS friendGender,
|
||||
friend.browserUsed AS friendBrowserUsed,
|
||||
friend.locationIP AS friendLocationIp,
|
||||
friend.email AS friendEmails,
|
||||
friend.speaks AS friendLanguages,
|
||||
friendCity.name AS friendCityName,
|
||||
unis AS friendUniversities,
|
||||
companies AS friendCompanies
|
||||
ORDER BY
|
||||
distanceFromPerson ASC,
|
||||
friendLastName ASC,
|
||||
toInteger(friendId) ASC
|
||||
LIMIT 20
|
||||
""".replace(
|
||||
"\n", ""
|
||||
),
|
||||
self._get_query_parameters(),
|
||||
)
|
||||
if self._vendor == "memgraph":
|
||||
return memgraph
|
||||
else:
|
||||
return neo4j
|
||||
|
||||
def benchmark__interactive__complex_query_2_analytical(self):
|
||||
return (
|
||||
"""
|
||||
MATCH (:Person {id: $personId })-[:KNOWS]-(friend:Person)<-[:HAS_CREATOR]-(message:Message)
|
||||
WHERE message.creationDate <= localDateTime($maxDate)
|
||||
RETURN
|
||||
friend.id AS personId,
|
||||
friend.firstName AS personFirstName,
|
||||
friend.lastName AS personLastName,
|
||||
message.id AS postOrCommentId,
|
||||
coalesce(message.content,message.imageFile) AS postOrCommentContent,
|
||||
message.creationDate AS postOrCommentCreationDate
|
||||
ORDER BY
|
||||
postOrCommentCreationDate DESC,
|
||||
toInteger(postOrCommentId) ASC
|
||||
LIMIT 20
|
||||
""".replace(
|
||||
"\n", ""
|
||||
),
|
||||
self._get_query_parameters(),
|
||||
)
|
||||
|
||||
def benchmark__interactive__complex_query_3_analytical(self):
|
||||
|
||||
memgraph = (
|
||||
"""
|
||||
MATCH (countryX:Country {name: $countryXName }),
|
||||
(countryY:Country {name: $countryYName }),
|
||||
(person:Person {id: $personId })
|
||||
WITH person, countryX, countryY
|
||||
LIMIT 1
|
||||
MATCH (city:City)-[:IS_PART_OF]->(country:Country)
|
||||
WHERE country IN [countryX, countryY]
|
||||
WITH person, countryX, countryY, collect(city) AS cities
|
||||
MATCH (person)-[:KNOWS*1..2]-(friend)-[:IS_LOCATED_IN]->(city)
|
||||
WHERE NOT person=friend AND NOT city IN cities
|
||||
WITH DISTINCT friend, countryX, countryY
|
||||
MATCH (friend)<-[:HAS_CREATOR]-(message),
|
||||
(message)-[:IS_LOCATED_IN]->(country)
|
||||
WHERE localDateTime($startDate) + duration({day:$durationDays}) > message.creationDate >= localDateTime($startDate) AND
|
||||
country IN [countryX, countryY]
|
||||
WITH friend,
|
||||
CASE WHEN country=countryX THEN 1 ELSE 0 END AS messageX,
|
||||
CASE WHEN country=countryY THEN 1 ELSE 0 END AS messageY
|
||||
WITH friend, sum(messageX) AS xCount, sum(messageY) AS yCount
|
||||
WHERE xCount>0 AND yCount>0
|
||||
RETURN friend.id AS friendId,
|
||||
friend.firstName AS friendFirstName,
|
||||
friend.lastName AS friendLastName,
|
||||
xCount,
|
||||
yCount,
|
||||
xCount + yCount AS xyCount
|
||||
ORDER BY xyCount DESC, friendId ASC
|
||||
LIMIT 20
|
||||
""".replace(
|
||||
"\n", ""
|
||||
),
|
||||
self._get_query_parameters(),
|
||||
)
|
||||
neo4j = (
|
||||
"""
|
||||
MATCH (countryX:Country {name: $countryXName }),
|
||||
(countryY:Country {name: $countryYName }),
|
||||
(person:Person {id: $personId })
|
||||
WITH person, countryX, countryY
|
||||
LIMIT 1
|
||||
MATCH (city:City)-[:IS_PART_OF]->(country:Country)
|
||||
WHERE country IN [countryX, countryY]
|
||||
WITH person, countryX, countryY, collect(city) AS cities
|
||||
MATCH (person)-[:KNOWS*1..2]-(friend)-[:IS_LOCATED_IN]->(city)
|
||||
WHERE NOT person=friend AND NOT city IN cities
|
||||
WITH DISTINCT friend, countryX, countryY
|
||||
MATCH (friend)<-[:HAS_CREATOR]-(message),
|
||||
(message)-[:IS_LOCATED_IN]->(country)
|
||||
WHERE localDateTime($startDate) + duration({days:$durationDays}) > message.creationDate >= localDateTime($startDate) AND
|
||||
country IN [countryX, countryY]
|
||||
WITH friend,
|
||||
CASE WHEN country=countryX THEN 1 ELSE 0 END AS messageX,
|
||||
CASE WHEN country=countryY THEN 1 ELSE 0 END AS messageY
|
||||
WITH friend, sum(messageX) AS xCount, sum(messageY) AS yCount
|
||||
WHERE xCount>0 AND yCount>0
|
||||
RETURN friend.id AS friendId,
|
||||
friend.firstName AS friendFirstName,
|
||||
friend.lastName AS friendLastName,
|
||||
xCount,
|
||||
yCount,
|
||||
xCount + yCount AS xyCount
|
||||
ORDER BY xyCount DESC, friendId ASC
|
||||
LIMIT 20
|
||||
""".replace(
|
||||
"\n", ""
|
||||
),
|
||||
self._get_query_parameters(),
|
||||
)
|
||||
|
||||
if self._vendor == "memgraph":
|
||||
return memgraph
|
||||
else:
|
||||
return neo4j
|
||||
|
||||
def benchmark__interactive__complex_query_4_analytical(self):
|
||||
memgraph = (
|
||||
"""
|
||||
MATCH (person:Person {id: $personId })-[:KNOWS]-(friend:Person),
|
||||
(friend)<-[:HAS_CREATOR]-(post:Post)-[:HAS_TAG]->(tag)
|
||||
WITH DISTINCT tag, post
|
||||
WITH tag,
|
||||
CASE
|
||||
WHEN localDateTime($startDate) + duration({day:$durationDays}) > post.creationDate >= localDateTime($startDate) THEN 1
|
||||
ELSE 0
|
||||
END AS valid,
|
||||
CASE
|
||||
WHEN localDateTime($startDate) > post.creationDate THEN 1
|
||||
ELSE 0
|
||||
END AS inValid
|
||||
WITH tag, sum(valid) AS postCount, sum(inValid) AS inValidPostCount
|
||||
WHERE postCount>0 AND inValidPostCount=0
|
||||
RETURN tag.name AS tagName, postCount
|
||||
ORDER BY postCount DESC, tagName ASC
|
||||
LIMIT 10
|
||||
|
||||
""",
|
||||
self._get_query_parameters(),
|
||||
)
|
||||
|
||||
neo4j = (
|
||||
"""
|
||||
MATCH (person:Person {id: $personId })-[:KNOWS]-(friend:Person),
|
||||
(friend)<-[:HAS_CREATOR]-(post:Post)-[:HAS_TAG]->(tag)
|
||||
WITH DISTINCT tag, post
|
||||
WITH tag,
|
||||
CASE
|
||||
WHEN localDateTime($startDate) + duration({days:$durationDays}) > post.creationDate >= localDateTime($startDate) THEN 1
|
||||
ELSE 0
|
||||
END AS valid,
|
||||
CASE
|
||||
WHEN localDateTime($startDate) > post.creationDate THEN 1
|
||||
ELSE 0
|
||||
END AS inValid
|
||||
WITH tag, sum(valid) AS postCount, sum(inValid) AS inValidPostCount
|
||||
WHERE postCount>0 AND inValidPostCount=0
|
||||
RETURN tag.name AS tagName, postCount
|
||||
ORDER BY postCount DESC, tagName ASC
|
||||
LIMIT 10
|
||||
|
||||
""",
|
||||
self._get_query_parameters(),
|
||||
)
|
||||
|
||||
if self._vendor == "memgraph":
|
||||
return memgraph
|
||||
else:
|
||||
return neo4j
|
||||
|
||||
def benchmark__interactive__complex_query_5_analytical(self):
|
||||
return (
|
||||
"""
|
||||
MATCH (person:Person { id: $personId })-[:KNOWS*1..2]-(friend)
|
||||
WHERE
|
||||
NOT person=friend
|
||||
WITH DISTINCT friend
|
||||
MATCH (friend)<-[membership:HAS_MEMBER]-(forum)
|
||||
WHERE
|
||||
membership.joinDate > localDateTime($minDate)
|
||||
WITH
|
||||
forum,
|
||||
collect(friend) AS friends
|
||||
OPTIONAL MATCH (friend)<-[:HAS_CREATOR]-(post)<-[:CONTAINER_OF]-(forum)
|
||||
WHERE
|
||||
friend IN friends
|
||||
WITH
|
||||
forum,
|
||||
count(post) AS postCount
|
||||
RETURN
|
||||
forum.title AS forumName,
|
||||
postCount
|
||||
ORDER BY
|
||||
postCount DESC,
|
||||
forum.id ASC
|
||||
LIMIT 20
|
||||
""".replace(
|
||||
"\n", ""
|
||||
),
|
||||
self._get_query_parameters(),
|
||||
)
|
||||
|
||||
def benchmark__interactive__complex_query_6_analytical(self):
|
||||
return (
|
||||
"""
|
||||
MATCH (knownTag:Tag { name: $tagName })
|
||||
WITH knownTag.id as knownTagId
|
||||
|
||||
MATCH (person:Person { id: $personId })-[:KNOWS*1..2]-(friend)
|
||||
WHERE NOT person=friend
|
||||
WITH
|
||||
knownTagId,
|
||||
collect(distinct friend) as friends
|
||||
UNWIND friends as f
|
||||
MATCH (f)<-[:HAS_CREATOR]-(post:Post),
|
||||
(post)-[:HAS_TAG]->(t:Tag{id: knownTagId}),
|
||||
(post)-[:HAS_TAG]->(tag:Tag)
|
||||
WHERE NOT t = tag
|
||||
WITH
|
||||
tag.name as tagName,
|
||||
count(post) as postCount
|
||||
RETURN
|
||||
tagName,
|
||||
postCount
|
||||
ORDER BY
|
||||
postCount DESC,
|
||||
tagName ASC
|
||||
LIMIT 10
|
||||
""".replace(
|
||||
"\n", ""
|
||||
),
|
||||
self._get_query_parameters(),
|
||||
)
|
||||
|
||||
def benchmark__interactive__complex_query_7_analytical(self):
|
||||
memgraph = (
|
||||
"""
|
||||
MATCH (person:Person {id: $personId})<-[:HAS_CREATOR]-(message:Message)<-[like:LIKES]-(liker:Person)
|
||||
WITH liker, message, like.creationDate AS likeTime, person
|
||||
ORDER BY likeTime DESC, toInteger(message.id) ASC
|
||||
WITH liker, head(collect({msg: message, likeTime: likeTime})) AS latestLike, person
|
||||
OPTIONAL MATCH (liker)-[:KNOWS]-(person)
|
||||
WITH liker, latestLike, person,
|
||||
CASE WHEN person IS null THEN TRUE ELSE FALSE END AS isNew
|
||||
RETURN
|
||||
liker.id AS personId,
|
||||
liker.firstName AS personFirstName,
|
||||
liker.lastName AS personLastName,
|
||||
latestLike.likeTime AS likeCreationDate,
|
||||
latestLike.msg.id AS commentOrPostId,
|
||||
coalesce(latestLike.msg.content, latestLike.msg.imageFile) AS commentOrPostContent,
|
||||
(latestLike.likeTime - latestLike.msg.creationDate).minute AS minutesLatency
|
||||
ORDER BY
|
||||
likeCreationDate DESC,
|
||||
toInteger(personId) ASC
|
||||
LIMIT 20
|
||||
""".replace(
|
||||
"\n", ""
|
||||
),
|
||||
self._get_query_parameters(),
|
||||
)
|
||||
neo4j = (
|
||||
"""
|
||||
MATCH (person:Person {id: $personId})<-[:HAS_CREATOR]-(message:Message)<-[like:LIKES]-(liker:Person)
|
||||
WITH liker, message, like.creationDate AS likeTime, person
|
||||
ORDER BY likeTime DESC, toInteger(message.id) ASC
|
||||
WITH liker, head(collect({msg: message, likeTime: likeTime})) AS latestLike, person
|
||||
RETURN
|
||||
liker.id AS personId,
|
||||
liker.firstName AS personFirstName,
|
||||
liker.lastName AS personLastName,
|
||||
latestLike.likeTime AS likeCreationDate,
|
||||
latestLike.msg.id AS commentOrPostId,
|
||||
coalesce(latestLike.msg.content, latestLike.msg.imageFile) AS commentOrPostContent,
|
||||
duration.between(latestLike.likeTime, latestLike.msg.creationDate).minutes AS minutesLatency,
|
||||
not((liker)-[:KNOWS]-(person)) AS isNew
|
||||
ORDER BY
|
||||
likeCreationDate DESC,
|
||||
toInteger(personId) ASC
|
||||
LIMIT 20
|
||||
""".replace(
|
||||
"\n", ""
|
||||
),
|
||||
self._get_query_parameters(),
|
||||
)
|
||||
if self._vendor == "memgraph":
|
||||
return memgraph
|
||||
else:
|
||||
return neo4j
|
||||
|
||||
def benchmark__interactive__complex_query_8_analytical(self):
|
||||
return (
|
||||
"""
|
||||
MATCH (start:Person {id: $personId})<-[:HAS_CREATOR]-(:Message)<-[:REPLY_OF]-(comment:Comment)-[:HAS_CREATOR]->(person:Person)
|
||||
RETURN
|
||||
person.id AS personId,
|
||||
person.firstName AS personFirstName,
|
||||
person.lastName AS personLastName,
|
||||
comment.creationDate AS commentCreationDate,
|
||||
comment.id AS commentId,
|
||||
comment.content AS commentContent
|
||||
ORDER BY
|
||||
commentCreationDate DESC,
|
||||
commentId ASC
|
||||
LIMIT 20
|
||||
""".replace(
|
||||
"\n", ""
|
||||
),
|
||||
self._get_query_parameters(),
|
||||
)
|
||||
|
||||
def benchmark__interactive__complex_query_9_analytical(self):
|
||||
return (
|
||||
"""
|
||||
MATCH (root:Person {id: $personId })-[:KNOWS*1..2]-(friend:Person)
|
||||
WHERE NOT friend = root
|
||||
WITH collect(distinct friend) as friends
|
||||
UNWIND friends as friend
|
||||
MATCH (friend)<-[:HAS_CREATOR]-(message:Message)
|
||||
WHERE message.creationDate < localDateTime($maxDate)
|
||||
RETURN
|
||||
friend.id AS personId,
|
||||
friend.firstName AS personFirstName,
|
||||
friend.lastName AS personLastName,
|
||||
message.id AS commentOrPostId,
|
||||
coalesce(message.content,message.imageFile) AS commentOrPostContent,
|
||||
message.creationDate AS commentOrPostCreationDate
|
||||
ORDER BY
|
||||
commentOrPostCreationDate DESC,
|
||||
message.id ASC
|
||||
LIMIT 20
|
||||
""".replace(
|
||||
"\n", ""
|
||||
),
|
||||
self._get_query_parameters(),
|
||||
)
|
||||
|
||||
def benchmark__interactive__complex_query_10_analytical(self):
|
||||
memgraph = (
|
||||
"""
|
||||
MATCH (person:Person {id: $personId})-[:KNOWS*2..2]-(friend),
|
||||
(friend)-[:IS_LOCATED_IN]->(city:City)
|
||||
WHERE NOT friend=person AND
|
||||
NOT (friend)-[:KNOWS]-(person)
|
||||
WITH person, city, friend, datetime({epochMillis: friend.birthday}) as birthday
|
||||
WHERE (birthday.month=$month AND birthday.day>=21) OR
|
||||
(birthday.month=($month%12)+1 AND birthday.day<22)
|
||||
WITH DISTINCT friend, city, person
|
||||
OPTIONAL MATCH (friend)<-[:HAS_CREATOR]-(post:Post)
|
||||
WITH friend, city, collect(post) AS posts, person
|
||||
WITH friend,
|
||||
city,
|
||||
size(posts) AS postCount,
|
||||
size([p IN posts WHERE (p)-[:HAS_TAG]->()<-[:HAS_INTEREST]-(person)]) AS commonPostCount
|
||||
RETURN friend.id AS personId,
|
||||
friend.firstName AS personFirstName,
|
||||
friend.lastName AS personLastName,
|
||||
commonPostCount - (postCount - commonPostCount) AS commonInterestScore,
|
||||
friend.gender AS personGender,
|
||||
city.name AS personCityName
|
||||
ORDER BY commonInterestScore DESC, personId ASC
|
||||
LIMIT 10
|
||||
""".replace(
|
||||
"\n", ""
|
||||
),
|
||||
self._get_query_parameters(),
|
||||
)
|
||||
|
||||
neo4j = (
|
||||
"""
|
||||
MATCH (person:Person {id: $personId})-[:KNOWS*2..2]-(friend),
|
||||
(friend)-[:IS_LOCATED_IN]->(city:City)
|
||||
WHERE NOT friend=person AND
|
||||
NOT (friend)-[:KNOWS]-(person)
|
||||
WITH person, city, friend, datetime({epochMillis: friend.birthday}) as birthday
|
||||
WHERE (birthday.month=$month AND birthday.day>=21) OR
|
||||
(birthday.month=($month%12)+1 AND birthday.day<22)
|
||||
WITH DISTINCT friend, city, person
|
||||
OPTIONAL MATCH (friend)<-[:HAS_CREATOR]-(post:Post)
|
||||
WITH friend, city, collect(post) AS posts, person
|
||||
WITH friend,
|
||||
city,
|
||||
size(posts) AS postCount,
|
||||
size([p IN posts WHERE (p)-[:HAS_TAG]->()<-[:HAS_INTEREST]-(person)]) AS commonPostCount
|
||||
RETURN friend.id AS personId,
|
||||
friend.firstName AS personFirstName,
|
||||
friend.lastName AS personLastName,
|
||||
commonPostCount - (postCount - commonPostCount) AS commonInterestScore,
|
||||
friend.gender AS personGender,
|
||||
city.name AS personCityName
|
||||
ORDER BY commonInterestScore DESC, personId ASC
|
||||
LIMIT 10
|
||||
""".replace(
|
||||
"\n", ""
|
||||
),
|
||||
self._get_query_parameters(),
|
||||
)
|
||||
|
||||
if self._vendor == "memgraph":
|
||||
return memgraph
|
||||
else:
|
||||
return neo4j
|
||||
|
||||
def benchmark__interactive__complex_query_11_analytical(self):
|
||||
return (
|
||||
"""
|
||||
MATCH (person:Person {id: $personId })-[:KNOWS*1..2]-(friend:Person)
|
||||
WHERE not(person=friend)
|
||||
WITH DISTINCT friend
|
||||
MATCH (friend)-[workAt:WORK_AT]->(company:Company)-[:IS_LOCATED_IN]->(:Country {name: $countryName })
|
||||
WHERE workAt.workFrom < $workFromYear
|
||||
RETURN
|
||||
friend.id AS personId,
|
||||
friend.firstName AS personFirstName,
|
||||
friend.lastName AS personLastName,
|
||||
company.name AS organizationName,
|
||||
workAt.workFrom AS organizationWorkFromYear
|
||||
ORDER BY
|
||||
organizationWorkFromYear ASC,
|
||||
toInteger(personId) ASC,
|
||||
organizationName DESC
|
||||
LIMIT 10
|
||||
""".replace(
|
||||
"\n", ""
|
||||
),
|
||||
self._get_query_parameters(),
|
||||
)
|
||||
|
||||
def benchmark__interactive__complex_query_12_analytical(self):
|
||||
return (
|
||||
"""
|
||||
MATCH (tag:Tag)-[:HAS_TYPE|IS_SUBCLASS_OF*0..]->(baseTagClass:TagClass)
|
||||
WHERE tag.name = $tagClassName OR baseTagClass.name = $tagClassName
|
||||
WITH collect(tag.id) as tags
|
||||
MATCH (:Person {id: $personId })-[:KNOWS]-(friend:Person)<-[:HAS_CREATOR]-(comment:Comment)-[:REPLY_OF]->(:Post)-[:HAS_TAG]->(tag:Tag)
|
||||
WHERE tag.id in tags
|
||||
RETURN
|
||||
friend.id AS personId,
|
||||
friend.firstName AS personFirstName,
|
||||
friend.lastName AS personLastName,
|
||||
collect(DISTINCT tag.name) AS tagNames,
|
||||
count(DISTINCT comment) AS replyCount
|
||||
ORDER BY
|
||||
replyCount DESC,
|
||||
toInteger(personId) ASC
|
||||
LIMIT 20
|
||||
""".replace(
|
||||
"\n", ""
|
||||
),
|
||||
self._get_query_parameters(),
|
||||
)
|
||||
|
||||
def benchmark__interactive__complex_query_13_analytical(self):
|
||||
memgraph = (
|
||||
"""
|
||||
MATCH
|
||||
(person1:Person {id: $person1Id}),
|
||||
(person2:Person {id: $person2Id}),
|
||||
path = (person1)-[:KNOWS *BFS]-(person2)
|
||||
RETURN
|
||||
CASE path IS NULL
|
||||
WHEN true THEN -1
|
||||
ELSE size(path)
|
||||
END AS shortestPathLength
|
||||
""".replace(
|
||||
"\n", ""
|
||||
),
|
||||
self._get_query_parameters(),
|
||||
)
|
||||
|
||||
neo4j = (
|
||||
"""
|
||||
MATCH
|
||||
(person1:Person {id: $person1Id}),
|
||||
(person2:Person {id: $person2Id}),
|
||||
path = shortestPath((person1)-[:KNOWS*]-(person2))
|
||||
RETURN
|
||||
CASE path IS NULL
|
||||
WHEN true THEN -1
|
||||
ELSE length(path)
|
||||
END AS shortestPathLength
|
||||
""".replace(
|
||||
"\n", ""
|
||||
),
|
||||
self._get_query_parameters(),
|
||||
)
|
||||
|
||||
if self._vendor == "memgraph":
|
||||
return memgraph
|
||||
else:
|
||||
return neo4j
|
@ -1,134 +1,17 @@
|
||||
# Copyright 2022 Memgraph Ltd.
|
||||
#
|
||||
# Use of this software is governed by the Business Source License
|
||||
# included in the file licenses/BSL.txt; by using this file, you agree to be bound by the terms of the Business Source
|
||||
# License, and you may not use this file except in compliance with the Business Source License.
|
||||
#
|
||||
# As of the Change Date specified in that file, in accordance with
|
||||
# the Business Source License, use of this software will be governed
|
||||
# by the Apache License, Version 2.0, included in the file
|
||||
# licenses/APL.txt.
|
||||
|
||||
import random
|
||||
|
||||
import helpers
|
||||
from benchmark_context import BenchmarkContext
|
||||
from workloads.base import Workload
|
||||
from workloads.importers.importer_pokec import ImporterPokec
|
||||
|
||||
|
||||
# Base dataset class used as a template to create each individual dataset. All
|
||||
# common logic is handled here.
|
||||
class Dataset:
|
||||
# Name of the dataset.
|
||||
NAME = "Base dataset"
|
||||
# List of all variants of the dataset that exist.
|
||||
VARIANTS = ["default"]
|
||||
# One of the available variants that should be used as the default variant.
|
||||
DEFAULT_VARIANT = "default"
|
||||
# List of query files that should be used to import the dataset.
|
||||
FILES = {
|
||||
"default": "/foo/bar",
|
||||
}
|
||||
INDEX = None
|
||||
INDEX_FILES = {"default": ""}
|
||||
# List of query file URLs that should be used to import the dataset.
|
||||
URLS = None
|
||||
# Number of vertices/edges for each variant.
|
||||
SIZES = {
|
||||
"default": {"vertices": 0, "edges": 0},
|
||||
}
|
||||
# Indicates whether the dataset has properties on edges.
|
||||
PROPERTIES_ON_EDGES = False
|
||||
|
||||
def __init__(self, variant=None, vendor=None):
|
||||
"""
|
||||
Accepts a `variant` variable that indicates which variant
|
||||
of the dataset should be executed.
|
||||
"""
|
||||
if variant is None:
|
||||
variant = self.DEFAULT_VARIANT
|
||||
if variant not in self.VARIANTS:
|
||||
raise ValueError("Invalid test variant!")
|
||||
if (self.FILES and variant not in self.FILES) and (self.URLS and variant not in self.URLS):
|
||||
raise ValueError("The variant doesn't have a defined URL or " "file path!")
|
||||
if variant not in self.SIZES:
|
||||
raise ValueError("The variant doesn't have a defined dataset " "size!")
|
||||
if vendor not in self.INDEX_FILES:
|
||||
raise ValueError("Vendor does not have INDEX for dataset!")
|
||||
self._variant = variant
|
||||
self._vendor = vendor
|
||||
if self.FILES is not None:
|
||||
self._file = self.FILES.get(variant, None)
|
||||
else:
|
||||
self._file = None
|
||||
if self.URLS is not None:
|
||||
self._url = self.URLS.get(variant, None)
|
||||
else:
|
||||
self._url = None
|
||||
|
||||
if self.INDEX_FILES is not None:
|
||||
self._index = self.INDEX_FILES.get(vendor, None)
|
||||
else:
|
||||
self._index = None
|
||||
|
||||
self._size = self.SIZES[variant]
|
||||
if "vertices" not in self._size or "edges" not in self._size:
|
||||
raise ValueError("The size defined for this variant doesn't " "have the number of vertices and/or edges!")
|
||||
self._num_vertices = self._size["vertices"]
|
||||
self._num_edges = self._size["edges"]
|
||||
|
||||
def prepare(self, directory):
|
||||
if self._file is not None:
|
||||
print("Using dataset file:", self._file)
|
||||
else:
|
||||
# TODO: add support for JSON datasets
|
||||
cached_input, exists = directory.get_file("dataset.cypher")
|
||||
if not exists:
|
||||
print("Downloading dataset file:", self._url)
|
||||
downloaded_file = helpers.download_file(self._url, directory.get_path())
|
||||
print("Unpacking and caching file:", downloaded_file)
|
||||
helpers.unpack_and_move_file(downloaded_file, cached_input)
|
||||
print("Using cached dataset file:", cached_input)
|
||||
self._file = cached_input
|
||||
|
||||
cached_index, exists = directory.get_file(self._vendor + ".cypher")
|
||||
if not exists:
|
||||
print("Downloading index file:", self._index)
|
||||
downloaded_file = helpers.download_file(self._index, directory.get_path())
|
||||
print("Unpacking and caching file:", downloaded_file)
|
||||
helpers.unpack_and_move_file(downloaded_file, cached_index)
|
||||
print("Using cached index file:", cached_index)
|
||||
self._index = cached_index
|
||||
|
||||
def get_variant(self):
|
||||
"""Returns the current variant of the dataset."""
|
||||
return self._variant
|
||||
|
||||
def get_index(self):
|
||||
"""Get index file, defined by vendor"""
|
||||
return self._index
|
||||
|
||||
def get_file(self):
|
||||
"""
|
||||
Returns path to the file that contains dataset creation queries.
|
||||
"""
|
||||
return self._file
|
||||
|
||||
def get_size(self):
|
||||
"""Returns number of vertices/edges for the current variant."""
|
||||
return self._size
|
||||
|
||||
# All tests should be query generator functions that output all of the
|
||||
# queries that should be executed by the runner. The functions should be
|
||||
# named `benchmark__GROUPNAME__TESTNAME` and should not accept any
|
||||
# arguments.
|
||||
|
||||
|
||||
class Pokec(Dataset):
|
||||
class Pokec(Workload):
|
||||
NAME = "pokec"
|
||||
VARIANTS = ["small", "medium", "large"]
|
||||
DEFAULT_VARIANT = "small"
|
||||
FILES = None
|
||||
FILE = None
|
||||
|
||||
URLS = {
|
||||
URL_FILE = {
|
||||
"small": "https://s3.eu-west-1.amazonaws.com/deps.memgraph.io/dataset/pokec/benchmark/pokec_small_import.cypher",
|
||||
"medium": "https://s3.eu-west-1.amazonaws.com/deps.memgraph.io/dataset/pokec/benchmark/pokec_medium_import.cypher",
|
||||
"large": "https://s3.eu-west-1.amazonaws.com/deps.memgraph.io/dataset/pokec/benchmark/pokec_large.setup.cypher.gz",
|
||||
@ -138,16 +21,28 @@ class Pokec(Dataset):
|
||||
"medium": {"vertices": 100000, "edges": 1768515},
|
||||
"large": {"vertices": 1632803, "edges": 30622564},
|
||||
}
|
||||
INDEX = None
|
||||
INDEX_FILES = {
|
||||
|
||||
URL_INDEX_FILE = {
|
||||
"memgraph": "https://s3.eu-west-1.amazonaws.com/deps.memgraph.io/dataset/pokec/benchmark/memgraph.cypher",
|
||||
"neo4j": "https://s3.eu-west-1.amazonaws.com/deps.memgraph.io/dataset/pokec/benchmark/neo4j.cypher",
|
||||
}
|
||||
|
||||
PROPERTIES_ON_EDGES = False
|
||||
|
||||
# Helpers used to generate the queries
|
||||
def __init__(self, variant: str = None, benchmark_context: BenchmarkContext = None):
|
||||
super().__init__(variant, benchmark_context=benchmark_context)
|
||||
|
||||
def custom_import(self) -> bool:
|
||||
importer = ImporterPokec(
|
||||
benchmark_context=self.benchmark_context,
|
||||
dataset_name=self.NAME,
|
||||
index_file=self._file_index,
|
||||
dataset_file=self._file,
|
||||
variant=self._variant,
|
||||
)
|
||||
return importer.execute_import()
|
||||
|
||||
# Helpers used to generate the queries
|
||||
def _get_random_vertex(self):
|
||||
# All vertices in the Pokec dataset have an ID in the range
|
||||
# [1, _num_vertices].
|
||||
@ -343,7 +238,7 @@ class Pokec(Dataset):
|
||||
return ("MATCH (n:User {id: $id}) RETURN n", {"id": self._get_random_vertex()})
|
||||
|
||||
def benchmark__match__vertex_on_property(self):
|
||||
return ("MATCH (n {id: $id}) RETURN n", {"id": self._get_random_vertex()})
|
||||
return ("MATCH (n:User {id: $id}) RETURN n", {"id": self._get_random_vertex()})
|
||||
|
||||
def benchmark__update__vertex_on_property(self):
|
||||
return (
|
||||
@ -364,7 +259,7 @@ class Pokec(Dataset):
|
||||
|
||||
def benchmark__basic__single_vertex_property_update_update(self):
|
||||
return (
|
||||
"MATCH (n {id: $id}) SET n.property = -1",
|
||||
"MATCH (n:User {id: $id}) SET n.property = -1",
|
||||
{"id": self._get_random_vertex()},
|
||||
)
|
||||
|
Loading…
Reference in New Issue
Block a user