Add bigger LDBC dataset to mgbench (#747)

This commit is contained in:
Ante Javor 2023-03-21 21:44:11 +01:00 committed by GitHub
parent 6349fc9501
commit cb813c3070
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
22 changed files with 3907 additions and 903 deletions

View File

@ -247,7 +247,7 @@ Index queries for each supported vendor can be downloaded from “https://s3.eu-
|Q19|pattern_short| analytical | MATCH (n:User {id: $id})-[e]->(m) RETURN m LIMIT 1|
|Q20|single_edge_write| write | MATCH (n:User {id: $from}), (m:User {id: $to}) WITH n, m CREATE (n)-[e:Temp]->(m) RETURN e|
|Q21|single_vertex_write| write |CREATE (n:UserTemp {id : $id}) RETURN n|
|Q22|single_vertex_property_update| update | MATCH (n:User {id: $id})-[e]->(m) RETURN m LIMIT 1|
|Q22|single_vertex_property_update| update | MATCH (n:User {id: $id}) SET n.property = -1|
|Q23|single_vertex_read| read | MATCH (n:User {id : $id}) RETURN n|
## :computer: Platform

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,57 @@
# Describes all the information of single benchmark.py run.
class BenchmarkContext:
"""
Class for holding information on what type of benchmark is being executed
"""
def __init__(
self,
benchmark_target_workload: str = None, # Workload that needs to be executed (dataset/variant/group/query)
vendor_binary: str = None, # Benchmark vendor binary
vendor_name: str = None,
client_binary: str = None,
num_workers_for_import: int = None,
num_workers_for_benchmark: int = None,
single_threaded_runtime_sec: int = 0,
no_load_query_counts: bool = False,
no_save_query_counts: bool = False,
export_results: str = None,
temporary_directory: str = None,
workload_mixed: str = None, # Default mode is isolated, mixed None
workload_realistic: str = None, # Default mode is isolated, realistic None
time_dependent_execution: int = 0,
warm_up: str = None,
performance_tracking: bool = False,
no_authorization: bool = True,
customer_workloads: str = None,
vendor_args: dict = {},
) -> None:
self.benchmark_target_workload = benchmark_target_workload
self.vendor_binary = vendor_binary
self.vendor_name = vendor_name
self.client_binary = client_binary
self.num_workers_for_import = num_workers_for_import
self.num_workers_for_benchmark = num_workers_for_benchmark
self.single_threaded_runtime_sec = single_threaded_runtime_sec
self.no_load_query_counts = no_load_query_counts
self.no_save_query_counts = no_save_query_counts
self.export_results = export_results
self.temporary_directory = temporary_directory
if workload_mixed != None:
self.mode = "Mixed"
self.mode_config = workload_mixed
elif workload_realistic != None:
self.mode = "Realistic"
self.mode_config = workload_realistic
else:
self.mode = "Isolated"
self.mode_config = "Isolated run does not have a config."
self.time_dependent_execution = time_dependent_execution
self.performance_tracking = performance_tracking
self.warm_up = warm_up
self.no_authorization = no_authorization
self.customer_workloads = customer_workloads
self.vendor_args = vendor_args

View File

@ -289,6 +289,7 @@ void ExecuteTimeDependentWorkload(
// Synchronize workers and collect runtime.
while (ready.load(std::memory_order_acq_rel) < FLAGS_num_workers)
;
run.store(true);
for (int i = 0; i < FLAGS_num_workers; ++i) {
threads[i].join();
@ -310,6 +311,7 @@ void ExecuteTimeDependentWorkload(
final_duration /= FLAGS_num_workers;
double execution_delta = time_limit.count() / final_duration;
// This is adjusted throughput based on how much longer did workload execution time took.
double throughput = (total_iterations / final_duration) * execution_delta;
double raw_throughput = total_iterations / final_duration;
@ -319,7 +321,6 @@ void ExecuteTimeDependentWorkload(
summary["duration"] = final_duration;
summary["time_limit"] = FLAGS_time_dependent_execution;
summary["queries_executed"] = total_iterations;
summary["throughput"] = throughput;
summary["raw_throughput"] = raw_throughput;
summary["latency_stats"] = LatencyStatistics(worker_query_durations);

View File

@ -77,10 +77,10 @@ def compare_results(results_from, results_to, fields, ignored, different_vendors
recursive_get(summary_from, "database", key, value=None),
summary_to["database"][key],
)
elif summary_to.get("query_statistics") != None and key in summary_to["query_statistics"]:
elif summary_to.get("latency_stats") != None and key in summary_to["latency_stats"]:
row[key] = compute_diff(
recursive_get(summary_from, "query_statistics", key, value=None),
summary_to["query_statistics"][key],
recursive_get(summary_from, "latency_stats", key, value=None),
summary_to["latency_stats"][key],
)
elif not different_vendors:
row[key] = compute_diff(
@ -160,7 +160,10 @@ if __name__ == "__main__":
help="Comparing different vendors, there is no need for metadata, duration, count check.",
)
parser.add_argument(
"--difference-threshold", type=float, help="Difference threshold for memory and throughput, 0.02 = 2% "
"--difference-threshold",
type=float,
default=0.02,
help="Difference threshold for memory and throughput, 0.02 = 2% ",
)
args = parser.parse_args()

View File

View File

@ -0,0 +1,500 @@
import argparse
import csv
import sys
from collections import defaultdict
from pathlib import Path
import helpers
# Most recent list of LDBC datasets available at: https://github.com/ldbc/data-sets-surf-repository
INTERACTIVE_LINK = {
"sf0.1": "https://repository.surfsara.nl/datasets/cwi/snb/files/social_network-csv_basic/social_network-csv_basic-sf0.1.tar.zst",
"sf0.3": "https://repository.surfsara.nl/datasets/cwi/snb/files/social_network-csv_basic/social_network-csv_basic-sf0.3.tar.zst",
"sf1": "https://repository.surfsara.nl/datasets/cwi/snb/files/social_network-csv_basic/social_network-csv_basic-sf1.tar.zst",
"sf3": "https://repository.surfsara.nl/datasets/cwi/snb/files/social_network-csv_basic/social_network-csv_basic-sf3.tar.zst",
"sf10": "https://repository.surfsara.nl/datasets/cwi/snb/files/social_network-csv_basic/social_network-csv_basic-sf10.tar.zst",
}
BI_LINK = {
"sf1": "https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/bi-pre-audit/bi-sf1-composite-projected-fk.tar.zst",
"sf3": "https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/bi-pre-audit/bi-sf3-composite-projected-fk.tar.zst",
"sf10": "https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/bi-pre-audit/bi-sf10-composite-projected-fk.tar.zst",
}
if __name__ == "__main__":
parser = argparse.ArgumentParser(
prog="LDBC CSV to CYPHERL converter",
description="""Converts all LDBC CSV files to CYPHERL transactions, for faster Memgraph load""",
)
parser.add_argument(
"--size",
required=True,
choices=["0.1", "0.3", "1", "3", "10"],
help="Interactive: (0.1 , 0.3, 1, 3, 10) BI: (1, 3, 10)",
)
parser.add_argument("--type", required=True, choices=["interactive", "bi"], help="interactive or bi")
args = parser.parse_args()
output_directory = Path().absolute() / ".cache" / "LDBC_generated"
output_directory.mkdir(exist_ok=True)
if args.type == "interactive":
NODES_INTERACTIVE = [
{"filename": "Place", "label": "Place"},
{"filename": "Organisation", "label": "Organisation"},
{"filename": "TagClass", "label": "TagClass"},
{"filename": "Tag", "label": "Tag"},
{"filename": "Comment", "label": "Message:Comment"},
{"filename": "Forum", "label": "Forum"},
{"filename": "Person", "label": "Person"},
{"filename": "Post", "label": "Message:Post"},
]
EDGES_INTERACTIVE = [
{
"filename": "Place_isPartOf_Place",
"source_label": "Place",
"type": "IS_PART_OF",
"target_label": "Place",
},
{
"filename": "TagClass_isSubclassOf_TagClass",
"source_label": "TagClass",
"type": "IS_SUBCLASS_OF",
"target_label": "TagClass",
},
{
"filename": "Organisation_isLocatedIn_Place",
"source_label": "Organisation",
"type": "IS_LOCATED_IN",
"target_label": "Place",
},
{"filename": "Tag_hasType_TagClass", "source_label": "Tag", "type": "HAS_TYPE", "target_label": "TagClass"},
{
"filename": "Comment_hasCreator_Person",
"source_label": "Comment",
"type": "HAS_CREATOR",
"target_label": "Person",
},
{
"filename": "Comment_isLocatedIn_Place",
"source_label": "Comment",
"type": "IS_LOCATED_IN",
"target_label": "Place",
},
{
"filename": "Comment_replyOf_Comment",
"source_label": "Comment",
"type": "REPLY_OF",
"target_label": "Comment",
},
{"filename": "Comment_replyOf_Post", "source_label": "Comment", "type": "REPLY_OF", "target_label": "Post"},
{
"filename": "Forum_containerOf_Post",
"source_label": "Forum",
"type": "CONTAINER_OF",
"target_label": "Post",
},
{
"filename": "Forum_hasMember_Person",
"source_label": "Forum",
"type": "HAS_MEMBER",
"target_label": "Person",
},
{
"filename": "Forum_hasModerator_Person",
"source_label": "Forum",
"type": "HAS_MODERATOR",
"target_label": "Person",
},
{"filename": "Forum_hasTag_Tag", "source_label": "Forum", "type": "HAS_TAG", "target_label": "Tag"},
{
"filename": "Person_hasInterest_Tag",
"source_label": "Person",
"type": "HAS_INTEREST",
"target_label": "Tag",
},
{
"filename": "Person_isLocatedIn_Place",
"source_label": "Person",
"type": "IS_LOCATED_IN",
"target_label": "Place",
},
{"filename": "Person_knows_Person", "source_label": "Person", "type": "KNOWS", "target_label": "Person"},
{"filename": "Person_likes_Comment", "source_label": "Person", "type": "LIKES", "target_label": "Comment"},
{"filename": "Person_likes_Post", "source_label": "Person", "type": "LIKES", "target_label": "Post"},
{
"filename": "Post_hasCreator_Person",
"source_label": "Post",
"type": "HAS_CREATOR",
"target_label": "Person",
},
{"filename": "Comment_hasTag_Tag", "source_label": "Comment", "type": "HAS_TAG", "target_label": "Tag"},
{"filename": "Post_hasTag_Tag", "source_label": "Post", "type": "HAS_TAG", "target_label": "Tag"},
{
"filename": "Post_isLocatedIn_Place",
"source_label": "Post",
"type": "IS_LOCATED_IN",
"target_label": "Place",
},
{
"filename": "Person_studyAt_Organisation",
"source_label": "Person",
"type": "STUDY_AT",
"target_label": "Organisation",
},
{
"filename": "Person_workAt_Organisation",
"source_label": "Person",
"type": "WORK_AT",
"target_label": "Organisation",
},
]
file_size = "sf{}".format(args.size)
out_file = "ldbc_interactive_{}.cypher".format(file_size)
output = output_directory / out_file
if output.exists():
output.unlink()
files_present = None
for file in output_directory.glob("**/*.tar.zst"):
if "basic-" + file_size in file.name:
files_present = file.with_suffix("").with_suffix("")
break
if not files_present:
try:
print("Downloading the file... " + INTERACTIVE_LINK[file_size])
downloaded_file = helpers.download_file(INTERACTIVE_LINK[file_size], output_directory.absolute())
print("Unpacking the file..." + downloaded_file)
files_present = helpers.unpack_tar_zst(Path(downloaded_file))
except:
print("Issue with downloading and unpacking the file, check if links are working properly.")
raise
input_files = {}
for file in files_present.glob("**/*.csv"):
name = file.name.replace("_0_0.csv", "").lower()
input_files[name] = file
for node_file in NODES_INTERACTIVE:
key = node_file["filename"].lower()
default_label = node_file["label"]
query = None
if key in input_files.keys():
with input_files[key].open("r") as input_f, output.open("a") as output_f:
reader = csv.DictReader(input_f, delimiter="|")
for row in reader:
if "type" in row.keys():
label = default_label + ":" + row.pop("type").capitalize()
else:
label = default_label
query = "CREATE (:{} {{id:{}, ".format(label, row.pop("id"))
# Format properties to fit Memgraph
for k, v in row.items():
if k == "creationDate":
row[k] = 'localDateTime("{}")'.format(v[0:-5])
elif k == "birthday":
row[k] = 'date("{}")'.format(v)
elif k == "length":
row[k] = "toInteger({})".format(v)
else:
row[k] = '"{}"'.format(v)
prop_string = ", ".join("{} : {}".format(k, v) for k, v in row.items())
query = query + prop_string + "});"
output_f.write(query + "\n")
print("Converted file: " + input_files[key].name + " to " + output.name)
else:
print("Didn't process node file: " + key)
raise Exception("Didn't find the file that was needed!")
for edge_file in EDGES_INTERACTIVE:
key = edge_file["filename"].lower()
source_label = edge_file["source_label"]
edge_type = edge_file["type"]
target_label = edge_file["target_label"]
if key in input_files.keys():
query = None
with input_files[key].open("r") as input_f, output.open("a") as output_f:
sufixl = ".id"
sufixr = ".id"
# Handle identical label/key in CSV header
if source_label == target_label:
sufixl = "l"
sufixr = "r"
# Move a place from header
header = next(input_f).strip().split("|")
reader = csv.DictReader(
input_f, delimiter="|", fieldnames=([source_label + sufixl, target_label + sufixr] + header[2:])
)
for row in reader:
query = "MATCH (n1:{} {{id:{}}}), (n2:{} {{id:{}}}) ".format(
source_label, row.pop(source_label + sufixl), target_label, row.pop(target_label + sufixr)
)
for k, v in row.items():
if "date" in k.lower():
# Take time zone out
row[k] = 'localDateTime("{}")'.format(v[0:-5])
elif "workfrom" in k.lower() or "classyear" in k.lower():
row[k] = 'toInteger("{}")'.format(v)
else:
row[k] = '"{}"'.format(v)
edge_part = "CREATE (n1)-[:{}{{".format(edge_type)
prop_string = ", ".join("{} : {}".format(k, v) for k, v in row.items())
query = query + edge_part + prop_string + "}]->(n2);"
output_f.write(query + "\n")
print("Converted file: " + input_files[key].name + " to " + output.name)
else:
print("Didn't process Edge file: " + key)
raise Exception("Didn't find the file that was needed!")
elif args.type == "bi":
NODES_BI = [
{"filename": "Place", "label": "Place"},
{"filename": "Organisation", "label": "Organisation"},
{"filename": "TagClass", "label": "TagClass"},
{"filename": "Tag", "label": "Tag"},
{"filename": "Comment", "label": "Message:Comment"},
{"filename": "Forum", "label": "Forum"},
{"filename": "Person", "label": "Person"},
{"filename": "Post", "label": "Message:Post"},
]
EDGES_BI = [
{
"filename": "Place_isPartOf_Place",
"source_label": "Place",
"type": "IS_PART_OF",
"target_label": "Place",
},
{
"filename": "TagClass_isSubclassOf_TagClass",
"source_label": "TagClass",
"type": "IS_SUBCLASS_OF",
"target_label": "TagClass",
},
{
"filename": "Organisation_isLocatedIn_Place",
"source_label": "Organisation",
"type": "IS_LOCATED_IN",
"target_label": "Place",
},
{"filename": "Tag_hasType_TagClass", "source_label": "Tag", "type": "HAS_TYPE", "target_label": "TagClass"},
{
"filename": "Comment_hasCreator_Person",
"source_label": "Comment",
"type": "HAS_CREATOR",
"target_label": "Person",
},
# Change place to Country
{
"filename": "Comment_isLocatedIn_Country",
"source_label": "Comment",
"type": "IS_LOCATED_IN",
"target_label": "Country",
},
{
"filename": "Comment_replyOf_Comment",
"source_label": "Comment",
"type": "REPLY_OF",
"target_label": "Comment",
},
{"filename": "Comment_replyOf_Post", "source_label": "Comment", "type": "REPLY_OF", "target_label": "Post"},
{
"filename": "Forum_containerOf_Post",
"source_label": "Forum",
"type": "CONTAINER_OF",
"target_label": "Post",
},
{
"filename": "Forum_hasMember_Person",
"source_label": "Forum",
"type": "HAS_MEMBER",
"target_label": "Person",
},
{
"filename": "Forum_hasModerator_Person",
"source_label": "Forum",
"type": "HAS_MODERATOR",
"target_label": "Person",
},
{"filename": "Forum_hasTag_Tag", "source_label": "Forum", "type": "HAS_TAG", "target_label": "Tag"},
{
"filename": "Person_hasInterest_Tag",
"source_label": "Person",
"type": "HAS_INTEREST",
"target_label": "Tag",
},
# Changed place to City
{
"filename": "Person_isLocatedIn_City",
"source_label": "Person",
"type": "IS_LOCATED_IN",
"target_label": "City",
},
{"filename": "Person_knows_Person", "source_label": "Person", "type": "KNOWS", "target_label": "Person"},
{"filename": "Person_likes_Comment", "source_label": "Person", "type": "LIKES", "target_label": "Comment"},
{"filename": "Person_likes_Post", "source_label": "Person", "type": "LIKES", "target_label": "Post"},
{
"filename": "Post_hasCreator_Person",
"source_label": "Post",
"type": "HAS_CREATOR",
"target_label": "Person",
},
{"filename": "Comment_hasTag_Tag", "source_label": "Comment", "type": "HAS_TAG", "target_label": "Tag"},
{"filename": "Post_hasTag_Tag", "source_label": "Post", "type": "HAS_TAG", "target_label": "Tag"},
# Change place to Country
{
"filename": "Post_isLocatedIn_Country",
"source_label": "Post",
"type": "IS_LOCATED_IN",
"target_label": "Country",
},
# Changed organisation to University
{
"filename": "Person_studyAt_University",
"source_label": "Person",
"type": "STUDY_AT",
"target_label": "University",
},
# Changed organisation to Company
{
"filename": "Person_workAt_Company",
"source_label": "Person",
"type": "WORK_AT",
"target_label": "Company",
},
]
file_size = "sf{}".format(args.size)
out_file = "ldbc_bi_{}.cypher".format(file_size)
output = output_directory / out_file
if output.exists():
output.unlink()
files_present = None
for file in output_directory.glob("**/*.tar.zst"):
if "bi-" + file_size in file.name:
files_present = file.with_suffix("").with_suffix("")
break
if not files_present:
try:
print("Downloading the file... " + BI_LINK[file_size])
downloaded_file = helpers.download_file(BI_LINK[file_size], output_directory.absolute())
print("Unpacking the file..." + downloaded_file)
files_present = helpers.unpack_tar_zst(Path(downloaded_file))
except:
print("Issue with downloading and unpacking the file, check if links are working properly.")
raise
for file in files_present.glob("**/*.csv.gz"):
if "initial_snapshot" in file.parts:
helpers.unpack_gz(file)
input_files = defaultdict(list)
for file in files_present.glob("**/*.csv"):
key = file.parents[0].name
input_files[file.parents[0].name].append(file)
for node_file in NODES_BI:
key = node_file["filename"]
default_label = node_file["label"]
query = None
if key in input_files.keys():
for part_file in input_files[key]:
with part_file.open("r") as input_f, output.open("a") as output_f:
reader = csv.DictReader(input_f, delimiter="|")
for row in reader:
if "type" in row.keys():
label = default_label + ":" + row.pop("type")
else:
label = default_label
query = "CREATE (:{} {{id:{}, ".format(label, row.pop("id"))
# Format properties to fit Memgraph
for k, v in row.items():
if k == "creationDate":
row[k] = 'localDateTime("{}")'.format(v[0:-6])
elif k == "birthday":
row[k] = 'date("{}")'.format(v)
elif k == "length":
row[k] = "toInteger({})".format(v)
else:
row[k] = '"{}"'.format(v)
prop_string = ", ".join("{} : {}".format(k, v) for k, v in row.items())
query = query + prop_string + "});"
output_f.write(query + "\n")
print("Key: " + key + " Converted file: " + part_file.name + " to " + output.name)
else:
print("Didn't process node file: " + key)
for edge_file in EDGES_BI:
key = edge_file["filename"]
source_label = edge_file["source_label"]
edge_type = edge_file["type"]
target_label = edge_file["target_label"]
if key in input_files.keys():
for part_file in input_files[key]:
query = None
with part_file.open("r") as input_f, output.open("a") as output_f:
sufixl = "Id"
sufixr = "Id"
# Handle identical label/key in CSV header
if source_label == target_label:
sufixl = "l"
sufixr = "r"
# Move a place from header
header = next(input_f).strip().split("|")
if len(header) >= 3:
reader = csv.DictReader(
input_f,
delimiter="|",
fieldnames=(["date", source_label + sufixl, target_label + sufixr] + header[3:]),
)
else:
reader = csv.DictReader(
input_f,
delimiter="|",
fieldnames=([source_label + sufixl, target_label + sufixr] + header[2:]),
)
for row in reader:
query = "MATCH (n1:{} {{id:{}}}), (n2:{} {{id:{}}}) ".format(
source_label,
row.pop(source_label + sufixl),
target_label,
row.pop(target_label + sufixr),
)
for k, v in row.items():
if "date" in k.lower():
# Take time zone out
row[k] = 'localDateTime("{}")'.format(v[0:-6])
elif k == "classYear" or k == "workFrom":
row[k] = 'toInteger("{}")'.format(v)
else:
row[k] = '"{}"'.format(v)
edge_part = "CREATE (n1)-[:{}{{".format(edge_type)
prop_string = ", ".join("{} : {}".format(k, v) for k, v in row.items())
query = query + edge_part + prop_string + "}]->(n2);"
output_f.write(query + "\n")
print("Key: " + key + " Converted file: " + part_file.name + " to " + output.name)
else:
print("Didn't process Edge file: " + key)
raise Exception("Didn't find the file that was needed!")

View File

@ -16,14 +16,20 @@ def parse_arguments():
help="Forward name and paths to vendors binary"
"Example: --vendor memgraph /path/to/binary --vendor neo4j /path/to/binary",
)
parser.add_argument(
"--dataset-size",
default="small",
choices=["small", "medium", "large"],
help="Pick a dataset size (small, medium, large)",
"--dataset-name",
default="",
help="Dataset name you wish to execute",
)
parser.add_argument("--dataset-group", default="basic", help="Select a group of queries")
parser.add_argument(
"--dataset-size",
default="",
help="Pick a dataset variant you wish to execute",
)
parser.add_argument("--dataset-group", default="", help="Select a group of queries")
parser.add_argument(
"--realistic",
@ -53,88 +59,110 @@ def parse_arguments():
return args
def run_full_benchmarks(vendor, binary, dataset_size, dataset_group, realistic, mixed):
def run_full_benchmarks(vendor, binary, dataset, dataset_size, dataset_group, realistic, mixed):
configurations = [
# Basic full group test cold
# Basic isolated test cold
[
"--export-results",
vendor + "_" + dataset_size + "_cold_isolated.json",
vendor + "_" + dataset + "_" + dataset_size + "_cold_isolated.json",
],
# Basic full group test hot
# Basic isolated test hot
[
"--export-results",
vendor + "_" + dataset_size + "_hot_isolated.json",
"--warmup-run",
vendor + "_" + dataset + "_" + dataset_size + "_hot_isolated.json",
"--warm-up",
"hot",
],
# Basic isolated test vulcanic
[
"--export-results",
vendor + "_" + dataset + "_" + dataset_size + "_vulcanic_isolated.json",
"--warm-up",
"vulcanic",
],
]
# Configurations for full workload
for count, write, read, update, analytical in realistic:
cold = [
"--export-results",
vendor
+ "_"
+ dataset_size
+ "_cold_realistic_{}_{}_{}_{}_{}.json".format(count, write, read, update, analytical),
"--mixed-workload",
count,
write,
read,
update,
analytical,
]
if realistic:
# Configurations for full workload
for count, write, read, update, analytical in realistic:
cold = [
"--export-results",
vendor
+ "_"
+ dataset
+ "_"
+ dataset_size
+ "_cold_realistic_{}_{}_{}_{}_{}.json".format(count, write, read, update, analytical),
"--workload-realistic",
count,
write,
read,
update,
analytical,
]
hot = [
"--export-results",
vendor
+ "_"
+ dataset_size
+ "_hot_realistic_{}_{}_{}_{}_{}.json".format(count, write, read, update, analytical),
"--warmup-run",
"--mixed-workload",
count,
write,
read,
update,
analytical,
]
configurations.append(cold)
configurations.append(hot)
hot = [
"--export-results",
vendor
+ "_"
+ dataset
+ "_"
+ dataset_size
+ "_hot_realistic_{}_{}_{}_{}_{}.json".format(count, write, read, update, analytical),
"--warm-up",
"hot",
"--workload-realistic",
count,
write,
read,
update,
analytical,
]
# Configurations for workload per query
for count, write, read, update, analytical, query in mixed:
cold = [
"--export-results",
vendor
+ "_"
+ dataset_size
+ "_cold_mixed_{}_{}_{}_{}_{}_{}.json".format(count, write, read, update, analytical, query),
"--mixed-workload",
count,
write,
read,
update,
analytical,
query,
]
hot = [
"--export-results",
vendor
+ "_"
+ dataset_size
+ "_hot_mixed_{}_{}_{}_{}_{}_{}.json".format(count, write, read, update, analytical, query),
"--warmup-run",
"--mixed-workload",
count,
write,
read,
update,
analytical,
query,
]
configurations.append(cold)
configurations.append(hot)
configurations.append(cold)
configurations.append(hot)
if mixed:
# Configurations for workload per query
for count, write, read, update, analytical, query in mixed:
cold = [
"--export-results",
vendor
+ "_"
+ dataset
+ "_"
+ dataset_size
+ "_cold_mixed_{}_{}_{}_{}_{}_{}.json".format(count, write, read, update, analytical, query),
"--workload-mixed",
count,
write,
read,
update,
analytical,
query,
]
hot = [
"--export-results",
vendor
+ "_"
+ dataset
+ "_"
+ dataset_size
+ "_hot_mixed_{}_{}_{}_{}_{}_{}.json".format(count, write, read, update, analytical, query),
"--warm-up",
"hot",
"--workload-mixed",
count,
write,
read,
update,
analytical,
query,
]
configurations.append(cold)
configurations.append(hot)
default_args = [
"python3",
@ -146,9 +174,7 @@ def run_full_benchmarks(vendor, binary, dataset_size, dataset_group, realistic,
"--num-workers-for-benchmark",
"12",
"--no-authorization",
"pokec/" + dataset_size + "/" + dataset_group + "/*",
"--tail-latency",
"100",
dataset + "/" + dataset_size + "/" + dataset_group + "/*",
]
for config in configurations:
@ -157,11 +183,11 @@ def run_full_benchmarks(vendor, binary, dataset_size, dataset_group, realistic,
subprocess.run(args=full_config, check=True)
def collect_all_results(vendor_name, dataset_size, dataset_group):
def collect_all_results(vendor_name, dataset, dataset_size, dataset_group):
working_directory = Path().absolute()
print(working_directory)
results = sorted(working_directory.glob(vendor_name + "_" + dataset_size + "_*.json"))
summary = {"pokec": {dataset_size: {dataset_group: {}}}}
results = sorted(working_directory.glob(vendor_name + "_" + dataset + "_" + dataset_size + "_*.json"))
summary = {dataset: {dataset_size: {dataset_group: {}}}}
for file in results:
if "summary" in file.name:
@ -169,19 +195,22 @@ def collect_all_results(vendor_name, dataset_size, dataset_group):
f = file.open()
data = json.loads(f.read())
if data["__run_configuration__"]["condition"] == "hot":
for key, value in data["pokec"][dataset_size][dataset_group].items():
for key, value in data[dataset][dataset_size][dataset_group].items():
key_condition = key + "_hot"
summary["pokec"][dataset_size][dataset_group][key_condition] = value
summary[dataset][dataset_size][dataset_group][key_condition] = value
elif data["__run_configuration__"]["condition"] == "cold":
for key, value in data["pokec"][dataset_size][dataset_group].items():
for key, value in data[dataset][dataset_size][dataset_group].items():
key_condition = key + "_cold"
summary["pokec"][dataset_size][dataset_group][key_condition] = value
summary[dataset][dataset_size][dataset_group][key_condition] = value
elif data["__run_configuration__"]["condition"] == "vulcanic":
for key, value in data[dataset][dataset_size][dataset_group].items():
key_condition = key + "_vulcanic"
summary[dataset][dataset_size][dataset_group][key_condition] = value
print(summary)
json_object = json.dumps(summary, indent=4)
print(json_object)
with open(vendor_name + "_" + dataset_size + "_summary.json", "w") as f:
with open(vendor_name + "_" + dataset + "_" + dataset_size + "_summary.json", "w") as f:
json.dump(summary, f)
@ -194,16 +223,17 @@ if __name__ == "__main__":
vendor_names = {"memgraph", "neo4j"}
for vendor_name, vendor_binary in args.vendor:
path = Path(vendor_binary)
if vendor_name.lower() in vendor_names and (path.is_file() or path.is_dir()):
if vendor_name.lower() in vendor_names and path.is_file():
run_full_benchmarks(
vendor_name,
vendor_binary,
args.dataset_name,
args.dataset_size,
args.dataset_group,
realistic,
mixed,
)
collect_all_results(vendor_name, args.dataset_size, args.dataset_group)
collect_all_results(vendor_name, args.dataset_name, args.dataset_size, args.dataset_group)
else:
raise Exception(
"Check that vendor: {} is supported and you are passing right path: {} to binary.".format(

View File

@ -1,4 +1,4 @@
# Copyright 2021 Memgraph Ltd.
# Copyright 2023 Memgraph Ltd.
#
# Use of this software is governed by the Business Source License
# included in the file licenses/BSL.txt; by using this file, you agree to be bound by the terms of the Business Source
@ -9,11 +9,21 @@
# by the Apache License, Version 2.0, included in the file
# licenses/APL.txt.
import collections
import copy
import fnmatch
import importlib
import inspect
import json
import os
import subprocess
import sys
from pathlib import Path
import workloads
from benchmark_context import BenchmarkContext
from workloads import *
from workloads import base
SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__))
@ -28,22 +38,70 @@ def get_binary_path(path, base=""):
def download_file(url, path):
ret = subprocess.run(["wget", "-nv", "--content-disposition", url],
stderr=subprocess.PIPE, cwd=path, check=True)
ret = subprocess.run(["wget", "-nv", "--content-disposition", url], stderr=subprocess.PIPE, cwd=path, check=True)
data = ret.stderr.decode("utf-8")
tmp = data.split("->")[1]
name = tmp[tmp.index('"') + 1:tmp.rindex('"')]
name = tmp[tmp.index('"') + 1 : tmp.rindex('"')]
return os.path.join(path, name)
def unpack_and_move_file(input_path, output_path):
def unpack_gz_and_move_file(input_path, output_path):
if input_path.endswith(".gz"):
subprocess.run(["gunzip", input_path],
stdout=subprocess.DEVNULL, check=True)
subprocess.run(["gunzip", input_path], stdout=subprocess.DEVNULL, check=True)
input_path = input_path[:-3]
os.rename(input_path, output_path)
def unpack_gz(input_path: Path):
if input_path.suffix == ".gz":
subprocess.run(["gzip", "-d", input_path], capture_output=True, check=True)
input_path = input_path.with_suffix("")
return input_path
def unpack_zip(input_path: Path):
if input_path.suffix == ".zip":
subprocess.run(["unzip", input_path], capture_output=True, check=True, cwd=input_path.parent)
input_path = input_path.with_suffix("")
return input_path
def unpack_tar_zst(input_path: Path):
if input_path.suffix == ".zst":
subprocess.run(
["tar", "--use-compress-program=unzstd", "-xvf", input_path],
cwd=input_path.parent,
capture_output=True,
check=True,
)
input_path = input_path.with_suffix("").with_suffix("")
return input_path
def unpack_tar_gz(input_path: Path):
if input_path.suffix == ".gz":
subprocess.run(
["tar", "-xvf", input_path],
cwd=input_path.parent,
capture_output=True,
check=True,
)
input_path = input_path.with_suffix("").with_suffix("")
return input_path
def unpack_tar_zst_and_move(input_path: Path, output_path: Path):
if input_path.suffix == ".zst":
subprocess.run(
["tar", "--use-compress-program=unzstd", "-xvf", input_path],
cwd=input_path.parent,
capture_output=True,
check=True,
)
input_path = input_path.with_suffix("").with_suffix("")
return input_path.rename(output_path)
def ensure_directory(path):
if not os.path.exists(path):
os.makedirs(path)
@ -51,6 +109,129 @@ def ensure_directory(path):
raise Exception("The path '{}' should be a directory!".format(path))
def get_available_workloads(customer_workloads: str = None) -> dict:
generators = {}
for module in map(workloads.__dict__.get, workloads.__all__):
for key in dir(module):
if key.startswith("_"):
continue
base_class = getattr(module, key)
if not inspect.isclass(base_class) or not issubclass(base_class, base.Workload):
continue
queries = collections.defaultdict(list)
for funcname in dir(base_class):
if not funcname.startswith("benchmark__"):
continue
group, query = funcname.split("__")[1:]
queries[group].append((query, funcname))
generators[base_class.NAME] = (base_class, dict(queries))
if customer_workloads:
head_tail = os.path.split(customer_workloads)
path_without_dataset_name = head_tail[0]
dataset_name = head_tail[1].split(".")[0]
sys.path.append(path_without_dataset_name)
dataset_to_use = importlib.import_module(dataset_name)
for key in dir(dataset_to_use):
if key.startswith("_"):
continue
base_class = getattr(dataset_to_use, key)
if not inspect.isclass(base_class) or not issubclass(base_class, base.Workload):
continue
queries = collections.defaultdict(list)
for funcname in dir(base_class):
if not funcname.startswith("benchmark__"):
continue
group, query = funcname.split("__")[1:]
queries[group].append((query, funcname))
generators[base_class.NAME] = (base_class, dict(queries))
return generators
def list_available_workloads(customer_workloads: str = None):
generators = get_available_workloads(customer_workloads)
for name in sorted(generators.keys()):
print("Dataset:", name)
dataset, queries = generators[name]
print(
" Variants:",
", ".join(dataset.VARIANTS),
"(default: " + dataset.DEFAULT_VARIANT + ")",
)
for group in sorted(queries.keys()):
print(" Group:", group)
for query_name, query_func in queries[group]:
print(" Query:", query_name)
def match_patterns(workload, variant, group, query, is_default_variant, patterns):
for pattern in patterns:
verdict = [fnmatch.fnmatchcase(workload, pattern[0])]
if pattern[1] != "":
verdict.append(fnmatch.fnmatchcase(variant, pattern[1]))
else:
verdict.append(is_default_variant)
verdict.append(fnmatch.fnmatchcase(group, pattern[2]))
verdict.append(fnmatch.fnmatchcase(query, pattern[3]))
if all(verdict):
return True
return False
def filter_workloads(available_workloads: dict, benchmark_context: BenchmarkContext) -> list:
patterns = benchmark_context.benchmark_target_workload
for i in range(len(patterns)):
pattern = patterns[i].split("/")
if len(pattern) > 5 or len(pattern) == 0:
raise Exception("Invalid benchmark description '" + pattern + "'!")
pattern.extend(["", "*", "*"][len(pattern) - 1 :])
patterns[i] = pattern
filtered = []
for workload in sorted(available_workloads.keys()):
generator, queries = available_workloads[workload]
for variant in generator.VARIANTS:
is_default_variant = variant == generator.DEFAULT_VARIANT
current = collections.defaultdict(list)
for group in queries:
for query_name, query_func in queries[group]:
if match_patterns(
workload,
variant,
group,
query_name,
is_default_variant,
patterns,
):
current[group].append((query_name, query_func))
if len(current) == 0:
continue
# Ignore benchgraph "basic" queries in standard CI/CD run
for pattern in patterns:
res = pattern.count("*")
key = "basic"
if res >= 2 and key in current.keys():
current.pop(key)
filtered.append((generator(variant=variant, benchmark_context=benchmark_context), dict(current)))
return filtered
def parse_kwargs(items):
"""
Parse a series of key-value pairs and return a dictionary
"""
d = {}
if items:
for item in items:
key, value = item.split("=")
d[key] = value
return d
class Directory:
def __init__(self, path):
self._path = path
@ -103,6 +284,9 @@ class Cache:
ensure_directory(path)
return Directory(path)
def get_default_cache_directory(self):
return self._directory
def load_config(self):
if not os.path.isfile(self._config):
return RecursiveDict()

View File

@ -9,6 +9,8 @@
# by the Apache License, Version 2.0, included in the file
# licenses/APL.txt.
import logging
COLOR_GRAY = 0
COLOR_RED = 1
COLOR_GREEN = 2
@ -16,27 +18,45 @@ COLOR_YELLOW = 3
COLOR_BLUE = 4
COLOR_VIOLET = 5
COLOR_CYAN = 6
COLOR_WHITE = 7
def log(color, *args):
logger = logging.Logger("mgbench_logger")
file_handler = logging.FileHandler("mgbench_logs.log")
file_format = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
file_handler.setFormatter(file_format)
logger.addHandler(file_handler)
def _log(color, *args):
print("\033[1;3{}m~~".format(color), *args, "~~\033[0m")
def log(msg):
print(msg)
logger.info(msg=msg)
def init(*args):
log(COLOR_BLUE, *args)
_log(COLOR_BLUE, *args)
logger.info(*args)
def info(*args):
log(COLOR_CYAN, *args)
_log(COLOR_WHITE, *args)
logger.info(*args)
def success(*args):
log(COLOR_GREEN, *args)
_log(COLOR_GREEN, *args)
logger.info(*args)
def warning(*args):
log(COLOR_YELLOW, *args)
_log(COLOR_YELLOW, *args)
logger.warning(*args)
def error(*args):
log(COLOR_RED, *args)
_log(COLOR_RED, *args)
logger.critical(*args)

View File

@ -1,4 +1,4 @@
# Copyright 2022 Memgraph Ltd.
# Copyright 2023 Memgraph Ltd.
#
# Use of this software is governed by the Business Source License
# included in the file licenses/BSL.txt; by using this file, you agree to be bound by the terms of the Business Source
@ -17,10 +17,13 @@ import subprocess
import tempfile
import threading
import time
from abc import ABC, abstractmethod
from pathlib import Path
from benchmark_context import BenchmarkContext
def wait_for_server(port, delay=0.1):
def _wait_for_server(port, delay=0.1):
cmd = ["nc", "-z", "-w", "1", "127.0.0.1", str(port)]
while subprocess.call(cmd) != 0:
time.sleep(0.01)
@ -62,50 +65,165 @@ def _get_current_usage(pid):
return rss / 1024
class Memgraph:
def __init__(self, memgraph_binary, temporary_dir, properties_on_edges, bolt_port, performance_tracking):
self._memgraph_binary = memgraph_binary
self._directory = tempfile.TemporaryDirectory(dir=temporary_dir)
self._properties_on_edges = properties_on_edges
class BaseClient(ABC):
@abstractmethod
def __init__(self, benchmark_context: BenchmarkContext):
self.benchmark_context = benchmark_context
@abstractmethod
def execute(self):
pass
class BoltClient(BaseClient):
def __init__(self, benchmark_context: BenchmarkContext):
self._client_binary = benchmark_context.client_binary
self._directory = tempfile.TemporaryDirectory(dir=benchmark_context.temporary_directory)
self._username = ""
self._password = ""
self._bolt_port = (
benchmark_context.vendor_args["bolt-port"] if "bolt-port" in benchmark_context.vendor_args.keys() else 7687
)
def _get_args(self, **kwargs):
return _convert_args_to_flags(self._client_binary, **kwargs)
def set_credentials(self, username: str, password: str):
self._username = username
self._password = password
def execute(
self,
queries=None,
file_path=None,
num_workers=1,
max_retries: int = 50,
validation: bool = False,
time_dependent_execution: int = 0,
):
if (queries is None and file_path is None) or (queries is not None and file_path is not None):
raise ValueError("Either queries or input_path must be specified!")
queries_json = False
if queries is not None:
queries_json = True
file_path = os.path.join(self._directory.name, "queries.json")
with open(file_path, "w") as f:
for query in queries:
json.dump(query, f)
f.write("\n")
args = self._get_args(
input=file_path,
num_workers=num_workers,
max_retries=max_retries,
queries_json=queries_json,
username=self._username,
password=self._password,
port=self._bolt_port,
validation=validation,
time_dependent_execution=time_dependent_execution,
)
ret = None
try:
ret = subprocess.run(args, capture_output=True)
finally:
error = ret.stderr.decode("utf-8").strip().split("\n")
data = ret.stdout.decode("utf-8").strip().split("\n")
if error and error[0] != "":
print("Reported errros from client")
print(error)
data = [x for x in data if not x.startswith("[")]
return list(map(json.loads, data))
class BaseRunner(ABC):
subclasses = {}
def __init_subclass__(cls, **kwargs) -> None:
super().__init_subclass__(**kwargs)
cls.subclasses[cls.__name__.lower()] = cls
return
@classmethod
def create(cls, benchmark_context: BenchmarkContext):
if benchmark_context.vendor_name not in cls.subclasses:
raise ValueError("Missing runner with name: {}".format(benchmark_context.vendor_name))
return cls.subclasses[benchmark_context.vendor_name](
benchmark_context=benchmark_context,
)
@abstractmethod
def __init__(self, benchmark_context: BenchmarkContext):
self.benchmark_context = benchmark_context
@abstractmethod
def start_benchmark(self):
pass
@abstractmethod
def start_preparation(self):
pass
@abstractmethod
def stop(self):
pass
@abstractmethod
def clean_db(self):
pass
@abstractmethod
def fetch_client(self) -> BaseClient:
pass
class Memgraph(BaseRunner):
def __init__(self, benchmark_context: BenchmarkContext):
super().__init__(benchmark_context=benchmark_context)
self._memgraph_binary = benchmark_context.vendor_binary
self._performance_tracking = benchmark_context.performance_tracking
self._directory = tempfile.TemporaryDirectory(dir=benchmark_context.temporary_directory)
self._vendor_args = benchmark_context.vendor_args
self._properties_on_edges = (
self._vendor_args["no-properties-on-edges"]
if "no-properties-on-edges" in self._vendor_args.keys()
else False
)
self._bolt_port = self._vendor_args["bolt-port"] if "bolt-port" in self._vendor_args.keys() else 7687
self._proc_mg = None
self._bolt_port = bolt_port
self.performance_tracking = performance_tracking
self._stop_event = threading.Event()
self._rss = []
atexit.register(self._cleanup)
# Determine Memgraph version
ret = subprocess.run([memgraph_binary, "--version"], stdout=subprocess.PIPE, check=True)
ret = subprocess.run([self._memgraph_binary, "--version"], stdout=subprocess.PIPE, check=True)
version = re.search(r"[0-9]+\.[0-9]+\.[0-9]+", ret.stdout.decode("utf-8")).group(0)
self._memgraph_version = tuple(map(int, version.split(".")))
atexit.register(self._cleanup)
def __del__(self):
self._cleanup()
atexit.unregister(self._cleanup)
def _get_args(self, **kwargs):
def _set_args(self, **kwargs):
data_directory = os.path.join(self._directory.name, "memgraph")
kwargs["bolt_port"] = self._bolt_port
if self._memgraph_version >= (0, 50, 0):
kwargs["data_directory"] = data_directory
else:
kwargs["durability_directory"] = data_directory
if self._memgraph_version >= (0, 50, 0):
kwargs["storage_properties_on_edges"] = self._properties_on_edges
else:
assert self._properties_on_edges, "Older versions of Memgraph can't disable properties on edges!"
kwargs["data_directory"] = data_directory
kwargs["storage_properties_on_edges"] = self._properties_on_edges
return _convert_args_to_flags(self._memgraph_binary, **kwargs)
def _start(self, **kwargs):
if self._proc_mg is not None:
raise Exception("The database process is already running!")
args = self._get_args(**kwargs)
args = self._set_args(**kwargs)
self._proc_mg = subprocess.Popen(args, stdout=subprocess.DEVNULL)
time.sleep(0.2)
if self._proc_mg.poll() is not None:
self._proc_mg = None
raise Exception("The database process died prematurely!")
wait_for_server(self._bolt_port)
_wait_for_server(self._bolt_port)
ret = self._proc_mg.poll()
assert ret is None, "The database process died prematurely " "({})!".format(ret)
@ -119,7 +237,7 @@ class Memgraph:
return ret, usage
def start_preparation(self, workload):
if self.performance_tracking:
if self._performance_tracking:
p = threading.Thread(target=self.res_background_tracking, args=(self._rss, self._stop_event))
self._stop_event.clear()
self._rss.clear()
@ -127,13 +245,26 @@ class Memgraph:
self._start(storage_snapshot_on_exit=True)
def start_benchmark(self, workload):
if self.performance_tracking:
if self._performance_tracking:
p = threading.Thread(target=self.res_background_tracking, args=(self._rss, self._stop_event))
self._stop_event.clear()
self._rss.clear()
p.start()
self._start(storage_recover_on_startup=True)
def clean_db(self):
if self._proc_mg is not None:
raise Exception("The database process is already running, cannot clear data it!")
else:
out = subprocess.run(
args="rm -Rf memgraph/snapshots/*",
cwd=self._directory.name,
capture_output=True,
shell=True,
)
print(out.stderr.decode("utf-8"))
print(out.stdout.decode("utf-8"))
def res_background_tracking(self, res, stop_event):
print("Started rss tracking.")
while not stop_event.is_set():
@ -154,35 +285,46 @@ class Memgraph:
f.close()
def stop(self, workload):
if self.performance_tracking:
if self._performance_tracking:
self._stop_event.set()
self.dump_rss(workload)
ret, usage = self._cleanup()
assert ret == 0, "The database process exited with a non-zero " "status ({})!".format(ret)
return usage
def fetch_client(self) -> BoltClient:
return BoltClient(benchmark_context=self.benchmark_context)
class Neo4j:
def __init__(self, neo4j_path, temporary_dir, bolt_port, performance_tracking):
self._neo4j_path = Path(neo4j_path)
self._neo4j_binary = Path(neo4j_path) / "bin" / "neo4j"
self._neo4j_config = Path(neo4j_path) / "conf" / "neo4j.conf"
self._neo4j_pid = Path(neo4j_path) / "run" / "neo4j.pid"
self._neo4j_admin = Path(neo4j_path) / "bin" / "neo4j-admin"
self.performance_tracking = performance_tracking
class Neo4j(BaseRunner):
def __init__(self, benchmark_context: BenchmarkContext):
super().__init__(benchmark_context=benchmark_context)
self._neo4j_binary = Path(benchmark_context.vendor_binary)
self._neo4j_path = Path(benchmark_context.vendor_binary).parents[1]
self._neo4j_config = self._neo4j_path / "conf" / "neo4j.conf"
self._neo4j_pid = self._neo4j_path / "run" / "neo4j.pid"
self._neo4j_admin = self._neo4j_path / "bin" / "neo4j-admin"
self._performance_tracking = benchmark_context.performance_tracking
self._vendor_args = benchmark_context.vendor_args
self._stop_event = threading.Event()
self._rss = []
if not self._neo4j_binary.is_file():
raise Exception("Wrong path to binary!")
self._directory = tempfile.TemporaryDirectory(dir=temporary_dir)
self._bolt_port = bolt_port
tempfile.TemporaryDirectory(dir=benchmark_context.temporary_directory)
self._bolt_port = (
self.benchmark_context.vendor_args["bolt-port"]
if "bolt-port" in self.benchmark_context.vendor_args.keys()
else 7687
)
atexit.register(self._cleanup)
configs = []
memory_flag = "server.jvm.additional=-XX:NativeMemoryTracking=detail"
auth_flag = "dbms.security.auth_enabled=false"
if self.performance_tracking:
bolt_flag = "server.bolt.listen_address=:7687"
http_flag = "server.http.listen_address=:7474"
if self._performance_tracking:
configs.append(memory_flag)
else:
lines = []
@ -201,6 +343,8 @@ class Neo4j:
file.close()
configs.append(auth_flag)
configs.append(bolt_flag)
configs.append(http_flag)
print("Check neo4j config flags:")
for conf in configs:
with self._neo4j_config.open("r+") as file:
@ -234,7 +378,7 @@ class Neo4j:
else:
raise Exception("The database process died prematurely!")
print("Run server check:")
wait_for_server(self._bolt_port)
_wait_for_server(self._bolt_port)
def _cleanup(self):
if self._neo4j_pid.exists():
@ -248,7 +392,7 @@ class Neo4j:
return 0
def start_preparation(self, workload):
if self.performance_tracking:
if self._performance_tracking:
p = threading.Thread(target=self.res_background_tracking, args=(self._rss, self._stop_event))
self._stop_event.clear()
self._rss.clear()
@ -257,11 +401,11 @@ class Neo4j:
# Start DB
self._start()
if self.performance_tracking:
if self._performance_tracking:
self.get_memory_usage("start_" + workload)
def start_benchmark(self, workload):
if self.performance_tracking:
if self._performance_tracking:
p = threading.Thread(target=self.res_background_tracking, args=(self._rss, self._stop_event))
self._stop_event.clear()
self._rss.clear()
@ -269,7 +413,7 @@ class Neo4j:
# Start DB
self._start()
if self.performance_tracking:
if self._performance_tracking:
self.get_memory_usage("start_" + workload)
def dump_db(self, path):
@ -290,6 +434,20 @@ class Neo4j:
check=True,
)
def clean_db(self):
print("Cleaning the database")
if self._neo4j_pid.exists():
raise Exception("Cannot clean DB because it is running.")
else:
out = subprocess.run(
args="rm -Rf data/databases/* data/transactions/*",
cwd=self._neo4j_path,
capture_output=True,
shell=True,
)
print(out.stderr.decode("utf-8"))
print(out.stdout.decode("utf-8"))
def load_db_from_dump(self, path):
print("Loading the neo4j database from dump...")
if self._neo4j_pid.exists():
@ -300,7 +458,8 @@ class Neo4j:
self._neo4j_admin,
"database",
"load",
"--from-path=" + path,
"--from-path",
path,
"--overwrite-destination=true",
"neo4j",
],
@ -325,7 +484,7 @@ class Neo4j:
return True
def stop(self, workload):
if self.performance_tracking:
if self._performance_tracking:
self._stop_event.set()
self.get_memory_usage("stop_" + workload)
self.dump_rss(workload)
@ -360,51 +519,5 @@ class Neo4j:
f.write(memory_usage.stdout)
f.close()
class Client:
def __init__(
self, client_binary: str, temporary_directory: str, bolt_port: int, username: str = "", password: str = ""
):
self._client_binary = client_binary
self._directory = tempfile.TemporaryDirectory(dir=temporary_directory)
self._username = username
self._password = password
self._bolt_port = bolt_port
def _get_args(self, **kwargs):
return _convert_args_to_flags(self._client_binary, **kwargs)
def execute(self, queries=None, file_path=None, num_workers=1):
if (queries is None and file_path is None) or (queries is not None and file_path is not None):
raise ValueError("Either queries or input_path must be specified!")
# TODO: check `file_path.endswith(".json")` to support advanced
# input queries
queries_json = False
if queries is not None:
queries_json = True
file_path = os.path.join(self._directory.name, "queries.json")
with open(file_path, "w") as f:
for query in queries:
json.dump(query, f)
f.write("\n")
args = self._get_args(
input=file_path,
num_workers=num_workers,
queries_json=queries_json,
username=self._username,
password=self._password,
port=self._bolt_port,
)
ret = subprocess.run(args, capture_output=True, check=True)
error = ret.stderr.decode("utf-8").strip().split("\n")
if error and error[0] != "":
print("Reported errros from client")
print(error)
data = ret.stdout.decode("utf-8").strip().split("\n")
data = [x for x in data if not x.startswith("[")]
return list(map(json.loads, data))
def fetch_client(self) -> BoltClient:
return BoltClient(benchmark_context=self.benchmark_context)

244
tests/mgbench/validation.py Normal file
View File

@ -0,0 +1,244 @@
import argparse
import copy
import multiprocessing
import random
import helpers
import runners
import workloads
from benchmark_context import BenchmarkContext
from workloads import base
def pars_args():
parser = argparse.ArgumentParser(
prog="Validator for individual query checking",
description="""Validates that query is running, and validates output between different vendors""",
)
parser.add_argument(
"benchmarks",
nargs="*",
default="",
help="descriptions of benchmarks that should be run; "
"multiple descriptions can be specified to run multiple "
"benchmarks; the description is specified as "
"dataset/variant/group/query; Unix shell-style wildcards "
"can be used in the descriptions; variant, group and query "
"are optional and they can be left out; the default "
"variant is '' which selects the default dataset variant; "
"the default group is '*' which selects all groups; the"
"default query is '*' which selects all queries",
)
parser.add_argument(
"--vendor-binary-1",
help="Vendor binary used for benchmarking, by default it is memgraph",
default=helpers.get_binary_path("memgraph"),
)
parser.add_argument(
"--vendor-name-1",
default="memgraph",
choices=["memgraph", "neo4j"],
help="Input vendor binary name (memgraph, neo4j)",
)
parser.add_argument(
"--vendor-binary-2",
help="Vendor binary used for benchmarking, by default it is memgraph",
default=helpers.get_binary_path("memgraph"),
)
parser.add_argument(
"--vendor-name-2",
default="memgraph",
choices=["memgraph", "neo4j"],
help="Input vendor binary name (memgraph, neo4j)",
)
parser.add_argument(
"--client-binary",
default=helpers.get_binary_path("tests/mgbench/client"),
help="Client binary used for benchmarking",
)
parser.add_argument(
"--temporary-directory",
default="/tmp",
help="directory path where temporary data should " "be stored",
)
parser.add_argument(
"--num-workers-for-import",
type=int,
default=multiprocessing.cpu_count() // 2,
help="number of workers used to import the dataset",
)
return parser.parse_args()
def get_queries(gen, count):
# Make the generator deterministic.
random.seed(gen.__name__)
# Generate queries.
ret = []
for i in range(count):
ret.append(gen())
return ret
if __name__ == "__main__":
args = pars_args()
benchmark_context_db_1 = BenchmarkContext(
vendor_name=args.vendor_name_1,
vendor_binary=args.vendor_binary_1,
benchmark_target_workload=copy.copy(args.benchmarks),
client_binary=args.client_binary,
num_workers_for_import=args.num_workers_for_import,
temporary_directory=args.temporary_directory,
)
available_workloads = helpers.get_available_workloads()
print(helpers.list_available_workloads())
vendor_runner = runners.BaseRunner.create(
benchmark_context=benchmark_context_db_1,
)
cache = helpers.Cache()
client = vendor_runner.fetch_client()
workloads = helpers.filter_workloads(
available_workloads=available_workloads, benchmark_context=benchmark_context_db_1
)
results_db_1 = {}
for workload, queries in workloads:
vendor_runner.clean_db()
generated_queries = workload.dataset_generator()
if generated_queries:
vendor_runner.start_preparation("import")
client.execute(queries=generated_queries, num_workers=benchmark_context_db_1.num_workers_for_import)
vendor_runner.stop("import")
else:
workload.prepare(cache.cache_directory("datasets", workload.NAME, workload.get_variant()))
imported = workload.custom_import()
if not imported:
vendor_runner.start_preparation("import")
print("Executing database cleanup and index setup...")
client.execute(
file_path=workload.get_index(), num_workers=benchmark_context_db_1.num_workers_for_import
)
print("Importing dataset...")
ret = client.execute(
file_path=workload.get_file(), num_workers=benchmark_context_db_1.num_workers_for_import
)
usage = vendor_runner.stop("import")
for group in sorted(queries.keys()):
for query, funcname in queries[group]:
print("Running query:{}/{}/{}".format(group, query, funcname))
func = getattr(workload, funcname)
count = 1
vendor_runner.start_benchmark("validation")
try:
ret = client.execute(queries=get_queries(func, count), num_workers=1, validation=True)[0]
results_db_1[funcname] = ret["results"].items()
except Exception as e:
print("Issue running the query" + funcname)
print(e)
results_db_1[funcname] = "Query not executed properly"
finally:
usage = vendor_runner.stop("validation")
print("Database used {:.3f} seconds of CPU time.".format(usage["cpu"]))
print("Database peaked at {:.3f} MiB of memory.".format(usage["memory"] / 1024.0 / 1024.0))
benchmark_context_db_2 = BenchmarkContext(
vendor_name=args.vendor_name_2,
vendor_binary=args.vendor_binary_2,
benchmark_target_workload=copy.copy(args.benchmarks),
client_binary=args.client_binary,
num_workers_for_import=args.num_workers_for_import,
temporary_directory=args.temporary_directory,
)
vendor_runner = runners.BaseRunner.create(
benchmark_context=benchmark_context_db_2,
)
available_workloads = helpers.get_available_workloads()
workloads = helpers.filter_workloads(available_workloads, benchmark_context=benchmark_context_db_2)
client = vendor_runner.fetch_client()
results_db_2 = {}
for workload, queries in workloads:
vendor_runner.clean_db()
generated_queries = workload.dataset_generator()
if generated_queries:
vendor_runner.start_preparation("import")
client.execute(queries=generated_queries, num_workers=benchmark_context_db_2.num_workers_for_import)
vendor_runner.stop("import")
else:
workload.prepare(cache.cache_directory("datasets", workload.NAME, workload.get_variant()))
imported = workload.custom_import()
if not imported:
vendor_runner.start_preparation("import")
print("Executing database cleanup and index setup...")
client.execute(
file_path=workload.get_index(), num_workers=benchmark_context_db_2.num_workers_for_import
)
print("Importing dataset...")
ret = client.execute(
file_path=workload.get_file(), num_workers=benchmark_context_db_2.num_workers_for_import
)
usage = vendor_runner.stop("import")
for group in sorted(queries.keys()):
for query, funcname in queries[group]:
print("Running query:{}/{}/{}".format(group, query, funcname))
func = getattr(workload, funcname)
count = 1
vendor_runner.start_benchmark("validation")
try:
ret = client.execute(queries=get_queries(func, count), num_workers=1, validation=True)[0]
results_db_2[funcname] = ret["results"].items()
except Exception as e:
print("Issue running the query" + funcname)
print(e)
results_db_2[funcname] = "Query not executed properly"
finally:
usage = vendor_runner.stop("validation")
print("Database used {:.3f} seconds of CPU time.".format(usage["cpu"]))
print("Database peaked at {:.3f} MiB of memory.".format(usage["memory"] / 1024.0 / 1024.0))
validation = {}
for key in results_db_1.keys():
if type(results_db_1[key]) is str:
validation[key] = "Query not executed properly."
else:
db_1_values = set()
for index, value in results_db_1[key]:
db_1_values.add(value)
neo4j_values = set()
for index, value in results_db_2[key]:
neo4j_values.add(value)
if db_1_values == neo4j_values:
validation[key] = "Identical results"
else:
validation[key] = "Different results, check manually."
for key, value in validation.items():
print(key + " " + value)

View File

@ -0,0 +1,4 @@
from pathlib import Path
modules = Path(__file__).resolve().parent.glob("*.py")
__all__ = [f.name[:-3] for f in modules if f.is_file() and not f.name == "__init__.py"]

View File

@ -0,0 +1,197 @@
# Copyright 2022 Memgraph Ltd.
#
# Use of this software is governed by the Business Source License
# included in the file licenses/BSL.txt; by using this file, you agree to be bound by the terms of the Business Source
# License, and you may not use this file except in compliance with the Business Source License.
#
# As of the Change Date specified in that file, in accordance with
# the Business Source License, use of this software will be governed
# by the Apache License, Version 2.0, included in the file
# licenses/APL.txt.
from abc import ABC, abstractclassmethod
from pathlib import Path
import helpers
from benchmark_context import BenchmarkContext
# Base dataset class used as a template to create each individual dataset. All
# common logic is handled here.
class Workload(ABC):
# Name of the workload/dataset.
NAME = ""
# List of all variants of the workload/dataset that exist.
VARIANTS = ["default"]
# One of the available variants that should be used as the default variant.
DEFAULT_VARIANT = "default"
# List of local files that should be used to import the dataset.
LOCAL_FILE = None
# URLs of remote dataset files that should be used to import the dataset, compressed in gz format.
URL_FILE = None
# Index files
LOCAL_INDEX_FILE = None
URL_INDEX_FILE = None
# Number of vertices/edges for each variant.
SIZES = {
"default": {"vertices": 0, "edges": 0},
}
# Indicates whether the dataset has properties on edges.
PROPERTIES_ON_EDGES = False
def __init_subclass__(cls) -> None:
name_prerequisite = "NAME" in cls.__dict__
generator_prerequisite = "dataset_generator" in cls.__dict__
custom_import_prerequisite = "custom_import" in cls.__dict__
basic_import_prerequisite = ("LOCAL_FILE" in cls.__dict__ or "URL_FILE" in cls.__dict__) and (
"LOCAL_INDEX_FILE" in cls.__dict__ or "URL_INDEX_FILE" in cls.__dict__
)
if not name_prerequisite:
raise ValueError(
"""Can't define a workload class {} without NAME property:
NAME = "dataset name"
Name property defines the workload you want to execute, for example: "demo/*/*/*"
""".format(
cls.__name__
)
)
# Check workload is in generator or dataset mode during interpretation (not both), not runtime
if generator_prerequisite and (custom_import_prerequisite or basic_import_prerequisite):
raise ValueError(
"""
The workload class {} cannot have defined dataset import and generate dataset at
the same time.
""".format(
cls.__name__
)
)
if not generator_prerequisite and (not custom_import_prerequisite and not basic_import_prerequisite):
raise ValueError(
"""
The workload class {} need to have defined dataset import or dataset generator
""".format(
cls.__name__
)
)
return super().__init_subclass__()
def __init__(self, variant: str = None, benchmark_context: BenchmarkContext = None):
"""
Accepts a `variant` variable that indicates which variant
of the dataset should be executed
"""
self.benchmark_context = benchmark_context
self._variant = variant
self._vendor = benchmark_context.vendor_name
self._file = None
self._file_index = None
if self.NAME == "":
raise ValueError("Give your workload a name, by setting self.NAME")
if variant is None:
variant = self.DEFAULT_VARIANT
if variant not in self.VARIANTS:
raise ValueError("Invalid test variant!")
if (self.LOCAL_FILE and variant not in self.LOCAL_FILE) and (self.URL_FILE and variant not in self.URL_FILE):
raise ValueError("The variant doesn't have a defined URL or LOCAL file path!")
if variant not in self.SIZES:
raise ValueError("The variant doesn't have a defined dataset " "size!")
if (self.LOCAL_INDEX_FILE and self._vendor not in self.LOCAL_INDEX_FILE) and (
self.URL_INDEX_FILE and self._vendor not in self.URL_INDEX_FILE
):
raise ValueError("Vendor does not have INDEX for dataset!")
if self.LOCAL_FILE is not None:
self._local_file = self.LOCAL_FILE.get(variant, None)
else:
self._local_file = None
if self.URL_FILE is not None:
self._url_file = self.URL_FILE.get(variant, None)
else:
self._url_file = None
if self.LOCAL_INDEX_FILE is not None:
self._local_index = self.LOCAL_INDEX_FILE.get(self._vendor, None)
else:
self._local_index = None
if self.URL_INDEX_FILE is not None:
self._url_index = self.URL_INDEX_FILE.get(self._vendor, None)
else:
self._url_index = None
self._size = self.SIZES[variant]
if "vertices" in self._size or "edges" in self._size:
self._num_vertices = self._size["vertices"]
self._num_edges = self._size["edges"]
def prepare(self, directory):
if self._local_file is not None:
print("Using local dataset file:", self._local_file)
self._file = self._local_file
elif self._url_file is not None:
cached_input, exists = directory.get_file("dataset.cypher")
if not exists:
print("Downloading dataset file:", self._url_file)
downloaded_file = helpers.download_file(self._url_file, directory.get_path())
print("Unpacking and caching file:", downloaded_file)
helpers.unpack_gz_and_move_file(downloaded_file, cached_input)
print("Using cached dataset file:", cached_input)
self._file = cached_input
if self._local_index is not None:
print("Using local index file:", self._local_index)
self._file_index = self._local_index
elif self._url_index is not None:
cached_index, exists = directory.get_file(self._vendor + ".cypher")
if not exists:
print("Downloading index file:", self._url_index)
downloaded_file = helpers.download_file(self._url_index, directory.get_path())
print("Unpacking and caching file:", downloaded_file)
helpers.unpack_gz_and_move_file(downloaded_file, cached_index)
print("Using cached index file:", cached_index)
self._file_index = cached_index
def get_variant(self):
"""Returns the current variant of the dataset."""
return self._variant
def get_index(self):
"""Get index file, defined by vendor"""
return self._file_index
def get_file(self):
"""
Returns path to the file that contains dataset creation queries.
"""
return self._file
def get_size(self):
"""Returns number of vertices/edges for the current variant."""
return self._size
def custom_import(self) -> bool:
print("Workload does not have a custom import")
return False
def dataset_generator(self) -> list:
print("Workload is not auto generated")
return []
# All tests should be query generator functions that output all of the
# queries that should be executed by the runner. The functions should be
# named `benchmark__GROUPNAME__TESTNAME` and should not accept any
# arguments.

View File

@ -0,0 +1,28 @@
import random
from workloads.base import Workload
class Demo(Workload):
NAME = "demo"
def dataset_generator(self):
queries = [("MATCH (n) DETACH DELETE n;", {})]
for i in range(0, 100):
queries.append(("CREATE (:NodeA{{ id:{}}});".format(i), {}))
queries.append(("CREATE (:NodeB{{ id:{}}});".format(i), {}))
for i in range(0, 100):
a = random.randint(0, 99)
b = random.randint(0, 99)
queries.append(("MATCH(a:NodeA{{ id: {}}}),(b:NodeB{{id: {}}}) CREATE (a)-[:EDGE]->(b)".format(a, b), {}))
return queries
def benchmark__test__sample_query1(self):
return ("MATCH (n) RETURN n", {})
def benchmark__test__sample_query2(self):
return ("MATCH (n) RETURN n", {})

View File

@ -0,0 +1,213 @@
import csv
import subprocess
from collections import defaultdict
from pathlib import Path
import helpers
from benchmark_context import BenchmarkContext
from runners import BaseRunner
HEADERS_URL = "https://s3.eu-west-1.amazonaws.com/deps.memgraph.io/dataset/ldbc/benchmark/bi/headers.tar.gz"
class ImporterLDBCBI:
def __init__(
self, benchmark_context: BenchmarkContext, dataset_name: str, variant: str, index_file: str, csv_dict: dict
) -> None:
self._benchmark_context = benchmark_context
self._dataset_name = dataset_name
self._variant = variant
self._index_file = index_file
self._csv_dict = csv_dict
def execute_import(self):
vendor_runner = BaseRunner.create(
benchmark_context=self._benchmark_context,
)
client = vendor_runner.fetch_client()
if self._benchmark_context.vendor_name == "neo4j":
data_dir = Path() / ".cache" / "datasets" / self._dataset_name / self._variant / "data_neo4j"
data_dir.mkdir(parents=True, exist_ok=True)
dir_name = self._csv_dict[self._variant].split("/")[-1:][0].removesuffix(".tar.zst")
if (data_dir / dir_name).exists():
print("Files downloaded")
data_dir = data_dir / dir_name
else:
print("Downloading files")
downloaded_file = helpers.download_file(self._csv_dict[self._variant], data_dir.absolute())
print("Unpacking the file..." + downloaded_file)
data_dir = helpers.unpack_tar_zst(Path(downloaded_file))
headers_dir = Path() / ".cache" / "datasets" / self._dataset_name / self._variant / "headers_neo4j"
headers_dir.mkdir(parents=True, exist_ok=True)
headers = HEADERS_URL.split("/")[-1:][0].removesuffix(".tar.gz")
if (headers_dir / headers).exists():
print("Header files downloaded.")
else:
print("Downloading files")
downloaded_file = helpers.download_file(HEADERS_URL, headers_dir.absolute())
print("Unpacking the file..." + downloaded_file)
headers_dir = helpers.unpack_tar_gz(Path(downloaded_file))
input_headers = {}
for header_file in headers_dir.glob("**/*.csv"):
key = "/".join(header_file.parts[-2:])[0:-4]
input_headers[key] = header_file.as_posix()
for data_file in data_dir.glob("**/*.gz"):
if "initial_snapshot" in data_file.parts:
data_file = helpers.unpack_gz(data_file)
output = data_file.parent / (data_file.stem + "_neo" + ".csv")
if not output.exists():
with data_file.open("r") as input_f, output.open("a") as output_f:
reader = csv.reader(input_f, delimiter="|")
header = next(reader)
writer = csv.writer(output_f, delimiter="|")
for line in reader:
writer.writerow(line)
else:
print("Files converted")
input_files = defaultdict(list)
for neo_file in data_dir.glob("**/*_neo.csv"):
key = "/".join(neo_file.parts[-3:-1])
input_files[key].append(neo_file.as_posix())
vendor_runner.clean_db()
subprocess.run(
args=[
vendor_runner._neo4j_admin,
"database",
"import",
"full",
"--id-type=INTEGER",
"--ignore-empty-strings=true",
"--bad-tolerance=0",
"--nodes=Place=" + input_headers["static/Place"] + "," + ",".join(input_files["static/Place"]),
"--nodes=Organisation="
+ input_headers["static/Organisation"]
+ ","
+ ",".join(input_files["static/Organisation"]),
"--nodes=TagClass="
+ input_headers["static/TagClass"]
+ ","
+ ",".join(input_files["static/TagClass"]),
"--nodes=Tag=" + input_headers["static/Tag"] + "," + ",".join(input_files["static/Tag"]),
"--nodes=Forum=" + input_headers["dynamic/Forum"] + "," + ",".join(input_files["dynamic/Forum"]),
"--nodes=Person=" + input_headers["dynamic/Person"] + "," + ",".join(input_files["dynamic/Person"]),
"--nodes=Message:Comment="
+ input_headers["dynamic/Comment"]
+ ","
+ ",".join(input_files["dynamic/Comment"]),
"--nodes=Message:Post="
+ input_headers["dynamic/Post"]
+ ","
+ ",".join(input_files["dynamic/Post"]),
"--relationships=IS_PART_OF="
+ input_headers["static/Place_isPartOf_Place"]
+ ","
+ ",".join(input_files["static/Place_isPartOf_Place"]),
"--relationships=IS_SUBCLASS_OF="
+ input_headers["static/TagClass_isSubclassOf_TagClass"]
+ ","
+ ",".join(input_files["static/TagClass_isSubclassOf_TagClass"]),
"--relationships=IS_LOCATED_IN="
+ input_headers["static/Organisation_isLocatedIn_Place"]
+ ","
+ ",".join(input_files["static/Organisation_isLocatedIn_Place"]),
"--relationships=HAS_TYPE="
+ input_headers["static/Tag_hasType_TagClass"]
+ ","
+ ",".join(input_files["static/Tag_hasType_TagClass"]),
"--relationships=HAS_CREATOR="
+ input_headers["dynamic/Comment_hasCreator_Person"]
+ ","
+ ",".join(input_files["dynamic/Comment_hasCreator_Person"]),
"--relationships=IS_LOCATED_IN="
+ input_headers["dynamic/Comment_isLocatedIn_Country"]
+ ","
+ ",".join(input_files["dynamic/Comment_isLocatedIn_Country"]),
"--relationships=REPLY_OF="
+ input_headers["dynamic/Comment_replyOf_Comment"]
+ ","
+ ",".join(input_files["dynamic/Comment_replyOf_Comment"]),
"--relationships=REPLY_OF="
+ input_headers["dynamic/Comment_replyOf_Post"]
+ ","
+ ",".join(input_files["dynamic/Comment_replyOf_Post"]),
"--relationships=CONTAINER_OF="
+ input_headers["dynamic/Forum_containerOf_Post"]
+ ","
+ ",".join(input_files["dynamic/Forum_containerOf_Post"]),
"--relationships=HAS_MEMBER="
+ input_headers["dynamic/Forum_hasMember_Person"]
+ ","
+ ",".join(input_files["dynamic/Forum_hasMember_Person"]),
"--relationships=HAS_MODERATOR="
+ input_headers["dynamic/Forum_hasModerator_Person"]
+ ","
+ ",".join(input_files["dynamic/Forum_hasModerator_Person"]),
"--relationships=HAS_TAG="
+ input_headers["dynamic/Forum_hasTag_Tag"]
+ ","
+ ",".join(input_files["dynamic/Forum_hasTag_Tag"]),
"--relationships=HAS_INTEREST="
+ input_headers["dynamic/Person_hasInterest_Tag"]
+ ","
+ ",".join(input_files["dynamic/Person_hasInterest_Tag"]),
"--relationships=IS_LOCATED_IN="
+ input_headers["dynamic/Person_isLocatedIn_City"]
+ ","
+ ",".join(input_files["dynamic/Person_isLocatedIn_City"]),
"--relationships=KNOWS="
+ input_headers["dynamic/Person_knows_Person"]
+ ","
+ ",".join(input_files["dynamic/Person_knows_Person"]),
"--relationships=LIKES="
+ input_headers["dynamic/Person_likes_Comment"]
+ ","
+ ",".join(input_files["dynamic/Person_likes_Comment"]),
"--relationships=LIKES="
+ input_headers["dynamic/Person_likes_Post"]
+ ","
+ ",".join(input_files["dynamic/Person_likes_Post"]),
"--relationships=HAS_CREATOR="
+ input_headers["dynamic/Post_hasCreator_Person"]
+ ","
+ ",".join(input_files["dynamic/Post_hasCreator_Person"]),
"--relationships=HAS_TAG="
+ input_headers["dynamic/Comment_hasTag_Tag"]
+ ","
+ ",".join(input_files["dynamic/Comment_hasTag_Tag"]),
"--relationships=HAS_TAG="
+ input_headers["dynamic/Post_hasTag_Tag"]
+ ","
+ ",".join(input_files["dynamic/Post_hasTag_Tag"]),
"--relationships=IS_LOCATED_IN="
+ input_headers["dynamic/Post_isLocatedIn_Country"]
+ ","
+ ",".join(input_files["dynamic/Post_isLocatedIn_Country"]),
"--relationships=STUDY_AT="
+ input_headers["dynamic/Person_studyAt_University"]
+ ","
+ ",".join(input_files["dynamic/Person_studyAt_University"]),
"--relationships=WORK_AT="
+ input_headers["dynamic/Person_workAt_Company"]
+ ","
+ ",".join(input_files["dynamic/Person_workAt_Company"]),
"--delimiter",
"|",
"neo4j",
],
check=True,
)
vendor_runner.start_preparation("Index preparation")
print("Executing database index setup")
client.execute(file_path=self._index_file, num_workers=1)
vendor_runner.stop("Stop index preparation")
return True
else:
return False

View File

@ -0,0 +1,163 @@
import csv
import subprocess
from pathlib import Path
import helpers
from benchmark_context import BenchmarkContext
from runners import BaseRunner
# Removed speaks/email from person header
HEADERS_INTERACTIVE = {
"static/organisation": "id:ID(Organisation)|:LABEL|name:STRING|url:STRING",
"static/place": "id:ID(Place)|name:STRING|url:STRING|:LABEL",
"static/tagclass": "id:ID(TagClass)|name:STRING|url:STRING",
"static/tag": "id:ID(Tag)|name:STRING|url:STRING",
"static/tagclass_isSubclassOf_tagclass": ":START_ID(TagClass)|:END_ID(TagClass)",
"static/tag_hasType_tagclass": ":START_ID(Tag)|:END_ID(TagClass)",
"static/organisation_isLocatedIn_place": ":START_ID(Organisation)|:END_ID(Place)",
"static/place_isPartOf_place": ":START_ID(Place)|:END_ID(Place)",
"dynamic/comment": "id:ID(Comment)|creationDate:LOCALDATETIME|locationIP:STRING|browserUsed:STRING|content:STRING|length:INT",
"dynamic/forum": "id:ID(Forum)|title:STRING|creationDate:LOCALDATETIME",
"dynamic/person": "id:ID(Person)|firstName:STRING|lastName:STRING|gender:STRING|birthday:LOCALDATETIME|creationDate:LOCALDATETIME|locationIP:STRING|browserUsed:STRING",
"dynamic/post": "id:ID(Post)|imageFile:STRING|creationDate:LOCALDATETIME|locationIP:STRING|browserUsed:STRING|language:STRING|content:STRING|length:INT",
"dynamic/comment_hasCreator_person": ":START_ID(Comment)|:END_ID(Person)",
"dynamic/comment_isLocatedIn_place": ":START_ID(Comment)|:END_ID(Place)",
"dynamic/comment_replyOf_comment": ":START_ID(Comment)|:END_ID(Comment)",
"dynamic/comment_replyOf_post": ":START_ID(Comment)|:END_ID(Post)",
"dynamic/forum_containerOf_post": ":START_ID(Forum)|:END_ID(Post)",
"dynamic/forum_hasMember_person": ":START_ID(Forum)|:END_ID(Person)|joinDate:LOCALDATETIME",
"dynamic/forum_hasModerator_person": ":START_ID(Forum)|:END_ID(Person)",
"dynamic/forum_hasTag_tag": ":START_ID(Forum)|:END_ID(Tag)",
"dynamic/person_hasInterest_tag": ":START_ID(Person)|:END_ID(Tag)",
"dynamic/person_isLocatedIn_place": ":START_ID(Person)|:END_ID(Place)",
"dynamic/person_knows_person": ":START_ID(Person)|:END_ID(Person)|creationDate:LOCALDATETIME",
"dynamic/person_likes_comment": ":START_ID(Person)|:END_ID(Comment)|creationDate:LOCALDATETIME",
"dynamic/person_likes_post": ":START_ID(Person)|:END_ID(Post)|creationDate:LOCALDATETIME",
"dynamic/person_studyAt_organisation": ":START_ID(Person)|:END_ID(Organisation)|classYear:INT",
"dynamic/person_workAt_organisation": ":START_ID(Person)|:END_ID(Organisation)|workFrom:INT",
"dynamic/post_hasCreator_person": ":START_ID(Post)|:END_ID(Person)",
"dynamic/comment_hasTag_tag": ":START_ID(Comment)|:END_ID(Tag)",
"dynamic/post_hasTag_tag": ":START_ID(Post)|:END_ID(Tag)",
"dynamic/post_isLocatedIn_place": ":START_ID(Post)|:END_ID(Place)",
}
class ImporterLDBCInteractive:
def __init__(
self, benchmark_context: BenchmarkContext, dataset_name: str, variant: str, index_file: str, csv_dict: dict
) -> None:
self._benchmark_context = benchmark_context
self._dataset_name = dataset_name
self._variant = variant
self._index_file = index_file
self._csv_dict = csv_dict
def execute_import(self):
vendor_runner = BaseRunner.create(
benchmark_context=self._benchmark_context,
)
client = vendor_runner.fetch_client()
if self._benchmark_context.vendor_name == "neo4j":
print("Runnning Neo4j import")
dump_dir = Path() / ".cache" / "datasets" / self._dataset_name / self._variant / "dump"
dump_dir.mkdir(parents=True, exist_ok=True)
dir_name = self._csv_dict[self._variant].split("/")[-1:][0].removesuffix(".tar.zst")
if (dump_dir / dir_name).exists():
print("Files downloaded")
dump_dir = dump_dir / dir_name
else:
print("Downloading files")
downloaded_file = helpers.download_file(self._csv_dict[self._variant], dump_dir.absolute())
print("Unpacking the file..." + downloaded_file)
dump_dir = helpers.unpack_tar_zst(Path(downloaded_file))
input_files = {}
for file in dump_dir.glob("*/*0.csv"):
parts = file.parts[-2:]
key = parts[0] + "/" + parts[1][:-8]
input_files[key] = file
output_files = {}
for key, file in input_files.items():
output = file.parent / (file.stem + "_neo" + ".csv")
if not output.exists():
with file.open("r") as input_f, output.open("a") as output_f:
reader = csv.reader(input_f, delimiter="|")
header = next(reader)
writer = csv.writer(output_f, delimiter="|")
if key in HEADERS_INTERACTIVE.keys():
updated_header = HEADERS_INTERACTIVE[key].split("|")
writer.writerow(updated_header)
for line in reader:
if "creationDate" in header:
pos = header.index("creationDate")
line[pos] = line[pos][0:-5]
elif "joinDate" in header:
pos = header.index("joinDate")
line[pos] = line[pos][0:-5]
if "organisation_0_0.csv" == file.name:
writer.writerow([line[0], line[1].capitalize(), line[2], line[3]])
elif "place_0_0.csv" == file.name:
writer.writerow([line[0], line[1], line[2], line[3].capitalize()])
else:
writer.writerow(line)
output_files[key] = output.as_posix()
vendor_runner.clean_db()
subprocess.run(
args=[
vendor_runner._neo4j_admin,
"database",
"import",
"full",
"--id-type=INTEGER",
"--nodes=Place=" + output_files["static/place"],
"--nodes=Organisation=" + output_files["static/organisation"],
"--nodes=TagClass=" + output_files["static/tagclass"],
"--nodes=Tag=" + output_files["static/tag"],
"--nodes=Comment:Message=" + output_files["dynamic/comment"],
"--nodes=Forum=" + output_files["dynamic/forum"],
"--nodes=Person=" + output_files["dynamic/person"],
"--nodes=Post:Message=" + output_files["dynamic/post"],
"--relationships=IS_PART_OF=" + output_files["static/place_isPartOf_place"],
"--relationships=IS_SUBCLASS_OF=" + output_files["static/tagclass_isSubclassOf_tagclass"],
"--relationships=IS_LOCATED_IN=" + output_files["static/organisation_isLocatedIn_place"],
"--relationships=HAS_TYPE=" + output_files["static/tag_hasType_tagclass"],
"--relationships=HAS_CREATOR=" + output_files["dynamic/comment_hasCreator_person"],
"--relationships=IS_LOCATED_IN=" + output_files["dynamic/comment_isLocatedIn_place"],
"--relationships=REPLY_OF=" + output_files["dynamic/comment_replyOf_comment"],
"--relationships=REPLY_OF=" + output_files["dynamic/comment_replyOf_post"],
"--relationships=CONTAINER_OF=" + output_files["dynamic/forum_containerOf_post"],
"--relationships=HAS_MEMBER=" + output_files["dynamic/forum_hasMember_person"],
"--relationships=HAS_MODERATOR=" + output_files["dynamic/forum_hasModerator_person"],
"--relationships=HAS_TAG=" + output_files["dynamic/forum_hasTag_tag"],
"--relationships=HAS_INTEREST=" + output_files["dynamic/person_hasInterest_tag"],
"--relationships=IS_LOCATED_IN=" + output_files["dynamic/person_isLocatedIn_place"],
"--relationships=KNOWS=" + output_files["dynamic/person_knows_person"],
"--relationships=LIKES=" + output_files["dynamic/person_likes_comment"],
"--relationships=LIKES=" + output_files["dynamic/person_likes_post"],
"--relationships=HAS_CREATOR=" + output_files["dynamic/post_hasCreator_person"],
"--relationships=HAS_TAG=" + output_files["dynamic/comment_hasTag_tag"],
"--relationships=HAS_TAG=" + output_files["dynamic/post_hasTag_tag"],
"--relationships=IS_LOCATED_IN=" + output_files["dynamic/post_isLocatedIn_place"],
"--relationships=STUDY_AT=" + output_files["dynamic/person_studyAt_organisation"],
"--relationships=WORK_AT=" + output_files["dynamic/person_workAt_organisation"],
"--delimiter",
"|",
"neo4j",
],
check=True,
)
vendor_runner.start_preparation("Index preparation")
print("Executing database index setup")
client.execute(file_path=self._index_file, num_workers=1)
vendor_runner.stop("Stop index preparation")
return True
else:
return False

View File

@ -0,0 +1,41 @@
from pathlib import Path
from benchmark_context import BenchmarkContext
from runners import BaseRunner
class ImporterPokec:
def __init__(
self, benchmark_context: BenchmarkContext, dataset_name: str, variant: str, index_file: str, dataset_file: str
) -> None:
self._benchmark_context = benchmark_context
self._dataset_name = dataset_name
self._variant = variant
self._index_file = index_file
self._dataset_file = dataset_file
def execute_import(self):
if self._benchmark_context.vendor_name == "neo4j":
vendor_runner = BaseRunner.create(
benchmark_context=self._benchmark_context,
)
client = vendor_runner.fetch_client()
vendor_runner.clean_db()
vendor_runner.start_preparation("preparation")
print("Executing database cleanup and index setup...")
client.execute(file_path=self._index_file, num_workers=1)
vendor_runner.stop("preparation")
neo4j_dump = Path() / ".cache" / "datasets" / self._dataset_name / self._variant / "neo4j.dump"
if neo4j_dump.exists():
vendor_runner.load_db_from_dump(path=neo4j_dump.parent)
else:
vendor_runner.start_preparation("import")
print("Importing dataset...")
client.execute(file_path=self._dataset_file, num_workers=self._benchmark_context.num_workers_for_import)
vendor_runner.stop("import")
vendor_runner.dump_db(path=neo4j_dump.parent)
return True
else:
return False

View File

@ -0,0 +1,708 @@
import inspect
import random
from pathlib import Path
import helpers
from benchmark_context import BenchmarkContext
from workloads.base import Workload
from workloads.importers.importer_ldbc_bi import ImporterLDBCBI
class LDBC_BI(Workload):
NAME = "ldbc_bi"
VARIANTS = ["sf1", "sf3", "sf10"]
DEFAULT_VARIANT = "sf1"
URL_FILE = {
"sf1": "https://s3.eu-west-1.amazonaws.com/deps.memgraph.io/dataset/ldbc/benchmark/bi/ldbc_bi_sf1.cypher.gz",
"sf3": "https://s3.eu-west-1.amazonaws.com/deps.memgraph.io/dataset/ldbc/benchmark/bi/ldbc_bi_sf3.cypher.gz",
"sf10": "https://s3.eu-west-1.amazonaws.com/deps.memgraph.io/dataset/ldbc/benchmark/bi/ldbc_bi_sf10.cypher.gz",
}
URL_CSV = {
"sf1": "https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/bi-pre-audit/bi-sf1-composite-projected-fk.tar.zst",
"sf3": "https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/bi-pre-audit/bi-sf3-composite-projected-fk.tar.zst",
"sf10": "https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/bi-pre-audit/bi-sf10-composite-projected-fk.tar.zst",
}
SIZES = {
"sf1": {"vertices": 2997352, "edges": 17196776},
"sf3": {"vertices": 1, "edges": 1},
"sf10": {"vertices": 1, "edges": 1},
}
LOCAL_INDEX_FILES = None
URL_INDEX_FILE = {
"memgraph": "https://s3.eu-west-1.amazonaws.com/deps.memgraph.io/dataset/ldbc/benchmark/bi/memgraph_bi_index.cypher",
"neo4j": "https://s3.eu-west-1.amazonaws.com/deps.memgraph.io/dataset/ldbc/benchmark/bi/neo4j_bi_index.cypher",
}
QUERY_PARAMETERS = {
"sf1": "https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/bi-pre-audit/parameters-2022-10-01.zip",
"sf3": "https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/bi-pre-audit/parameters-2022-10-01.zip",
"sf10": "https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/bi-pre-audit/parameters-2022-10-01.zip",
}
def custom_import(self) -> bool:
importer = ImporterLDBCBI(
benchmark_context=self.benchmark_context,
dataset_name=self.NAME,
variant=self._variant,
index_file=self._file_index,
csv_dict=self.URL_CSV,
)
return importer.execute_import()
def _prepare_parameters_directory(self):
parameters = Path() / ".cache" / "datasets" / self.NAME / self._variant / "parameters"
parameters.mkdir(parents=True, exist_ok=True)
if parameters.exists() and any(parameters.iterdir()):
print("Files downloaded.")
else:
print("Downloading files")
downloaded_file = helpers.download_file(self.QUERY_PARAMETERS[self._variant], parameters.parent.absolute())
print("Unpacking the file..." + downloaded_file)
parameters = helpers.unpack_zip(Path(downloaded_file))
return parameters / ("parameters-" + self._variant)
def _get_query_parameters(self) -> dict:
func_name = inspect.stack()[1].function
parameters = {}
for file in self._parameters_dir.glob("bi-*.csv"):
file_name_query_id = file.name.split("-")[1][0:-4]
func_name_id = func_name.split("_")[-1]
if file_name_query_id == func_name_id or file_name_query_id == func_name_id + "a":
with file.open("r") as input:
lines = input.readlines()
header = lines[0].strip("\n").split("|")
position = random.randint(1, len(lines) - 1)
data = lines[position].strip("\n").split("|")
for i in range(len(header)):
key, value_type = header[i].split(":")
if value_type == "DATETIME":
# Drop time zone
converted = data[i][0:-6]
parameters[key] = converted
elif value_type == "DATE":
converted = data[i] + "T00:00:00"
parameters[key] = converted
elif value_type == "INT":
parameters[key] = int(data[i])
elif value_type == "STRING[]":
elements = data[i].split(";")
parameters[key] = elements
else:
parameters[key] = data[i]
break
return parameters
def __init__(self, variant=None, benchmark_context: BenchmarkContext = None):
super().__init__(variant, benchmark_context=benchmark_context)
self._parameters_dir = self._prepare_parameters_directory()
def benchmark__bi__query_1_analytical(self):
memgraph = (
"""
MATCH (message:Message)
WHERE message.creationDate < localDateTime($datetime)
WITH count(message) AS totalMessageCountInt
WITH toFloat(totalMessageCountInt) AS totalMessageCount
MATCH (message:Message)
WHERE message.creationDate < localDateTime($datetime)
AND message.content IS NOT NULL
WITH
totalMessageCount,
message,
message.creationDate.year AS year
WITH
totalMessageCount,
year,
message:Comment AS isComment,
CASE
WHEN message.length < 40 THEN 0
WHEN message.length < 80 THEN 1
WHEN message.length < 160 THEN 2
ELSE 3
END AS lengthCategory,
count(message) AS messageCount,
sum(message.length) / toFloat(count(message)) AS averageMessageLength,
sum(message.length) AS sumMessageLength
RETURN
year,
isComment,
lengthCategory,
messageCount,
averageMessageLength,
sumMessageLength,
messageCount / totalMessageCount AS percentageOfMessages
ORDER BY
year DESC,
isComment ASC,
lengthCategory ASC
""".replace(
"\n", ""
),
self._get_query_parameters(),
)
neo4j = (
"""
MATCH (message:Message)
WHERE message.creationDate < DateTime($datetime)
WITH count(message) AS totalMessageCountInt
WITH toFloat(totalMessageCountInt) AS totalMessageCount
MATCH (message:Message)
WHERE message.creationDate < DateTime($datetime)
AND message.content IS NOT NULL
WITH
totalMessageCount,
message,
message.creationDate.year AS year
WITH
totalMessageCount,
year,
message:Comment AS isComment,
CASE
WHEN message.length < 40 THEN 0
WHEN message.length < 80 THEN 1
WHEN message.length < 160 THEN 2
ELSE 3
END AS lengthCategory,
count(message) AS messageCount,
sum(message.length) / toFloat(count(message)) AS averageMessageLength,
sum(message.length) AS sumMessageLength
RETURN
year,
isComment,
lengthCategory,
messageCount,
averageMessageLength,
sumMessageLength,
messageCount / totalMessageCount AS percentageOfMessages
ORDER BY
year DESC,
isComment ASC,
lengthCategory ASC
""".replace(
"\n", ""
),
self._get_query_parameters(),
)
if self._vendor == "memgraph":
return memgraph
else:
return neo4j
def benchmark__bi__query_2_analytical(self):
memgraph = (
"""
MATCH (tag:Tag)-[:HAS_TYPE]->(:TagClass {name: $tagClass})
OPTIONAL MATCH (message1:Message)-[:HAS_TAG]->(tag)
WHERE localDateTime($date) <= message1.creationDate
AND message1.creationDate < localDateTime($date) + duration({day: 100})
WITH tag, count(message1) AS countWindow1
OPTIONAL MATCH (message2:Message)-[:HAS_TAG]->(tag)
WHERE localDateTime($date) + duration({day: 100}) <= message2.creationDate
AND message2.creationDate < localDateTime($date) + duration({day: 200})
WITH
tag,
countWindow1,
count(message2) AS countWindow2
RETURN
tag.name,
countWindow1,
countWindow2,
abs(countWindow1 - countWindow2) AS diff
ORDER BY
diff DESC,
tag.name ASC
LIMIT 100
""".replace(
"\n", ""
),
self._get_query_parameters(),
)
neo4j = (
"""
MATCH (tag:Tag)-[:HAS_TYPE]->(:TagClass {name: $tagClass})
OPTIONAL MATCH (message1:Message)-[:HAS_TAG]->(tag)
WHERE DateTime($date) <= message1.creationDate
AND message1.creationDate < DateTime($date) + duration({days: 100})
WITH tag, count(message1) AS countWindow1
OPTIONAL MATCH (message2:Message)-[:HAS_TAG]->(tag)
WHERE DateTime($date) + duration({days: 100}) <= message2.creationDate
AND message2.creationDate < DateTime($date) + duration({days: 200})
WITH
tag,
countWindow1,
count(message2) AS countWindow2
RETURN
tag.name,
countWindow1,
countWindow2,
abs(countWindow1 - countWindow2) AS diff
ORDER BY
diff DESC,
tag.name ASC
LIMIT 100
""".replace(
"\n", ""
),
self._get_query_parameters(),
)
if self._vendor == "memgraph":
return memgraph
else:
return neo4j
def benchmark__bi__query_3_analytical(self):
return (
"""
MATCH
(:Country {name: $country})<-[:IS_PART_OF]-(:City)<-[:IS_LOCATED_IN]-
(person:Person)<-[:HAS_MODERATOR]-(forum:Forum)-[:CONTAINER_OF]->
(post:Post)<-[:REPLY_OF*0..]-(message:Message)-[:HAS_TAG]->(:Tag)-[:HAS_TYPE]->(:TagClass {name: $tagClass})
RETURN
forum.id as id,
forum.title,
person.id,
count(DISTINCT message) AS messageCount
ORDER BY
messageCount DESC,
id ASC
LIMIT 20
""".replace(
"\n", ""
),
self._get_query_parameters(),
)
def benchmark__bi__query_5_analytical(self):
return (
"""
MATCH (tag:Tag {name: $tag})<-[:HAS_TAG]-(message:Message)-[:HAS_CREATOR]->(person:Person)
OPTIONAL MATCH (message)<-[likes:LIKES]-(:Person)
WITH person, message, count(likes) AS likeCount
OPTIONAL MATCH (message)<-[:REPLY_OF]-(reply:Comment)
WITH person, message, likeCount, count(reply) AS replyCount
WITH person, count(message) AS messageCount, sum(likeCount) AS likeCount, sum(replyCount) AS replyCount
RETURN
person.id,
replyCount,
likeCount,
messageCount,
1*messageCount + 2*replyCount + 10*likeCount AS score
ORDER BY
score DESC,
person.id ASC
LIMIT 100
""".replace(
"\n", ""
),
self._get_query_parameters(),
)
def benchmark__bi__query_6_analytical(self):
return (
"""
MATCH (tag:Tag {name: $tag})<-[:HAS_TAG]-(message1:Message)-[:HAS_CREATOR]->(person1:Person)
OPTIONAL MATCH (message1)<-[:LIKES]-(person2:Person)
OPTIONAL MATCH (person2)<-[:HAS_CREATOR]-(message2:Message)<-[like:LIKES]-(person3:Person)
RETURN
person1.id as id,
count(DISTINCT like) AS authorityScore
ORDER BY
authorityScore DESC,
id ASC
LIMIT 100
""".replace(
"\n", ""
),
self._get_query_parameters(),
)
def benchmark__bi__query_7_analytical(self):
memgraph = (
"""
MATCH
(tag:Tag {name: $tag})<-[:HAS_TAG]-(message:Message),
(message)<-[:REPLY_OF]-(comment:Comment)-[:HAS_TAG]->(relatedTag:Tag)
OPTIONAL MATCH (comment)-[:HAS_TAG]->(tag)
WHERE tag IS NOT NULL
RETURN
relatedTag,
count(DISTINCT comment) AS count
ORDER BY
relatedTag.name ASC,
count DESC
LIMIT 100
""".replace(
"\n", ""
),
self._get_query_parameters(),
)
neo4j = (
"""
MATCH
(tag:Tag {name: $tag})<-[:HAS_TAG]-(message:Message),
(message)<-[:REPLY_OF]-(comment:Comment)-[:HAS_TAG]->(relatedTag:Tag)
WHERE NOT (comment)-[:HAS_TAG]->(tag)
RETURN
relatedTag.name,
count(DISTINCT comment) AS count
ORDER BY
relatedTag.name ASC,
count DESC
LIMIT 100
""".replace(
"\n", ""
),
self._get_query_parameters(),
)
if self._vendor == "memgraph":
return memgraph
else:
return neo4j
def benchmark__bi__query_9_analytical(self):
memgraph = (
"""
MATCH (person:Person)<-[:HAS_CREATOR]-(post:Post)<-[:REPLY_OF*0..]-(reply:Message)
WHERE post.creationDate >= localDateTime($startDate)
AND post.creationDate <= localDateTime($endDate)
AND reply.creationDate >= localDateTime($startDate)
AND reply.creationDate <= localDateTime($endDate)
RETURN
person.id as id,
person.firstName,
person.lastName,
count(DISTINCT post) AS threadCount,
count(DISTINCT reply) AS messageCount
ORDER BY
messageCount DESC,
id ASC
LIMIT 100
""".replace(
"\n", ""
),
self._get_query_parameters(),
)
neo4j = (
"""
MATCH (person:Person)<-[:HAS_CREATOR]-(post:Post)<-[:REPLY_OF*0..]-(reply:Message)
WHERE post.creationDate >= DateTime($startDate)
AND post.creationDate <= DateTime($endDate)
AND reply.creationDate >= DateTime($startDate)
AND reply.creationDate <= DateTime($endDate)
RETURN
person.id as id,
person.firstName,
person.lastName,
count(DISTINCT post) AS threadCount,
count(DISTINCT reply) AS messageCount
ORDER BY
messageCount DESC,
id ASC
LIMIT 100
""".replace(
"\n", ""
),
self._get_query_parameters(),
)
if self._vendor == "memgraph":
return memgraph
else:
return neo4j
def benchmark__bi__query_11_analytical(self):
return (
"""
MATCH (a:Person)-[:IS_LOCATED_IN]->(:City)-[:IS_PART_OF]->(country:Country {name: $country}),
(a)-[k1:KNOWS]-(b:Person)
WHERE a.id < b.id
AND localDateTime($startDate) <= k1.creationDate AND k1.creationDate <= localDateTime($endDate)
WITH DISTINCT country, a, b
MATCH (b)-[:IS_LOCATED_IN]->(:City)-[:IS_PART_OF]->(country)
WITH DISTINCT country, a, b
MATCH (b)-[k2:KNOWS]-(c:Person),
(c)-[:IS_LOCATED_IN]->(:City)-[:IS_PART_OF]->(country)
WHERE b.id < c.id
AND localDateTime($startDate) <= k2.creationDate AND k2.creationDate <= localDateTime($endDate)
WITH DISTINCT a, b, c
MATCH (c)-[k3:KNOWS]-(a)
WHERE localDateTime($startDate) <= k3.creationDate AND k3.creationDate <= localDateTime($endDate)
WITH DISTINCT a, b, c
RETURN count(*) AS count
""".replace(
"\n", ""
),
self._get_query_parameters(),
)
def benchmark__bi__query_12_analytical(self):
return (
"""
MATCH (person:Person)
OPTIONAL MATCH (person)<-[:HAS_CREATOR]-(message:Message)-[:REPLY_OF*0..]->(post:Post)
WHERE message.content IS NOT NULL
AND message.length < $lengthThreshold
AND message.creationDate > localDateTime($startDate)
AND post.language IN $languages
WITH
person,
count(message) AS messageCount
RETURN
messageCount,
count(person) AS personCount
ORDER BY
personCount DESC,
messageCount DESC
""".replace(
"\n", ""
),
self._get_query_parameters(),
)
def benchmark__bi__query_13_analytical(self):
memgraph = (
"""
MATCH (country:Country {name: $country})<-[:IS_PART_OF]-(:City)<-[:IS_LOCATED_IN]-(zombie:Person)
WHERE zombie.creationDate < localDateTime($endDate)
WITH country, zombie
OPTIONAL MATCH (zombie)<-[:HAS_CREATOR]-(message:Message)
WHERE message.creationDate < localDateTime($endDate)
WITH
country,
zombie,
count(message) AS messageCount
WITH
country,
zombie,
12 * (localDateTime($endDate).year - zombie.creationDate.year )
+ (localDateTime($endDate).month - zombie.creationDate.month)
+ 1 AS months,
messageCount
WHERE messageCount / months < 1
WITH
country,
collect(zombie) AS zombies
UNWIND zombies AS zombie
OPTIONAL MATCH
(zombie)<-[:HAS_CREATOR]-(message:Message)<-[:LIKES]-(likerZombie:Person)
WHERE likerZombie IN zombies
WITH
zombie,
count(likerZombie) AS zombieLikeCount
OPTIONAL MATCH
(zombie)<-[:HAS_CREATOR]-(message:Message)<-[:LIKES]-(likerPerson:Person)
WHERE likerPerson.creationDate < localDateTime($endDate)
WITH
zombie,
zombieLikeCount,
count(likerPerson) AS totalLikeCount
RETURN
zombie.id,
zombieLikeCount,
totalLikeCount,
CASE totalLikeCount
WHEN 0 THEN 0.0
ELSE zombieLikeCount / toFloat(totalLikeCount)
END AS zombieScore
ORDER BY
zombieScore DESC,
zombie.id ASC
LIMIT 100
""".replace(
"\n", ""
),
self._get_query_parameters(),
)
neo4j = (
"""
MATCH (country:Country {name: $country})<-[:IS_PART_OF]-(:City)<-[:IS_LOCATED_IN]-(zombie:Person)
WHERE zombie.creationDate < DateTime($endDate)
WITH country, zombie
OPTIONAL MATCH (zombie)<-[:HAS_CREATOR]-(message:Message)
WHERE message.creationDate < DateTime($endDate)
WITH
country,
zombie,
count(message) AS messageCount
WITH
country,
zombie,
12 * (DateTime($endDate).year - zombie.creationDate.year )
+ (DateTime($endDate).month - zombie.creationDate.month)
+ 1 AS months,
messageCount
WHERE messageCount / months < 1
WITH
country,
collect(zombie) AS zombies
UNWIND zombies AS zombie
OPTIONAL MATCH
(zombie)<-[:HAS_CREATOR]-(message:Message)<-[:LIKES]-(likerZombie:Person)
WHERE likerZombie IN zombies
WITH
zombie,
count(likerZombie) AS zombieLikeCount
OPTIONAL MATCH
(zombie)<-[:HAS_CREATOR]-(message:Message)<-[:LIKES]-(likerPerson:Person)
WHERE likerPerson.creationDate < DateTime($endDate)
WITH
zombie,
zombieLikeCount,
count(likerPerson) AS totalLikeCount
RETURN
zombie.id,
zombieLikeCount,
totalLikeCount,
CASE totalLikeCount
WHEN 0 THEN 0.0
ELSE zombieLikeCount / toFloat(totalLikeCount)
END AS zombieScore
ORDER BY
zombieScore DESC,
zombie.id ASC
LIMIT 100
""".replace(
"\n", ""
),
self._get_query_parameters(),
)
if self._vendor == "memgraph":
return memgraph
else:
return neo4j
def benchmark__bi__query_14_analytical(self):
return (
"""
MATCH
(country1:Country {name: $country1})<-[:IS_PART_OF]-(city1:City)<-[:IS_LOCATED_IN]-(person1:Person),
(country2:Country {name: $country2})<-[:IS_PART_OF]-(city2:City)<-[:IS_LOCATED_IN]-(person2:Person),
(person1)-[:KNOWS]-(person2)
WITH person1, person2, city1, 0 AS score
OPTIONAL MATCH (person1)<-[:HAS_CREATOR]-(c:Comment)-[:REPLY_OF]->(:Message)-[:HAS_CREATOR]->(person2)
WITH DISTINCT person1, person2, city1, score + (CASE c WHEN null THEN 0 ELSE 4 END) AS score
OPTIONAL MATCH (person1)<-[:HAS_CREATOR]-(m:Message)<-[:REPLY_OF]-(:Comment)-[:HAS_CREATOR]->(person2)
WITH DISTINCT person1, person2, city1, score + (CASE m WHEN null THEN 0 ELSE 1 END) AS score
OPTIONAL MATCH (person1)-[:LIKES]->(m:Message)-[:HAS_CREATOR]->(person2)
WITH DISTINCT person1, person2, city1, score + (CASE m WHEN null THEN 0 ELSE 10 END) AS score
OPTIONAL MATCH (person1)<-[:HAS_CREATOR]-(m:Message)<-[:LIKES]-(person2)
WITH DISTINCT person1, person2, city1, score + (CASE m WHEN null THEN 0 ELSE 1 END) AS score
ORDER BY
city1.name ASC,
score DESC,
person1.id ASC,
person2.id ASC
WITH city1, collect({score: score, person1Id: person1.id, person2Id: person2.id})[0] AS top
RETURN
top.person1Id,
top.person2Id,
city1.name,
top.score
ORDER BY
top.score DESC,
top.person1Id ASC,
top.person2Id ASC
LIMIT 100
""".replace(
"\n", ""
),
self._get_query_parameters(),
)
def benchmark__bi__query_17_analytical(self):
memgraph = (
"""
MATCH
(tag:Tag {name: $tag}),
(person1:Person)<-[:HAS_CREATOR]-(message1:Message)-[:REPLY_OF*0..]->(post1:Post)<-[:CONTAINER_OF]-(forum1:Forum),
(message1)-[:HAS_TAG]->(tag),
(forum1)<-[:HAS_MEMBER]->(person2:Person)<-[:HAS_CREATOR]-(comment:Comment)-[:HAS_TAG]->(tag),
(forum1)<-[:HAS_MEMBER]->(person3:Person)<-[:HAS_CREATOR]-(message2:Message),
(comment)-[:REPLY_OF]->(message2)-[:REPLY_OF*0..]->(post2:Post)<-[:CONTAINER_OF]-(forum2:Forum)
MATCH (comment)-[:HAS_TAG]->(tag)
MATCH (message2)-[:HAS_TAG]->(tag)
OPTIONAL MATCH (forum2)-[:HAS_MEMBER]->(person1)
WHERE forum1 <> forum2 AND message2.creationDate > message1.creationDate + duration({hours: $delta}) AND person1 IS NULL
RETURN person1, count(DISTINCT message2) AS messageCount
ORDER BY messageCount DESC, person1.id ASC
LIMIT 10
""".replace(
"\n", ""
),
self._get_query_parameters(),
)
neo4j = (
"""
MATCH
(tag:Tag {name: $tag}),
(person1:Person)<-[:HAS_CREATOR]-(message1:Message)-[:REPLY_OF*0..]->(post1:Post)<-[:CONTAINER_OF]-(forum1:Forum),
(message1)-[:HAS_TAG]->(tag),
(forum1)<-[:HAS_MEMBER]->(person2:Person)<-[:HAS_CREATOR]-(comment:Comment)-[:HAS_TAG]->(tag),
(forum1)<-[:HAS_MEMBER]->(person3:Person)<-[:HAS_CREATOR]-(message2:Message),
(comment)-[:REPLY_OF]->(message2)-[:REPLY_OF*0..]->(post2:Post)<-[:CONTAINER_OF]-(forum2:Forum)
MATCH (comment)-[:HAS_TAG]->(tag)
MATCH (message2)-[:HAS_TAG]->(tag)
WHERE forum1 <> forum2
AND message2.creationDate > message1.creationDate + duration({hours: $delta})
AND NOT (forum2)-[:HAS_MEMBER]->(person1)
RETURN person1, count(DISTINCT message2) AS messageCount
ORDER BY messageCount DESC, person1.id ASC
LIMIT 10
""".replace(
"\n", ""
),
self._get_query_parameters(),
)
if self._vendor == "memgraph":
return memgraph
else:
return neo4j
def benchmark__bi__query_18_analytical(self):
memgraph = (
"""
MATCH (tag:Tag {name: $tag})<-[:HAS_INTEREST]-(person1:Person)-[:KNOWS]-(mutualFriend:Person)-[:KNOWS]-(person2:Person)-[:HAS_INTEREST]->(tag)
OPTIONAL MATCH (person1)-[:KNOWS]-(person2)
WHERE person1 <> person2
RETURN person1.id AS person1Id, person2.id AS person2Id, count(DISTINCT mutualFriend) AS mutualFriendCount
ORDER BY mutualFriendCount DESC, person1Id ASC, person2Id ASC
LIMIT 20
""".replace(
"\n", ""
),
self._get_query_parameters(),
)
neo4j = (
"""
MATCH (tag:Tag {name: $tag})<-[:HAS_INTEREST]-(person1:Person)-[:KNOWS]-(mutualFriend:Person)-[:KNOWS]-(person2:Person)-[:HAS_INTEREST]->(tag)
WHERE person1 <> person2
AND NOT (person1)-[:KNOWS]-(person2)
RETURN person1.id AS person1Id, person2.id AS person2Id, count(DISTINCT mutualFriend) AS mutualFriendCount
ORDER BY mutualFriendCount DESC, person1Id ASC, person2Id ASC
LIMIT 20
""".replace(
"\n", ""
),
self._get_query_parameters(),
)
if self._vendor == "memgraph":
return memgraph
else:
return neo4j

View File

@ -0,0 +1,684 @@
import inspect
import random
from datetime import datetime
from pathlib import Path
import helpers
from benchmark_context import BenchmarkContext
from workloads.base import Workload
from workloads.importers.importer_ldbc_interactive import *
class LDBC_Interactive(Workload):
NAME = "ldbc_interactive"
VARIANTS = ["sf0.1", "sf1", "sf3", "sf10"]
DEFAULT_VARIANT = "sf1"
URL_FILE = {
"sf0.1": "https://s3.eu-west-1.amazonaws.com/deps.memgraph.io/dataset/ldbc/benchmark/interactive/ldbc_interactive_sf0.1.cypher.gz",
"sf1": "https://s3.eu-west-1.amazonaws.com/deps.memgraph.io/dataset/ldbc/benchmark/interactive/ldbc_interactive_sf1.cypher.gz",
"sf3": "https://s3.eu-west-1.amazonaws.com/deps.memgraph.io/dataset/ldbc/benchmark/interactive/ldbc_interactive_sf3.cypher.gz",
"sf10": "https://s3.eu-west-1.amazonaws.com/deps.memgraph.io/dataset/ldbc/benchmark/interactive/ldbc_interactive_sf10.cypher.gz",
}
URL_CSV = {
"sf0.1": "https://repository.surfsara.nl/datasets/cwi/snb/files/social_network-csv_basic/social_network-csv_basic-sf0.1.tar.zst",
"sf1": "https://repository.surfsara.nl/datasets/cwi/snb/files/social_network-csv_basic/social_network-csv_basic-sf1.tar.zst",
"sf3": "https://repository.surfsara.nl/datasets/cwi/snb/files/social_network-csv_basic/social_network-csv_basic-sf3.tar.zst",
"sf10": "https://repository.surfsara.nl/datasets/cwi/snb/files/social_network-csv_basic/social_network-csv_basic-sf10.tar.zst",
}
SIZES = {
"sf0.1": {"vertices": 327588, "edges": 1477965},
"sf1": {"vertices": 3181724, "edges": 17256038},
"sf3": {"vertices": 1, "edges": 1},
"sf10": {"vertices": 1, "edges": 1},
}
URL_INDEX_FILE = {
"memgraph": "https://s3.eu-west-1.amazonaws.com/deps.memgraph.io/dataset/ldbc/benchmark/interactive/memgraph_interactive_index.cypher",
"neo4j": "https://s3.eu-west-1.amazonaws.com/deps.memgraph.io/dataset/ldbc/benchmark/interactive/neo4j_interactive_index.cypher",
}
PROPERTIES_ON_EDGES = True
QUERY_PARAMETERS = {
"sf0.1": "https://repository.surfsara.nl/datasets/cwi/snb/files/substitution_parameters/substitution_parameters-sf0.1.tar.zst",
"sf1": "https://repository.surfsara.nl/datasets/cwi/snb/files/substitution_parameters/substitution_parameters-sf0.1.tar.zst",
"sf3": "https://repository.surfsara.nl/datasets/cwi/snb/files/substitution_parameters/substitution_parameters-sf0.1.tar.zst",
}
def custom_import(self) -> bool:
importer = ImporterLDBCInteractive(
benchmark_context=self.benchmark_context,
dataset_name=self.NAME,
variant=self._variant,
index_file=self._file_index,
csv_dict=self.URL_CSV,
)
return importer.execute_import()
def _prepare_parameters_directory(self):
parameters = Path() / ".cache" / "datasets" / self.NAME / self._variant / "parameters"
parameters.mkdir(parents=True, exist_ok=True)
dir_name = self.QUERY_PARAMETERS[self._variant].split("/")[-1:][0].removesuffix(".tar.zst")
if (parameters / dir_name).exists():
print("Files downloaded:")
parameters = parameters / dir_name
else:
print("Downloading files")
downloaded_file = helpers.download_file(self.QUERY_PARAMETERS[self._variant], parameters.absolute())
print("Unpacking the file..." + downloaded_file)
parameters = helpers.unpack_tar_zst(Path(downloaded_file))
return parameters
def _get_query_parameters(self) -> dict:
func_name = inspect.stack()[1].function
parameters = {}
for file in self._parameters_dir.glob("interactive_*.txt"):
if file.name.split("_")[1] == func_name.split("_")[-2]:
with file.open("r") as input:
lines = input.readlines()
position = random.randint(1, len(lines) - 1)
header = lines[0].strip("\n").split("|")
data = lines[position].strip("\n").split("|")
for i in range(len(header)):
if "Date" in header[i]:
time = int(data[i]) / 1000
converted = datetime.utcfromtimestamp(time).strftime("%Y-%m-%dT%H:%M:%S")
parameters[header[i]] = converted
elif data[i].isdigit():
parameters[header[i]] = int(data[i])
else:
parameters[header[i]] = data[i]
return parameters
def __init__(self, variant: str = None, benchmark_context: BenchmarkContext = None):
super().__init__(variant, benchmark_context=benchmark_context)
self._parameters_dir = self._prepare_parameters_directory()
self.benchmark_context = benchmark_context
def benchmark__interactive__complex_query_1_analytical(self):
memgraph = (
"""
MATCH (p:Person {id: $personId}), (friend:Person {firstName: $firstName})
WHERE NOT p=friend
WITH p, friend
MATCH path =((p)-[:KNOWS *BFS 1..3]-(friend))
WITH min(size(path)) AS distance, friend
ORDER BY
distance ASC,
friend.lastName ASC,
toInteger(friend.id) ASC
LIMIT 20
MATCH (friend)-[:IS_LOCATED_IN]->(friendCity:City)
OPTIONAL MATCH (friend)-[studyAt:STUDY_AT]->(uni:University)-[:IS_LOCATED_IN]->(uniCity:City)
WITH friend, collect(
CASE uni.name
WHEN null THEN null
ELSE [uni.name, studyAt.classYear, uniCity.name]
END ) AS unis, friendCity, distance
OPTIONAL MATCH (friend)-[workAt:WORK_AT]->(company:Company)-[:IS_LOCATED_IN]->(companyCountry:Country)
WITH friend, collect(
CASE company.name
WHEN null THEN null
ELSE [company.name, workAt.workFrom, companyCountry.name]
END ) AS companies, unis, friendCity, distance
RETURN
friend.id AS friendId,
friend.lastName AS friendLastName,
distance AS distanceFromPerson,
friend.birthday AS friendBirthday,
friend.gender AS friendGender,
friend.browserUsed AS friendBrowserUsed,
friend.locationIP AS friendLocationIp,
friend.email AS friendEmails,
friend.speaks AS friendLanguages,
friendCity.name AS friendCityName,
unis AS friendUniversities,
companies AS friendCompanies
ORDER BY
distanceFromPerson ASC,
friendLastName ASC,
toInteger(friendId) ASC
LIMIT 20
""".replace(
"\n", ""
),
self._get_query_parameters(),
)
neo4j = (
"""
MATCH (p:Person {id: $personId}), (friend:Person {firstName: $firstName})
WHERE NOT p=friend
WITH p, friend
MATCH path = shortestPath((p)-[:KNOWS*1..3]-(friend))
WITH min(length(path)) AS distance, friend
ORDER BY
distance ASC,
friend.lastName ASC,
toInteger(friend.id) ASC
LIMIT 20
MATCH (friend)-[:IS_LOCATED_IN]->(friendCity:City)
OPTIONAL MATCH (friend)-[studyAt:STUDY_AT]->(uni:University)-[:IS_LOCATED_IN]->(uniCity:City)
WITH friend, collect(
CASE uni.name
WHEN null THEN null
ELSE [uni.name, studyAt.classYear, uniCity.name]
END ) AS unis, friendCity, distance
OPTIONAL MATCH (friend)-[workAt:WORK_AT]->(company:Company)-[:IS_LOCATED_IN]->(companyCountry:Country)
WITH friend, collect(
CASE company.name
WHEN null THEN null
ELSE [company.name, workAt.workFrom, companyCountry.name]
END ) AS companies, unis, friendCity, distance
RETURN
friend.id AS friendId,
friend.lastName AS friendLastName,
distance AS distanceFromPerson,
friend.birthday AS friendBirthday,
friend.gender AS friendGender,
friend.browserUsed AS friendBrowserUsed,
friend.locationIP AS friendLocationIp,
friend.email AS friendEmails,
friend.speaks AS friendLanguages,
friendCity.name AS friendCityName,
unis AS friendUniversities,
companies AS friendCompanies
ORDER BY
distanceFromPerson ASC,
friendLastName ASC,
toInteger(friendId) ASC
LIMIT 20
""".replace(
"\n", ""
),
self._get_query_parameters(),
)
if self._vendor == "memgraph":
return memgraph
else:
return neo4j
def benchmark__interactive__complex_query_2_analytical(self):
return (
"""
MATCH (:Person {id: $personId })-[:KNOWS]-(friend:Person)<-[:HAS_CREATOR]-(message:Message)
WHERE message.creationDate <= localDateTime($maxDate)
RETURN
friend.id AS personId,
friend.firstName AS personFirstName,
friend.lastName AS personLastName,
message.id AS postOrCommentId,
coalesce(message.content,message.imageFile) AS postOrCommentContent,
message.creationDate AS postOrCommentCreationDate
ORDER BY
postOrCommentCreationDate DESC,
toInteger(postOrCommentId) ASC
LIMIT 20
""".replace(
"\n", ""
),
self._get_query_parameters(),
)
def benchmark__interactive__complex_query_3_analytical(self):
memgraph = (
"""
MATCH (countryX:Country {name: $countryXName }),
(countryY:Country {name: $countryYName }),
(person:Person {id: $personId })
WITH person, countryX, countryY
LIMIT 1
MATCH (city:City)-[:IS_PART_OF]->(country:Country)
WHERE country IN [countryX, countryY]
WITH person, countryX, countryY, collect(city) AS cities
MATCH (person)-[:KNOWS*1..2]-(friend)-[:IS_LOCATED_IN]->(city)
WHERE NOT person=friend AND NOT city IN cities
WITH DISTINCT friend, countryX, countryY
MATCH (friend)<-[:HAS_CREATOR]-(message),
(message)-[:IS_LOCATED_IN]->(country)
WHERE localDateTime($startDate) + duration({day:$durationDays}) > message.creationDate >= localDateTime($startDate) AND
country IN [countryX, countryY]
WITH friend,
CASE WHEN country=countryX THEN 1 ELSE 0 END AS messageX,
CASE WHEN country=countryY THEN 1 ELSE 0 END AS messageY
WITH friend, sum(messageX) AS xCount, sum(messageY) AS yCount
WHERE xCount>0 AND yCount>0
RETURN friend.id AS friendId,
friend.firstName AS friendFirstName,
friend.lastName AS friendLastName,
xCount,
yCount,
xCount + yCount AS xyCount
ORDER BY xyCount DESC, friendId ASC
LIMIT 20
""".replace(
"\n", ""
),
self._get_query_parameters(),
)
neo4j = (
"""
MATCH (countryX:Country {name: $countryXName }),
(countryY:Country {name: $countryYName }),
(person:Person {id: $personId })
WITH person, countryX, countryY
LIMIT 1
MATCH (city:City)-[:IS_PART_OF]->(country:Country)
WHERE country IN [countryX, countryY]
WITH person, countryX, countryY, collect(city) AS cities
MATCH (person)-[:KNOWS*1..2]-(friend)-[:IS_LOCATED_IN]->(city)
WHERE NOT person=friend AND NOT city IN cities
WITH DISTINCT friend, countryX, countryY
MATCH (friend)<-[:HAS_CREATOR]-(message),
(message)-[:IS_LOCATED_IN]->(country)
WHERE localDateTime($startDate) + duration({days:$durationDays}) > message.creationDate >= localDateTime($startDate) AND
country IN [countryX, countryY]
WITH friend,
CASE WHEN country=countryX THEN 1 ELSE 0 END AS messageX,
CASE WHEN country=countryY THEN 1 ELSE 0 END AS messageY
WITH friend, sum(messageX) AS xCount, sum(messageY) AS yCount
WHERE xCount>0 AND yCount>0
RETURN friend.id AS friendId,
friend.firstName AS friendFirstName,
friend.lastName AS friendLastName,
xCount,
yCount,
xCount + yCount AS xyCount
ORDER BY xyCount DESC, friendId ASC
LIMIT 20
""".replace(
"\n", ""
),
self._get_query_parameters(),
)
if self._vendor == "memgraph":
return memgraph
else:
return neo4j
def benchmark__interactive__complex_query_4_analytical(self):
memgraph = (
"""
MATCH (person:Person {id: $personId })-[:KNOWS]-(friend:Person),
(friend)<-[:HAS_CREATOR]-(post:Post)-[:HAS_TAG]->(tag)
WITH DISTINCT tag, post
WITH tag,
CASE
WHEN localDateTime($startDate) + duration({day:$durationDays}) > post.creationDate >= localDateTime($startDate) THEN 1
ELSE 0
END AS valid,
CASE
WHEN localDateTime($startDate) > post.creationDate THEN 1
ELSE 0
END AS inValid
WITH tag, sum(valid) AS postCount, sum(inValid) AS inValidPostCount
WHERE postCount>0 AND inValidPostCount=0
RETURN tag.name AS tagName, postCount
ORDER BY postCount DESC, tagName ASC
LIMIT 10
""",
self._get_query_parameters(),
)
neo4j = (
"""
MATCH (person:Person {id: $personId })-[:KNOWS]-(friend:Person),
(friend)<-[:HAS_CREATOR]-(post:Post)-[:HAS_TAG]->(tag)
WITH DISTINCT tag, post
WITH tag,
CASE
WHEN localDateTime($startDate) + duration({days:$durationDays}) > post.creationDate >= localDateTime($startDate) THEN 1
ELSE 0
END AS valid,
CASE
WHEN localDateTime($startDate) > post.creationDate THEN 1
ELSE 0
END AS inValid
WITH tag, sum(valid) AS postCount, sum(inValid) AS inValidPostCount
WHERE postCount>0 AND inValidPostCount=0
RETURN tag.name AS tagName, postCount
ORDER BY postCount DESC, tagName ASC
LIMIT 10
""",
self._get_query_parameters(),
)
if self._vendor == "memgraph":
return memgraph
else:
return neo4j
def benchmark__interactive__complex_query_5_analytical(self):
return (
"""
MATCH (person:Person { id: $personId })-[:KNOWS*1..2]-(friend)
WHERE
NOT person=friend
WITH DISTINCT friend
MATCH (friend)<-[membership:HAS_MEMBER]-(forum)
WHERE
membership.joinDate > localDateTime($minDate)
WITH
forum,
collect(friend) AS friends
OPTIONAL MATCH (friend)<-[:HAS_CREATOR]-(post)<-[:CONTAINER_OF]-(forum)
WHERE
friend IN friends
WITH
forum,
count(post) AS postCount
RETURN
forum.title AS forumName,
postCount
ORDER BY
postCount DESC,
forum.id ASC
LIMIT 20
""".replace(
"\n", ""
),
self._get_query_parameters(),
)
def benchmark__interactive__complex_query_6_analytical(self):
return (
"""
MATCH (knownTag:Tag { name: $tagName })
WITH knownTag.id as knownTagId
MATCH (person:Person { id: $personId })-[:KNOWS*1..2]-(friend)
WHERE NOT person=friend
WITH
knownTagId,
collect(distinct friend) as friends
UNWIND friends as f
MATCH (f)<-[:HAS_CREATOR]-(post:Post),
(post)-[:HAS_TAG]->(t:Tag{id: knownTagId}),
(post)-[:HAS_TAG]->(tag:Tag)
WHERE NOT t = tag
WITH
tag.name as tagName,
count(post) as postCount
RETURN
tagName,
postCount
ORDER BY
postCount DESC,
tagName ASC
LIMIT 10
""".replace(
"\n", ""
),
self._get_query_parameters(),
)
def benchmark__interactive__complex_query_7_analytical(self):
memgraph = (
"""
MATCH (person:Person {id: $personId})<-[:HAS_CREATOR]-(message:Message)<-[like:LIKES]-(liker:Person)
WITH liker, message, like.creationDate AS likeTime, person
ORDER BY likeTime DESC, toInteger(message.id) ASC
WITH liker, head(collect({msg: message, likeTime: likeTime})) AS latestLike, person
OPTIONAL MATCH (liker)-[:KNOWS]-(person)
WITH liker, latestLike, person,
CASE WHEN person IS null THEN TRUE ELSE FALSE END AS isNew
RETURN
liker.id AS personId,
liker.firstName AS personFirstName,
liker.lastName AS personLastName,
latestLike.likeTime AS likeCreationDate,
latestLike.msg.id AS commentOrPostId,
coalesce(latestLike.msg.content, latestLike.msg.imageFile) AS commentOrPostContent,
(latestLike.likeTime - latestLike.msg.creationDate).minute AS minutesLatency
ORDER BY
likeCreationDate DESC,
toInteger(personId) ASC
LIMIT 20
""".replace(
"\n", ""
),
self._get_query_parameters(),
)
neo4j = (
"""
MATCH (person:Person {id: $personId})<-[:HAS_CREATOR]-(message:Message)<-[like:LIKES]-(liker:Person)
WITH liker, message, like.creationDate AS likeTime, person
ORDER BY likeTime DESC, toInteger(message.id) ASC
WITH liker, head(collect({msg: message, likeTime: likeTime})) AS latestLike, person
RETURN
liker.id AS personId,
liker.firstName AS personFirstName,
liker.lastName AS personLastName,
latestLike.likeTime AS likeCreationDate,
latestLike.msg.id AS commentOrPostId,
coalesce(latestLike.msg.content, latestLike.msg.imageFile) AS commentOrPostContent,
duration.between(latestLike.likeTime, latestLike.msg.creationDate).minutes AS minutesLatency,
not((liker)-[:KNOWS]-(person)) AS isNew
ORDER BY
likeCreationDate DESC,
toInteger(personId) ASC
LIMIT 20
""".replace(
"\n", ""
),
self._get_query_parameters(),
)
if self._vendor == "memgraph":
return memgraph
else:
return neo4j
def benchmark__interactive__complex_query_8_analytical(self):
return (
"""
MATCH (start:Person {id: $personId})<-[:HAS_CREATOR]-(:Message)<-[:REPLY_OF]-(comment:Comment)-[:HAS_CREATOR]->(person:Person)
RETURN
person.id AS personId,
person.firstName AS personFirstName,
person.lastName AS personLastName,
comment.creationDate AS commentCreationDate,
comment.id AS commentId,
comment.content AS commentContent
ORDER BY
commentCreationDate DESC,
commentId ASC
LIMIT 20
""".replace(
"\n", ""
),
self._get_query_parameters(),
)
def benchmark__interactive__complex_query_9_analytical(self):
return (
"""
MATCH (root:Person {id: $personId })-[:KNOWS*1..2]-(friend:Person)
WHERE NOT friend = root
WITH collect(distinct friend) as friends
UNWIND friends as friend
MATCH (friend)<-[:HAS_CREATOR]-(message:Message)
WHERE message.creationDate < localDateTime($maxDate)
RETURN
friend.id AS personId,
friend.firstName AS personFirstName,
friend.lastName AS personLastName,
message.id AS commentOrPostId,
coalesce(message.content,message.imageFile) AS commentOrPostContent,
message.creationDate AS commentOrPostCreationDate
ORDER BY
commentOrPostCreationDate DESC,
message.id ASC
LIMIT 20
""".replace(
"\n", ""
),
self._get_query_parameters(),
)
def benchmark__interactive__complex_query_10_analytical(self):
memgraph = (
"""
MATCH (person:Person {id: $personId})-[:KNOWS*2..2]-(friend),
(friend)-[:IS_LOCATED_IN]->(city:City)
WHERE NOT friend=person AND
NOT (friend)-[:KNOWS]-(person)
WITH person, city, friend, datetime({epochMillis: friend.birthday}) as birthday
WHERE (birthday.month=$month AND birthday.day>=21) OR
(birthday.month=($month%12)+1 AND birthday.day<22)
WITH DISTINCT friend, city, person
OPTIONAL MATCH (friend)<-[:HAS_CREATOR]-(post:Post)
WITH friend, city, collect(post) AS posts, person
WITH friend,
city,
size(posts) AS postCount,
size([p IN posts WHERE (p)-[:HAS_TAG]->()<-[:HAS_INTEREST]-(person)]) AS commonPostCount
RETURN friend.id AS personId,
friend.firstName AS personFirstName,
friend.lastName AS personLastName,
commonPostCount - (postCount - commonPostCount) AS commonInterestScore,
friend.gender AS personGender,
city.name AS personCityName
ORDER BY commonInterestScore DESC, personId ASC
LIMIT 10
""".replace(
"\n", ""
),
self._get_query_parameters(),
)
neo4j = (
"""
MATCH (person:Person {id: $personId})-[:KNOWS*2..2]-(friend),
(friend)-[:IS_LOCATED_IN]->(city:City)
WHERE NOT friend=person AND
NOT (friend)-[:KNOWS]-(person)
WITH person, city, friend, datetime({epochMillis: friend.birthday}) as birthday
WHERE (birthday.month=$month AND birthday.day>=21) OR
(birthday.month=($month%12)+1 AND birthday.day<22)
WITH DISTINCT friend, city, person
OPTIONAL MATCH (friend)<-[:HAS_CREATOR]-(post:Post)
WITH friend, city, collect(post) AS posts, person
WITH friend,
city,
size(posts) AS postCount,
size([p IN posts WHERE (p)-[:HAS_TAG]->()<-[:HAS_INTEREST]-(person)]) AS commonPostCount
RETURN friend.id AS personId,
friend.firstName AS personFirstName,
friend.lastName AS personLastName,
commonPostCount - (postCount - commonPostCount) AS commonInterestScore,
friend.gender AS personGender,
city.name AS personCityName
ORDER BY commonInterestScore DESC, personId ASC
LIMIT 10
""".replace(
"\n", ""
),
self._get_query_parameters(),
)
if self._vendor == "memgraph":
return memgraph
else:
return neo4j
def benchmark__interactive__complex_query_11_analytical(self):
return (
"""
MATCH (person:Person {id: $personId })-[:KNOWS*1..2]-(friend:Person)
WHERE not(person=friend)
WITH DISTINCT friend
MATCH (friend)-[workAt:WORK_AT]->(company:Company)-[:IS_LOCATED_IN]->(:Country {name: $countryName })
WHERE workAt.workFrom < $workFromYear
RETURN
friend.id AS personId,
friend.firstName AS personFirstName,
friend.lastName AS personLastName,
company.name AS organizationName,
workAt.workFrom AS organizationWorkFromYear
ORDER BY
organizationWorkFromYear ASC,
toInteger(personId) ASC,
organizationName DESC
LIMIT 10
""".replace(
"\n", ""
),
self._get_query_parameters(),
)
def benchmark__interactive__complex_query_12_analytical(self):
return (
"""
MATCH (tag:Tag)-[:HAS_TYPE|IS_SUBCLASS_OF*0..]->(baseTagClass:TagClass)
WHERE tag.name = $tagClassName OR baseTagClass.name = $tagClassName
WITH collect(tag.id) as tags
MATCH (:Person {id: $personId })-[:KNOWS]-(friend:Person)<-[:HAS_CREATOR]-(comment:Comment)-[:REPLY_OF]->(:Post)-[:HAS_TAG]->(tag:Tag)
WHERE tag.id in tags
RETURN
friend.id AS personId,
friend.firstName AS personFirstName,
friend.lastName AS personLastName,
collect(DISTINCT tag.name) AS tagNames,
count(DISTINCT comment) AS replyCount
ORDER BY
replyCount DESC,
toInteger(personId) ASC
LIMIT 20
""".replace(
"\n", ""
),
self._get_query_parameters(),
)
def benchmark__interactive__complex_query_13_analytical(self):
memgraph = (
"""
MATCH
(person1:Person {id: $person1Id}),
(person2:Person {id: $person2Id}),
path = (person1)-[:KNOWS *BFS]-(person2)
RETURN
CASE path IS NULL
WHEN true THEN -1
ELSE size(path)
END AS shortestPathLength
""".replace(
"\n", ""
),
self._get_query_parameters(),
)
neo4j = (
"""
MATCH
(person1:Person {id: $person1Id}),
(person2:Person {id: $person2Id}),
path = shortestPath((person1)-[:KNOWS*]-(person2))
RETURN
CASE path IS NULL
WHEN true THEN -1
ELSE length(path)
END AS shortestPathLength
""".replace(
"\n", ""
),
self._get_query_parameters(),
)
if self._vendor == "memgraph":
return memgraph
else:
return neo4j

View File

@ -1,134 +1,17 @@
# Copyright 2022 Memgraph Ltd.
#
# Use of this software is governed by the Business Source License
# included in the file licenses/BSL.txt; by using this file, you agree to be bound by the terms of the Business Source
# License, and you may not use this file except in compliance with the Business Source License.
#
# As of the Change Date specified in that file, in accordance with
# the Business Source License, use of this software will be governed
# by the Apache License, Version 2.0, included in the file
# licenses/APL.txt.
import random
import helpers
from benchmark_context import BenchmarkContext
from workloads.base import Workload
from workloads.importers.importer_pokec import ImporterPokec
# Base dataset class used as a template to create each individual dataset. All
# common logic is handled here.
class Dataset:
# Name of the dataset.
NAME = "Base dataset"
# List of all variants of the dataset that exist.
VARIANTS = ["default"]
# One of the available variants that should be used as the default variant.
DEFAULT_VARIANT = "default"
# List of query files that should be used to import the dataset.
FILES = {
"default": "/foo/bar",
}
INDEX = None
INDEX_FILES = {"default": ""}
# List of query file URLs that should be used to import the dataset.
URLS = None
# Number of vertices/edges for each variant.
SIZES = {
"default": {"vertices": 0, "edges": 0},
}
# Indicates whether the dataset has properties on edges.
PROPERTIES_ON_EDGES = False
def __init__(self, variant=None, vendor=None):
"""
Accepts a `variant` variable that indicates which variant
of the dataset should be executed.
"""
if variant is None:
variant = self.DEFAULT_VARIANT
if variant not in self.VARIANTS:
raise ValueError("Invalid test variant!")
if (self.FILES and variant not in self.FILES) and (self.URLS and variant not in self.URLS):
raise ValueError("The variant doesn't have a defined URL or " "file path!")
if variant not in self.SIZES:
raise ValueError("The variant doesn't have a defined dataset " "size!")
if vendor not in self.INDEX_FILES:
raise ValueError("Vendor does not have INDEX for dataset!")
self._variant = variant
self._vendor = vendor
if self.FILES is not None:
self._file = self.FILES.get(variant, None)
else:
self._file = None
if self.URLS is not None:
self._url = self.URLS.get(variant, None)
else:
self._url = None
if self.INDEX_FILES is not None:
self._index = self.INDEX_FILES.get(vendor, None)
else:
self._index = None
self._size = self.SIZES[variant]
if "vertices" not in self._size or "edges" not in self._size:
raise ValueError("The size defined for this variant doesn't " "have the number of vertices and/or edges!")
self._num_vertices = self._size["vertices"]
self._num_edges = self._size["edges"]
def prepare(self, directory):
if self._file is not None:
print("Using dataset file:", self._file)
else:
# TODO: add support for JSON datasets
cached_input, exists = directory.get_file("dataset.cypher")
if not exists:
print("Downloading dataset file:", self._url)
downloaded_file = helpers.download_file(self._url, directory.get_path())
print("Unpacking and caching file:", downloaded_file)
helpers.unpack_and_move_file(downloaded_file, cached_input)
print("Using cached dataset file:", cached_input)
self._file = cached_input
cached_index, exists = directory.get_file(self._vendor + ".cypher")
if not exists:
print("Downloading index file:", self._index)
downloaded_file = helpers.download_file(self._index, directory.get_path())
print("Unpacking and caching file:", downloaded_file)
helpers.unpack_and_move_file(downloaded_file, cached_index)
print("Using cached index file:", cached_index)
self._index = cached_index
def get_variant(self):
"""Returns the current variant of the dataset."""
return self._variant
def get_index(self):
"""Get index file, defined by vendor"""
return self._index
def get_file(self):
"""
Returns path to the file that contains dataset creation queries.
"""
return self._file
def get_size(self):
"""Returns number of vertices/edges for the current variant."""
return self._size
# All tests should be query generator functions that output all of the
# queries that should be executed by the runner. The functions should be
# named `benchmark__GROUPNAME__TESTNAME` and should not accept any
# arguments.
class Pokec(Dataset):
class Pokec(Workload):
NAME = "pokec"
VARIANTS = ["small", "medium", "large"]
DEFAULT_VARIANT = "small"
FILES = None
FILE = None
URLS = {
URL_FILE = {
"small": "https://s3.eu-west-1.amazonaws.com/deps.memgraph.io/dataset/pokec/benchmark/pokec_small_import.cypher",
"medium": "https://s3.eu-west-1.amazonaws.com/deps.memgraph.io/dataset/pokec/benchmark/pokec_medium_import.cypher",
"large": "https://s3.eu-west-1.amazonaws.com/deps.memgraph.io/dataset/pokec/benchmark/pokec_large.setup.cypher.gz",
@ -138,16 +21,28 @@ class Pokec(Dataset):
"medium": {"vertices": 100000, "edges": 1768515},
"large": {"vertices": 1632803, "edges": 30622564},
}
INDEX = None
INDEX_FILES = {
URL_INDEX_FILE = {
"memgraph": "https://s3.eu-west-1.amazonaws.com/deps.memgraph.io/dataset/pokec/benchmark/memgraph.cypher",
"neo4j": "https://s3.eu-west-1.amazonaws.com/deps.memgraph.io/dataset/pokec/benchmark/neo4j.cypher",
}
PROPERTIES_ON_EDGES = False
# Helpers used to generate the queries
def __init__(self, variant: str = None, benchmark_context: BenchmarkContext = None):
super().__init__(variant, benchmark_context=benchmark_context)
def custom_import(self) -> bool:
importer = ImporterPokec(
benchmark_context=self.benchmark_context,
dataset_name=self.NAME,
index_file=self._file_index,
dataset_file=self._file,
variant=self._variant,
)
return importer.execute_import()
# Helpers used to generate the queries
def _get_random_vertex(self):
# All vertices in the Pokec dataset have an ID in the range
# [1, _num_vertices].
@ -343,7 +238,7 @@ class Pokec(Dataset):
return ("MATCH (n:User {id: $id}) RETURN n", {"id": self._get_random_vertex()})
def benchmark__match__vertex_on_property(self):
return ("MATCH (n {id: $id}) RETURN n", {"id": self._get_random_vertex()})
return ("MATCH (n:User {id: $id}) RETURN n", {"id": self._get_random_vertex()})
def benchmark__update__vertex_on_property(self):
return (
@ -364,7 +259,7 @@ class Pokec(Dataset):
def benchmark__basic__single_vertex_property_update_update(self):
return (
"MATCH (n {id: $id}) SET n.property = -1",
"MATCH (n:User {id: $id}) SET n.property = -1",
{"id": self._get_random_vertex()},
)