From a5dc818e19f314f6e0990fd89913c0f8911cce66 Mon Sep 17 00:00:00 2001 From: jeremy Date: Wed, 12 Oct 2022 16:35:47 +0200 Subject: [PATCH 01/38] Add new dataset for mgbench --- tests/mgbench/dataset_creator.py | 94 +++++++++++ tests/mgbench/datasets.py | 272 +++++++++++++++++++++---------- 2 files changed, 277 insertions(+), 89 deletions(-) create mode 100644 tests/mgbench/dataset_creator.py diff --git a/tests/mgbench/dataset_creator.py b/tests/mgbench/dataset_creator.py new file mode 100644 index 000000000..432a44ec7 --- /dev/null +++ b/tests/mgbench/dataset_creator.py @@ -0,0 +1,94 @@ +# Copyright 2021 Memgraph Ltd. +# +# Use of this software is governed by the Business Source License +# included in the file licenses/BSL.txt; by using this file, you agree to be bound by the terms of the Business Source +# License, and you may not use this file except in compliance with the Business Source License. +# +# As of the Change Date specified in that file, in accordance with +# the Business Source License, use of this software will be governed +# by the Apache License, Version 2.0, included in the file +# licenses/APL.txt. + +import random + +import helpers + +# Explaination of datasets: +# - empty_only_index: contains index; contains no data +# - small: contains index; contains data (small dataset) +# +# Datamodel is as follow: +# +# ┌──────────────┐ +# │ Permission │ +# ┌────────────────┐ │ Schema:uuid │ ┌────────────┐ +# │:IS_FOR_IDENTITY├────┤ Index:name ├───┤:IS_FOR_USER│ +# └┬───────────────┘ └──────────────┘ └────────────┤ +# │ │ +# ┌──────▼──────────────┐ ┌──▼───────────┐ +# │ Identity │ │ User │ +# │ Schema:uuid │ │ Schema:uuid │ +# │ Index:platformId │ │ Index:email │ +# │ Index:name │ └──────────────┘ +# └─────────────────────┘ +# +# +# - User: attributes: ["uuid", "name", "platformId"] +# - Permission: attributes: ["uuid", "name"] +# - Identity: attributes: ["uuid", "email"] +# +# Indexes: +# - User: [User(uuid), User(platformId), User(name)] +# - Permission: [Permission(uuid), Permission(name)] +# - Identity: [Identity(uuid), Identity(email)] +# +# Edges: +# - (:Permission)-[:IS_FOR_USER]->(:User) +# - (:Permission)-[:IS_FOR_IDENTITYR]->(:Identity) +# +# Distributed specific: uuid is the schema + +filename = "dataset.cypher" +f = open(filename, "x") + +f.write("MATCH (n) DETACH DELETE n;\n") + +# Create the indexes +f.write("CREATE INDEX ON :User;\n") +f.write("CREATE INDEX ON :Permission;\n") +f.write("CREATE INDEX ON :Identity;\n") +f.write("CREATE INDEX ON :User(platformId);\n") +f.write("CREATE INDEX ON :User(name);\n") +f.write("CREATE INDEX ON :Permission(name);\n") +f.write("CREATE INDEX ON :Identity(email);\n") + +# Create extra index: in distributed, this will be the schema +f.write("CREATE INDEX ON :User(uuid);\n") +f.write("CREATE INDEX ON :Permission(uuid);\n") +f.write("CREATE INDEX ON :Identity(uuid);\n") + +platform_ids = [f"somePlatformId_{id}" for id in range(10)] + +# This is the number of clusters to change if you want a bigger dataset +number_of_clusters = 3000000 + +for index in range(1, number_of_clusters + 1): + platform_id = platform_ids[random.randint(0, len(platform_ids) - 1)] + user_uuid = index + platform_uuid = number_of_clusters + index + identity_uuid = 2 * number_of_clusters + index + + # Create the nodes + f.write(f'CREATE (:User {{uuid: {user_uuid}, platformId: "{platform_id}", name: "name_user_{user_uuid}"}});\n') + f.write(f'CREATE (:Permission {{uuid: {platform_uuid}, name: "name_permission_{platform_uuid}"}});\n') + f.write(f'CREATE (:Permission {{uuid: {identity_uuid}, name: "mail_{identity_uuid}@something.com"}});\n') + + # Create the edges + f.write( + f"MATCH (permission:Permission {{uuid: {platform_uuid}}}), (user:User {{uuid: {user_uuid}}}) CREATE (permission)-[e: IS_FOR_USER]->(user);\n" + ) + f.write( + f"MATCH (permission:Permission {{uuid: {platform_uuid}}}), (identity:Identity {{uuid: {identity_uuid}}}) CREATE (permission)-[e: IS_FOR_IDENTITY]->(identity);\n" + ) + +f.close() diff --git a/tests/mgbench/datasets.py b/tests/mgbench/datasets.py index dbaaa2de9..45fdf67db 100644 --- a/tests/mgbench/datasets.py +++ b/tests/mgbench/datasets.py @@ -45,13 +45,10 @@ class Dataset: variant = self.DEFAULT_VARIANT if variant not in self.VARIANTS: raise ValueError("Invalid test variant!") - if (self.FILES and variant not in self.FILES) and \ - (self.URLS and variant not in self.URLS): - raise ValueError("The variant doesn't have a defined URL or " - "file path!") + if (self.FILES and variant not in self.FILES) and (self.URLS and variant not in self.URLS): + raise ValueError("The variant doesn't have a defined URL or " "file path!") if variant not in self.SIZES: - raise ValueError("The variant doesn't have a defined dataset " - "size!") + raise ValueError("The variant doesn't have a defined dataset " "size!") self._variant = variant if self.FILES is not None: self._file = self.FILES.get(variant, None) @@ -63,8 +60,7 @@ class Dataset: self._url = None self._size = self.SIZES[variant] if "vertices" not in self._size or "edges" not in self._size: - raise ValueError("The size defined for this variant doesn't " - "have the number of vertices and/or edges!") + raise ValueError("The size defined for this variant doesn't " "have the number of vertices and/or edges!") self._num_vertices = self._size["vertices"] self._num_edges = self._size["edges"] @@ -76,8 +72,7 @@ class Dataset: cached_input, exists = directory.get_file("dataset.cypher") if not exists: print("Downloading dataset file:", self._url) - downloaded_file = helpers.download_file( - self._url, directory.get_path()) + downloaded_file = helpers.download_file(self._url, directory.get_path()) print("Unpacking and caching file:", downloaded_file) helpers.unpack_and_move_file(downloaded_file, cached_input) print("Using cached dataset file:", cached_input) @@ -137,18 +132,17 @@ class Pokec(Dataset): # Arango benchmarks def benchmark__arango__single_vertex_read(self): - return ("MATCH (n:User {id : $id}) RETURN n", - {"id": self._get_random_vertex()}) + return ("MATCH (n:User {id : $id}) RETURN n", {"id": self._get_random_vertex()}) def benchmark__arango__single_vertex_write(self): - return ("CREATE (n:UserTemp {id : $id}) RETURN n", - {"id": random.randint(1, self._num_vertices * 10)}) + return ("CREATE (n:UserTemp {id : $id}) RETURN n", {"id": random.randint(1, self._num_vertices * 10)}) def benchmark__arango__single_edge_write(self): vertex_from, vertex_to = self._get_random_from_to() - return ("MATCH (n:User {id: $from}), (m:User {id: $to}) WITH n, m " - "CREATE (n)-[e:Temp]->(m) RETURN e", - {"from": vertex_from, "to": vertex_to}) + return ( + "MATCH (n:User {id: $from}), (m:User {id: $to}) WITH n, m " "CREATE (n)-[e:Temp]->(m) RETURN e", + {"from": vertex_from, "to": vertex_to}, + ) def benchmark__arango__aggregate(self): return ("MATCH (n:User) RETURN n.age, COUNT(*)", {}) @@ -157,92 +151,94 @@ class Pokec(Dataset): return ("MATCH (n:User) WHERE n.age >= 18 RETURN n.age, COUNT(*)", {}) def benchmark__arango__expansion_1(self): - return ("MATCH (s:User {id: $id})-->(n:User) " - "RETURN n.id", - {"id": self._get_random_vertex()}) + return ("MATCH (s:User {id: $id})-->(n:User) " "RETURN n.id", {"id": self._get_random_vertex()}) def benchmark__arango__expansion_1_with_filter(self): - return ("MATCH (s:User {id: $id})-->(n:User) " - "WHERE n.age >= 18 " - "RETURN n.id", - {"id": self._get_random_vertex()}) + return ( + "MATCH (s:User {id: $id})-->(n:User) " "WHERE n.age >= 18 " "RETURN n.id", + {"id": self._get_random_vertex()}, + ) def benchmark__arango__expansion_2(self): - return ("MATCH (s:User {id: $id})-->()-->(n:User) " - "RETURN DISTINCT n.id", - {"id": self._get_random_vertex()}) + return ("MATCH (s:User {id: $id})-->()-->(n:User) " "RETURN DISTINCT n.id", {"id": self._get_random_vertex()}) def benchmark__arango__expansion_2_with_filter(self): - return ("MATCH (s:User {id: $id})-->()-->(n:User) " - "WHERE n.age >= 18 " - "RETURN DISTINCT n.id", - {"id": self._get_random_vertex()}) + return ( + "MATCH (s:User {id: $id})-->()-->(n:User) " "WHERE n.age >= 18 " "RETURN DISTINCT n.id", + {"id": self._get_random_vertex()}, + ) def benchmark__arango__expansion_3(self): - return ("MATCH (s:User {id: $id})-->()-->()-->(n:User) " - "RETURN DISTINCT n.id", - {"id": self._get_random_vertex()}) + return ( + "MATCH (s:User {id: $id})-->()-->()-->(n:User) " "RETURN DISTINCT n.id", + {"id": self._get_random_vertex()}, + ) def benchmark__arango__expansion_3_with_filter(self): - return ("MATCH (s:User {id: $id})-->()-->()-->(n:User) " - "WHERE n.age >= 18 " - "RETURN DISTINCT n.id", - {"id": self._get_random_vertex()}) + return ( + "MATCH (s:User {id: $id})-->()-->()-->(n:User) " "WHERE n.age >= 18 " "RETURN DISTINCT n.id", + {"id": self._get_random_vertex()}, + ) def benchmark__arango__expansion_4(self): - return ("MATCH (s:User {id: $id})-->()-->()-->()-->(n:User) " - "RETURN DISTINCT n.id", - {"id": self._get_random_vertex()}) + return ( + "MATCH (s:User {id: $id})-->()-->()-->()-->(n:User) " "RETURN DISTINCT n.id", + {"id": self._get_random_vertex()}, + ) def benchmark__arango__expansion_4_with_filter(self): - return ("MATCH (s:User {id: $id})-->()-->()-->()-->(n:User) " - "WHERE n.age >= 18 " - "RETURN DISTINCT n.id", - {"id": self._get_random_vertex()}) + return ( + "MATCH (s:User {id: $id})-->()-->()-->()-->(n:User) " "WHERE n.age >= 18 " "RETURN DISTINCT n.id", + {"id": self._get_random_vertex()}, + ) def benchmark__arango__neighbours_2(self): - return ("MATCH (s:User {id: $id})-[*1..2]->(n:User) " - "RETURN DISTINCT n.id", - {"id": self._get_random_vertex()}) + return ("MATCH (s:User {id: $id})-[*1..2]->(n:User) " "RETURN DISTINCT n.id", {"id": self._get_random_vertex()}) def benchmark__arango__neighbours_2_with_filter(self): - return ("MATCH (s:User {id: $id})-[*1..2]->(n:User) " - "WHERE n.age >= 18 " - "RETURN DISTINCT n.id", - {"id": self._get_random_vertex()}) + return ( + "MATCH (s:User {id: $id})-[*1..2]->(n:User) " "WHERE n.age >= 18 " "RETURN DISTINCT n.id", + {"id": self._get_random_vertex()}, + ) def benchmark__arango__neighbours_2_with_data(self): - return ("MATCH (s:User {id: $id})-[*1..2]->(n:User) " - "RETURN DISTINCT n.id, n", - {"id": self._get_random_vertex()}) + return ( + "MATCH (s:User {id: $id})-[*1..2]->(n:User) " "RETURN DISTINCT n.id, n", + {"id": self._get_random_vertex()}, + ) def benchmark__arango__neighbours_2_with_data_and_filter(self): - return ("MATCH (s:User {id: $id})-[*1..2]->(n:User) " - "WHERE n.age >= 18 " - "RETURN DISTINCT n.id, n", - {"id": self._get_random_vertex()}) + return ( + "MATCH (s:User {id: $id})-[*1..2]->(n:User) " "WHERE n.age >= 18 " "RETURN DISTINCT n.id, n", + {"id": self._get_random_vertex()}, + ) def benchmark__arango__shortest_path(self): vertex_from, vertex_to = self._get_random_from_to() - return ("MATCH (n:User {id: $from}), (m:User {id: $to}) WITH n, m " - "MATCH p=(n)-[*bfs..15]->(m) " - "RETURN extract(n in nodes(p) | n.id) AS path", - {"from": vertex_from, "to": vertex_to}) + return ( + "MATCH (n:User {id: $from}), (m:User {id: $to}) WITH n, m " + "MATCH p=(n)-[*bfs..15]->(m) " + "RETURN extract(n in nodes(p) | n.id) AS path", + {"from": vertex_from, "to": vertex_to}, + ) def benchmark__arango__shortest_path_with_filter(self): vertex_from, vertex_to = self._get_random_from_to() - return ("MATCH (n:User {id: $from}), (m:User {id: $to}) WITH n, m " - "MATCH p=(n)-[*bfs..15 (e, n | n.age >= 18)]->(m) " - "RETURN extract(n in nodes(p) | n.id) AS path", - {"from": vertex_from, "to": vertex_to}) + return ( + "MATCH (n:User {id: $from}), (m:User {id: $to}) WITH n, m " + "MATCH p=(n)-[*bfs..15 (e, n | n.age >= 18)]->(m) " + "RETURN extract(n in nodes(p) | n.id) AS path", + {"from": vertex_from, "to": vertex_to}, + ) # Our benchmark queries def benchmark__create__edge(self): vertex_from, vertex_to = self._get_random_from_to() - return ("MATCH (a:User {id: $from}), (b:User {id: $to}) " - "CREATE (a)-[:TempEdge]->(b)", - {"from": vertex_from, "to": vertex_to}) + return ( + "MATCH (a:User {id: $from}), (b:User {id: $to}) " "CREATE (a)-[:TempEdge]->(b)", + {"from": vertex_from, "to": vertex_to}, + ) def benchmark__create__pattern(self): return ("CREATE ()-[:TempEdge]->()", {}) @@ -251,9 +247,12 @@ class Pokec(Dataset): return ("CREATE ()", {}) def benchmark__create__vertex_big(self): - return ("CREATE (:L1:L2:L3:L4:L5:L6:L7 {p1: true, p2: 42, " - "p3: \"Here is some text that is not extremely short\", " - "p4:\"Short text\", p5: 234.434, p6: 11.11, p7: false})", {}) + return ( + "CREATE (:L1:L2:L3:L4:L5:L6:L7 {p1: true, p2: 42, " + 'p3: "Here is some text that is not extremely short", ' + 'p4:"Short text", p5: 234.434, p6: 11.11, p7: false})', + {}, + ) def benchmark__aggregation__count(self): return ("MATCH (n) RETURN count(n), count(n.age)", {}) @@ -262,29 +261,124 @@ class Pokec(Dataset): return ("MATCH (n) RETURN min(n.age), max(n.age), avg(n.age)", {}) def benchmark__match__pattern_cycle(self): - return ("MATCH (n:User {id: $id})-[e1]->(m)-[e2]->(n) " - "RETURN e1, m, e2", - {"id": self._get_random_vertex()}) + return ("MATCH (n:User {id: $id})-[e1]->(m)-[e2]->(n) " "RETURN e1, m, e2", {"id": self._get_random_vertex()}) def benchmark__match__pattern_long(self): - return ("MATCH (n1:User {id: $id})-[e1]->(n2)-[e2]->" - "(n3)-[e3]->(n4)<-[e4]-(n5) " - "RETURN n5 LIMIT 1", - {"id": self._get_random_vertex()}) + return ( + "MATCH (n1:User {id: $id})-[e1]->(n2)-[e2]->" "(n3)-[e3]->(n4)<-[e4]-(n5) " "RETURN n5 LIMIT 1", + {"id": self._get_random_vertex()}, + ) def benchmark__match__pattern_short(self): - return ("MATCH (n:User {id: $id})-[e]->(m) " - "RETURN m LIMIT 1", - {"id": self._get_random_vertex()}) + return ("MATCH (n:User {id: $id})-[e]->(m) " "RETURN m LIMIT 1", {"id": self._get_random_vertex()}) def benchmark__match__vertex_on_label_property(self): - return ("MATCH (n:User) WITH n WHERE n.id = $id RETURN n", - {"id": self._get_random_vertex()}) + return ("MATCH (n:User) WITH n WHERE n.id = $id RETURN n", {"id": self._get_random_vertex()}) def benchmark__match__vertex_on_label_property_index(self): - return ("MATCH (n:User {id: $id}) RETURN n", - {"id": self._get_random_vertex()}) + return ("MATCH (n:User {id: $id}) RETURN n", {"id": self._get_random_vertex()}) def benchmark__match__vertex_on_property(self): - return ("MATCH (n {id: $id}) RETURN n", - {"id": self._get_random_vertex()}) + return ("MATCH (n {id: $id}) RETURN n", {"id": self._get_random_vertex()}) + + +class Distributed(Dataset): + + # Explaination of datasets: + # - empty_only_index: contains index; contains no data + # - small/medium/large: contains index; contains data (respectively small/medium/large dataset) + # + # See dataset_creator.py to understand the datamodel and generate a dataset + + NAME = "distributed" + VARIANTS = ["empty_only_index", "small", "medium", "large"] + DEFAULT_VARIANT = "empty_only_index" + URLS = { + "empty_only_index": "https://s3-eu-west-1.amazonaws.com/deps.memgraph.io/distributed_empty_only_index.setup.cypher.gz", + "small": "https://s3-eu-west-1.amazonaws.com/deps.memgraph.io/distributed_small.setup.cypher.gz", + "medium": "https://s3-eu-west-1.amazonaws.com/deps.memgraph.io/distributed_medium.setup.cypher.gz", + "large": "https://s3-eu-west-1.amazonaws.com/deps.memgraph.io/distributed_large.setup.cypher.gz", + } + SIZES = { + "empty_only_index": { + "vertices": 0, + "edges": -1, # not used + "uuid_ranges": { + "User": {"first_uuid": 0, "last_uuid": 0}, + "Permission": {"first_uuid": 0, "last_uuid": 0}, + "Identity": {"first_uuid": 0, "last_uuid": 0}, + }, + }, + "small": { + "vertices": 30, + "edges": -1, # not used + "uuid_ranges": { + "User": {"first_uuid": 1, "last_uuid": 10}, + "Permission": {"first_uuid": 11, "last_uuid": 20}, + "Identity": {"first_uuid": 21, "last_uuid": 30}, + }, + }, + "medium": { + "vertices": 30000, + "edges": -1, # not used + "uuid_ranges": { + "User": {"first_uuid": 1, "last_uuid": 10000}, + "Permission": {"first_uuid": 10001, "last_uuid": 20000}, + "Identity": {"first_uuid": 10001, "last_uuid": 30000}, + }, + }, + "large": { + "vertices": 3000000, + "edges": -1, # not used + "uuid_ranges": { + "User": {"first_uuid": 1, "last_uuid": 1000000}, + "Permission": {"first_uuid": 100001, "last_uuid": 2000000}, + "Identity": {"first_uuid": 1000001, "last_uuid": 3000000}, + }, + }, + } + + def _get_random_uuid(self, type): + assert type in ["User", "Permission", "Identity"] + + first_uuid = Dataset.get_size(self)["uuid_ranges"][type]["first_uuid"] + last_uuid = Dataset.get_size(self)["uuid_ranges"][type]["last_uuid"] + + random_value = random.randint(first_uuid, last_uuid) + return random_value + + def __init__(self, variant=None): + Dataset.__init__(self, variant) + self.next_value_idx = Dataset.get_size(self)["vertices"] + 1 + + def benchmark__create__vertex(self): + self.next_value_idx += 1 + query = (f"CREATE (:User {{uuid: {self.next_value_idx}}});", {}) + return query + + def benchmark__create__edges(self): + permission_uuid = self._get_random_uuid("Permission") + user_uuid = self._get_random_uuid("User") + + query = ( + "MATCH (permission:Permission {uuid: $permission_uuid}), (user:User {uuid: $user_uuid}) " + "CREATE (permission)-[:IS_FOR_USER]->(user)", + {"permission_uuid": permission_uuid, "user_uuid": user_uuid}, + ) + + return query + + def benchmark__match__match_all_vertices(self): + self.next_value_idx += 1 + query = ("MATCH (n) RETURN *", {}) + return query + + def benchmark__match__match_on_labelled_vertices(self): + self.next_value_idx += 1 + query = ("MATCH (n:User) RETURN *", {}) + return query + + def benchmark__match__match_all_verteices_with_edges(self): + self.next_value_idx += 1 + query = ("MATCH (permission:Permission)-[e:IS_FOR_USER]->(user:User) RETURN *", {}) + return query From 58243f4a268e0a0745060f2247da07e466b93069 Mon Sep 17 00:00:00 2001 From: jeremy Date: Tue, 18 Oct 2022 15:47:13 +0200 Subject: [PATCH 02/38] Rename User->File Use parser for argument i.o. simple variable in script --- tests/mgbench/dataset_creator.py | 118 ++++++++++++++++++------------- tests/mgbench/datasets.py | 36 +++++----- 2 files changed, 88 insertions(+), 66 deletions(-) diff --git a/tests/mgbench/dataset_creator.py b/tests/mgbench/dataset_creator.py index 432a44ec7..754419ccc 100644 --- a/tests/mgbench/dataset_creator.py +++ b/tests/mgbench/dataset_creator.py @@ -10,8 +10,8 @@ # licenses/APL.txt. import random - import helpers +import argparse # Explaination of datasets: # - empty_only_index: contains index; contains no data @@ -22,73 +22,95 @@ import helpers # ┌──────────────┐ # │ Permission │ # ┌────────────────┐ │ Schema:uuid │ ┌────────────┐ -# │:IS_FOR_IDENTITY├────┤ Index:name ├───┤:IS_FOR_USER│ +# │:IS_FOR_IDENTITY├────┤ Index:name ├───┤:IS_FOR_FILE│ # └┬───────────────┘ └──────────────┘ └────────────┤ # │ │ -# ┌──────▼──────────────┐ ┌──▼───────────┐ -# │ Identity │ │ User │ -# │ Schema:uuid │ │ Schema:uuid │ -# │ Index:platformId │ │ Index:email │ -# │ Index:name │ └──────────────┘ -# └─────────────────────┘ +# ┌──────▼──────────────┐ ┌──▼────────────────┐ +# │ Identity │ │ File │ +# │ Schema:uuid │ │ Schema:uuid │ +# │ Index:email │ │ Index:name │ +# └─────────────────────┘ │ Index:platformId │ +# └───────────────────┘ # -# -# - User: attributes: ["uuid", "name", "platformId"] +# - File: attributes: ["uuid", "name", "platformId"] # - Permission: attributes: ["uuid", "name"] # - Identity: attributes: ["uuid", "email"] # # Indexes: -# - User: [User(uuid), User(platformId), User(name)] +# - File: [File(uuid), File(platformId), File(name)] # - Permission: [Permission(uuid), Permission(name)] # - Identity: [Identity(uuid), Identity(email)] # # Edges: -# - (:Permission)-[:IS_FOR_USER]->(:User) +# - (:Permission)-[:IS_FOR_FILE]->(:File) # - (:Permission)-[:IS_FOR_IDENTITYR]->(:Identity) # -# Distributed specific: uuid is the schema +# AccessControl specific: uuid is the schema -filename = "dataset.cypher" -f = open(filename, "x") +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--number_of_identities", type=int, default=10) + parser.add_argument("--number_of_files", type=int, default=10) + parser.add_argument("--percentage_of_permissions", type=float, default=1.0) + parser.add_argument("--filename", default="dataset.cypher") -f.write("MATCH (n) DETACH DELETE n;\n") + args = parser.parse_args() -# Create the indexes -f.write("CREATE INDEX ON :User;\n") -f.write("CREATE INDEX ON :Permission;\n") -f.write("CREATE INDEX ON :Identity;\n") -f.write("CREATE INDEX ON :User(platformId);\n") -f.write("CREATE INDEX ON :User(name);\n") -f.write("CREATE INDEX ON :Permission(name);\n") -f.write("CREATE INDEX ON :Identity(email);\n") + number_of_identities = args.number_of_identities + number_of_files = args.number_of_files + percentage_of_permissions = args.percentage_of_permissions + filename = args.filename -# Create extra index: in distributed, this will be the schema -f.write("CREATE INDEX ON :User(uuid);\n") -f.write("CREATE INDEX ON :Permission(uuid);\n") -f.write("CREATE INDEX ON :Identity(uuid);\n") + assert number_of_identities > 0 + assert number_of_files > 0 + assert percentage_of_permissions > 0.0 and percentage_of_permissions <= 1.0 + assert filename != "" -platform_ids = [f"somePlatformId_{id}" for id in range(10)] + f = open(filename, "w") -# This is the number of clusters to change if you want a bigger dataset -number_of_clusters = 3000000 + f.write("MATCH (n) DETACH DELETE n;\n") -for index in range(1, number_of_clusters + 1): - platform_id = platform_ids[random.randint(0, len(platform_ids) - 1)] - user_uuid = index - platform_uuid = number_of_clusters + index - identity_uuid = 2 * number_of_clusters + index + # Create the indexes + f.write("CREATE INDEX ON :File;\n") + f.write("CREATE INDEX ON :Permission;\n") + f.write("CREATE INDEX ON :Identity;\n") + f.write("CREATE INDEX ON :File(platformId);\n") + f.write("CREATE INDEX ON :File(name);\n") + f.write("CREATE INDEX ON :Permission(name);\n") + f.write("CREATE INDEX ON :Identity(email);\n") - # Create the nodes - f.write(f'CREATE (:User {{uuid: {user_uuid}, platformId: "{platform_id}", name: "name_user_{user_uuid}"}});\n') - f.write(f'CREATE (:Permission {{uuid: {platform_uuid}, name: "name_permission_{platform_uuid}"}});\n') - f.write(f'CREATE (:Permission {{uuid: {identity_uuid}, name: "mail_{identity_uuid}@something.com"}});\n') + # Create extra index: in distributed, this will be the schema + f.write("CREATE INDEX ON :File(uuid);\n") + f.write("CREATE INDEX ON :Permission(uuid);\n") + f.write("CREATE INDEX ON :Identity(uuid);\n") - # Create the edges - f.write( - f"MATCH (permission:Permission {{uuid: {platform_uuid}}}), (user:User {{uuid: {user_uuid}}}) CREATE (permission)-[e: IS_FOR_USER]->(user);\n" - ) - f.write( - f"MATCH (permission:Permission {{uuid: {platform_uuid}}}), (identity:Identity {{uuid: {identity_uuid}}}) CREATE (permission)-[e: IS_FOR_IDENTITY]->(identity);\n" - ) + uuid = 1 -f.close() + files = [] + # Create the nodes File + for index in range(0, number_of_files): + f.write(f'CREATE (:File {{uuid: {uuid}, platformId: platform_id, name: "name_file_{uuid}"}});\n') + uuid += 1 + + identities = [] + # Create the nodes Identity + for index in range(0, number_of_identities): + f.write(f'CREATE (:Identity {{uuid: {uuid}, name: "mail_{uuid}@something.com"}});\n') + uuid += 1 + + for outer_index in range(0, number_of_files): + for inner_index in range(0, number_of_identities): + file_uuid = outer_index + identity_uuid = number_of_files + inner_index + + if random.random() <= percentage_of_permissions: + f.write(f'CREATE (:Permission {{uuid: {uuid}, name: "name_permission_{uuid}"}});\n') + f.write( + f"MATCH (permission:Permission {{uuid: {uuid}}}), (file:File {{uuid: {file_uuid}}}) CREATE (permission)-[e: IS_FOR_FILE]->(file);\n" + ) + f.write( + f"MATCH (permission:Permission {{uuid: {uuid}}}), (identity:Identity {{uuid: {identity_uuid}}}) CREATE (permission)-[e: IS_FOR_IDENTITY]->(identity);\n" + ) + uuid += 1 + + f.close() diff --git a/tests/mgbench/datasets.py b/tests/mgbench/datasets.py index 45fdf67db..4c5f19043 100644 --- a/tests/mgbench/datasets.py +++ b/tests/mgbench/datasets.py @@ -282,7 +282,7 @@ class Pokec(Dataset): return ("MATCH (n {id: $id}) RETURN n", {"id": self._get_random_vertex()}) -class Distributed(Dataset): +class AccessControl(Dataset): # Explaination of datasets: # - empty_only_index: contains index; contains no data @@ -290,21 +290,21 @@ class Distributed(Dataset): # # See dataset_creator.py to understand the datamodel and generate a dataset - NAME = "distributed" + NAME = "accesscontrol" VARIANTS = ["empty_only_index", "small", "medium", "large"] DEFAULT_VARIANT = "empty_only_index" URLS = { - "empty_only_index": "https://s3-eu-west-1.amazonaws.com/deps.memgraph.io/distributed_empty_only_index.setup.cypher.gz", - "small": "https://s3-eu-west-1.amazonaws.com/deps.memgraph.io/distributed_small.setup.cypher.gz", - "medium": "https://s3-eu-west-1.amazonaws.com/deps.memgraph.io/distributed_medium.setup.cypher.gz", - "large": "https://s3-eu-west-1.amazonaws.com/deps.memgraph.io/distributed_large.setup.cypher.gz", + "empty_only_index": "https://s3-eu-west-1.amazonaws.com/deps.memgraph.io/accesscontrol_empty_only_index.setup.cypher.gz", + "small": "https://s3-eu-west-1.amazonaws.com/deps.memgraph.io/accesscontrol_small.setup.cypher.gz", + "medium": "https://s3-eu-west-1.amazonaws.com/deps.memgraph.io/accesscontrol_medium.setup.cypher.gz", + "large": "https://s3-eu-west-1.amazonaws.com/deps.memgraph.io/accesscontrol_large.setup.cypher.gz", } SIZES = { "empty_only_index": { "vertices": 0, "edges": -1, # not used "uuid_ranges": { - "User": {"first_uuid": 0, "last_uuid": 0}, + "File": {"first_uuid": 0, "last_uuid": 0}, "Permission": {"first_uuid": 0, "last_uuid": 0}, "Identity": {"first_uuid": 0, "last_uuid": 0}, }, @@ -313,7 +313,7 @@ class Distributed(Dataset): "vertices": 30, "edges": -1, # not used "uuid_ranges": { - "User": {"first_uuid": 1, "last_uuid": 10}, + "File": {"first_uuid": 1, "last_uuid": 10}, "Permission": {"first_uuid": 11, "last_uuid": 20}, "Identity": {"first_uuid": 21, "last_uuid": 30}, }, @@ -322,7 +322,7 @@ class Distributed(Dataset): "vertices": 30000, "edges": -1, # not used "uuid_ranges": { - "User": {"first_uuid": 1, "last_uuid": 10000}, + "File": {"first_uuid": 1, "last_uuid": 10000}, "Permission": {"first_uuid": 10001, "last_uuid": 20000}, "Identity": {"first_uuid": 10001, "last_uuid": 30000}, }, @@ -331,7 +331,7 @@ class Distributed(Dataset): "vertices": 3000000, "edges": -1, # not used "uuid_ranges": { - "User": {"first_uuid": 1, "last_uuid": 1000000}, + "File": {"first_uuid": 1, "last_uuid": 1000000}, "Permission": {"first_uuid": 100001, "last_uuid": 2000000}, "Identity": {"first_uuid": 1000001, "last_uuid": 3000000}, }, @@ -339,7 +339,7 @@ class Distributed(Dataset): } def _get_random_uuid(self, type): - assert type in ["User", "Permission", "Identity"] + assert type in ["File", "Permission", "Identity"] first_uuid = Dataset.get_size(self)["uuid_ranges"][type]["first_uuid"] last_uuid = Dataset.get_size(self)["uuid_ranges"][type]["last_uuid"] @@ -353,17 +353,17 @@ class Distributed(Dataset): def benchmark__create__vertex(self): self.next_value_idx += 1 - query = (f"CREATE (:User {{uuid: {self.next_value_idx}}});", {}) + query = (f"CREATE (:File {{uuid: {self.next_value_idx}}});", {}) return query def benchmark__create__edges(self): permission_uuid = self._get_random_uuid("Permission") - user_uuid = self._get_random_uuid("User") + file_uuid = self._get_random_uuid("File") query = ( - "MATCH (permission:Permission {uuid: $permission_uuid}), (user:User {uuid: $user_uuid}) " - "CREATE (permission)-[:IS_FOR_USER]->(user)", - {"permission_uuid": permission_uuid, "user_uuid": user_uuid}, + "MATCH (permission:Permission {uuid: $permission_uuid}), (file:File {uuid: $file_uuid}) " + "CREATE (permission)-[:IS_FOR_FILE]->(file)", + {"permission_uuid": permission_uuid, "file_uuid": file_uuid}, ) return query @@ -375,10 +375,10 @@ class Distributed(Dataset): def benchmark__match__match_on_labelled_vertices(self): self.next_value_idx += 1 - query = ("MATCH (n:User) RETURN *", {}) + query = ("MATCH (n:File) RETURN *", {}) return query def benchmark__match__match_all_verteices_with_edges(self): self.next_value_idx += 1 - query = ("MATCH (permission:Permission)-[e:IS_FOR_USER]->(user:User) RETURN *", {}) + query = ("MATCH (permission:Permission)-[e:IS_FOR_FILE]->(file:File) RETURN *", {}) return query From ddb30f49ea843f703ff33231e83edae9bbe5d7b2 Mon Sep 17 00:00:00 2001 From: Jeremy B <97525434+42jeremy@users.noreply.github.com> Date: Mon, 31 Oct 2022 09:43:22 +0100 Subject: [PATCH 03/38] Update datasets.py --- tests/mgbench/datasets.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/mgbench/datasets.py b/tests/mgbench/datasets.py index 4c5f19043..c2e8499bd 100644 --- a/tests/mgbench/datasets.py +++ b/tests/mgbench/datasets.py @@ -378,7 +378,7 @@ class AccessControl(Dataset): query = ("MATCH (n:File) RETURN *", {}) return query - def benchmark__match__match_all_verteices_with_edges(self): + def benchmark__match__match_all_vertices_with_edges(self): self.next_value_idx += 1 query = ("MATCH (permission:Permission)-[e:IS_FOR_FILE]->(file:File) RETURN *", {}) return query From 03c095e780df92f41da1c298a05030a245610d6c Mon Sep 17 00:00:00 2001 From: jeremy Date: Mon, 31 Oct 2022 10:52:06 +0100 Subject: [PATCH 04/38] Update assert --- tests/mgbench/dataset_creator.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/mgbench/dataset_creator.py b/tests/mgbench/dataset_creator.py index 754419ccc..1b48838b6 100644 --- a/tests/mgbench/dataset_creator.py +++ b/tests/mgbench/dataset_creator.py @@ -61,8 +61,8 @@ if __name__ == "__main__": percentage_of_permissions = args.percentage_of_permissions filename = args.filename - assert number_of_identities > 0 - assert number_of_files > 0 + assert number_of_identities >= 0 + assert number_of_files >= 0 assert percentage_of_permissions > 0.0 and percentage_of_permissions <= 1.0 assert filename != "" From 5ef08f841ae8a0d6a299020e37f4da2220057c4a Mon Sep 17 00:00:00 2001 From: jeremy Date: Mon, 31 Oct 2022 11:56:20 +0100 Subject: [PATCH 05/38] Update Dataset creation script --- tests/mgbench/dataset_creator.py | 4 ++-- tests/mgbench/datasets.py | 26 +++++++++++++------------- 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/tests/mgbench/dataset_creator.py b/tests/mgbench/dataset_creator.py index 1b48838b6..c63ae9731 100644 --- a/tests/mgbench/dataset_creator.py +++ b/tests/mgbench/dataset_creator.py @@ -100,8 +100,8 @@ if __name__ == "__main__": for outer_index in range(0, number_of_files): for inner_index in range(0, number_of_identities): - file_uuid = outer_index - identity_uuid = number_of_files + inner_index + file_uuid = outer_index + 1 + identity_uuid = number_of_files + inner_index + 1 if random.random() <= percentage_of_permissions: f.write(f'CREATE (:Permission {{uuid: {uuid}, name: "name_permission_{uuid}"}});\n') diff --git a/tests/mgbench/datasets.py b/tests/mgbench/datasets.py index c2e8499bd..953111807 100644 --- a/tests/mgbench/datasets.py +++ b/tests/mgbench/datasets.py @@ -314,26 +314,26 @@ class AccessControl(Dataset): "edges": -1, # not used "uuid_ranges": { "File": {"first_uuid": 1, "last_uuid": 10}, - "Permission": {"first_uuid": 11, "last_uuid": 20}, - "Identity": {"first_uuid": 21, "last_uuid": 30}, + "Identity": {"first_uuid": 11, "last_uuid": 20}, + "Permission": {"first_uuid": 21, "last_uuid": 120}, # 120=10*10+20 }, }, "medium": { + "vertices": 3000, + "edges": -1, # not used + "uuid_ranges": { + "File": {"first_uuid": 1, "last_uuid": 1000}, + "Identity": {"first_uuid": 1001, "last_uuid": 2000}, + "Permission": {"first_uuid": 2001, "last_uuid": 1002000}, # 1002000=1000*1000+2000 + }, + }, + "large": { "vertices": 30000, "edges": -1, # not used "uuid_ranges": { "File": {"first_uuid": 1, "last_uuid": 10000}, - "Permission": {"first_uuid": 10001, "last_uuid": 20000}, - "Identity": {"first_uuid": 10001, "last_uuid": 30000}, - }, - }, - "large": { - "vertices": 3000000, - "edges": -1, # not used - "uuid_ranges": { - "File": {"first_uuid": 1, "last_uuid": 1000000}, - "Permission": {"first_uuid": 100001, "last_uuid": 2000000}, - "Identity": {"first_uuid": 1000001, "last_uuid": 3000000}, + "Identity": {"first_uuid": 10001, "last_uuid": 20000}, + "Permission": {"first_uuid": 20001, "last_uuid": 100020000}, # 100020000=10000*10000+20000 }, }, } From c5ee6ffbc2cfed49dfc2e6dba106b0fa6736747f Mon Sep 17 00:00:00 2001 From: jeremy Date: Mon, 31 Oct 2022 12:41:28 +0100 Subject: [PATCH 06/38] Update dataset_creator script --- tests/mgbench/dataset_creator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/mgbench/dataset_creator.py b/tests/mgbench/dataset_creator.py index c63ae9731..dc98350cb 100644 --- a/tests/mgbench/dataset_creator.py +++ b/tests/mgbench/dataset_creator.py @@ -89,7 +89,7 @@ if __name__ == "__main__": files = [] # Create the nodes File for index in range(0, number_of_files): - f.write(f'CREATE (:File {{uuid: {uuid}, platformId: platform_id, name: "name_file_{uuid}"}});\n') + f.write(f'CREATE (:File {{uuid: {uuid}, platformId: "platform_id", name: "name_file_{uuid}"}});\n') uuid += 1 identities = [] From d62a45752a24e22063d9ae0d5eebe906b93a4608 Mon Sep 17 00:00:00 2001 From: jeremy Date: Mon, 31 Oct 2022 12:55:38 +0100 Subject: [PATCH 07/38] Remove unused variable --- tests/mgbench/dataset_creator.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/mgbench/dataset_creator.py b/tests/mgbench/dataset_creator.py index dc98350cb..712fae1bc 100644 --- a/tests/mgbench/dataset_creator.py +++ b/tests/mgbench/dataset_creator.py @@ -86,7 +86,6 @@ if __name__ == "__main__": uuid = 1 - files = [] # Create the nodes File for index in range(0, number_of_files): f.write(f'CREATE (:File {{uuid: {uuid}, platformId: "platform_id", name: "name_file_{uuid}"}});\n') From 2898120eeb7a5a2398216c95f9dea2908ecf2328 Mon Sep 17 00:00:00 2001 From: Jeremy B <97525434+42jeremy@users.noreply.github.com> Date: Mon, 31 Oct 2022 12:55:42 +0100 Subject: [PATCH 08/38] Update tests/mgbench/datasets.py Co-authored-by: Jure Bajic --- tests/mgbench/datasets.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/mgbench/datasets.py b/tests/mgbench/datasets.py index 953111807..6e9a68186 100644 --- a/tests/mgbench/datasets.py +++ b/tests/mgbench/datasets.py @@ -348,7 +348,7 @@ class AccessControl(Dataset): return random_value def __init__(self, variant=None): - Dataset.__init__(self, variant) + super().__init__(self, variant) self.next_value_idx = Dataset.get_size(self)["vertices"] + 1 def benchmark__create__vertex(self): From bae8c084b119d87939f900273cddc853fa96a217 Mon Sep 17 00:00:00 2001 From: Jeremy B <97525434+42jeremy@users.noreply.github.com> Date: Mon, 31 Oct 2022 12:56:02 +0100 Subject: [PATCH 09/38] Update tests/mgbench/datasets.py Co-authored-by: Jure Bajic --- tests/mgbench/datasets.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/mgbench/datasets.py b/tests/mgbench/datasets.py index 6e9a68186..60660c297 100644 --- a/tests/mgbench/datasets.py +++ b/tests/mgbench/datasets.py @@ -341,8 +341,8 @@ class AccessControl(Dataset): def _get_random_uuid(self, type): assert type in ["File", "Permission", "Identity"] - first_uuid = Dataset.get_size(self)["uuid_ranges"][type]["first_uuid"] - last_uuid = Dataset.get_size(self)["uuid_ranges"][type]["last_uuid"] + first_uuid = self.get_size()["uuid_ranges"][type]["first_uuid"] + last_uuid = self.get_size()["uuid_ranges"][type]["last_uuid"] random_value = random.randint(first_uuid, last_uuid) return random_value From f28ba89584b412e00c06e1401df5ab3677521813 Mon Sep 17 00:00:00 2001 From: Jeremy B <97525434+42jeremy@users.noreply.github.com> Date: Mon, 31 Oct 2022 13:01:42 +0100 Subject: [PATCH 10/38] Update tests/mgbench/dataset_creator.py Co-authored-by: Jure Bajic --- tests/mgbench/dataset_creator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/mgbench/dataset_creator.py b/tests/mgbench/dataset_creator.py index 712fae1bc..78cf9ca64 100644 --- a/tests/mgbench/dataset_creator.py +++ b/tests/mgbench/dataset_creator.py @@ -1,4 +1,4 @@ -# Copyright 2021 Memgraph Ltd. +# Copyright 2022 Memgraph Ltd. # # Use of this software is governed by the Business Source License # included in the file licenses/BSL.txt; by using this file, you agree to be bound by the terms of the Business Source From f04e1cda4bb70ef7ea19856467a5797bb3f509ab Mon Sep 17 00:00:00 2001 From: jeremy Date: Mon, 31 Oct 2022 13:02:05 +0100 Subject: [PATCH 11/38] Add function --- tests/mgbench/dataset_creator.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/tests/mgbench/dataset_creator.py b/tests/mgbench/dataset_creator.py index 712fae1bc..7ba2827f5 100644 --- a/tests/mgbench/dataset_creator.py +++ b/tests/mgbench/dataset_creator.py @@ -47,7 +47,8 @@ import argparse # # AccessControl specific: uuid is the schema -if __name__ == "__main__": + +def main(): parser = argparse.ArgumentParser() parser.add_argument("--number_of_identities", type=int, default=10) parser.add_argument("--number_of_files", type=int, default=10) @@ -113,3 +114,7 @@ if __name__ == "__main__": uuid += 1 f.close() + + +if __name__ == "__main__": + main() From acbf3c764c8e9d61375930da2bf91e7bddf4df2b Mon Sep 17 00:00:00 2001 From: jeremy Date: Mon, 31 Oct 2022 13:35:41 +0100 Subject: [PATCH 12/38] Remove arg from __init__ --- tests/mgbench/datasets.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/mgbench/datasets.py b/tests/mgbench/datasets.py index 60660c297..6e9412af4 100644 --- a/tests/mgbench/datasets.py +++ b/tests/mgbench/datasets.py @@ -348,7 +348,7 @@ class AccessControl(Dataset): return random_value def __init__(self, variant=None): - super().__init__(self, variant) + super().__init__(variant) self.next_value_idx = Dataset.get_size(self)["vertices"] + 1 def benchmark__create__vertex(self): From c90b38faf0b27f897df5b1805d9a640fc7905c5d Mon Sep 17 00:00:00 2001 From: jeremy Date: Mon, 31 Oct 2022 14:49:32 +0100 Subject: [PATCH 13/38] Update aws address for datasets --- tests/mgbench/datasets.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/mgbench/datasets.py b/tests/mgbench/datasets.py index 6e9412af4..3f6ad9eb0 100644 --- a/tests/mgbench/datasets.py +++ b/tests/mgbench/datasets.py @@ -294,10 +294,10 @@ class AccessControl(Dataset): VARIANTS = ["empty_only_index", "small", "medium", "large"] DEFAULT_VARIANT = "empty_only_index" URLS = { - "empty_only_index": "https://s3-eu-west-1.amazonaws.com/deps.memgraph.io/accesscontrol_empty_only_index.setup.cypher.gz", - "small": "https://s3-eu-west-1.amazonaws.com/deps.memgraph.io/accesscontrol_small.setup.cypher.gz", - "medium": "https://s3-eu-west-1.amazonaws.com/deps.memgraph.io/accesscontrol_medium.setup.cypher.gz", - "large": "https://s3-eu-west-1.amazonaws.com/deps.memgraph.io/accesscontrol_large.setup.cypher.gz", + "empty_only_index": "https://s3.eu-west-1.amazonaws.com/deps.memgraph.io/dataset/accesscontrol/accesscontrol_empty_only_index.setup.cypher.gz", + "small": "https://s3.eu-west-1.amazonaws.com/deps.memgraph.io/dataset/accesscontrol/accesscontrol_small.setup.cypher.gz", + "medium": "https://s3.eu-west-1.amazonaws.com/deps.memgraph.io/dataset/accesscontrol/accesscontrol_medium.setup.cypher.gz", + "large": "https://s3.eu-west-1.amazonaws.com/deps.memgraph.io/dataset/accesscontrol/accesscontrol_large.setup.cypher.gz", } SIZES = { "empty_only_index": { From e1f18f37337606ef763cecc5c05513d5659e1896 Mon Sep 17 00:00:00 2001 From: jeremy Date: Mon, 31 Oct 2022 15:19:34 +0100 Subject: [PATCH 14/38] Update location of Pokec datasets on aws --- tests/mgbench/datasets.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/mgbench/datasets.py b/tests/mgbench/datasets.py index 3f6ad9eb0..40a76b96f 100644 --- a/tests/mgbench/datasets.py +++ b/tests/mgbench/datasets.py @@ -104,9 +104,9 @@ class Pokec(Dataset): DEFAULT_VARIANT = "small" FILES = None URLS = { - "small": "https://s3-eu-west-1.amazonaws.com/deps.memgraph.io/pokec_small.setup.cypher", - "medium": "https://s3-eu-west-1.amazonaws.com/deps.memgraph.io/pokec_medium.setup.cypher", - "large": "https://s3-eu-west-1.amazonaws.com/deps.memgraph.io/pokec_large.setup.cypher.gz", + "small": "https://s3.eu-west-1.amazonaws.com/deps.memgraph.io/dataset/pokec/pokec_small.setup.cypher", + "medium": "https://s3.eu-west-1.amazonaws.com/deps.memgraph.io/dataset/pokec/pokec_medium.setup.cypher", + "large": "https://s3.eu-west-1.amazonaws.com/deps.memgraph.io/dataset/pokec/pokec_large.setup.cypher.gz", } SIZES = { "small": {"vertices": 10000, "edges": 121716}, From 4c5cd1f847c41d554ed9b213a1bb2aefb616bdc3 Mon Sep 17 00:00:00 2001 From: jeremy Date: Mon, 31 Oct 2022 16:10:04 +0100 Subject: [PATCH 15/38] Add possibility to have MgBench working against local file --- tests/mgbench/helpers.py | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/tests/mgbench/helpers.py b/tests/mgbench/helpers.py index 7488b1443..1a4cd3c3e 100644 --- a/tests/mgbench/helpers.py +++ b/tests/mgbench/helpers.py @@ -28,18 +28,25 @@ def get_binary_path(path, base=""): def download_file(url, path): - ret = subprocess.run(["wget", "-nv", "--content-disposition", url], - stderr=subprocess.PIPE, cwd=path, check=True) - data = ret.stderr.decode("utf-8") - tmp = data.split("->")[1] - name = tmp[tmp.index('"') + 1:tmp.rindex('"')] - return os.path.join(path, name) + if "https://" in url: + ret = subprocess.run( + ["wget", "-nv", "--content-disposition", url], stderr=subprocess.PIPE, cwd=path, check=True + ) + data = ret.stderr.decode("utf-8") + tmp = data.split("->")[1] + name = tmp[tmp.index('"') + 1 : tmp.rindex('"')] + return os.path.join(path, name) + else: + assert os.path.exists(url) + subprocess.run(["cp", url, path], stderr=subprocess.PIPE, cwd=path, check=True) + tmp = url.split("/") + name = tmp[len(tmp) - 1] + return os.path.join(path, name) def unpack_and_move_file(input_path, output_path): if input_path.endswith(".gz"): - subprocess.run(["gunzip", input_path], - stdout=subprocess.DEVNULL, check=True) + subprocess.run(["gunzip", input_path], stdout=subprocess.DEVNULL, check=True) input_path = input_path[:-3] os.rename(input_path, output_path) From 787987168cc6b2a956a227884ed53a869e459c6c Mon Sep 17 00:00:00 2001 From: jeremy Date: Tue, 1 Nov 2022 12:51:01 +0100 Subject: [PATCH 16/38] Make benchmark work with any customer datasets --- tests/mgbench/benchmark.py | 221 ++++++++++++++++++++----------------- 1 file changed, 118 insertions(+), 103 deletions(-) diff --git a/tests/mgbench/benchmark.py b/tests/mgbench/benchmark.py index 5ce715571..498f04f44 100755 --- a/tests/mgbench/benchmark.py +++ b/tests/mgbench/benchmark.py @@ -25,6 +25,7 @@ import datasets import log import helpers import runners +import importlib def get_queries(gen, count): @@ -37,8 +38,7 @@ def get_queries(gen, count): return ret -def match_patterns(dataset, variant, group, test, is_default_variant, - patterns): +def match_patterns(dataset, variant, group, test, is_default_variant, patterns): for pattern in patterns: verdict = [fnmatch.fnmatchcase(dataset, pattern[0])] if pattern[1] != "": @@ -58,7 +58,7 @@ def filter_benchmarks(generators, patterns): pattern = patterns[i].split("/") if len(pattern) > 4 or len(pattern) == 0: raise Exception("Invalid benchmark description '" + pattern + "'!") - pattern.extend(["", "*", "*"][len(pattern) - 1:]) + pattern.extend(["", "*", "*"][len(pattern) - 1 :]) patterns[i] = pattern filtered = [] for dataset in sorted(generators.keys()): @@ -68,8 +68,7 @@ def filter_benchmarks(generators, patterns): current = collections.defaultdict(list) for group in tests: for test_name, test_func in tests[group]: - if match_patterns(dataset, variant, group, test_name, - is_default_variant, patterns): + if match_patterns(dataset, variant, group, test_name, is_default_variant, patterns): current[group].append((test_name, test_func)) if len(current) > 0: filtered.append((generator(variant), dict(current))) @@ -78,54 +77,61 @@ def filter_benchmarks(generators, patterns): # Parse options. parser = argparse.ArgumentParser( - description="Memgraph benchmark executor.", - formatter_class=argparse.ArgumentDefaultsHelpFormatter) -parser.add_argument("benchmarks", nargs="*", default="", - help="descriptions of benchmarks that should be run; " - "multiple descriptions can be specified to run multiple " - "benchmarks; the description is specified as " - "dataset/variant/group/test; Unix shell-style wildcards " - "can be used in the descriptions; variant, group and test " - "are optional and they can be left out; the default " - "variant is '' which selects the default dataset variant; " - "the default group is '*' which selects all groups; the " - "default test is '*' which selects all tests") -parser.add_argument("--memgraph-binary", - default=helpers.get_binary_path("memgraph"), - help="Memgraph binary used for benchmarking") -parser.add_argument("--client-binary", - default=helpers.get_binary_path("tests/mgbench/client"), - help="client binary used for benchmarking") -parser.add_argument("--num-workers-for-import", type=int, - default=multiprocessing.cpu_count() // 2, - help="number of workers used to import the dataset") -parser.add_argument("--num-workers-for-benchmark", type=int, - default=1, - help="number of workers used to execute the benchmark") -parser.add_argument("--single-threaded-runtime-sec", type=int, - default=10, - help="single threaded duration of each test") -parser.add_argument("--no-load-query-counts", action="store_true", - help="disable loading of cached query counts") -parser.add_argument("--no-save-query-counts", action="store_true", - help="disable storing of cached query counts") -parser.add_argument("--export-results", default="", - help="file path into which results should be exported") -parser.add_argument("--temporary-directory", default="/tmp", - help="directory path where temporary data should " - "be stored") -parser.add_argument("--no-properties-on-edges", action="store_true", - help="disable properties on edges") + description="Memgraph benchmark executor.", formatter_class=argparse.ArgumentDefaultsHelpFormatter +) +parser.add_argument( + "benchmarks", + nargs="*", + default="", + help="descriptions of benchmarks that should be run; " + "multiple descriptions can be specified to run multiple " + "benchmarks; the description is specified as " + "dataset/variant/group/test; Unix shell-style wildcards " + "can be used in the descriptions; variant, group and test " + "are optional and they can be left out; the default " + "variant is '' which selects the default dataset variant; " + "the default group is '*' which selects all groups; the " + "default test is '*' which selects all tests", +) +parser.add_argument( + "--memgraph-binary", default=helpers.get_binary_path("memgraph"), help="Memgraph binary used for benchmarking" +) +parser.add_argument( + "--client-binary", + default=helpers.get_binary_path("tests/mgbench/client"), + help="client binary used for benchmarking", +) +parser.add_argument( + "--num-workers-for-import", + type=int, + default=multiprocessing.cpu_count() // 2, + help="number of workers used to import the dataset", +) +parser.add_argument( + "--num-workers-for-benchmark", type=int, default=1, help="number of workers used to execute the benchmark" +) +parser.add_argument("--single-threaded-runtime-sec", type=int, default=10, help="single threaded duration of each test") +parser.add_argument("--no-load-query-counts", action="store_true", help="disable loading of cached query counts") +parser.add_argument("--no-save-query-counts", action="store_true", help="disable storing of cached query counts") +parser.add_argument("--export-results", default="", help="file path into which results should be exported") +parser.add_argument( + "--temporary-directory", default="/tmp", help="directory path where temporary data should " "be stored" +) +parser.add_argument("--no-properties-on-edges", action="store_true", help="disable properties on edges") +parser.add_argument("--datasets", default="datasets", help="datasets to scan") +parser.add_argument("--datasets-path", default=".", help="path to datasets to scan") args = parser.parse_args() +sys.path.append(args.datasets_path) +dataset_to_use = importlib.import_module(args.datasets) + # Detect available datasets. generators = {} -for key in dir(datasets): +for key in dir(dataset_to_use): if key.startswith("_"): continue - dataset = getattr(datasets, key) - if not inspect.isclass(dataset) or dataset == datasets.Dataset or \ - not issubclass(dataset, datasets.Dataset): + dataset = getattr(dataset_to_use, key) + if not inspect.isclass(dataset) or dataset == datasets.Dataset or not issubclass(dataset, datasets.Dataset): continue tests = collections.defaultdict(list) for funcname in dir(dataset): @@ -135,8 +141,9 @@ for key in dir(datasets): tests[group].append((test, funcname)) generators[dataset.NAME] = (dataset, dict(tests)) if dataset.PROPERTIES_ON_EDGES and args.no_properties_on_edges: - raise Exception("The \"{}\" dataset requires properties on edges, " - "but you have disabled them!".format(dataset.NAME)) + raise Exception( + 'The "{}" dataset requires properties on edges, ' "but you have disabled them!".format(dataset.NAME) + ) # List datasets if there is no specified dataset. if len(args.benchmarks) == 0: @@ -144,8 +151,7 @@ if len(args.benchmarks) == 0: for name in sorted(generators.keys()): print("Dataset:", name) dataset, tests = generators[name] - print(" Variants:", ", ".join(dataset.VARIANTS), - "(default: " + dataset.DEFAULT_VARIANT + ")") + print(" Variants:", ", ".join(dataset.VARIANTS), "(default: " + dataset.DEFAULT_VARIANT + ")") for group in sorted(tests.keys()): print(" Group:", group) for test_name, test_func in tests[group]: @@ -165,31 +171,38 @@ benchmarks = filter_benchmarks(generators, args.benchmarks) # Run all specified benchmarks. for dataset, tests in benchmarks: - log.init("Preparing", dataset.NAME + "/" + dataset.get_variant(), - "dataset") - dataset.prepare(cache.cache_directory("datasets", dataset.NAME, - dataset.get_variant())) + log.init("Preparing", dataset.NAME + "/" + dataset.get_variant(), "dataset") + dataset.prepare(cache.cache_directory("datasets", dataset.NAME, dataset.get_variant())) # Prepare runners and import the dataset. - memgraph = runners.Memgraph(args.memgraph_binary, args.temporary_directory, - not args.no_properties_on_edges) + memgraph = runners.Memgraph(args.memgraph_binary, args.temporary_directory, not args.no_properties_on_edges) client = runners.Client(args.client_binary, args.temporary_directory) memgraph.start_preparation() - ret = client.execute(file_path=dataset.get_file(), - num_workers=args.num_workers_for_import) + ret = client.execute(file_path=dataset.get_file(), num_workers=args.num_workers_for_import) usage = memgraph.stop() # Display import statistics. print() for row in ret: - print("Executed", row["count"], "queries in", row["duration"], - "seconds using", row["num_workers"], - "workers with a total throughput of", row["throughput"], - "queries/second.") + print( + "Executed", + row["count"], + "queries in", + row["duration"], + "seconds using", + row["num_workers"], + "workers with a total throughput of", + row["throughput"], + "queries/second.", + ) print() - print("The database used", usage["cpu"], - "seconds of CPU time and peaked at", - usage["memory"] / 1024 / 1024, "MiB of RAM.") + print( + "The database used", + usage["cpu"], + "seconds of CPU time and peaked at", + usage["memory"] / 1024 / 1024, + "MiB of RAM.", + ) # Save import results. import_key = [dataset.NAME, dataset.get_variant(), "__import__"] @@ -208,24 +221,26 @@ for dataset, tests in benchmarks: config_key = [dataset.NAME, dataset.get_variant(), group, test] cached_count = config.get_value(*config_key) if cached_count is None: - print("Determining the number of queries necessary for", - args.single_threaded_runtime_sec, - "seconds of single-threaded runtime...") + print( + "Determining the number of queries necessary for", + args.single_threaded_runtime_sec, + "seconds of single-threaded runtime...", + ) # First run to prime the query caches. memgraph.start_benchmark() client.execute(queries=get_queries(func, 1), num_workers=1) # Get a sense of the runtime. count = 1 while True: - ret = client.execute(queries=get_queries(func, count), - num_workers=1) + ret = client.execute(queries=get_queries(func, count), num_workers=1) duration = ret[0]["duration"] - should_execute = int(args.single_threaded_runtime_sec / - (duration / count)) - print("executed_queries={}, total_duration={}, " - "query_duration={}, estimated_count={}".format( - count, duration, duration / count, - should_execute)) + should_execute = int(args.single_threaded_runtime_sec / (duration / count)) + print( + "executed_queries={}, total_duration={}, " + "query_duration={}, estimated_count={}".format( + count, duration, duration / count, should_execute + ) + ) # We don't have to execute the next iteration when # `should_execute` becomes the same order of magnitude as # `count * 10`. @@ -235,45 +250,45 @@ for dataset, tests in benchmarks: else: count = count * 10 memgraph.stop() - config.set_value(*config_key, value={ - "count": count, - "duration": args.single_threaded_runtime_sec}) + config.set_value(*config_key, value={"count": count, "duration": args.single_threaded_runtime_sec}) else: - print("Using cached query count of", cached_count["count"], - "queries for", cached_count["duration"], - "seconds of single-threaded runtime.") - count = int(cached_count["count"] * - args.single_threaded_runtime_sec / - cached_count["duration"]) + print( + "Using cached query count of", + cached_count["count"], + "queries for", + cached_count["duration"], + "seconds of single-threaded runtime.", + ) + count = int(cached_count["count"] * args.single_threaded_runtime_sec / cached_count["duration"]) # Benchmark run. print("Sample query:", get_queries(func, 1)[0][0]) - print("Executing benchmark with", count, "queries that should " - "yield a single-threaded runtime of", - args.single_threaded_runtime_sec, "seconds.") - print("Queries are executed using", args.num_workers_for_benchmark, - "concurrent clients.") + print( + "Executing benchmark with", + count, + "queries that should " "yield a single-threaded runtime of", + args.single_threaded_runtime_sec, + "seconds.", + ) + print("Queries are executed using", args.num_workers_for_benchmark, "concurrent clients.") memgraph.start_benchmark() - ret = client.execute(queries=get_queries(func, count), - num_workers=args.num_workers_for_benchmark)[0] + ret = client.execute(queries=get_queries(func, count), num_workers=args.num_workers_for_benchmark)[0] usage = memgraph.stop() ret["database"] = usage # Output summary. print() - print("Executed", ret["count"], "queries in", - ret["duration"], "seconds.") + print("Executed", ret["count"], "queries in", ret["duration"], "seconds.") print("Queries have been retried", ret["retries"], "times.") - print("Database used {:.3f} seconds of CPU time.".format( - usage["cpu"])) - print("Database peaked at {:.3f} MiB of memory.".format( - usage["memory"] / 1024.0 / 1024.0)) - print("{:<31} {:>20} {:>20} {:>20}".format("Metadata:", "min", - "avg", "max")) + print("Database used {:.3f} seconds of CPU time.".format(usage["cpu"])) + print("Database peaked at {:.3f} MiB of memory.".format(usage["memory"] / 1024.0 / 1024.0)) + print("{:<31} {:>20} {:>20} {:>20}".format("Metadata:", "min", "avg", "max")) metadata = ret["metadata"] for key in sorted(metadata.keys()): - print("{name:>30}: {minimum:>20.06f} {average:>20.06f} " - "{maximum:>20.06f}".format(name=key, **metadata[key])) + print( + "{name:>30}: {minimum:>20.06f} {average:>20.06f} " + "{maximum:>20.06f}".format(name=key, **metadata[key]) + ) log.success("Throughput: {:02f} QPS".format(ret["throughput"])) # Save results. From 1f778ba5f3b9d146f50f8ffd5ef7bc7b62b18f39 Mon Sep 17 00:00:00 2001 From: jeremy Date: Tue, 1 Nov 2022 14:14:55 +0100 Subject: [PATCH 17/38] Add possibility to give extra tests arg to MGBench --- tests/mgbench/benchmark.py | 7 ++++++- tests/mgbench/runners.py | 34 +++++++++++++++++----------------- 2 files changed, 23 insertions(+), 18 deletions(-) diff --git a/tests/mgbench/benchmark.py b/tests/mgbench/benchmark.py index 498f04f44..b5f1abee0 100755 --- a/tests/mgbench/benchmark.py +++ b/tests/mgbench/benchmark.py @@ -26,6 +26,7 @@ import log import helpers import runners import importlib +import time def get_queries(gen, count): @@ -120,6 +121,7 @@ parser.add_argument( parser.add_argument("--no-properties-on-edges", action="store_true", help="disable properties on edges") parser.add_argument("--datasets", default="datasets", help="datasets to scan") parser.add_argument("--datasets-path", default=".", help="path to datasets to scan") +parser.add_argument("--test-system-args", default="") args = parser.parse_args() sys.path.append(args.datasets_path) @@ -175,9 +177,12 @@ for dataset, tests in benchmarks: dataset.prepare(cache.cache_directory("datasets", dataset.NAME, dataset.get_variant())) # Prepare runners and import the dataset. - memgraph = runners.Memgraph(args.memgraph_binary, args.temporary_directory, not args.no_properties_on_edges) + memgraph = runners.Memgraph( + args.memgraph_binary, args.temporary_directory, not args.no_properties_on_edges, args.test_system_args + ) client = runners.Client(args.client_binary, args.temporary_directory) memgraph.start_preparation() + time.sleep(5.0) # giving enough time to machine manager and all to start up ret = client.execute(file_path=dataset.get_file(), num_workers=args.num_workers_for_import) usage = memgraph.stop() diff --git a/tests/mgbench/runners.py b/tests/mgbench/runners.py index 891a7cddd..067a58006 100644 --- a/tests/mgbench/runners.py +++ b/tests/mgbench/runners.py @@ -40,8 +40,7 @@ def _convert_args_to_flags(*args, **kwargs): def _get_usage(pid): total_cpu = 0 with open("/proc/{}/stat".format(pid)) as f: - total_cpu = (sum(map(int, f.read().split(")")[1].split()[11:15])) / - os.sysconf(os.sysconf_names["SC_CLK_TCK"])) + total_cpu = sum(map(int, f.read().split(")")[1].split()[11:15])) / os.sysconf(os.sysconf_names["SC_CLK_TCK"]) peak_rss = 0 with open("/proc/{}/status".format(pid)) as f: for row in f: @@ -52,18 +51,17 @@ def _get_usage(pid): class Memgraph: - def __init__(self, memgraph_binary, temporary_dir, properties_on_edges): + def __init__(self, memgraph_binary, temporary_dir, properties_on_edges, extra_args): self._memgraph_binary = memgraph_binary self._directory = tempfile.TemporaryDirectory(dir=temporary_dir) self._properties_on_edges = properties_on_edges self._proc_mg = None + self._extra_args = extra_args atexit.register(self._cleanup) # Determine Memgraph version - ret = subprocess.run([memgraph_binary, "--version"], - stdout=subprocess.PIPE, check=True) - version = re.search(r"[0-9]+\.[0-9]+\.[0-9]+", - ret.stdout.decode("utf-8")).group(0) + ret = subprocess.run([memgraph_binary, "--version"], stdout=subprocess.PIPE, check=True) + version = re.search(r"[0-9]+\.[0-9]+\.[0-9]+", ret.stdout.decode("utf-8")).group(0) self._memgraph_version = tuple(map(int, version.split("."))) def __del__(self): @@ -79,8 +77,14 @@ class Memgraph: if self._memgraph_version >= (0, 50, 0): kwargs["storage_properties_on_edges"] = self._properties_on_edges else: - assert self._properties_on_edges, \ - "Older versions of Memgraph can't disable properties on edges!" + assert self._properties_on_edges, "Older versions of Memgraph can't disable properties on edges!" + + if self._extra_args != "": + args_list = self._extra_args.split(" ") + assert len(args_list) % 2 == 0 + for i in range(0, len(args_list) // 2): + kwargs[args_list[i]] = args_list[i + 1] + return _convert_args_to_flags(self._memgraph_binary, **kwargs) def _start(self, **kwargs): @@ -94,8 +98,7 @@ class Memgraph: raise Exception("The database process died prematurely!") wait_for_server(7687) ret = self._proc_mg.poll() - assert ret is None, "The database process died prematurely " \ - "({})!".format(ret) + assert ret is None, "The database process died prematurely " "({})!".format(ret) def _cleanup(self): if self._proc_mg is None: @@ -121,8 +124,7 @@ class Memgraph: def stop(self): ret, usage = self._cleanup() - assert ret == 0, "The database process exited with a non-zero " \ - "status ({})!".format(ret) + assert ret == 0, "The database process exited with a non-zero " "status ({})!".format(ret) return usage @@ -135,8 +137,7 @@ class Client: return _convert_args_to_flags(self._client_binary, **kwargs) def execute(self, queries=None, file_path=None, num_workers=1): - if (queries is None and file_path is None) or \ - (queries is not None and file_path is not None): + if (queries is None and file_path is None) or (queries is not None and file_path is not None): raise ValueError("Either queries or input_path must be specified!") # TODO: check `file_path.endswith(".json")` to support advanced @@ -151,8 +152,7 @@ class Client: json.dump(query, f) f.write("\n") - args = self._get_args(input=file_path, num_workers=num_workers, - queries_json=queries_json) + args = self._get_args(input=file_path, num_workers=num_workers, queries_json=queries_json) ret = subprocess.run(args, stdout=subprocess.PIPE, check=True) data = ret.stdout.decode("utf-8").strip().split("\n") return list(map(json.loads, data)) From 1148fe9aad61251a9f4a968d96aa54c3771e3a44 Mon Sep 17 00:00:00 2001 From: Jeremy B <97525434+42jeremy@users.noreply.github.com> Date: Wed, 2 Nov 2022 14:13:40 +0100 Subject: [PATCH 18/38] Update tests/mgbench/datasets.py Co-authored-by: Jure Bajic --- tests/mgbench/datasets.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/mgbench/datasets.py b/tests/mgbench/datasets.py index 40a76b96f..3a5806629 100644 --- a/tests/mgbench/datasets.py +++ b/tests/mgbench/datasets.py @@ -349,7 +349,7 @@ class AccessControl(Dataset): def __init__(self, variant=None): super().__init__(variant) - self.next_value_idx = Dataset.get_size(self)["vertices"] + 1 + self.next_value_idx = self.get_size()["vertices"] + 1 def benchmark__create__vertex(self): self.next_value_idx += 1 From edeebf46ec8efced3fb1d5949289717532307a6e Mon Sep 17 00:00:00 2001 From: Jeremy B <97525434+42jeremy@users.noreply.github.com> Date: Wed, 2 Nov 2022 14:13:58 +0100 Subject: [PATCH 19/38] Update tests/mgbench/benchmark.py Co-authored-by: Jure Bajic --- tests/mgbench/benchmark.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/mgbench/benchmark.py b/tests/mgbench/benchmark.py index b5f1abee0..6458447d0 100755 --- a/tests/mgbench/benchmark.py +++ b/tests/mgbench/benchmark.py @@ -271,7 +271,7 @@ for dataset, tests in benchmarks: print( "Executing benchmark with", count, - "queries that should " "yield a single-threaded runtime of", + "queries that should yield a single-threaded runtime of", args.single_threaded_runtime_sec, "seconds.", ) From 1d18f1197fcd14ef30ca9db92f8b9f7f8b8f102b Mon Sep 17 00:00:00 2001 From: Jeremy B <97525434+42jeremy@users.noreply.github.com> Date: Wed, 2 Nov 2022 14:14:16 +0100 Subject: [PATCH 20/38] Update tests/mgbench/dataset_creator.py Co-authored-by: Jure Bajic --- tests/mgbench/dataset_creator.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/tests/mgbench/dataset_creator.py b/tests/mgbench/dataset_creator.py index 991135cdd..d2b4a67d9 100644 --- a/tests/mgbench/dataset_creator.py +++ b/tests/mgbench/dataset_creator.py @@ -67,18 +67,18 @@ def main(): assert percentage_of_permissions > 0.0 and percentage_of_permissions <= 1.0 assert filename != "" - f = open(filename, "w") + with open(filename, "w") as f: - f.write("MATCH (n) DETACH DELETE n;\n") + f.write("MATCH (n) DETACH DELETE n;\n") - # Create the indexes - f.write("CREATE INDEX ON :File;\n") - f.write("CREATE INDEX ON :Permission;\n") - f.write("CREATE INDEX ON :Identity;\n") - f.write("CREATE INDEX ON :File(platformId);\n") - f.write("CREATE INDEX ON :File(name);\n") - f.write("CREATE INDEX ON :Permission(name);\n") - f.write("CREATE INDEX ON :Identity(email);\n") + # Create the indexes + f.write("CREATE INDEX ON :File;\n") + f.write("CREATE INDEX ON :Permission;\n") + f.write("CREATE INDEX ON :Identity;\n") + f.write("CREATE INDEX ON :File(platformId);\n") + f.write("CREATE INDEX ON :File(name);\n") + f.write("CREATE INDEX ON :Permission(name);\n") + f.write("CREATE INDEX ON :Identity(email);\n") # Create extra index: in distributed, this will be the schema f.write("CREATE INDEX ON :File(uuid);\n") From e909e7d2d8cc64770ade67d8f92cb08c8150ad9c Mon Sep 17 00:00:00 2001 From: jeremy Date: Wed, 2 Nov 2022 14:18:04 +0100 Subject: [PATCH 21/38] Format --- tests/mgbench/dataset_creator.py | 75 +++++++++++++++----------------- 1 file changed, 36 insertions(+), 39 deletions(-) diff --git a/tests/mgbench/dataset_creator.py b/tests/mgbench/dataset_creator.py index d2b4a67d9..72b773593 100644 --- a/tests/mgbench/dataset_creator.py +++ b/tests/mgbench/dataset_creator.py @@ -68,52 +68,49 @@ def main(): assert filename != "" with open(filename, "w") as f: + f.write("MATCH (n) DETACH DELETE n;\n") - f.write("MATCH (n) DETACH DELETE n;\n") + # Create the indexes + f.write("CREATE INDEX ON :File;\n") + f.write("CREATE INDEX ON :Permission;\n") + f.write("CREATE INDEX ON :Identity;\n") + f.write("CREATE INDEX ON :File(platformId);\n") + f.write("CREATE INDEX ON :File(name);\n") + f.write("CREATE INDEX ON :Permission(name);\n") + f.write("CREATE INDEX ON :Identity(email);\n") - # Create the indexes - f.write("CREATE INDEX ON :File;\n") - f.write("CREATE INDEX ON :Permission;\n") - f.write("CREATE INDEX ON :Identity;\n") - f.write("CREATE INDEX ON :File(platformId);\n") - f.write("CREATE INDEX ON :File(name);\n") - f.write("CREATE INDEX ON :Permission(name);\n") - f.write("CREATE INDEX ON :Identity(email);\n") + # Create extra index: in distributed, this will be the schema + f.write("CREATE INDEX ON :File(uuid);\n") + f.write("CREATE INDEX ON :Permission(uuid);\n") + f.write("CREATE INDEX ON :Identity(uuid);\n") - # Create extra index: in distributed, this will be the schema - f.write("CREATE INDEX ON :File(uuid);\n") - f.write("CREATE INDEX ON :Permission(uuid);\n") - f.write("CREATE INDEX ON :Identity(uuid);\n") + uuid = 1 - uuid = 1 + # Create the nodes File + for index in range(0, number_of_files): + f.write(f'CREATE (:File {{uuid: {uuid}, platformId: "platform_id", name: "name_file_{uuid}"}});\n') + uuid += 1 - # Create the nodes File - for index in range(0, number_of_files): - f.write(f'CREATE (:File {{uuid: {uuid}, platformId: "platform_id", name: "name_file_{uuid}"}});\n') - uuid += 1 + identities = [] + # Create the nodes Identity + for index in range(0, number_of_identities): + f.write(f'CREATE (:Identity {{uuid: {uuid}, name: "mail_{uuid}@something.com"}});\n') + uuid += 1 - identities = [] - # Create the nodes Identity - for index in range(0, number_of_identities): - f.write(f'CREATE (:Identity {{uuid: {uuid}, name: "mail_{uuid}@something.com"}});\n') - uuid += 1 + for outer_index in range(0, number_of_files): + for inner_index in range(0, number_of_identities): + file_uuid = outer_index + 1 + identity_uuid = number_of_files + inner_index + 1 - for outer_index in range(0, number_of_files): - for inner_index in range(0, number_of_identities): - file_uuid = outer_index + 1 - identity_uuid = number_of_files + inner_index + 1 - - if random.random() <= percentage_of_permissions: - f.write(f'CREATE (:Permission {{uuid: {uuid}, name: "name_permission_{uuid}"}});\n') - f.write( - f"MATCH (permission:Permission {{uuid: {uuid}}}), (file:File {{uuid: {file_uuid}}}) CREATE (permission)-[e: IS_FOR_FILE]->(file);\n" - ) - f.write( - f"MATCH (permission:Permission {{uuid: {uuid}}}), (identity:Identity {{uuid: {identity_uuid}}}) CREATE (permission)-[e: IS_FOR_IDENTITY]->(identity);\n" - ) - uuid += 1 - - f.close() + if random.random() <= percentage_of_permissions: + f.write(f'CREATE (:Permission {{uuid: {uuid}, name: "name_permission_{uuid}"}});\n') + f.write( + f"MATCH (permission:Permission {{uuid: {uuid}}}), (file:File {{uuid: {file_uuid}}}) CREATE (permission)-[e: IS_FOR_FILE]->(file);\n" + ) + f.write( + f"MATCH (permission:Permission {{uuid: {uuid}}}), (identity:Identity {{uuid: {identity_uuid}}}) CREATE (permission)-[e: IS_FOR_IDENTITY]->(identity);\n" + ) + uuid += 1 if __name__ == "__main__": From 70dc19dfdb4e4ef7f7acd79f1aee6ef2942e15db Mon Sep 17 00:00:00 2001 From: jeremy Date: Thu, 3 Nov 2022 11:03:21 +0100 Subject: [PATCH 22/38] Mgbench: apply filtering on results from client --- tests/mgbench/runners.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/mgbench/runners.py b/tests/mgbench/runners.py index 067a58006..363595c69 100644 --- a/tests/mgbench/runners.py +++ b/tests/mgbench/runners.py @@ -155,4 +155,5 @@ class Client: args = self._get_args(input=file_path, num_workers=num_workers, queries_json=queries_json) ret = subprocess.run(args, stdout=subprocess.PIPE, check=True) data = ret.stdout.decode("utf-8").strip().split("\n") + data = [x for x in data if not x.startswith("[")] return list(map(json.loads, data)) From 14e3e725658aa046efe071d09cb3ffc870d97c29 Mon Sep 17 00:00:00 2001 From: jeremy Date: Fri, 4 Nov 2022 08:52:47 +0100 Subject: [PATCH 23/38] Correct badly written range --- tests/mgbench/runners.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/mgbench/runners.py b/tests/mgbench/runners.py index 363595c69..2b69a811f 100644 --- a/tests/mgbench/runners.py +++ b/tests/mgbench/runners.py @@ -82,7 +82,7 @@ class Memgraph: if self._extra_args != "": args_list = self._extra_args.split(" ") assert len(args_list) % 2 == 0 - for i in range(0, len(args_list) // 2): + for i in range(0, len(args_list), 2): kwargs[args_list[i]] = args_list[i + 1] return _convert_args_to_flags(self._memgraph_binary, **kwargs) From e41073bc2ca83b768a58f251e3802841ba573a93 Mon Sep 17 00:00:00 2001 From: jeremy Date: Fri, 4 Nov 2022 09:17:09 +0100 Subject: [PATCH 24/38] Update script to need single argument for local dataset --- tests/mgbench/benchmark.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/tests/mgbench/benchmark.py b/tests/mgbench/benchmark.py index 6458447d0..f6b0a19ca 100755 --- a/tests/mgbench/benchmark.py +++ b/tests/mgbench/benchmark.py @@ -27,6 +27,7 @@ import helpers import runners import importlib import time +import os def get_queries(gen, count): @@ -119,13 +120,15 @@ parser.add_argument( "--temporary-directory", default="/tmp", help="directory path where temporary data should " "be stored" ) parser.add_argument("--no-properties-on-edges", action="store_true", help="disable properties on edges") -parser.add_argument("--datasets", default="datasets", help="datasets to scan") parser.add_argument("--datasets-path", default=".", help="path to datasets to scan") parser.add_argument("--test-system-args", default="") args = parser.parse_args() -sys.path.append(args.datasets_path) -dataset_to_use = importlib.import_module(args.datasets) +head_tail = os.path.split(args.datasets_path) +path_without_dataset_name = head_tail[0] +dataset_name = head_tail[1] +sys.path.append(path_without_dataset_name) +dataset_to_use = importlib.import_module(dataset_name) # Detect available datasets. generators = {} From 9e72c7cb54e1186baa0a1464a1bdb5bc4f938a85 Mon Sep 17 00:00:00 2001 From: jeremy Date: Fri, 4 Nov 2022 15:57:26 +0100 Subject: [PATCH 25/38] Add extra safe check to in case we call on dataset.py --- tests/mgbench/benchmark.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/mgbench/benchmark.py b/tests/mgbench/benchmark.py index f6b0a19ca..619723caf 100755 --- a/tests/mgbench/benchmark.py +++ b/tests/mgbench/benchmark.py @@ -126,7 +126,7 @@ args = parser.parse_args() head_tail = os.path.split(args.datasets_path) path_without_dataset_name = head_tail[0] -dataset_name = head_tail[1] +dataset_name = head_tail[1].split(".")[0] sys.path.append(path_without_dataset_name) dataset_to_use = importlib.import_module(dataset_name) From d17970f6d9dba056e3f06022de3849d93667cab6 Mon Sep 17 00:00:00 2001 From: jeremy Date: Fri, 4 Nov 2022 16:04:45 +0100 Subject: [PATCH 26/38] Update default value for --datasets --- tests/mgbench/benchmark.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/mgbench/benchmark.py b/tests/mgbench/benchmark.py index 619723caf..44eb2290d 100755 --- a/tests/mgbench/benchmark.py +++ b/tests/mgbench/benchmark.py @@ -120,7 +120,7 @@ parser.add_argument( "--temporary-directory", default="/tmp", help="directory path where temporary data should " "be stored" ) parser.add_argument("--no-properties-on-edges", action="store_true", help="disable properties on edges") -parser.add_argument("--datasets-path", default=".", help="path to datasets to scan") +parser.add_argument("--datasets-path", default="datasets", help="path to datasets to scan") parser.add_argument("--test-system-args", default="") args = parser.parse_args() From 5273d319e2316aadb62b6754dec68c8b90e58d4b Mon Sep 17 00:00:00 2001 From: jeremy Date: Mon, 7 Nov 2022 09:53:29 +0100 Subject: [PATCH 27/38] Add split file for access control --- .../accesscontrol_large.shard_configuration | 36 +++++++++++++++++++ .../accesscontrol_medium.shard_configuration | 36 +++++++++++++++++++ .../accesscontrol_small.shard_configuration | 36 +++++++++++++++++++ 3 files changed, 108 insertions(+) create mode 100644 tests/mgbench/splitfiles/accesscontrol_large.shard_configuration create mode 100644 tests/mgbench/splitfiles/accesscontrol_medium.shard_configuration create mode 100644 tests/mgbench/splitfiles/accesscontrol_small.shard_configuration diff --git a/tests/mgbench/splitfiles/accesscontrol_large.shard_configuration b/tests/mgbench/splitfiles/accesscontrol_large.shard_configuration new file mode 100644 index 000000000..b7ce91b58 --- /dev/null +++ b/tests/mgbench/splitfiles/accesscontrol_large.shard_configuration @@ -0,0 +1,36 @@ +4 +uuid +email +name +platformId +2 +IS_FOR_IDENTITY +IS_FOR_FILE│ +3 +File +1 +uuid +string +1 +[1] +Identity +1 +uuid +string +1 +[10001] +Permission +1 +uuid +string +10 +[20001] +[10020000] +[20002000] +[30002000] +[40002000] +[50002000] +[60002000] +[70002000] +[80002000] +[90002000] diff --git a/tests/mgbench/splitfiles/accesscontrol_medium.shard_configuration b/tests/mgbench/splitfiles/accesscontrol_medium.shard_configuration new file mode 100644 index 000000000..ff01a53d2 --- /dev/null +++ b/tests/mgbench/splitfiles/accesscontrol_medium.shard_configuration @@ -0,0 +1,36 @@ +4 +uuid +email +name +platformId +2 +IS_FOR_IDENTITY +IS_FOR_FILE│ +3 +File +1 +uuid +string +1 +[1] +Identity +1 +uuid +string +1 +[1001] +Permission +1 +uuid +string +10 +[2001] +[102000] +[202000] +[302000] +[402000] +[502000] +[602000] +[702000] +[802000] +[902000] diff --git a/tests/mgbench/splitfiles/accesscontrol_small.shard_configuration b/tests/mgbench/splitfiles/accesscontrol_small.shard_configuration new file mode 100644 index 000000000..101c40cca --- /dev/null +++ b/tests/mgbench/splitfiles/accesscontrol_small.shard_configuration @@ -0,0 +1,36 @@ +4 +uuid +email +name +platformId +2 +IS_FOR_IDENTITY +IS_FOR_FILE│ +3 +File +1 +uuid +string +1 +[1] +Identity +1 +uuid +string +1 +[11] +Permission +1 +uuid +string +10 +[21] +[31] +[41] +[51] +[61] +[71] +[81] +[91] +[100] +[110] From baacc52a65fbb373bca79a390a7334abb0580113 Mon Sep 17 00:00:00 2001 From: jeremy Date: Mon, 7 Nov 2022 09:54:14 +0100 Subject: [PATCH 28/38] Add support for split file configuration --- tests/mgbench/benchmark.py | 6 +++++- tests/mgbench/datasets.py | 15 +++++++++++++++ tests/mgbench/runners.py | 5 ++++- 3 files changed, 24 insertions(+), 2 deletions(-) diff --git a/tests/mgbench/benchmark.py b/tests/mgbench/benchmark.py index 44eb2290d..40760f63e 100755 --- a/tests/mgbench/benchmark.py +++ b/tests/mgbench/benchmark.py @@ -181,7 +181,11 @@ for dataset, tests in benchmarks: # Prepare runners and import the dataset. memgraph = runners.Memgraph( - args.memgraph_binary, args.temporary_directory, not args.no_properties_on_edges, args.test_system_args + args.memgraph_binary, + args.temporary_directory, + not args.no_properties_on_edges, + args.test_system_args, + dataset.get_split_file(), ) client = runners.Client(args.client_binary, args.temporary_directory) memgraph.start_preparation() diff --git a/tests/mgbench/datasets.py b/tests/mgbench/datasets.py index 3a5806629..e73169aab 100644 --- a/tests/mgbench/datasets.py +++ b/tests/mgbench/datasets.py @@ -63,6 +63,10 @@ class Dataset: raise ValueError("The size defined for this variant doesn't " "have the number of vertices and/or edges!") self._num_vertices = self._size["vertices"] self._num_edges = self._size["edges"] + if self.SPLIT_FILES is not None: + self._split_file = self.SPLIT_FILES.get(variant, None) + else: + self._split_file = None def prepare(self, directory): if self._file is not None: @@ -92,6 +96,11 @@ class Dataset: """Returns number of vertices/edges for the current variant.""" return self._size + def get_split_file(self): + """Returns the location of the split file of the dataset.""" + assert self._split_file is not None + return self._split_file + # All tests should be query generator functions that output all of the # queries that should be executed by the runner. The functions should be # named `benchmark__GROUPNAME__TESTNAME` and should not accept any @@ -299,6 +308,12 @@ class AccessControl(Dataset): "medium": "https://s3.eu-west-1.amazonaws.com/deps.memgraph.io/dataset/accesscontrol/accesscontrol_medium.setup.cypher.gz", "large": "https://s3.eu-west-1.amazonaws.com/deps.memgraph.io/dataset/accesscontrol/accesscontrol_large.setup.cypher.gz", } + SPLIT_FILES = { + "empty_only_index": "splitfiles/accesscontrol_small.shard_configuration", + "small": "splitfiles/accesscontrol_small.shard_configuration", + "medium": "splitfiles/accesscontrol_medium.shard_configuration", + "large": "splitfiles/accesscontrol_large.shard_configuration", + } SIZES = { "empty_only_index": { "vertices": 0, diff --git a/tests/mgbench/runners.py b/tests/mgbench/runners.py index 2b69a811f..acee68e07 100644 --- a/tests/mgbench/runners.py +++ b/tests/mgbench/runners.py @@ -51,12 +51,13 @@ def _get_usage(pid): class Memgraph: - def __init__(self, memgraph_binary, temporary_dir, properties_on_edges, extra_args): + def __init__(self, memgraph_binary, temporary_dir, properties_on_edges, extra_args, split_file): self._memgraph_binary = memgraph_binary self._directory = tempfile.TemporaryDirectory(dir=temporary_dir) self._properties_on_edges = properties_on_edges self._proc_mg = None self._extra_args = extra_args + self._split_file = split_file atexit.register(self._cleanup) # Determine Memgraph version @@ -85,6 +86,8 @@ class Memgraph: for i in range(0, len(args_list), 2): kwargs[args_list[i]] = args_list[i + 1] + kwargs["split-file"] = self._split_file + return _convert_args_to_flags(self._memgraph_binary, **kwargs) def _start(self, **kwargs): From dca94f42bb543daaf25be0939a09dacfb61e2bb8 Mon Sep 17 00:00:00 2001 From: jeremy Date: Mon, 7 Nov 2022 10:14:49 +0100 Subject: [PATCH 29/38] Update key type in shard configuration --- .../splitfiles/accesscontrol_large.shard_configuration | 6 +++--- .../splitfiles/accesscontrol_medium.shard_configuration | 6 +++--- .../splitfiles/accesscontrol_small.shard_configuration | 6 +++--- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/tests/mgbench/splitfiles/accesscontrol_large.shard_configuration b/tests/mgbench/splitfiles/accesscontrol_large.shard_configuration index b7ce91b58..1f5759e0c 100644 --- a/tests/mgbench/splitfiles/accesscontrol_large.shard_configuration +++ b/tests/mgbench/splitfiles/accesscontrol_large.shard_configuration @@ -10,19 +10,19 @@ IS_FOR_FILE│ File 1 uuid -string +int 1 [1] Identity 1 uuid -string +int 1 [10001] Permission 1 uuid -string +int 10 [20001] [10020000] diff --git a/tests/mgbench/splitfiles/accesscontrol_medium.shard_configuration b/tests/mgbench/splitfiles/accesscontrol_medium.shard_configuration index ff01a53d2..f346d4b96 100644 --- a/tests/mgbench/splitfiles/accesscontrol_medium.shard_configuration +++ b/tests/mgbench/splitfiles/accesscontrol_medium.shard_configuration @@ -10,19 +10,19 @@ IS_FOR_FILE│ File 1 uuid -string +int 1 [1] Identity 1 uuid -string +int 1 [1001] Permission 1 uuid -string +int 10 [2001] [102000] diff --git a/tests/mgbench/splitfiles/accesscontrol_small.shard_configuration b/tests/mgbench/splitfiles/accesscontrol_small.shard_configuration index 101c40cca..86ea13346 100644 --- a/tests/mgbench/splitfiles/accesscontrol_small.shard_configuration +++ b/tests/mgbench/splitfiles/accesscontrol_small.shard_configuration @@ -10,19 +10,19 @@ IS_FOR_FILE│ File 1 uuid -string +int 1 [1] Identity 1 uuid -string +int 1 [11] Permission 1 uuid -string +int 10 [21] [31] From b10b1eb23982d9f72d6f25c74f7248f6de5c7889 Mon Sep 17 00:00:00 2001 From: jeremy Date: Mon, 7 Nov 2022 10:33:24 +0100 Subject: [PATCH 30/38] Correct shard configuration --- .../mgbench/splitfiles/accesscontrol_large.shard_configuration | 2 +- .../mgbench/splitfiles/accesscontrol_medium.shard_configuration | 2 +- .../mgbench/splitfiles/accesscontrol_small.shard_configuration | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/mgbench/splitfiles/accesscontrol_large.shard_configuration b/tests/mgbench/splitfiles/accesscontrol_large.shard_configuration index 1f5759e0c..34dca66be 100644 --- a/tests/mgbench/splitfiles/accesscontrol_large.shard_configuration +++ b/tests/mgbench/splitfiles/accesscontrol_large.shard_configuration @@ -5,7 +5,7 @@ name platformId 2 IS_FOR_IDENTITY -IS_FOR_FILE│ +IS_FOR_FILE 3 File 1 diff --git a/tests/mgbench/splitfiles/accesscontrol_medium.shard_configuration b/tests/mgbench/splitfiles/accesscontrol_medium.shard_configuration index f346d4b96..a807e783f 100644 --- a/tests/mgbench/splitfiles/accesscontrol_medium.shard_configuration +++ b/tests/mgbench/splitfiles/accesscontrol_medium.shard_configuration @@ -5,7 +5,7 @@ name platformId 2 IS_FOR_IDENTITY -IS_FOR_FILE│ +IS_FOR_FILE 3 File 1 diff --git a/tests/mgbench/splitfiles/accesscontrol_small.shard_configuration b/tests/mgbench/splitfiles/accesscontrol_small.shard_configuration index 86ea13346..9c11b6258 100644 --- a/tests/mgbench/splitfiles/accesscontrol_small.shard_configuration +++ b/tests/mgbench/splitfiles/accesscontrol_small.shard_configuration @@ -5,7 +5,7 @@ name platformId 2 IS_FOR_IDENTITY -IS_FOR_FILE│ +IS_FOR_FILE 3 File 1 From 5201db46d20aedf71b21ecd367e0227cf04c8bf4 Mon Sep 17 00:00:00 2001 From: jeremy Date: Mon, 7 Nov 2022 12:15:57 +0100 Subject: [PATCH 31/38] Add assert for split_file --- tests/mgbench/datasets.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/tests/mgbench/datasets.py b/tests/mgbench/datasets.py index e73169aab..8722ada1d 100644 --- a/tests/mgbench/datasets.py +++ b/tests/mgbench/datasets.py @@ -63,10 +63,9 @@ class Dataset: raise ValueError("The size defined for this variant doesn't " "have the number of vertices and/or edges!") self._num_vertices = self._size["vertices"] self._num_edges = self._size["edges"] - if self.SPLIT_FILES is not None: - self._split_file = self.SPLIT_FILES.get(variant, None) - else: - self._split_file = None + self._split_file = self.SPLIT_FILES.get(variant, None) + assert self._split_file is not None + assert self._split_file != "" def prepare(self, directory): if self._file is not None: From 2a7ed1ad829ad87075450de879a09c3bb2a4c33b Mon Sep 17 00:00:00 2001 From: jeremy Date: Mon, 7 Nov 2022 12:55:08 +0100 Subject: [PATCH 32/38] Add single e2e benchmark test --- .github/workflows/diff.yaml | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/.github/workflows/diff.yaml b/.github/workflows/diff.yaml index ef5cf2ee2..550452b85 100644 --- a/.github/workflows/diff.yaml +++ b/.github/workflows/diff.yaml @@ -173,6 +173,15 @@ jobs: cd build ctest -R memgraph__simulation --output-on-failure -j$THREADS + - name: Run single benchmark test + run: | + # Activate toolchain. + source /opt/toolchain-v4/activate + + # Run simulation tests. + cd tests/mgbench + ./benchmark.py accesscontrol/small --num-workers-for-import 1 --test-system-arg "bolt-num-workers 1" + release_build: name: "Release build" runs-on: [self-hosted, Linux, X64, Diff] @@ -220,6 +229,15 @@ jobs: cd build ctest -R memgraph__simulation --output-on-failure -j$THREADS + - name: Run single benchmark test + run: | + # Activate toolchain. + source /opt/toolchain-v4/activate + + # Run simulation tests. + cd tests/mgbench + ./benchmark.py accesscontrol/small --num-workers-for-import 1 --test-system-arg "bolt-num-workers 1" + - name: Run e2e tests run: | # TODO(gitbuda): Setup mgclient and pymgclient properly. From c16f948de9d27ebeafa4821f1d94536faed5e1f3 Mon Sep 17 00:00:00 2001 From: jeremy Date: Mon, 7 Nov 2022 13:08:35 +0100 Subject: [PATCH 33/38] Delete cache folder before running benchmark test --- .github/workflows/diff.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/diff.yaml b/.github/workflows/diff.yaml index 550452b85..731e9e816 100644 --- a/.github/workflows/diff.yaml +++ b/.github/workflows/diff.yaml @@ -180,6 +180,7 @@ jobs: # Run simulation tests. cd tests/mgbench + rm -r .cache ./benchmark.py accesscontrol/small --num-workers-for-import 1 --test-system-arg "bolt-num-workers 1" release_build: @@ -236,6 +237,7 @@ jobs: # Run simulation tests. cd tests/mgbench + rm -r .cache ./benchmark.py accesscontrol/small --num-workers-for-import 1 --test-system-arg "bolt-num-workers 1" - name: Run e2e tests From a54bcb9819bdded10e49ea3aefe3dbd1ba1f1b52 Mon Sep 17 00:00:00 2001 From: jeremy Date: Mon, 7 Nov 2022 17:32:09 +0100 Subject: [PATCH 34/38] Remove un-necessary rm in workflow --- .github/workflows/diff.yaml | 2 -- 1 file changed, 2 deletions(-) diff --git a/.github/workflows/diff.yaml b/.github/workflows/diff.yaml index 731e9e816..550452b85 100644 --- a/.github/workflows/diff.yaml +++ b/.github/workflows/diff.yaml @@ -180,7 +180,6 @@ jobs: # Run simulation tests. cd tests/mgbench - rm -r .cache ./benchmark.py accesscontrol/small --num-workers-for-import 1 --test-system-arg "bolt-num-workers 1" release_build: @@ -237,7 +236,6 @@ jobs: # Run simulation tests. cd tests/mgbench - rm -r .cache ./benchmark.py accesscontrol/small --num-workers-for-import 1 --test-system-arg "bolt-num-workers 1" - name: Run e2e tests From 61b9457718423f045309a730539ae44060e48842 Mon Sep 17 00:00:00 2001 From: jeremy Date: Tue, 8 Nov 2022 11:35:54 +0100 Subject: [PATCH 35/38] Remove split-files logic from test code --- .github/workflows/diff.yaml | 4 ++-- tests/mgbench/benchmark.py | 1 - tests/mgbench/datasets.py | 14 -------------- tests/mgbench/runners.py | 5 +---- 4 files changed, 3 insertions(+), 21 deletions(-) diff --git a/.github/workflows/diff.yaml b/.github/workflows/diff.yaml index 550452b85..41573fc67 100644 --- a/.github/workflows/diff.yaml +++ b/.github/workflows/diff.yaml @@ -180,7 +180,7 @@ jobs: # Run simulation tests. cd tests/mgbench - ./benchmark.py accesscontrol/small --num-workers-for-import 1 --test-system-arg "bolt-num-workers 1" + ./benchmark.py accesscontrol/small --num-workers-for-import 1 --test-system-arg "split-file splitfiles/accesscontrol_small.shard_configuration bolt-num-workers 1" release_build: name: "Release build" @@ -236,7 +236,7 @@ jobs: # Run simulation tests. cd tests/mgbench - ./benchmark.py accesscontrol/small --num-workers-for-import 1 --test-system-arg "bolt-num-workers 1" + ./benchmark.py accesscontrol/small --num-workers-for-import 1 --test-system-arg "split-file splitfiles/accesscontrol_small.shard_configuration bolt-num-workers 1" - name: Run e2e tests run: | diff --git a/tests/mgbench/benchmark.py b/tests/mgbench/benchmark.py index 40760f63e..6f37c9570 100755 --- a/tests/mgbench/benchmark.py +++ b/tests/mgbench/benchmark.py @@ -185,7 +185,6 @@ for dataset, tests in benchmarks: args.temporary_directory, not args.no_properties_on_edges, args.test_system_args, - dataset.get_split_file(), ) client = runners.Client(args.client_binary, args.temporary_directory) memgraph.start_preparation() diff --git a/tests/mgbench/datasets.py b/tests/mgbench/datasets.py index 8722ada1d..3a5806629 100644 --- a/tests/mgbench/datasets.py +++ b/tests/mgbench/datasets.py @@ -63,9 +63,6 @@ class Dataset: raise ValueError("The size defined for this variant doesn't " "have the number of vertices and/or edges!") self._num_vertices = self._size["vertices"] self._num_edges = self._size["edges"] - self._split_file = self.SPLIT_FILES.get(variant, None) - assert self._split_file is not None - assert self._split_file != "" def prepare(self, directory): if self._file is not None: @@ -95,11 +92,6 @@ class Dataset: """Returns number of vertices/edges for the current variant.""" return self._size - def get_split_file(self): - """Returns the location of the split file of the dataset.""" - assert self._split_file is not None - return self._split_file - # All tests should be query generator functions that output all of the # queries that should be executed by the runner. The functions should be # named `benchmark__GROUPNAME__TESTNAME` and should not accept any @@ -307,12 +299,6 @@ class AccessControl(Dataset): "medium": "https://s3.eu-west-1.amazonaws.com/deps.memgraph.io/dataset/accesscontrol/accesscontrol_medium.setup.cypher.gz", "large": "https://s3.eu-west-1.amazonaws.com/deps.memgraph.io/dataset/accesscontrol/accesscontrol_large.setup.cypher.gz", } - SPLIT_FILES = { - "empty_only_index": "splitfiles/accesscontrol_small.shard_configuration", - "small": "splitfiles/accesscontrol_small.shard_configuration", - "medium": "splitfiles/accesscontrol_medium.shard_configuration", - "large": "splitfiles/accesscontrol_large.shard_configuration", - } SIZES = { "empty_only_index": { "vertices": 0, diff --git a/tests/mgbench/runners.py b/tests/mgbench/runners.py index acee68e07..2b69a811f 100644 --- a/tests/mgbench/runners.py +++ b/tests/mgbench/runners.py @@ -51,13 +51,12 @@ def _get_usage(pid): class Memgraph: - def __init__(self, memgraph_binary, temporary_dir, properties_on_edges, extra_args, split_file): + def __init__(self, memgraph_binary, temporary_dir, properties_on_edges, extra_args): self._memgraph_binary = memgraph_binary self._directory = tempfile.TemporaryDirectory(dir=temporary_dir) self._properties_on_edges = properties_on_edges self._proc_mg = None self._extra_args = extra_args - self._split_file = split_file atexit.register(self._cleanup) # Determine Memgraph version @@ -86,8 +85,6 @@ class Memgraph: for i in range(0, len(args_list), 2): kwargs[args_list[i]] = args_list[i + 1] - kwargs["split-file"] = self._split_file - return _convert_args_to_flags(self._memgraph_binary, **kwargs) def _start(self, **kwargs): From 33add3ecd067d7beb948a0fdeda6817470efefd3 Mon Sep 17 00:00:00 2001 From: jeremy Date: Wed, 9 Nov 2022 15:38:51 +0100 Subject: [PATCH 36/38] Force formatting --- tests/mgbench/benchmark.py | 8 ++++---- tests/mgbench/dataset_creator.py | 5 +++-- tests/mgbench/helpers.py | 1 - 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/tests/mgbench/benchmark.py b/tests/mgbench/benchmark.py index 6f37c9570..9d8423e89 100755 --- a/tests/mgbench/benchmark.py +++ b/tests/mgbench/benchmark.py @@ -15,19 +15,19 @@ import argparse import collections import copy import fnmatch +import importlib import inspect import json import multiprocessing +import os import random import sys +import time import datasets -import log import helpers +import log import runners -import importlib -import time -import os def get_queries(gen, count): diff --git a/tests/mgbench/dataset_creator.py b/tests/mgbench/dataset_creator.py index 72b773593..9ebeb8cd1 100644 --- a/tests/mgbench/dataset_creator.py +++ b/tests/mgbench/dataset_creator.py @@ -9,9 +9,10 @@ # by the Apache License, Version 2.0, included in the file # licenses/APL.txt. -import random -import helpers import argparse +import random + +import helpers # Explaination of datasets: # - empty_only_index: contains index; contains no data diff --git a/tests/mgbench/helpers.py b/tests/mgbench/helpers.py index 1a4cd3c3e..b46e51db4 100644 --- a/tests/mgbench/helpers.py +++ b/tests/mgbench/helpers.py @@ -14,7 +14,6 @@ import json import os import subprocess - SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__)) From 968584a8fcf7ed1467c19f2c3ba808967c4c5b5c Mon Sep 17 00:00:00 2001 From: jeremy Date: Wed, 9 Nov 2022 16:02:25 +0100 Subject: [PATCH 37/38] Add comment force github workflow --- tests/mgbench/helpers.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/mgbench/helpers.py b/tests/mgbench/helpers.py index b46e51db4..8a122b9e0 100644 --- a/tests/mgbench/helpers.py +++ b/tests/mgbench/helpers.py @@ -118,3 +118,6 @@ class Cache: def save_config(self, config): with open(self._config, "w") as f: json.dump(config.get_data(), f) + + +# Comment to force github workflow From 6df2db0d1911e16c22241f7cf24b13d780799637 Mon Sep 17 00:00:00 2001 From: jeremy Date: Wed, 9 Nov 2022 16:02:59 +0100 Subject: [PATCH 38/38] Remove comment force github workflow --- tests/mgbench/helpers.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/tests/mgbench/helpers.py b/tests/mgbench/helpers.py index 8a122b9e0..b46e51db4 100644 --- a/tests/mgbench/helpers.py +++ b/tests/mgbench/helpers.py @@ -118,6 +118,3 @@ class Cache: def save_config(self, config): with open(self._config, "w") as f: json.dump(config.get_data(), f) - - -# Comment to force github workflow